Spaces:

Ransaka
/

sinhala-embedding-space

Running

App Files Files Community

Ransaka commited on Sep 24, 2023

Commit

d06496c

1 Parent(s): 4d31406

Added files

Browse files

Files changed (12) hide show

.gitignore +160 -0
app.py +92 -0
clustering/clustering.py +58 -0
data/top_cluster_dataset.csv +0 -0
embeddings/__int__.py +0 -0
embeddings/embeddings.py +100 -0
plots/chart.html +0 -0
plots/clusters.png +0 -0
requirements.txt +12 -0
search_demo.py +35 -0
vector_search/__init__.py +0 -0
vector_search/vector_search.py +102 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import altair as alt
+from PIL import Image
+from embeddings.embeddings import load_model
+from sentence_transformers import  util
+# Create sample data
+data = pd.DataFrame({
+    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
+    'Age': [25, 30, 22, 35]
+})
+# Sample PNG file
+image = Image.open('plots\clusters.png')
+# Sample HTML chart
+chart_data = pd.read_csv(r"data\top_cluster_dataset.csv",dtype={'Headline': str, 'x': np.float64, 'y': np.float64, 'labels': str})
+# Create a Streamlit app
+st.set_page_config(page_title="Sample Webpage", page_icon=":bar_chart:")
+# Define tabs
+tabs = ["Search", "Clustering Results"]
+selected_tab = st.sidebar.radio("Select a Tab", tabs)
+# Main content
+if selected_tab == "Search":
+    sample_sentences = chart_data['Headline'].sample(10, random_state=1).tolist()
+    st.title("Calculate Sentences Similarity")
+    # select model to use dropdown
+    st.subheader("Select a model to use")
+    model_list = ["Ransaka/SinhalaRoberta","keshan/SinhalaBERTo"]
+    selected_model = st.selectbox("Select Model", model_list)
+    model = load_model(selected_model)
+    sentence1 = st.text_input("Enter Sentence 1", "")
+    sentence2 = st.text_input("Enter Sentence 2", "")
+    if sentence1 and sentence2:
+        # add button to calculate similarity
+        if st.button("Calculate Similarity"):
+            with st.spinner('Calculating Similarity...'):
+                # Calculate similarity
+                similarity = util.pytorch_cos_sim(model.encode(sentence1), model.encode(sentence2))[0][0]
+                if similarity > 0.7:
+                    st.success(f"Sentences are similar (Score: {similarity:.3f})")
+                elif similarity > 0.5:
+                    st.warning(f"Sentences are somewhat similar (Score: {similarity:.3f})")
+                else:
+                    st.error(f"Sentences are not similar (Score: {similarity:.3f})")
+    else:
+        st.write("Enter two sentences to calculate similarity. Or start with sample sentences below.")
+        # change radio button to randomize sentences and show sample sentences
+        if st.button("Randomize Sentences"):
+            sample_sentences = chart_data['Headline'].sample(10).tolist()
+        for sentence in sample_sentences:
+            # show sample sentences in small font
+            st.write(sentence)
+elif selected_tab == "Clustering Results":
+    st.title("Clustering Results Tab")
+    # Display PNG image
+    st.subheader("Static PNG File")
+    st.image(image, use_column_width=False, caption='Static PNG File',width=750)
+    altair_chart = alt.Chart(chart_data).mark_circle().encode(
+        x='x',
+        y='y',
+        color='labels',
+        tooltip='Headline'
+    ).properties(
+        width=750,
+        height=500
+    ).interactive()
+    # Display chart
+    st.subheader("Interactive Chart for top clusters")
+    st.altair_chart(altair_chart, use_container_width=False, theme="streamlit")
+    # Dropdown functionality to update DataFrame
+    st.subheader("Select a cluster")
+    unique_clusters = chart_data['labels'].unique().tolist()
+    selected_value = st.selectbox("Select Value", unique_clusters)
+    # Filter and display results based on selected cluster
+    if selected_value:
+        filtered_data = chart_data[chart_data['labels'].str.contains(selected_value, case=False)].sample(10)[['Headline']].reset_index(drop=True)
+        st.dataframe(filtered_data,width=750)
+    else:
+        st.write("Select a cluster to display results.")

clustering/clustering.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import hdbscan
+import umap
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+def load_data():
+    # Load data
+    embeddings = np.load(r'data\top_cluster_embeddings.npy')
+    return embeddings
+def get_clusters(embeddings):
+    # Get clusters
+    umap_embeddings = umap.UMAP(
+        n_neighbors=15,
+        n_components=15,
+        metric='cosine'
+        ).fit_transform(embeddings)
+    cluster = hdbscan.HDBSCAN(
+        min_cluster_size=30,
+        metric='euclidean',
+        cluster_selection_method='eom'
+        ).fit(umap_embeddings)
+    return cluster.labels_
+def get_2d_data_for_plotting(embeddings):
+    # Get 2D data for plotting
+    umap_embeddings = umap.UMAP(
+        n_neighbors=15,
+        n_components=2,
+        metric='cosine'
+        ).fit_transform(embeddings)
+    return umap_embeddings
+def plot_clusters(embeddings, cluster_labels):
+    umap_data = get_2d_data_for_plotting(embeddings)
+    result = pd.DataFrame(umap_data, columns=['x', 'y'])
+    result['labels'] = cluster_labels
+    # Visualize clusters
+    fig, ax = plt.subplots(figsize=(20, 10))
+    outliers = result.loc[result.labels == -1, :]
+    clustered = result.loc[result.labels != -1, :]
+    plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
+    plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
+    plt.colorbar()
+    plt.savefig(r'plots\clusters.png', dpi=300)
+def main():
+    embeddings = load_data()
+    cluster_labels = get_clusters(embeddings)
+    plot_clusters(embeddings, cluster_labels)
+if __name__ == '__main__':
+    main()

data/top_cluster_dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

embeddings/__int__.py ADDED Viewed

File without changes

embeddings/embeddings.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+This file contains the code for the embeddings.
+    Tested models as follows:
+        - Ransaka/SinhalaRoberta
+        - keshan/SinhalaBERTo
+This file used Ransaka/SinhalaRoberta model for the embeddings.
+You can download the model from huggingface.co
+    - https://huggingface.co/Ransaka/SinhalaRoberta
+    - https://huggingface.co/keshan/SinhalaBERTo
+You can download dataset from kaggle.com
+    - https://www.kaggle.com/datasets/ransakaravihara/hiru-news-set3
+"""
+import random
+import numpy as np
+import pandas as pd
+import torch
+from sentence_transformers import SentenceTransformer, models,util
+model_id = "Ransaka/SinhalaRoberta"
+def load_and_process_data(file_path:str)->list:
+    """
+    This function loads the data from the file path and process it.
+    """
+    def processor(text:str)->str:
+        """Only addresses the most common issues in the dataset"""
+        return text\
+            .replace("\u200d","")\
+            .replace("Read More..","")\
+            .replace("ඡායාරූප","")\
+            .replace("\xa0","")\
+            .replace("වීඩියෝ","")\
+            .replace("()","")
+    def basic_processing(series:pd.Series)->pd.Series:
+        """Applies the processor function to a pandas series"""
+        return series\
+        .apply(processor)
+    df  = pd.read_csv(file_path)
+    df.dropna(inplace=True)
+    df['Headline'] = basic_processing(df['Headline'])
+    # df['fullText'] = basic_processing(df['fullText'])
+    #only headlines used for the embeddings
+    sentences = df['Headline'].values.tolist()
+    random.shuffle(sentences)
+    return sentences
+def load_model(model_id:str)->SentenceTransformer:
+    """
+    This function loads the model from the huggingface.co
+    """
+    word_embedding_model = models.Transformer(model_id, max_seq_length=514)
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+    return model
+def get_embeddings(model: SentenceTransformer, sentences: list)->list:
+    """
+    This function returns the embeddings for the given sentences.
+    """
+    return model.encode(sentences)
+def save_embeddings(embeddings: list, file_path: str):
+    """
+    This function saves the embeddings to the given file path.
+    """
+    np.save(file_path, embeddings)
+def load_embeddings(file_path: str)->list:
+    """
+    This function loads the embeddings from the given file path.
+    """
+    return np.load(file_path)
+def get_similar(model:SentenceTransformer,embeddings: list, query: str, top_k: int = 5)->list:
+    """
+    This function returns the top k similar sentences for the given query.
+    """
+    query_embedding = model.encode([query])[0]
+    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
+    top_results = torch.topk(cos_scores, k=top_k)
+    return top_results
+if __name__ == "__main__":
+    file_path = r"data\top_cluster_dataset.csv"
+    #load and process data
+    sentences = load_and_process_data(file_path)
+    model = load_model(model_id)
+    #get embeddings
+    embeddings = get_embeddings(model, sentences)
+    save_embeddings(embeddings, r"data\embeddings.npy")

plots/chart.html ADDED Viewed

The diff for this file is too large to render. See raw diff

plots/clusters.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+altair==5.1.1
+faiss-cpu==1.7.4
+hdbscan==0.8.1
+numba==0.58.0
+numpy==1.25.2
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+streamlit==1.27.0
+tokenizers==0.13.3
+torch==2.0.1
+transformers==4.33.2
+umap-learn==0.5.4

search_demo.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Sample results:
+Query:  ක්ෂය රෝග මර්දන ව්යාපාරයේ පී.සී.ආර්. යන්ත්ර 36 භාවිතයට ගන්නැයි ඉල්ලීමක්
+Results:
+        - ක්ෂය රෝග මර්දන ව්යාපාරයේ පී.සී.ආර්. යන්ත්ර 36 භාවිතයට ගන්නැයි ඉල්ලීමක්
+        - ජාතික රෝහලේ අද සිට දිනකට පී.සී.ආර් පරීක්ෂණ 200 ක්
+        - පී.සී.ආර්.සාම්පල රසායනාගාරවල ගොඩගැසී ඇතැයි වෛද්ය සංගමයෙන් චෝදනා
+        - කොරෝනා සොයන්න දිනකට පී.සී.ආර්. පරීක්ෂණ, 6000 ක් කිරීමේ සැලසුම්
+Query:  පොළොන්නරුව මහරෝහලේ අකුරට වැඩ කිරීමේ වෘත්තීය ක්රියාමාර්ගයක්
+Results:
+        - පොළොන්නරුව මහරෝහලේ අකුරට වැඩ කිරීමේ වෘත්තීය ක්රියාමාර්ගයක්
+        - අකුරට වැඩ කළ රේගු වෘත්තීය සමිති, වර්ජනයකට සැරසේ
+        - ජාතික සත්ත්වෝද්යාන වෘත්තීය සමිති වැඩ වර්ජනයක
+        - ජල සම්පාදන වෘත්තීය සමිති ඒකාබද්ධ සන්ධානයෙන් වෘත්තීය ක්රියාමාර්ගවලට
+Query:  අංගොඩ අයි ඩී එච් රෝහලේ ඩෙංගු විශේෂ ප්රතිකාර ඒකකය තවම නැහැ
+Results:
+        - අංගොඩ අයි ඩී එච් රෝහලේ ඩෙංගු විශේෂ ප්රතිකාර ඒකකය තවම නැහැ
+        - අයි.ඩී.එච්. රෝහලෙන් පැන ගිය කොරෝනා ආසාදිත කාන්තාව සොයා තවදුරටත් මෙහෙයුම්
+        - අයි.ඩී.එච්. රෝහලෙන් පැන්න කොරෝනා ආසාදිත කාන්තාව සොයන මෙහෙයුම අඛණ්ඩව
+        - කොරෝනා වෛරසය ආසාදනය වී ඇත්දැයි සැකයෙන්, සතියක් තුල 71ක් අයි.ඩී.එච් රෝහලට
+Query:  කමිටු ගැන විශ්වාසයක් නැහැ - මාළඹේ පෞද්ගලික වෛද්ය විද්යාලයීය දෙමාපිය සංසදය
+Results:
+        - කමිටු ගැන විශ්වාසයක් නැහැ - මාළඹේ පෞද්ගලික වෛද්ය විද්යාලයීය දෙමාපිය සංසදය
+        - මාළඹේ වෛද්ය විද්යාලයීය දෙමාපිය සංසදය ජනපති ලේකම් කාර්යාලයට
+        - සයිටම් ගැටළුව වෙනතකට යොමුකිරීමට ආණ්ඩුව උපක්රම යොදනවා - වෛද්ය පීඨ ශිෂ්ය ක්රියාකාරී කමිටුව
+        - එකම විසඳුම සයිටම් අහෝසි කිරීමයි - වෛද්ය පීඨ ශිෂ්ය ක්රියාකාරී කමිටුව
+"""
+from vector_search.vector_search import search_demo
+if __name__ == "__main__":
+    search_demo(top_k=4)

vector_search/__init__.py ADDED Viewed

File without changes

vector_search/vector_search.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+This file is used to search the most similar vectors in the database using the faiss library.
+used indexer class grabbed from daily-llama repo (https://github.com/Ransaka/daily-llama)
+"""
+import numpy as np
+import pandas as pd
+from embeddings.embeddings import load_model, model_id
+# from daily llama repo
+import faiss
+class Indexer:
+  def __init__(self, embed_vec):
+    self.embeddings_vec = embed_vec
+    self.build_index()
+  def build_index(self):
+    """
+    Build the index for the embeddings.
+    This function initializes the index for the embeddings. It calculates the dimension (self.d)
+    of the embeddings vector and creates an IndexFlatL2 object (self.index) for the given dimension.
+    It then adds the embeddings vector (self.embeddings_vec) to the index.
+    Parameters:
+    - None
+    Return:
+    - None
+    """
+    self.d = self.embeddings_vec.shape[1]
+    self.index = faiss.IndexFlatL2(self.d)
+    self.index.add(self.embeddings_vec)
+  def topk(self, vector, k = 4):
+    """
+        A function that takes in a vector and an optional parameter k and returns the indices of the k nearest neighbors in the index.
+        Parameters:
+            vector: A numpy array representing the input vector.
+            k (optional): An integer representing the number of nearest neighbors to retrieve. Defaults to 4 if not specified.
+        Returns:
+            I: A numpy array containing the indices of the k nearest neighbors in the index.
+    """
+    # vec = self.retreaver.encode(text)['embeddings'].detach().cpu().numpy()
+    _, I = self.index.search(vector, k)
+    return I
+def get_embeddings_vec(file_path):
+    """
+    This function loads the embeddings from the given file path.
+    Parameters:
+    - file_path: A string representing the path to the embeddings file.
+    Return:
+    - embeddings_vec: A numpy array containing the embeddings.
+    """
+    return np.load(file_path)
+def get_similar(indexer, text_embeddings, top_k = 5):
+    """
+    This function returns the top k similar sentences for the given query.
+    Parameters:
+    - indexer: An Indexer object representing the indexer for the embeddings.
+    - text_embeddings: A np.array representing the query embeddings.
+    - top_k (optional): An integer representing the number of nearest neighbors to retrieve. Defaults to 4 if not specified.
+    Return:
+    - top_results: A numpy array containing the indices of the k nearest neighbors in the index.
+    """
+    return indexer.topk(text_embeddings,k=top_k).flatten()
+def search_demo(test_queries:list=None,top_k:int=1):
+    """
+    This function returns the top k similar sentences for the given query.
+    """
+    model = load_model(model_id)
+    embeddings_vec = get_embeddings_vec(r"data\top_cluster_embeddings.npy")
+    indexer = Indexer(embeddings_vec)
+    cluster_dataset = pd.read_csv(r"data\top_cluster_dataset.csv",usecols=['Headline'])
+    search_space = cluster_dataset['Headline'].values.tolist()
+    if test_queries is None:
+        test_queries = [
+            "ක්ෂය රෝග මර්දන ව්යාපාරයේ පී.සී.ආර්. යන්ත්ර 36 භාවිතයට ගන්නැයි ඉල්ලීමක්",
+            "පොළොන්නරුව මහරෝහලේ අකුරට වැඩ කිරීමේ වෘත්තීය ක්රියාමාර්ගයක්",
+            "අංගොඩ අයි ඩී එච් රෝහලේ ඩෙංගු විශේෂ ප්රතිකාර ඒකකය තවම නැහැ ",
+            "කමිටු ගැන විශ්වාසයක් නැහැ - මාළඹේ පෞද්ගලික වෛද්ය විද්යාලයීය දෙමාපිය සංසදය"
+        ]
+    for query in test_queries:
+        query_embeddings = model.encode(query).reshape(1,-1)
+        print("Query: ", query)
+        print("Results: ")
+        for index in get_similar(indexer, query_embeddings, top_k = top_k):
+            print("\t-",search_space[index])
+        print()