File size: 3,459 Bytes
d06496c
 
 
 
b409a80
d06496c
 
 
9615424
d06496c
2e39235
9615424
 
b409a80
d06496c
b409a80
d06496c
 
b409a80
d06496c
 
2113989
d06496c
 
 
2113989
d06496c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c8d160
d06496c
 
4c8d160
d06496c
 
4c8d160
 
 
 
 
d06496c
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import pandas as pd
import numpy as np
import altair as alt
import os
from PIL import Image
from embeddings.embeddings import load_model
from sentence_transformers import  util
import warnings

warnings.filterwarnings('ignore')

# cluster PNG file
image = Image.open('plots/clusters.png')

chart_data = pd.read_csv(r"data/top_cluster_dataset.csv",dtype={'Headline': str, 'x': np.float64, 'y': np.float64, 'labels': str})

# Create a Streamlit app
st.set_page_config(page_title="Sinhala Embedding Space", page_icon=":bar_chart:")

# Define tabs
tabs = ["Clustering Results","Sentences Similarity"]
selected_tab = st.sidebar.radio("Select a Tab", tabs)

# Main content
if selected_tab == "Sentences Similarity":
    sample_sentences = chart_data['Headline'].sample(10, random_state=1).tolist()
    st.title("Calculate Sentences Similarity")
    # select model to use dropdown
    st.subheader("Select a model to use")
    model_list = ["Ransaka/SinhalaRoberta","keshan/SinhalaBERTo"]
    selected_model = st.selectbox("Select Model", model_list)
    model = load_model(selected_model)
    
    sentence1 = st.text_input("Enter Sentence 1", "")
    sentence2 = st.text_input("Enter Sentence 2", "")

    if sentence1 and sentence2:
        # add button to calculate similarity
        if st.button("Calculate Similarity"):
            with st.spinner('Calculating Similarity...'):
                # Calculate similarity
                similarity = util.pytorch_cos_sim(model.encode(sentence1), model.encode(sentence2))[0][0]
                if similarity > 0.7:
                    st.success(f"Sentences are similar (Score: {similarity:.3f})")
                elif similarity > 0.5:
                    st.warning(f"Sentences are somewhat similar (Score: {similarity:.3f})")
                else:
                    st.error(f"Sentences are not similar (Score: {similarity:.3f})")
    else:
        st.write("Enter two sentences to calculate similarity. Or start with sample sentences below.")
        # change radio button to randomize sentences and show sample sentences
        if st.button("Randomize Sentences"):
            sample_sentences = chart_data['Headline'].sample(10).tolist()
        for sentence in sample_sentences:
            # show sample sentences in small font
            st.write(sentence)

elif selected_tab == "Clustering Results":
    st.title("Clustering Results")
    
    # Display PNG image
    st.subheader("Full Clustering Results")
    st.image(image, use_column_width=False, caption='Static PNG File',width=750)
    
    with st.spinner('Loading Interactive Results...'):
        # Display Altair chart
        st.subheader("Interactive Chart")
        chart = alt.Chart(chart_data).mark_circle(size=60).encode(x='x', y='y', color='labels', tooltip=['Headline']).interactive()
        st.altair_chart(chart, use_container_width=True)
    
    # Dropdown functionality to update DataFrame
    st.subheader("Select a cluster")
    unique_clusters = chart_data['labels'].unique().tolist()
    selected_value = st.selectbox("Select Value", unique_clusters)
    
    # Filter and display results based on selected cluster
    if selected_value:
        filtered_data = chart_data[chart_data['labels'].str.contains(selected_value, case=False)].sample(10)[['Headline']].reset_index(drop=True)
        st.dataframe(filtered_data,width=750)
    else:
        st.write("Select a cluster to display results.")