Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on May 11, 2024

Commit

8fb441e

1 Parent(s): 7088ca8

nn function now compares vectors of target word only with vectors within the same model

Browse files

Files changed (2) hide show

app.py +33 -34
word2vec.py +67 -6

app.py CHANGED Viewed

@@ -12,6 +12,9 @@ from streamlit_tags import st_tags, st_tags_sidebar
 st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
 # Horizontal menu
 active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
     menu_icon="cast", default_index=0, orientation="horizontal")
@@ -29,59 +32,55 @@ if active_tab == "Nearest neighbours":
     all_words = load_compressed_word_list(compressed_word_list_filename)
     eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
     with st.container():
-        with col1:
-            word = st.multiselect("Enter a word", all_words, max_selections=1)
-            if len(word) > 0:
-                word = word[0]
-                # Check which models contain the word
-                eligible_models = check_word_in_models(word)
-        with col2:
-            time_slice = st.selectbox("Time slice", eligible_models)
         models = st.multiselect(
             "Select models to search for neighbours",
-            ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
             )
         n = st.slider("Number of neighbours", 1, 50, 15)
-        nearest_neighbours_button = st.button("Find nearest neighbours")
         # If the button to calculate nearest neighbours is clicked
-        if nearest_neighbours_button:
-            # Rewrite timeslices to model names: Archaic -> archaic_cbow
-            if time_slice == 'Hellenistic':
-                time_slice = 'hellen'
-            elif time_slice == 'Early Roman':
-                time_slice = 'early_roman'
-            elif time_slice == 'Late Roman':
-                time_slice = 'late_roman'
-            time_slice = time_slice.lower() + "_cbow"
             # Check if all fields are filled in
-            if validate_nearest_neighbours(word, time_slice, n, models) == False:
                 st.error('Please fill in all fields')
             else:
                 # Rewrite models to list of all loaded models
                 models = load_selected_models(models)
-                nearest_neighbours = get_nearest_neighbours(word, time_slice, n, models)
-                df = pd.DataFrame(
-                    nearest_neighbours,
-                    columns=["Word", "Time slice", "Similarity"],
-                    index = range(1, len(nearest_neighbours) + 1)
-                )
-                st.table(df)
                 # Store content in a temporary file
-                tmp_file = store_df_in_temp_file(df)
                 # Open the temporary file and read its content
                 with open(tmp_file, "rb") as file:
@@ -91,7 +90,7 @@ if active_tab == "Nearest neighbours":
                     st.download_button(
                         "Download results",
                         data=file_byte,
-                        file_name = f'nearest_neighbours_{word}_{time_slice}.xlsx',
                         mime='application/octet-stream'
                         )

 st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
+def click_nn_button():
+    st.session_state.nearest_neighbours = not st.session_state.nearest_neighbours
 # Horizontal menu
 active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
     menu_icon="cast", default_index=0, orientation="horizontal")
     all_words = load_compressed_word_list(compressed_word_list_filename)
     eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
+    if 'nearest_neighbours' not in st.session_state:
+        st.session_state.nearest_neighbours = False
     with st.container():
+        word = st.multiselect("Enter a word", all_words, max_selections=1)
+        if len(word) > 0:
+            word = word[0]
+            # Check which models contain the word
+            eligible_models = check_word_in_models(word)
         models = st.multiselect(
             "Select models to search for neighbours",
+            eligible_models
             )
         n = st.slider("Number of neighbours", 1, 50, 15)
+        nearest_neighbours_button = st.button("Find nearest neighbours", on_click = click_nn_button)
         # If the button to calculate nearest neighbours is clicked
+        if st.session_state.nearest_neighbours:
             # Check if all fields are filled in
+            if validate_nearest_neighbours(word, n, models) == False:
                 st.error('Please fill in all fields')
             else:
                 # Rewrite models to list of all loaded models
                 models = load_selected_models(models)
+                nearest_neighbours = get_nearest_neighbours(word, n, models)
+                all_dfs = []
+                # Create dataframes
+                for model in nearest_neighbours.keys():
+                    st.write(f"### {model}")
+                    df = pd.DataFrame(
+                        nearest_neighbours[model],
+                        columns = ['Word', 'Cosine Similarity']
+                    )
+                    all_dfs.append((model, df))
+                    st.table(df)
                 # Store content in a temporary file
+                tmp_file = store_df_in_temp_file(all_dfs)
                 # Open the temporary file and read its content
                 with open(tmp_file, "rb") as file:
                     st.download_button(
                         "Download results",
                         data=file_byte,
+                        file_name = f'nearest_neighbours_{word}_TEST.xlsx',
                         mime='application/octet-stream'
                         )

word2vec.py CHANGED Viewed

@@ -148,11 +148,11 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
-def validate_nearest_neighbours(word, time_slice_model, n, models):
     '''
         Validate the input of the nearest neighbours function
     '''
-    if word == '' or time_slice_model == [] or n == '' or models == []:
         return False
     return True
@@ -198,7 +198,7 @@ def convert_time_name_to_model(time_name):
     elif time_name == 'archaic':
         return 'Archaic'
-def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
     '''
         Return the nearest neighbours of a word
@@ -243,6 +243,51 @@ def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models(
     return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
 def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
@@ -287,7 +332,7 @@ def write_to_file(data):
     return temp_file_path
-def store_df_in_temp_file(df):
     '''
         Store the dataframe in a temporary file
     '''
@@ -300,9 +345,25 @@ def store_df_in_temp_file(df):
     # Create random tmp file name
     _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)
-    # Write data to the temporary file
     with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
-        df.to_excel(writer, index=False)
     return temp_file_path

+def validate_nearest_neighbours(word, n, models):
     '''
         Validate the input of the nearest neighbours function
     '''
+    if word == '' or n == '' or models == []:
         return False
     return True
     elif time_name == 'archaic':
         return 'Archaic'
+def get_nearest_neighbours2(word, n=10, models=load_all_models()):
     '''
         Return the nearest neighbours of a word
     return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
+def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
+    """
+    Return the nearest neighbours of a word for the given models
+    word: the word for which the nearest neighbours are calculated
+    n: the number of nearest neighbours to return (default: 10)
+    models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
+    Return: { 'model_name': [(word, cosine_similarity), ...], ... }
+    """
+    nearest_neighbours = {}
+    # Iterate over models and compute nearest neighbours
+    for model in models:
+        model_neighbours = []
+        model_name = convert_model_to_time_name(model[0])
+        model = model[1]
+        vector_1 = get_word_vector(model, target_word)
+        # Iterate over all words of the model
+        for word, index in model.wv.key_to_index.items():
+            vector_2 = get_word_vector(model, word)
+            cosine_sim = cosine_similarity(vector_1, vector_2)
+            # If the list of nearest neighbours is not full yet, add the current word
+            if len(model_neighbours) < n:
+                model_neighbours.append((word, cosine_sim))
+            else:
+                # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
+                smallest_neighbour = min(model_neighbours, key=lambda x: x[1])
+                if cosine_sim > smallest_neighbour[1]:
+                    model_neighbours.remove(smallest_neighbour)
+                    model_neighbours.append((word, cosine_sim))
+        # Sort the nearest neighbours by cosine similarity
+        model_neighbours = sorted(model_neighbours, key=lambda x: x[1], reverse=True)
+        # Add the model name and the nearest neighbours to the dictionary
+        nearest_neighbours[model_name] = model_neighbours
+    return nearest_neighbours
 def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
     return temp_file_path
+def store_df_in_temp_file(all_dfs):
     '''
         Store the dataframe in a temporary file
     '''
     # Create random tmp file name
     _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)
+    # Concatenate all dataframes
+    df = pd.concat([df for _, df in all_dfs], axis=1, keys=[model for model, _ in all_dfs])
+    # Create an ExcelWriter object
     with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
+        # Create a new sheet
+        worksheet = writer.book.add_worksheet('Results')
+        # Write text before DataFrames
+        start_row = 0
+        for model, df in all_dfs:
+            # Write model name as text
+            worksheet.write(start_row, 0, f"Model: {model}")
+            # Write DataFrame
+            df.to_excel(writer, sheet_name='Results', index=False, startrow=start_row + 1, startcol=0)
+            # Update start_row for the next model
+            start_row += df.shape[0] + 3  # Add some space between models
     return temp_file_path