Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on Apr 12, 2024

Commit

7b3478d

1 Parent(s): d24cb74

Created 3d graph functionality, not optimal yet

Browse files

Files changed (4) hide show

app.py +37 -1
plots.py +144 -0
vector_graph.py +73 -0
word2vec.py +26 -0

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ from streamlit_option_menu import option_menu
 from word2vec import *
 import pandas as pd
 from autocomplete import *
 st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
@@ -112,9 +114,43 @@ elif active_tab == "Cosine similarity":
 # 3D graph tab
 elif active_tab == "3D graph":
     with st.container():
-        st.write("3D graph tab")
 # Dictionary tab
 elif active_tab == "Dictionary":
     with st.container():

 from word2vec import *
 import pandas as pd
 from autocomplete import *
+from vector_graph import *
+from plots import *
 st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
 # 3D graph tab
 elif active_tab == "3D graph":
+    col1, col2 = st.columns(2)
+    # Load compressed word list
+    compressed_word_list_filename = 'corpora/compass_filtered.pkl.gz'
+    all_words = load_compressed_word_list(compressed_word_list_filename)
     with st.container():
+        with col1:
+            word = st.multiselect("Enter a word", all_words, max_selections=1)
+            if len(word) > 0:
+                word = word[0]
+        with col2:
+            time_slice = st.selectbox("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
+        n = st.slider("Number of words", 1, 50, 15)
+        graph_button = st.button("Create 3D graph")
+        if graph_button:
+            time_slice_model = convert_time_name_to_model(time_slice)
+            nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
+            # nearest_neighbours_3d_vectors = create_3d_vectors(word, time_slice_model, nearest_neighbours_vectors)
+            st.dataframe(nearest_neighbours_vectors)
+            # new_3d_vectors = nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors)
+            # st.dataframe(new_3d_vectors)
+            fig, df = make_3d_plot4(nearest_neighbours_vectors, word, time_slice_model)
+            st.dataframe(df)
+            st.plotly_chart(fig)
 # Dictionary tab
 elif active_tab == "Dictionary":
     with st.container():

plots.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import streamlit as st
+import matplotlib.pyplot as plt
+import numpy as np
+from mpl_toolkits.mplot3d import Axes3D
+import umap
+import pandas as pd
+from word2vec import *
+from sklearn.preprocessing import StandardScaler
+def make_3d_plot(new_3d_vectors):
+    """
+    Turn DataFrame of 3D vectors into a 3D plot
+    DataFrame structure: ['word', 'cosine_sim', '3d_vector']
+    """
+    fig = plt.figure()
+    ax = fig.add_subplot(projection='3d')
+    plt.ion()
+    # Unpack vectors and labels from DataFrame
+    labels = new_3d_vectors['word']
+    x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
+    y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
+    z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])
+    # Plot points
+    ax.scatter(x, y, z)
+    # Add labels
+    for i, label in enumerate(labels):
+        ax.text(x[i], y[i], z[i], label)
+    # Set labels and title
+    ax.set_xlabel('X')
+    ax.set_ylabel('Y')
+    ax.set_zlabel('Z')
+    ax.set_title('3D plot of word vectors')
+    return fig
+import plotly.express as px
+def make_3d_plot2(df):
+    """
+        Turn DataFrame of 3D vectors into a 3D plot using plotly
+        DataFrame structure: ['word', 'cosine_sim', '3d_vector']
+    """
+    vectors = df['3d_vector'].tolist()
+    fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
+    return fig
+def make_3d_plot3(vectors_list, word, time_slice_model):
+    """
+    Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
+    List structure: [(word, model_name, vector, cosine_sim)]
+    """
+    # Load model
+    model = load_word2vec_model(f'models/{time_slice_model}.model')
+    # Make UMAP model and fit it to the vectors
+    umap_model = umap.UMAP(n_components=3)
+    umap_model.fit(model.wv.vectors)
+    # Transform the vectors to 3D
+    transformed_vectors = umap_model.transform(model.wv.vectors)
+    # Create DataFrame from the transformed vectors
+    df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
+    # Add word and cosine similarity to DataFrame
+    df['word'] = model.wv.index_to_key
+    # Filter the DataFrame for words in vectors_list and add cosine similarity
+    word_list = [v[0] for v in vectors_list]
+    cosine_sim_list = [v[3] for v in vectors_list]
+    # Ensure that the word list and cosine similarity list are aligned properly
+    df = df[df['word'].isin(word_list)]
+    df['cosine_sim'] = cosine_sim_list
+    # Create plot
+    fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
+    fig.update_traces(marker=dict(size=5))
+    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
+    return fig, df
+def make_3d_plot4(vectors_list, word, time_slice_model):
+    """
+    Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
+    List structure: [(word, model_name, vector, cosine_sim)]
+    """
+    # Load model
+    model = load_word2vec_model(f'models/{time_slice_model}.model')
+    model_dict = model_dictionary(model)
+    # Extract vectors and names from model_dict
+    all_vector_names = list(model_dict.keys())
+    all_vectors = list(model_dict.values())
+    # Scale the vectors
+    scaler = StandardScaler()
+    vectors_scaled = scaler.fit_transform(all_vectors)
+    # Make UMAP model and fit it to the scaled vectors
+    umap_model = umap.UMAP(n_components=3)
+    umap_result = umap_model.fit_transform(vectors_scaled)
+    # Now umap_result contains the 3D representations of the vectors
+    # Associate the names with the 3D representations
+    result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
+    # Only keep the vectors that are in vectors_list and their cosine similarities
+    result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
+    result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
+    # Create DataFrame from the transformed vectors
+    df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
+    # Sort dataframe by cosine_sim
+    df = df.sort_values(by='cosine_sim', ascending=False)
+    x = df['3d_vector'].apply(lambda v: v[0])
+    y = df['3d_vector'].apply(lambda v: v[1])
+    z = df['3d_vector'].apply(lambda v: v[2])
+    # Create plot
+    fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
+    fig.update_traces(marker=dict(size=5))
+    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
+    return fig, df

vector_graph.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from word2vec import *
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+import pandas as pd
+import gensim
+import umap
+def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
+    """
+    Turn word vectors into 3D vectors
+    """
+    model = load_word2vec_model(f'models/{time_slice}.model')
+    # Compress all vectors to 3D
+    model_df = pd.DataFrame(model.wv.vectors)
+    pca_vectors = PCA(n_components=3)
+    pca_model = pca_vectors.fit_transform(model_df)
+    pca_model_df = pd.DataFrame(
+        data = pca_model,
+        columns = ['x', 'y', 'z']
+    )
+    pca_model_df.insert(0, 'word', model.wv.index_to_key)
+    return pca_model_df
+def create_3d_models(time_slice):
+    """
+    Create 3D models for each time slice
+    """
+    time_slice_model = convert_time_name_to_model(time_slice)
+    model = load_word2vec_model(f'models/{time_slice_model}.model')
+    # Compress all vectors to 3D
+    model_df = pd.DataFrame(model.wv.vectors)
+    pca_vectors = PCA(n_components=3)
+    pca_model = pca_vectors.fit_transform(model_df)
+    pca_model_df = pd.DataFrame(
+        data = pca_model,
+        columns = ['x', 'y', 'z']
+    )
+    pca_model_df.insert(0, 'word', model.wv.index_to_key)
+    pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False)
+    return pca_model_df, pca_vectors
+def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors):
+    """
+    Turn nearest neighbours into 3D vectors
+    """
+    model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv')
+    new_data = []
+    # Get the word vector for the nearest neighbours
+    for neighbour in nearest_neighbours_vectors:
+        word = neighbour[0]
+        cosine_sim = neighbour[3]
+        vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0]
+        # Add word, cosine_sim and 3D vector to new data list
+        new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d})
+    # Convert the list of dictionaries to a DataFrame
+    new_df = pd.DataFrame(new_data)
+    return new_df

word2vec.py CHANGED Viewed

@@ -235,6 +235,32 @@ def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models(
     return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
 def write_to_file(data):
     '''
         Write the data to a file

     return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
+def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
+    """
+        Returns the vectors of the nearest neighbours of a word
+    """
+    model_name = convert_model_to_time_name(time_slice_model)
+    time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
+    vector_1 = get_word_vector(time_slice_model, word)
+    nearest_neighbours = []
+    for word, index in time_slice_model.wv.key_to_index.items():
+        vector_2 = get_word_vector(time_slice_model, word)
+        cosine_sim = cosine_similarity(vector_1, vector_2)
+        if len(nearest_neighbours) < n:
+            nearest_neighbours.append((word, model_name, vector_2, cosine_sim))
+        else:
+            smallest_neighbour = min(nearest_neighbours, key=lambda x: x[3])
+            if cosine_sim > smallest_neighbour[3]:
+                nearest_neighbours.remove(smallest_neighbour)
+                nearest_neighbours.append((word, model_name, vector_2, cosine_sim))
+    return sorted(nearest_neighbours, key=lambda x: x[3], reverse=True)
 def write_to_file(data):
     '''
         Write the data to a file