Mark7549 commited on
Commit
7b3478d
·
1 Parent(s): d24cb74

Created 3d graph functionality, not optimal yet

Browse files
Files changed (4) hide show
  1. app.py +37 -1
  2. plots.py +144 -0
  3. vector_graph.py +73 -0
  4. word2vec.py +26 -0
app.py CHANGED
@@ -3,6 +3,8 @@ from streamlit_option_menu import option_menu
3
  from word2vec import *
4
  import pandas as pd
5
  from autocomplete import *
 
 
6
 
7
  st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
8
 
@@ -112,9 +114,43 @@ elif active_tab == "Cosine similarity":
112
 
113
  # 3D graph tab
114
  elif active_tab == "3D graph":
 
 
 
 
 
 
115
  with st.container():
116
- st.write("3D graph tab")
 
 
 
 
 
 
 
 
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  # Dictionary tab
119
  elif active_tab == "Dictionary":
120
  with st.container():
 
3
  from word2vec import *
4
  import pandas as pd
5
  from autocomplete import *
6
+ from vector_graph import *
7
+ from plots import *
8
 
9
  st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
10
 
 
114
 
115
  # 3D graph tab
116
  elif active_tab == "3D graph":
117
+ col1, col2 = st.columns(2)
118
+
119
+ # Load compressed word list
120
+ compressed_word_list_filename = 'corpora/compass_filtered.pkl.gz'
121
+ all_words = load_compressed_word_list(compressed_word_list_filename)
122
+
123
  with st.container():
124
+ with col1:
125
+ word = st.multiselect("Enter a word", all_words, max_selections=1)
126
+ if len(word) > 0:
127
+ word = word[0]
128
+
129
+ with col2:
130
+ time_slice = st.selectbox("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
131
+
132
+ n = st.slider("Number of words", 1, 50, 15)
133
 
134
+ graph_button = st.button("Create 3D graph")
135
+
136
+ if graph_button:
137
+ time_slice_model = convert_time_name_to_model(time_slice)
138
+ nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
139
+ # nearest_neighbours_3d_vectors = create_3d_vectors(word, time_slice_model, nearest_neighbours_vectors)
140
+ st.dataframe(nearest_neighbours_vectors)
141
+ # new_3d_vectors = nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors)
142
+ # st.dataframe(new_3d_vectors)
143
+
144
+
145
+ fig, df = make_3d_plot4(nearest_neighbours_vectors, word, time_slice_model)
146
+
147
+ st.dataframe(df)
148
+
149
+ st.plotly_chart(fig)
150
+
151
+
152
+
153
+
154
  # Dictionary tab
155
  elif active_tab == "Dictionary":
156
  with st.container():
plots.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ from mpl_toolkits.mplot3d import Axes3D
5
+ import umap
6
+ import pandas as pd
7
+ from word2vec import *
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+
11
+ def make_3d_plot(new_3d_vectors):
12
+ """
13
+ Turn DataFrame of 3D vectors into a 3D plot
14
+ DataFrame structure: ['word', 'cosine_sim', '3d_vector']
15
+ """
16
+ fig = plt.figure()
17
+ ax = fig.add_subplot(projection='3d')
18
+
19
+ plt.ion()
20
+
21
+ # Unpack vectors and labels from DataFrame
22
+ labels = new_3d_vectors['word']
23
+ x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
24
+ y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
25
+ z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])
26
+
27
+ # Plot points
28
+ ax.scatter(x, y, z)
29
+
30
+ # Add labels
31
+ for i, label in enumerate(labels):
32
+ ax.text(x[i], y[i], z[i], label)
33
+
34
+ # Set labels and title
35
+ ax.set_xlabel('X')
36
+ ax.set_ylabel('Y')
37
+ ax.set_zlabel('Z')
38
+ ax.set_title('3D plot of word vectors')
39
+
40
+ return fig
41
+
42
+
43
+ import plotly.express as px
44
+
45
+
46
+ def make_3d_plot2(df):
47
+ """
48
+ Turn DataFrame of 3D vectors into a 3D plot using plotly
49
+ DataFrame structure: ['word', 'cosine_sim', '3d_vector']
50
+ """
51
+ vectors = df['3d_vector'].tolist()
52
+ fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
53
+ return fig
54
+
55
+
56
+ def make_3d_plot3(vectors_list, word, time_slice_model):
57
+ """
58
+ Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
59
+ List structure: [(word, model_name, vector, cosine_sim)]
60
+ """
61
+ # Load model
62
+ model = load_word2vec_model(f'models/{time_slice_model}.model')
63
+
64
+ # Make UMAP model and fit it to the vectors
65
+ umap_model = umap.UMAP(n_components=3)
66
+ umap_model.fit(model.wv.vectors)
67
+
68
+ # Transform the vectors to 3D
69
+ transformed_vectors = umap_model.transform(model.wv.vectors)
70
+
71
+
72
+ # Create DataFrame from the transformed vectors
73
+ df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
74
+
75
+ # Add word and cosine similarity to DataFrame
76
+ df['word'] = model.wv.index_to_key
77
+
78
+ # Filter the DataFrame for words in vectors_list and add cosine similarity
79
+ word_list = [v[0] for v in vectors_list]
80
+ cosine_sim_list = [v[3] for v in vectors_list]
81
+
82
+ # Ensure that the word list and cosine similarity list are aligned properly
83
+ df = df[df['word'].isin(word_list)]
84
+ df['cosine_sim'] = cosine_sim_list
85
+
86
+ # Create plot
87
+ fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
88
+ fig.update_traces(marker=dict(size=5))
89
+ fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
90
+
91
+ return fig, df
92
+
93
+
94
+
95
+ def make_3d_plot4(vectors_list, word, time_slice_model):
96
+ """
97
+ Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
98
+ List structure: [(word, model_name, vector, cosine_sim)]
99
+ """
100
+ # Load model
101
+ model = load_word2vec_model(f'models/{time_slice_model}.model')
102
+ model_dict = model_dictionary(model)
103
+
104
+
105
+ # Extract vectors and names from model_dict
106
+ all_vector_names = list(model_dict.keys())
107
+ all_vectors = list(model_dict.values())
108
+
109
+
110
+ # Scale the vectors
111
+ scaler = StandardScaler()
112
+ vectors_scaled = scaler.fit_transform(all_vectors)
113
+
114
+ # Make UMAP model and fit it to the scaled vectors
115
+ umap_model = umap.UMAP(n_components=3)
116
+ umap_result = umap_model.fit_transform(vectors_scaled)
117
+
118
+ # Now umap_result contains the 3D representations of the vectors
119
+ # Associate the names with the 3D representations
120
+ result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
121
+
122
+
123
+ # Only keep the vectors that are in vectors_list and their cosine similarities
124
+ result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
125
+ result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
126
+
127
+
128
+ # Create DataFrame from the transformed vectors
129
+ df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
130
+
131
+ # Sort dataframe by cosine_sim
132
+ df = df.sort_values(by='cosine_sim', ascending=False)
133
+
134
+ x = df['3d_vector'].apply(lambda v: v[0])
135
+ y = df['3d_vector'].apply(lambda v: v[1])
136
+ z = df['3d_vector'].apply(lambda v: v[2])
137
+
138
+
139
+ # Create plot
140
+ fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
141
+ fig.update_traces(marker=dict(size=5))
142
+ fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
143
+
144
+ return fig, df
vector_graph.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from word2vec import *
2
+ import numpy as np
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.preprocessing import StandardScaler
5
+ import pandas as pd
6
+ import gensim
7
+ import umap
8
+
9
+
10
+ def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
11
+ """
12
+ Turn word vectors into 3D vectors
13
+ """
14
+ model = load_word2vec_model(f'models/{time_slice}.model')
15
+
16
+ # Compress all vectors to 3D
17
+ model_df = pd.DataFrame(model.wv.vectors)
18
+ pca_vectors = PCA(n_components=3)
19
+ pca_model = pca_vectors.fit_transform(model_df)
20
+ pca_model_df = pd.DataFrame(
21
+ data = pca_model,
22
+ columns = ['x', 'y', 'z']
23
+ )
24
+ pca_model_df.insert(0, 'word', model.wv.index_to_key)
25
+
26
+ return pca_model_df
27
+
28
+
29
+
30
+
31
+ def create_3d_models(time_slice):
32
+ """
33
+ Create 3D models for each time slice
34
+ """
35
+ time_slice_model = convert_time_name_to_model(time_slice)
36
+ model = load_word2vec_model(f'models/{time_slice_model}.model')
37
+
38
+ # Compress all vectors to 3D
39
+ model_df = pd.DataFrame(model.wv.vectors)
40
+ pca_vectors = PCA(n_components=3)
41
+ pca_model = pca_vectors.fit_transform(model_df)
42
+ pca_model_df = pd.DataFrame(
43
+ data = pca_model,
44
+ columns = ['x', 'y', 'z']
45
+ )
46
+
47
+ pca_model_df.insert(0, 'word', model.wv.index_to_key)
48
+
49
+ pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False)
50
+ return pca_model_df, pca_vectors
51
+
52
+
53
+ def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors):
54
+ """
55
+ Turn nearest neighbours into 3D vectors
56
+ """
57
+ model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv')
58
+
59
+ new_data = []
60
+
61
+ # Get the word vector for the nearest neighbours
62
+ for neighbour in nearest_neighbours_vectors:
63
+ word = neighbour[0]
64
+ cosine_sim = neighbour[3]
65
+ vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0]
66
+
67
+ # Add word, cosine_sim and 3D vector to new data list
68
+ new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d})
69
+
70
+ # Convert the list of dictionaries to a DataFrame
71
+ new_df = pd.DataFrame(new_data)
72
+
73
+ return new_df
word2vec.py CHANGED
@@ -235,6 +235,32 @@ def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models(
235
  return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
236
 
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  def write_to_file(data):
239
  '''
240
  Write the data to a file
 
235
  return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
236
 
237
 
238
+ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
239
+ """
240
+ Returns the vectors of the nearest neighbours of a word
241
+ """
242
+ model_name = convert_model_to_time_name(time_slice_model)
243
+ time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
244
+ vector_1 = get_word_vector(time_slice_model, word)
245
+ nearest_neighbours = []
246
+
247
+
248
+
249
+ for word, index in time_slice_model.wv.key_to_index.items():
250
+ vector_2 = get_word_vector(time_slice_model, word)
251
+ cosine_sim = cosine_similarity(vector_1, vector_2)
252
+
253
+ if len(nearest_neighbours) < n:
254
+ nearest_neighbours.append((word, model_name, vector_2, cosine_sim))
255
+ else:
256
+ smallest_neighbour = min(nearest_neighbours, key=lambda x: x[3])
257
+ if cosine_sim > smallest_neighbour[3]:
258
+ nearest_neighbours.remove(smallest_neighbour)
259
+ nearest_neighbours.append((word, model_name, vector_2, cosine_sim))
260
+
261
+ return sorted(nearest_neighbours, key=lambda x: x[3], reverse=True)
262
+
263
+
264
  def write_to_file(data):
265
  '''
266
  Write the data to a file