File size: 2,145 Bytes
7b3478d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from word2vec import *
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import gensim
import umap
def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
"""
Turn word vectors into 3D vectors
"""
model = load_word2vec_model(f'models/{time_slice}.model')
# Compress all vectors to 3D
model_df = pd.DataFrame(model.wv.vectors)
pca_vectors = PCA(n_components=3)
pca_model = pca_vectors.fit_transform(model_df)
pca_model_df = pd.DataFrame(
data = pca_model,
columns = ['x', 'y', 'z']
)
pca_model_df.insert(0, 'word', model.wv.index_to_key)
return pca_model_df
def create_3d_models(time_slice):
"""
Create 3D models for each time slice
"""
time_slice_model = convert_time_name_to_model(time_slice)
model = load_word2vec_model(f'models/{time_slice_model}.model')
# Compress all vectors to 3D
model_df = pd.DataFrame(model.wv.vectors)
pca_vectors = PCA(n_components=3)
pca_model = pca_vectors.fit_transform(model_df)
pca_model_df = pd.DataFrame(
data = pca_model,
columns = ['x', 'y', 'z']
)
pca_model_df.insert(0, 'word', model.wv.index_to_key)
pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False)
return pca_model_df, pca_vectors
def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors):
"""
Turn nearest neighbours into 3D vectors
"""
model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv')
new_data = []
# Get the word vector for the nearest neighbours
for neighbour in nearest_neighbours_vectors:
word = neighbour[0]
cosine_sim = neighbour[3]
vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0]
# Add word, cosine_sim and 3D vector to new data list
new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d})
# Convert the list of dictionaries to a DataFrame
new_df = pd.DataFrame(new_data)
return new_df
|