File size: 2,145 Bytes
7b3478d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from word2vec import *
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import gensim
import umap


def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
    """
    Turn word vectors into 3D vectors
    """
    model = load_word2vec_model(f'models/{time_slice}.model')
    
    # Compress all vectors to 3D
    model_df = pd.DataFrame(model.wv.vectors)
    pca_vectors = PCA(n_components=3)
    pca_model = pca_vectors.fit_transform(model_df)
    pca_model_df = pd.DataFrame(
        data = pca_model,
        columns = ['x', 'y', 'z']
    )
    pca_model_df.insert(0, 'word', model.wv.index_to_key)
    
    return pca_model_df




def create_3d_models(time_slice):
    """
    Create 3D models for each time slice
    """
    time_slice_model = convert_time_name_to_model(time_slice)
    model = load_word2vec_model(f'models/{time_slice_model}.model')
    
    # Compress all vectors to 3D
    model_df = pd.DataFrame(model.wv.vectors)
    pca_vectors = PCA(n_components=3)
    pca_model = pca_vectors.fit_transform(model_df)
    pca_model_df = pd.DataFrame(
        data = pca_model,
        columns = ['x', 'y', 'z']
    )
    
    pca_model_df.insert(0, 'word', model.wv.index_to_key)
    
    pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False)
    return pca_model_df, pca_vectors


def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors):
    """
    Turn nearest neighbours into 3D vectors
    """
    model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv')
    
    new_data = []

    # Get the word vector for the nearest neighbours
    for neighbour in nearest_neighbours_vectors:
        word = neighbour[0]
        cosine_sim = neighbour[3]
        vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0]
        
        # Add word, cosine_sim and 3D vector to new data list
        new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d})

    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(new_data)

    return new_df