File size: 2,161 Bytes
7b3478d
 
f30d304
f9c30de
f30d304
7b3478d
88d7eed
7b3478d
c7d8395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b3478d
88d7eed
 
 
 
 
 
 
 
 
7b3478d
 
 
 
88d7eed
 
7b3478d
 
 
 
 
 
88d7eed
7b3478d
 
 
88d7eed
05fa263
7b3478d
 
88d7eed
7b3478d
 
05fa263
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
from word2vec import *
import plotly.express as px
import pickle


def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
    """
        Create a 3D plot using t-SNE and Plotly from a list of 100-dimensional vectors.

        vectors_list: list of tuples containing (word, model_name, vector, cosine_sim)
            - word: the word in the model
            - model_name: the name of the model
            - vector: the 100-dimensional vector representation of the word
            - cosine_sim: the cosine similarity of the word to the target word

        target_word: the word for which the nearest neighbours are calculated and plotted

        time_slice_model: the time slice model name used to extract 3D vector representations

        Return: a tuple containing:
            - fig: the Plotly 3D scatter plot figure
            - df: a pandas DataFrame containing the words, their 3D vectors, and cosine similarities
    """
    word = target_word
        
    # Extract vectors and names from ./3d_models/{time_slice_model}.model
    all_vectors = {}
    with open(f'./3d_models/{time_slice_model}.model', 'rb') as f:
        result_with_names = pickle.load(f)
        
    for word, vector in result_with_names:
        all_vectors[word] = vector
    
    
    
    # Only keep the vectors that are in vectors_list and their cosine similarities
    result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]

    # Create DataFrame from the transformed vectors
    df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
    
    # Sort dataframe by cosine_sim
    df = df.sort_values(by='cosine_sim', ascending=False)
    
    
    x = df['3d_vector'].apply(lambda v: v[0])
    y = df['3d_vector'].apply(lambda v: v[1])
    z = df['3d_vector'].apply(lambda v: v[2])
        
    # Plot
    fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
    fig.update_traces(marker=dict(size=5))
    fig.update_layout(title=f'3D plot of nearest neighbours to {target_word}')
    
    return fig, df