Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

agalma / plots.py

Mark7549

added docstring and removed unnecessary imported modules

c7d8395 8 months ago

raw

history blame

2.16 kB

	import pandas as pd
	from word2vec import *
	import plotly.express as px
	import pickle


	def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
	"""
	Create a 3D plot using t-SNE and Plotly from a list of 100-dimensional vectors.

	vectors_list: list of tuples containing (word, model_name, vector, cosine_sim)
	- word: the word in the model
	- model_name: the name of the model
	- vector: the 100-dimensional vector representation of the word
	- cosine_sim: the cosine similarity of the word to the target word

	target_word: the word for which the nearest neighbours are calculated and plotted

	time_slice_model: the time slice model name used to extract 3D vector representations

	Return: a tuple containing:
	- fig: the Plotly 3D scatter plot figure
	- df: a pandas DataFrame containing the words, their 3D vectors, and cosine similarities
	"""
	word = target_word

	# Extract vectors and names from ./3d_models/{time_slice_model}.model
	all_vectors = {}
	with open(f'./3d_models/{time_slice_model}.model', 'rb') as f:
	result_with_names = pickle.load(f)

	for word, vector in result_with_names:
	all_vectors[word] = vector



	# Only keep the vectors that are in vectors_list and their cosine similarities
	result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]

	# Create DataFrame from the transformed vectors
	df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])

	# Sort dataframe by cosine_sim
	df = df.sort_values(by='cosine_sim', ascending=False)


	x = df['3d_vector'].apply(lambda v: v[0])
	y = df['3d_vector'].apply(lambda v: v[1])
	z = df['3d_vector'].apply(lambda v: v[2])

	# Plot
	fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
	fig.update_traces(marker=dict(size=5))
	fig.update_layout(title=f'3D plot of nearest neighbours to {target_word}')

	return fig, df