rnn-embeddings-songs / project /models /embeddings.py

Igor Santana

rnn model sent from github to huggingface

9c58361 over 1 year ago

6.86 kB

	import sys

	import pickle
	import pandas as pd
	import numpy as np
	from os import makedirs
	from os.path import exists
	from gensim.models import Word2Vec, Doc2Vec
	from gensim.models.doc2vec import TaggedDocument
	from datetime import datetime
	from glove import Glove, Corpus
	from project.models.rnn import rnn
	from project.models.setups import Setups
	from project.models.seq2seq import start as rnn_start

	def data_prep(model, df):
	if model == 'user':
	return df.groupby(by='user')['song'].apply(list).values.tolist()
	if model == 'user_doc':
	return df.groupby(by='user')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist()
	if model == 'session':
	return df.groupby(by='session')['song'].apply(list).values.tolist()
	if model == 'session_doc':
	return df.groupby(by='session')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist()

	def music2vec(data, w2v_type, dim, lr, window, down, neg_sample, epochs):
	sentences = data_prep(w2v_type, data)
	return Word2Vec(sentences, size=dim, alpha=lr, window=window, sample=down,
	sg=1, hs=0, negative=neg_sample, iter=epochs, min_count=1, compute_loss=True)

	def doc2vec(data, d2v_type, dim, lr, window, down, neg_sample, epochs):
	sequence = data_prep(d2v_type, data)
	return Doc2Vec(sequence, dm=1, vector_size=dim, alpha=lr, window=window, sample=down,
	negative=neg_sample, epochs=epochs, min_count=1, compute_loss=True)

	def glove(data, glove_type, window, dim, lr, epochs):
	sentences = data_prep(glove_type, data)
	corpus = Corpus()
	corpus.fit(sentences, window=window)
	glove = Glove(no_components=dim, learning_rate=lr)
	glove.fit(corpus.matrix, epochs=epochs, no_threads=4, verbose=True)
	glove.add_dictionary(corpus.dictionary)
	return glove

	def embeddings(df, conf):
	ds = conf['evaluation']['dataset']
	cwd = 'tmp/{}/models'.format(ds)

	if not exists(cwd):
	makedirs(cwd)

	setups = Setups(conf)
	generators = setups.get_generators()

	c_id = 0
	setups_id = []
	for method, generator in generators:
	if method == 'rnn':
	for s in generator:
	to_str = setups.setup_to_string(c_id, s, method)
	print(to_str)

	path = '{}/{}__{}.pickle'.format(cwd, method, c_id)
	path_s = '{}/s{}__{}.pickle'.format(cwd, method, c_id)

	if not exists(path):
	user, session = rnn(df, ds, s['model'], s['window'], s['epochs'],
	s['batch'], s['dim'], s['num_units'], s['bidi'])
	fu = open(path, 'wb')
	fs = open(path_s, 'wb')

	pickle.dump(user, fu, protocol=pickle.HIGHEST_PROTOCOL)
	pickle.dump(session, fs, protocol=pickle.HIGHEST_PROTOCOL)

	fu.close()
	fs.close()

	setups_id.append([c_id, to_str, path])
	c_id+=1
	if method == 'music2vec':
	for s in generator:
	to_str = setups.setup_to_string(c_id, s, method)
	print(to_str)

	path = '{}/{}__{}.model'.format(cwd, method, c_id)
	path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)

	if not exists(path):

	m2v = music2vec(df,'user', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
	sm2v = music2vec(df,'session', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])

	m2v.save(path)
	sm2v.save(path_s)

	setups_id.append([c_id, to_str, path])

	c_id+=1
	if method == 'doc2vec':
	for s in generator:
	to_str = setups.setup_to_string(c_id, s, method)
	path = '{}/{}__{}.model'.format(cwd, method, c_id)
	path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
	print(to_str)

	if not exists(path):

	d2v = doc2vec(df,'user_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
	sd2v = doc2vec(df,'session_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])

	d2v.save(path)
	sd2v.save(path_s)

	setups_id.append([c_id, to_str, path])

	c_id+=1
	if method == 'glove':
	for s in generator:
	to_str = setups.setup_to_string(c_id, s, method)
	path = '{}/{}__{}.model'.format(cwd, method, c_id)
	path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
	print(to_str)

	if not exists(path):

	glv = glove(df, 'user', s['window'], s['dim'], s['lr'], s['epochs'])
	sglv = glove(df, 'session', s['window'], s['dim'], s['lr'], s['epochs'])

	glv.save(path)
	sglv.save(path_s)

	c_id+=1
	if method == 'genres':
	for s in generator:
	to_str = s
	print(to_str)
	path = 'tmp/{}/experiments/'.format(ds)
	path_s = 'tmp/{}/experiments/'.format(ds)

	if s == 'add-all':
	path += 'all_genres/add/all_add.pickle'
	path_s += 'all_genres/add/sall_add.pickle'
	if s == 'mul-all':
	path += 'all_genres/mul/all_mul.pickle'
	path_s += 'all_genres/mul/sall_mul.pickle'
	if s == 'avg-all':
	path += 'all_genres/avg/all_avg.pickle'
	path_s += 'all_genres/avg/sall_avg.pickle'
	if s == 'add-ran':
	path += 'random_genres/add/ran_add.pickle'
	path_s += 'random_genres/add/sran_add.pickle'
	if s == 'mul-ran':
	path += 'random_genres/mul/ran_mul.pickle'
	path_s += 'random_genres/mul/sran_mul.pickle'
	if s == 'avg-ran':
	path += 'random_genres/avg/ran_avg.pickle'
	path_s += 'random_genres/avg/sran_avg.pickle'

	setups_id.append([c_id, to_str, path])

	c_id+=1

	setups_id = np.stack(setups_id, axis=0)

	np.save('{}/ids'.format(cwd), setups_id)