import sys import pickle import pandas as pd import numpy as np from os import makedirs from os.path import exists from gensim.models import Word2Vec, Doc2Vec from gensim.models.doc2vec import TaggedDocument from datetime import datetime from glove import Glove, Corpus from project.models.rnn import rnn from project.models.setups import Setups from project.models.seq2seq import start as rnn_start def data_prep(model, df): if model == 'user': return df.groupby(by='user')['song'].apply(list).values.tolist() if model == 'user_doc': return df.groupby(by='user')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist() if model == 'session': return df.groupby(by='session')['song'].apply(list).values.tolist() if model == 'session_doc': return df.groupby(by='session')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist() def music2vec(data, w2v_type, dim, lr, window, down, neg_sample, epochs): sentences = data_prep(w2v_type, data) return Word2Vec(sentences, size=dim, alpha=lr, window=window, sample=down, sg=1, hs=0, negative=neg_sample, iter=epochs, min_count=1, compute_loss=True) def doc2vec(data, d2v_type, dim, lr, window, down, neg_sample, epochs): sequence = data_prep(d2v_type, data) return Doc2Vec(sequence, dm=1, vector_size=dim, alpha=lr, window=window, sample=down, negative=neg_sample, epochs=epochs, min_count=1, compute_loss=True) def glove(data, glove_type, window, dim, lr, epochs): sentences = data_prep(glove_type, data) corpus = Corpus() corpus.fit(sentences, window=window) glove = Glove(no_components=dim, learning_rate=lr) glove.fit(corpus.matrix, epochs=epochs, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) return glove def embeddings(df, conf): ds = conf['evaluation']['dataset'] cwd = 'tmp/{}/models'.format(ds) if not exists(cwd): makedirs(cwd) setups = Setups(conf) generators = setups.get_generators() c_id = 0 setups_id = [] for method, generator in generators: if method == 'rnn': for s in generator: to_str = setups.setup_to_string(c_id, s, method) print(to_str) path = '{}/{}__{}.pickle'.format(cwd, method, c_id) path_s = '{}/s{}__{}.pickle'.format(cwd, method, c_id) if not exists(path): user, session = rnn(df, ds, s['model'], s['window'], s['epochs'], s['batch'], s['dim'], s['num_units'], s['bidi']) fu = open(path, 'wb') fs = open(path_s, 'wb') pickle.dump(user, fu, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(session, fs, protocol=pickle.HIGHEST_PROTOCOL) fu.close() fs.close() setups_id.append([c_id, to_str, path]) c_id+=1 if method == 'music2vec': for s in generator: to_str = setups.setup_to_string(c_id, s, method) print(to_str) path = '{}/{}__{}.model'.format(cwd, method, c_id) path_s = '{}/s{}__{}.model'.format(cwd, method, c_id) if not exists(path): m2v = music2vec(df,'user', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) sm2v = music2vec(df,'session', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) m2v.save(path) sm2v.save(path_s) setups_id.append([c_id, to_str, path]) c_id+=1 if method == 'doc2vec': for s in generator: to_str = setups.setup_to_string(c_id, s, method) path = '{}/{}__{}.model'.format(cwd, method, c_id) path_s = '{}/s{}__{}.model'.format(cwd, method, c_id) print(to_str) if not exists(path): d2v = doc2vec(df,'user_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) sd2v = doc2vec(df,'session_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) d2v.save(path) sd2v.save(path_s) setups_id.append([c_id, to_str, path]) c_id+=1 if method == 'glove': for s in generator: to_str = setups.setup_to_string(c_id, s, method) path = '{}/{}__{}.model'.format(cwd, method, c_id) path_s = '{}/s{}__{}.model'.format(cwd, method, c_id) print(to_str) if not exists(path): glv = glove(df, 'user', s['window'], s['dim'], s['lr'], s['epochs']) sglv = glove(df, 'session', s['window'], s['dim'], s['lr'], s['epochs']) glv.save(path) sglv.save(path_s) c_id+=1 if method == 'genres': for s in generator: to_str = s print(to_str) path = 'tmp/{}/experiments/'.format(ds) path_s = 'tmp/{}/experiments/'.format(ds) if s == 'add-all': path += 'all_genres/add/all_add.pickle' path_s += 'all_genres/add/sall_add.pickle' if s == 'mul-all': path += 'all_genres/mul/all_mul.pickle' path_s += 'all_genres/mul/sall_mul.pickle' if s == 'avg-all': path += 'all_genres/avg/all_avg.pickle' path_s += 'all_genres/avg/sall_avg.pickle' if s == 'add-ran': path += 'random_genres/add/ran_add.pickle' path_s += 'random_genres/add/sran_add.pickle' if s == 'mul-ran': path += 'random_genres/mul/ran_mul.pickle' path_s += 'random_genres/mul/sran_mul.pickle' if s == 'avg-ran': path += 'random_genres/avg/ran_avg.pickle' path_s += 'random_genres/avg/sran_avg.pickle' setups_id.append([c_id, to_str, path]) c_id+=1 setups_id = np.stack(setups_id, axis=0) np.save('{}/ids'.format(cwd), setups_id)