|
import sys |
|
|
|
import pickle |
|
import pandas as pd |
|
import numpy as np |
|
from os import makedirs |
|
from os.path import exists |
|
from gensim.models import Word2Vec, Doc2Vec |
|
from gensim.models.doc2vec import TaggedDocument |
|
from datetime import datetime |
|
from glove import Glove, Corpus |
|
from project.models.rnn import rnn |
|
from project.models.setups import Setups |
|
from project.models.seq2seq import start as rnn_start |
|
|
|
def data_prep(model, df): |
|
if model == 'user': |
|
return df.groupby(by='user')['song'].apply(list).values.tolist() |
|
if model == 'user_doc': |
|
return df.groupby(by='user')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist() |
|
if model == 'session': |
|
return df.groupby(by='session')['song'].apply(list).values.tolist() |
|
if model == 'session_doc': |
|
return df.groupby(by='session')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist() |
|
|
|
def music2vec(data, w2v_type, dim, lr, window, down, neg_sample, epochs): |
|
sentences = data_prep(w2v_type, data) |
|
return Word2Vec(sentences, size=dim, alpha=lr, window=window, sample=down, |
|
sg=1, hs=0, negative=neg_sample, iter=epochs, min_count=1, compute_loss=True) |
|
|
|
def doc2vec(data, d2v_type, dim, lr, window, down, neg_sample, epochs): |
|
sequence = data_prep(d2v_type, data) |
|
return Doc2Vec(sequence, dm=1, vector_size=dim, alpha=lr, window=window, sample=down, |
|
negative=neg_sample, epochs=epochs, min_count=1, compute_loss=True) |
|
|
|
def glove(data, glove_type, window, dim, lr, epochs): |
|
sentences = data_prep(glove_type, data) |
|
corpus = Corpus() |
|
corpus.fit(sentences, window=window) |
|
glove = Glove(no_components=dim, learning_rate=lr) |
|
glove.fit(corpus.matrix, epochs=epochs, no_threads=4, verbose=True) |
|
glove.add_dictionary(corpus.dictionary) |
|
return glove |
|
|
|
def embeddings(df, conf): |
|
ds = conf['evaluation']['dataset'] |
|
cwd = 'tmp/{}/models'.format(ds) |
|
|
|
if not exists(cwd): |
|
makedirs(cwd) |
|
|
|
setups = Setups(conf) |
|
generators = setups.get_generators() |
|
|
|
c_id = 0 |
|
setups_id = [] |
|
for method, generator in generators: |
|
if method == 'rnn': |
|
for s in generator: |
|
to_str = setups.setup_to_string(c_id, s, method) |
|
print(to_str) |
|
|
|
path = '{}/{}__{}.pickle'.format(cwd, method, c_id) |
|
path_s = '{}/s{}__{}.pickle'.format(cwd, method, c_id) |
|
|
|
if not exists(path): |
|
user, session = rnn(df, ds, s['model'], s['window'], s['epochs'], |
|
s['batch'], s['dim'], s['num_units'], s['bidi']) |
|
fu = open(path, 'wb') |
|
fs = open(path_s, 'wb') |
|
|
|
pickle.dump(user, fu, protocol=pickle.HIGHEST_PROTOCOL) |
|
pickle.dump(session, fs, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
|
fu.close() |
|
fs.close() |
|
|
|
setups_id.append([c_id, to_str, path]) |
|
c_id+=1 |
|
if method == 'music2vec': |
|
for s in generator: |
|
to_str = setups.setup_to_string(c_id, s, method) |
|
print(to_str) |
|
|
|
path = '{}/{}__{}.model'.format(cwd, method, c_id) |
|
path_s = '{}/s{}__{}.model'.format(cwd, method, c_id) |
|
|
|
if not exists(path): |
|
|
|
m2v = music2vec(df,'user', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) |
|
sm2v = music2vec(df,'session', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) |
|
|
|
m2v.save(path) |
|
sm2v.save(path_s) |
|
|
|
setups_id.append([c_id, to_str, path]) |
|
|
|
c_id+=1 |
|
if method == 'doc2vec': |
|
for s in generator: |
|
to_str = setups.setup_to_string(c_id, s, method) |
|
path = '{}/{}__{}.model'.format(cwd, method, c_id) |
|
path_s = '{}/s{}__{}.model'.format(cwd, method, c_id) |
|
print(to_str) |
|
|
|
if not exists(path): |
|
|
|
d2v = doc2vec(df,'user_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) |
|
sd2v = doc2vec(df,'session_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs']) |
|
|
|
d2v.save(path) |
|
sd2v.save(path_s) |
|
|
|
setups_id.append([c_id, to_str, path]) |
|
|
|
c_id+=1 |
|
if method == 'glove': |
|
for s in generator: |
|
to_str = setups.setup_to_string(c_id, s, method) |
|
path = '{}/{}__{}.model'.format(cwd, method, c_id) |
|
path_s = '{}/s{}__{}.model'.format(cwd, method, c_id) |
|
print(to_str) |
|
|
|
if not exists(path): |
|
|
|
glv = glove(df, 'user', s['window'], s['dim'], s['lr'], s['epochs']) |
|
sglv = glove(df, 'session', s['window'], s['dim'], s['lr'], s['epochs']) |
|
|
|
glv.save(path) |
|
sglv.save(path_s) |
|
|
|
c_id+=1 |
|
if method == 'genres': |
|
for s in generator: |
|
to_str = s |
|
print(to_str) |
|
path = 'tmp/{}/experiments/'.format(ds) |
|
path_s = 'tmp/{}/experiments/'.format(ds) |
|
|
|
if s == 'add-all': |
|
path += 'all_genres/add/all_add.pickle' |
|
path_s += 'all_genres/add/sall_add.pickle' |
|
if s == 'mul-all': |
|
path += 'all_genres/mul/all_mul.pickle' |
|
path_s += 'all_genres/mul/sall_mul.pickle' |
|
if s == 'avg-all': |
|
path += 'all_genres/avg/all_avg.pickle' |
|
path_s += 'all_genres/avg/sall_avg.pickle' |
|
if s == 'add-ran': |
|
path += 'random_genres/add/ran_add.pickle' |
|
path_s += 'random_genres/add/sran_add.pickle' |
|
if s == 'mul-ran': |
|
path += 'random_genres/mul/ran_mul.pickle' |
|
path_s += 'random_genres/mul/sran_mul.pickle' |
|
if s == 'avg-ran': |
|
path += 'random_genres/avg/ran_avg.pickle' |
|
path_s += 'random_genres/avg/sran_avg.pickle' |
|
|
|
setups_id.append([c_id, to_str, path]) |
|
|
|
c_id+=1 |
|
|
|
setups_id = np.stack(setups_id, axis=0) |
|
|
|
np.save('{}/ids'.format(cwd), setups_id) |
|
|