Igor Santana
commited on
Commit
·
9c58361
1
Parent(s):
c9be590
rnn model sent from github to huggingface
Browse files- .editorconfig +12 -0
- .gitignore +12 -0
- LICENSE +21 -0
- README.md +82 -2
- analysis/main.py +70 -0
- configs/config_sample.yml +46 -0
- environment.yml +129 -0
- main.py +39 -0
- project/__init__.py +0 -0
- project/data/preparation.py +87 -0
- project/data/preprocess.py +82 -0
- project/evaluation/ResultReport.py +27 -0
- project/evaluation/metrics.py +30 -0
- project/evaluation/ranking_metrics.py +240 -0
- project/evaluation/run.py +69 -0
- project/models/embeddings.py +166 -0
- project/models/rnn.py +180 -0
- project/models/seq2seq.py +201 -0
- project/models/setups.py +65 -0
- project/recsys/algorithms.py +92 -0
- project/recsys/helper.py +92 -0
.editorconfig
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# top-most EditorConfig file
|
2 |
+
root = true
|
3 |
+
|
4 |
+
# Unix-style newlines with a newline ending every file
|
5 |
+
[*]
|
6 |
+
end_of_line = lf
|
7 |
+
insert_final_newline = true
|
8 |
+
|
9 |
+
# 4 space indentation
|
10 |
+
[*.py]
|
11 |
+
indent_style = tab
|
12 |
+
indent_size = 4
|
.gitignore
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset/*
|
2 |
+
tmp/*
|
3 |
+
**/*.pyc
|
4 |
+
**/*.cpython-37.pyc
|
5 |
+
.ipynb_checkpoints/*
|
6 |
+
.history
|
7 |
+
.vscode
|
8 |
+
tmp
|
9 |
+
project/data/__pycache__/*.pyc
|
10 |
+
project/evaluation/__pycache__/*.pyc
|
11 |
+
project/recsys/__pycache__/*.pyc
|
12 |
+
project/__pycache__/*.pyc
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Igor André P. Santana
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,3 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
|
3 |
-
|
|
|
1 |
+
# RNN Embeddings
|
2 |
+
|
3 |
+
## Jointly learning music embeddings with Recurrent Neural Networks
|
4 |
+
|
5 |
+
This repository contains all the code that I did during my masters @ State University of Maringá. I do not intend to add new features to this project, as I will not continue this project in a PhD. To better understand what is the goal of this project, this quote is from my thesis and summarizes what I did:
|
6 |
+
|
7 |
+
> This work's goal is to use Recurrent Neural Networks to acquire contextual information for each song, given the sequence of songs that each user has listened to using embeddings.
|
8 |
+
|
9 |
+
|
10 |
+
If you have any doubts about the code, or want to use it in your project, let me know! I will be glad to help you in anything you need.
|
11 |
+
|
12 |
+
### Installation and Setup
|
13 |
+
|
14 |
+
As this code was written in Python, I highly recommend you to use [conda](https://docs.conda.io/en/latest/) to install all the dependencies that you'll need to run it. I have provided the [environment file](environment.yml) that I ended up with, and to create the repository using this file, you should run the following command (assuming you already have conda):
|
15 |
+
|
16 |
+
```
|
17 |
+
conda env create -f environment.yml
|
18 |
+
```
|
19 |
+
|
20 |
+
It is important to know that I used Tensorflow 1.14.0, Cuda 9.2 and Python 3.6.9 to run the experiments. If you cannot run with the environment file that I have provided, perhaps its because one of those versions.
|
21 |
+
|
22 |
+
### Directory Structure and General Instructions
|
23 |
+
|
24 |
+
```
|
25 |
+
.
|
26 |
+
|-- analysis
|
27 |
+
|-- configs
|
28 |
+
|-- dataset
|
29 |
+
| |-- dataset #1
|
30 |
+
| |-- dataset #2
|
31 |
+
| `-- ...
|
32 |
+
|-- outputs
|
33 |
+
|-- project
|
34 |
+
| |-- data
|
35 |
+
| |-- evaluation
|
36 |
+
| |-- models
|
37 |
+
| `-- recsys
|
38 |
+
|-- tmp
|
39 |
+
```
|
40 |
+
|
41 |
+
This project follows this directory structure in order to work. The main python files are in the **project** folder, and any change that you'll want to do in the code must be done in the files in this folder. The **outputs** folder will contain the output file for the models that you built.
|
42 |
+
|
43 |
+
The **dataset** contains all the datasets that you'll use in the project, and for each dataset, you should create a separate folder for it inside the **dataset** folder. The project will then look for a `listening_history.csv` file inside of this folder to run it. This file **must be** comma-separated.
|
44 |
+
|
45 |
+
A temporary folder, **tmp**, will be created while the project works. For each dataset that you'll run this project with, a folder inside the **tmp** folder will be created. There you can find the cross-validation folds, the models that you built and the individual recommendations for each user, as well as some auxiliary matrixes used in the UserKNN algorithm.
|
46 |
+
|
47 |
+
I have also included an **analysis** folder that I used to create some graphs with the results. You just have to point to the `main.py` file in the analysis folder where are the results, and it will show an graphical comparison between the models with all the metrics.
|
48 |
+
|
49 |
+
The project will only work if you provide a configuration file to it. In my case, I stored my configuration files in the **configs** folder, but feel free to delete the folder if you don' want it. The configuration file contains the parameters for the models, and I don't recommend deleting any parameter even if you are not going to use it. I've included a [sample configuration](configs/config_sample.yml) file that you can use as guideline for your project.
|
50 |
+
|
51 |
+
|
52 |
+
To run the project, you have to pass the config to the `main.py` as a parameter.
|
53 |
+
|
54 |
+
```
|
55 |
+
$ python main.py --config=configs/config_sample.yml
|
56 |
+
```
|
57 |
+
|
58 |
+
|
59 |
+
###### DISCLAIMER:
|
60 |
+
|
61 |
+
The `model` and `bi` parameters in the `models/rnn` configuration object are not working, as I hardcoded it in my project. If you want to change the layer (to a GRU or a Simple RNN), you should do it [directly in the code](project/models/rnn.py#L147).
|
62 |
+
|
63 |
+
|
64 |
+
### What is included in this project?
|
65 |
+
|
66 |
+
To better understand the project, I highly recommend you to go check the work that I used as a baseline for my model:
|
67 |
+
|
68 |
+
- [link](https://doi.org/10.1007/s10791-017-9317-7) - Wang, D., Deng, S. & Xu, G. Sequence-based context-aware music recommendation. Information Retrieval Journal (2018)
|
69 |
+
|
70 |
+
Their work, *music2vec*, is one of the baselines for my RNN model. The following embeddings are implemented in this project:
|
71 |
+
|
72 |
+
- music2vec
|
73 |
+
- doc2vec - [link](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)
|
74 |
+
- GloVe - [link](https://nlp.stanford.edu/projects/glove/)
|
75 |
+
|
76 |
+
To evaluate these embeddings models, the CARS that are implemented are the ones that were proposed by Wang et. al (M-TN, SM-TN, CSM-TN, CSM-UK). Besides the metrics that were used in the paper, I have included MAP, NDCG@5 and Precision@5 as well. The cutoff of these metrics is not configurable, sorry.
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
---
|
82 |
+
|
83 |
+
If you have any doubts about this project, feel free to contact me!
|
analysis/main.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
|
6 |
+
sns.set(font_scale =1, style='whitegrid', context='paper')
|
7 |
+
colors = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", '#f1c40f']
|
8 |
+
palette = sns.color_palette(colors)
|
9 |
+
|
10 |
+
df = pd.read_csv('data/xiami.csv', sep='\t')
|
11 |
+
df['id'] = df.params
|
12 |
+
|
13 |
+
mtn = df[df.algo == 'm2vTN'][['id','prec','rec', 'f1']]
|
14 |
+
mtn = pd.DataFrame(mtn.groupby(by='id').mean())
|
15 |
+
mtn['id'] = mtn.index
|
16 |
+
|
17 |
+
smtn = df[df.algo == 'sm2vTN'][['id','prec','rec', 'f1']]
|
18 |
+
smtn = pd.DataFrame(smtn.groupby(by='id').mean())
|
19 |
+
smtn['id'] = smtn.index
|
20 |
+
|
21 |
+
csmtn = df[df.algo == 'csm2vTN'][['id','prec','rec', 'f1']]
|
22 |
+
csmtn = pd.DataFrame(csmtn.groupby(by='id').mean())
|
23 |
+
csmtn['id'] = csmtn.index
|
24 |
+
|
25 |
+
csmuk = df[df.algo == 'csm2vUK'][['id','prec','rec', 'f1']]
|
26 |
+
csmuk = pd.DataFrame(csmuk.groupby(by='id').mean())
|
27 |
+
csmuk['id'] = csmuk.index
|
28 |
+
|
29 |
+
mtn.sort_index(ascending=False, inplace=True)
|
30 |
+
smtn.sort_index(ascending=False, inplace=True)
|
31 |
+
csmtn.sort_index(ascending=False, inplace=True)
|
32 |
+
csmuk.sort_index(ascending=False, inplace=True)
|
33 |
+
|
34 |
+
melt_mtn = pd.melt(mtn, id_vars='id')
|
35 |
+
melt_smtn = pd.melt(smtn, id_vars='id')
|
36 |
+
melt_csmtn = pd.melt(csmtn, id_vars='id')
|
37 |
+
melt_csmuk = pd.melt(csmuk, id_vars='id')
|
38 |
+
|
39 |
+
fig, axes = plt.subplots(2, 2, figsize=(25, 25))
|
40 |
+
|
41 |
+
a1 = sns.catplot(x='variable', y='value', hue='id', data=melt_mtn, kind='bar', palette=palette, ax=axes[0][0])
|
42 |
+
a2 = sns.catplot(x='variable', y='value', hue='id', data=melt_smtn, kind='bar', palette=palette, ax=axes[0][1])
|
43 |
+
a3 = sns.catplot(x='variable', y='value', hue='id', data=melt_csmtn, kind='bar', palette=palette, ax=axes[1][0])
|
44 |
+
a4 = sns.catplot(x='variable', y='value', hue='id', data=melt_csmuk, kind='bar', palette=palette, ax=axes[1][1])
|
45 |
+
|
46 |
+
plt.close(2)
|
47 |
+
plt.close(3)
|
48 |
+
plt.close(4)
|
49 |
+
plt.close(5)
|
50 |
+
|
51 |
+
titles = ['M-TN', 'SM-TN', 'CSM-TN', 'CSM-UK']
|
52 |
+
|
53 |
+
last = axes.flatten()[-1]
|
54 |
+
handles, labels = last.get_legend_handles_labels()
|
55 |
+
fig.legend(handles, labels, loc='upper left')
|
56 |
+
|
57 |
+
i=0
|
58 |
+
for ax in axes.flatten():
|
59 |
+
ax.get_legend().remove()
|
60 |
+
ax.set(yticks=np.arange(0, 0.21, 0.025))
|
61 |
+
ax.set(xlabel='Metrics Used', ylabel='Valor')
|
62 |
+
ax.set(title=titles[i])
|
63 |
+
i+=1
|
64 |
+
|
65 |
+
|
66 |
+
plt.subplots_adjust(hspace=0.4)
|
67 |
+
fig.suptitle('Metrics', fontsize=18, y=.98)
|
68 |
+
plt.show()
|
69 |
+
|
70 |
+
|
configs/config_sample.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
models:
|
2 |
+
rnn:
|
3 |
+
embedding_dim: [256]
|
4 |
+
batch: 64
|
5 |
+
epochs: [50]
|
6 |
+
model: ['LSTM']
|
7 |
+
window: [3]
|
8 |
+
bi: [False]
|
9 |
+
num_units: [512]
|
10 |
+
music2vec:
|
11 |
+
window: [5]
|
12 |
+
epochs: [5]
|
13 |
+
down_sample: [1e-3]
|
14 |
+
learning_rate: [0.025]
|
15 |
+
embedding_dim: [300]
|
16 |
+
negative_sample: [20]
|
17 |
+
doc2vec:
|
18 |
+
window: [10]
|
19 |
+
epochs: [10]
|
20 |
+
down_sample: [1e-4]
|
21 |
+
learning_rate: [0.025]
|
22 |
+
embedding_dim: [50]
|
23 |
+
negative_sample: [10]
|
24 |
+
glove:
|
25 |
+
window: [10]
|
26 |
+
embedding_dim: [100]
|
27 |
+
epochs: [15]
|
28 |
+
learning_rate: [0.025]
|
29 |
+
session:
|
30 |
+
interval: 30
|
31 |
+
evaluation:
|
32 |
+
dataset: 'sample'
|
33 |
+
cross-validation: 5
|
34 |
+
k: 5
|
35 |
+
topN: 5
|
36 |
+
results:
|
37 |
+
full: 'outputs/sample.csv'
|
38 |
+
embeddings:
|
39 |
+
music2vec:
|
40 |
+
usage: True
|
41 |
+
doc2vec:
|
42 |
+
usage: False
|
43 |
+
glove:
|
44 |
+
usage: False
|
45 |
+
rnn:
|
46 |
+
usage: False
|
environment.yml
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: rnn-embeddings
|
2 |
+
channels:
|
3 |
+
- anaconda
|
4 |
+
- defaults
|
5 |
+
dependencies:
|
6 |
+
- _libgcc_mutex=0.1=main
|
7 |
+
- _tflow_select=2.1.0=gpu
|
8 |
+
- absl-py=0.8.1=py36_0
|
9 |
+
- astor=0.8.0=py36_0
|
10 |
+
- astroid=2.3.3=py36_0
|
11 |
+
- blas=1.0=mkl
|
12 |
+
- c-ares=1.15.0=h7b6447c_1001
|
13 |
+
- ca-certificates=2019.11.27=0
|
14 |
+
- cairo=1.14.12=h8948797_3
|
15 |
+
- certifi=2019.11.28=py36_0
|
16 |
+
- cudatoolkit=9.2=0
|
17 |
+
- cudnn=7.6.4=cuda9.2_0
|
18 |
+
- cupti=9.2.148=0
|
19 |
+
- cycler=0.10.0=py36_0
|
20 |
+
- dbus=1.13.12=h746ee38_0
|
21 |
+
- expat=2.2.6=he6710b0_0
|
22 |
+
- fontconfig=2.13.0=h9420a91_0
|
23 |
+
- freetype=2.9.1=h8a8886c_1
|
24 |
+
- fribidi=1.0.5=h7b6447c_0
|
25 |
+
- gast=0.3.2=py_0
|
26 |
+
- glib=2.63.1=h5a9c865_0
|
27 |
+
- google-pasta=0.1.8=py_0
|
28 |
+
- graphite2=1.3.13=h23475e2_0
|
29 |
+
- graphviz=2.40.1=h21bd128_2
|
30 |
+
- grpcio=1.16.1=py36hf8bcb03_1
|
31 |
+
- gst-plugins-base=1.14.0=hbbd80ab_1
|
32 |
+
- gstreamer=1.14.0=hb453b48_1
|
33 |
+
- h5py=2.9.0=py36h7918eee_0
|
34 |
+
- harfbuzz=1.8.8=hffaf4a1_0
|
35 |
+
- hdf5=1.10.4=hb1b8bf9_0
|
36 |
+
- icu=58.2=h9c2bf20_1
|
37 |
+
- intel-openmp=2019.4=243
|
38 |
+
- isort=4.3.21=py36_0
|
39 |
+
- joblib=0.14.0=py_0
|
40 |
+
- jpeg=9b=h024ee3a_2
|
41 |
+
- keras=2.2.4=0
|
42 |
+
- keras-applications=1.0.8=py_0
|
43 |
+
- keras-base=2.2.4=py36_0
|
44 |
+
- keras-preprocessing=1.1.0=py_1
|
45 |
+
- kiwisolver=1.1.0=py36he6710b0_0
|
46 |
+
- lazy-object-proxy=1.4.3=py36h7b6447c_0
|
47 |
+
- libedit=3.1.20181209=hc058e9b_0
|
48 |
+
- libffi=3.2.1=hd88cf55_4
|
49 |
+
- libgcc-ng=9.1.0=hdf63c60_0
|
50 |
+
- libgfortran-ng=7.3.0=hdf63c60_0
|
51 |
+
- libpng=1.6.37=hbc83047_0
|
52 |
+
- libprotobuf=3.10.1=hd408876_0
|
53 |
+
- libstdcxx-ng=9.1.0=hdf63c60_0
|
54 |
+
- libtiff=4.1.0=h2733197_0
|
55 |
+
- libuuid=1.0.3=h1bed415_2
|
56 |
+
- libxcb=1.13=h1bed415_1
|
57 |
+
- libxml2=2.9.9=hea5a465_1
|
58 |
+
- markdown=3.1.1=py36_0
|
59 |
+
- matplotlib=3.1.1=py36h5429711_0
|
60 |
+
- mccabe=0.6.1=py36_1
|
61 |
+
- mkl=2019.4=243
|
62 |
+
- mkl-service=2.3.0=py36he904b0f_0
|
63 |
+
- mkl_fft=1.0.15=py36ha843d7b_0
|
64 |
+
- mkl_random=1.1.0=py36hd6b4f25_0
|
65 |
+
- mock=3.0.5=py36_0
|
66 |
+
- ncurses=6.1=he6710b0_1
|
67 |
+
- openssl=1.1.1=h7b6447c_0
|
68 |
+
- pandas=0.25.3=py36he6710b0_0
|
69 |
+
- pango=1.42.4=h049681c_0
|
70 |
+
- patsy=0.5.1=py36_0
|
71 |
+
- pcre=8.43=he6710b0_0
|
72 |
+
- pip=19.3.1=py36_0
|
73 |
+
- pixman=0.38.0=h7b6447c_0
|
74 |
+
- protobuf=3.10.1=py36he6710b0_0
|
75 |
+
- pylint=2.4.4=py36_0
|
76 |
+
- pyparsing=2.4.5=py_0
|
77 |
+
- pyqt=5.9.2=py36h05f1152_2
|
78 |
+
- python=3.6.9=h265db76_0
|
79 |
+
- pytz=2019.3=py_0
|
80 |
+
- qt=5.9.7=h5867ecd_1
|
81 |
+
- readline=7.0=h7b6447c_5
|
82 |
+
- scikit-learn=0.21.3=py36hd81dba3_0
|
83 |
+
- scipy=1.3.1=py36h7c811a0_0
|
84 |
+
- seaborn=0.9.0=pyh91ea838_1
|
85 |
+
- setuptools=42.0.2=py36_0
|
86 |
+
- sip=4.19.8=py36hf484d3e_0
|
87 |
+
- six=1.13.0=py36_0
|
88 |
+
- sqlite=3.30.1=h7b6447c_0
|
89 |
+
- statsmodels=0.10.1=py36hdd07704_0
|
90 |
+
- tensorboard=1.14.0=py36hf484d3e_0
|
91 |
+
- tensorflow=1.14.0=gpu_py36hfc5689a_0
|
92 |
+
- tensorflow-base=1.14.0=gpu_py36h611c6d2_0
|
93 |
+
- tensorflow-estimator=1.14.0=py_0
|
94 |
+
- tensorflow-gpu=1.14.0=h0d30ee6_0
|
95 |
+
- termcolor=1.1.0=py36_1
|
96 |
+
- tk=8.6.8=hbc83047_0
|
97 |
+
- tornado=6.0.3=py36h7b6447c_0
|
98 |
+
- typed-ast=1.4.0=py36h7b6447c_0
|
99 |
+
- werkzeug=0.16.0=py_0
|
100 |
+
- wheel=0.33.6=py36_0
|
101 |
+
- wrapt=1.11.2=py36h7b6447c_0
|
102 |
+
- xz=5.2.4=h14c3975_4
|
103 |
+
- yaml=0.1.7=had09818_2
|
104 |
+
- zlib=1.2.11=h7b6447c_3
|
105 |
+
- zstd=1.3.7=h0b5b093_0
|
106 |
+
- pip:
|
107 |
+
- bilm==0.1.post5
|
108 |
+
- blessings==1.7
|
109 |
+
- boto==2.49.0
|
110 |
+
- boto3==1.10.33
|
111 |
+
- botocore==1.13.33
|
112 |
+
- chardet==3.0.4
|
113 |
+
- docutils==0.15.2
|
114 |
+
- gensim==3.8.1
|
115 |
+
- glove-python==0.1.0
|
116 |
+
- gpustat==0.6.0
|
117 |
+
- idna==2.8
|
118 |
+
- jmespath==0.9.4
|
119 |
+
- ml-metrics==0.1.4
|
120 |
+
- numpy==1.16.4
|
121 |
+
- nvidia-ml-py3==7.352.0
|
122 |
+
- psutil==5.6.7
|
123 |
+
- pydot==1.4.1
|
124 |
+
- python-dateutil==2.8.0
|
125 |
+
- pyyaml>=3.11, <6.0
|
126 |
+
- requests==2.22.0
|
127 |
+
- s3transfer==0.2.1
|
128 |
+
- smart-open==1.9.0
|
129 |
+
- urllib3==1.25.7
|
main.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import yaml
|
4 |
+
import pickle
|
5 |
+
import argparse
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
import multiprocessing as mp
|
9 |
+
import project.evaluation.run as r
|
10 |
+
from os.path import exists
|
11 |
+
from datetime import datetime
|
12 |
+
from project.data.preprocess import preprocess, remove_sessions
|
13 |
+
from project.models.embeddings import embeddings
|
14 |
+
from project.evaluation.run import cross_validation
|
15 |
+
|
16 |
+
|
17 |
+
if __name__ == '__main__':
|
18 |
+
|
19 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
20 |
+
|
21 |
+
parser = argparse.ArgumentParser(description='RNN Embeddings')
|
22 |
+
parser.add_argument('--config', help='Configuration file', type=str)
|
23 |
+
args = parser.parse_args()
|
24 |
+
conf = yaml.safe_load(open(args.config))
|
25 |
+
|
26 |
+
print('The configuration file "%s" was read.' % args.config)
|
27 |
+
print('Pre-process started for dataset "%s"' % conf['evaluation']['dataset'])
|
28 |
+
|
29 |
+
preprocess(conf)
|
30 |
+
|
31 |
+
ds = conf['evaluation']['dataset']
|
32 |
+
df = pd.read_csv('dataset/{}/session_listening_history.csv'.format(ds), sep = ',')
|
33 |
+
|
34 |
+
emb_path = 'tmp/{}/models/ids.npy'.format(ds)
|
35 |
+
|
36 |
+
if not exists(emb_path):
|
37 |
+
embeddings(df, conf)
|
38 |
+
ids = np.load(emb_path)
|
39 |
+
cross_validation(df, conf, ids)
|
project/__init__.py
ADDED
File without changes
|
project/data/preparation.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
import pickle
|
6 |
+
from os import makedirs
|
7 |
+
from os.path import exists
|
8 |
+
from gensim.models import Word2Vec, Doc2Vec
|
9 |
+
from glove import Glove
|
10 |
+
from sklearn.model_selection import KFold
|
11 |
+
|
12 |
+
def _rnn_load(path, songs):
|
13 |
+
data = pickle.load(open(path, 'rb'))
|
14 |
+
emb_dict = {}
|
15 |
+
for song in songs:
|
16 |
+
emb_dict[song] = data[song]
|
17 |
+
return emb_dict
|
18 |
+
|
19 |
+
def __w2v_load(path, songs):
|
20 |
+
wv = Word2Vec.load(path).wv
|
21 |
+
emb_dict = {}
|
22 |
+
for song in songs:
|
23 |
+
emb_dict[song] = wv[song]
|
24 |
+
return emb_dict
|
25 |
+
|
26 |
+
def __g_load(path, songs):
|
27 |
+
glove = Glove.load(path)
|
28 |
+
emb_dict = {}
|
29 |
+
for song in songs:
|
30 |
+
emb_dict[song] = glove.word_vectors[glove.dictionary[song]]
|
31 |
+
return emb_dict
|
32 |
+
|
33 |
+
def __load_exp(path, songs):
|
34 |
+
data = pickle.load(open(path, 'rb'))
|
35 |
+
return data
|
36 |
+
|
37 |
+
|
38 |
+
def get_embeddings(path, songs):
|
39 |
+
path_arr = path.split('/')
|
40 |
+
session_file = '/'.join(path_arr[:-1] + ['s' + path_arr[-1]])
|
41 |
+
user_file = path
|
42 |
+
|
43 |
+
if 'experiments' in path:
|
44 |
+
return __load_exp(user_file, songs), __load_exp(session_file, songs)
|
45 |
+
if 'glove' in path:
|
46 |
+
return __g_load(user_file, songs),__g_load(session_file, songs)
|
47 |
+
if 'music2vec' in path:
|
48 |
+
return __w2v_load(user_file, songs), __w2v_load(session_file, songs)
|
49 |
+
if 'doc2vec' in path:
|
50 |
+
return __w2v_load(user_file, songs), __w2v_load(session_file, songs)
|
51 |
+
if 'rnn' in path:
|
52 |
+
return _rnn_load(user_file, songs), _rnn_load(session_file, songs)
|
53 |
+
return {},{}
|
54 |
+
|
55 |
+
def prepare_data(df, conf):
|
56 |
+
ds = conf['evaluation']['dataset']
|
57 |
+
path_kfold = 'tmp/{}/kfold/'.format(ds)
|
58 |
+
if exists(path_kfold):
|
59 |
+
kfold = []
|
60 |
+
for i in range(0, conf['evaluation']['k']):
|
61 |
+
j = i + 1
|
62 |
+
train = pd.read_pickle(path_kfold + 'train_{}.pkl'.format(j))
|
63 |
+
test = pd.read_pickle(path_kfold + 'test_{}.pkl'.format(j))
|
64 |
+
kfold.append((train, test))
|
65 |
+
return kfold
|
66 |
+
makedirs('tmp/{}/kfold/'.format(ds))
|
67 |
+
sessions = df.groupby('session')['song'].apply(lambda x: x.tolist())
|
68 |
+
users = df.groupby('user').agg(list)
|
69 |
+
users['history'] = users['session'].apply(lambda x: [sessions[session] for session in list(set(x))])
|
70 |
+
users = users.drop(['song', 'timestamp','session'], axis=1)
|
71 |
+
unique_users = df.user.unique()
|
72 |
+
kf = KFold(n_splits=conf['evaluation']['k'], shuffle=True)
|
73 |
+
i = 1
|
74 |
+
kfold = []
|
75 |
+
for train, test in kf.split(unique_users):
|
76 |
+
train_df = users[users.index.isin(unique_users[train])]
|
77 |
+
test_df = users[users.index.isin(unique_users[test])]
|
78 |
+
train_df.to_pickle('tmp/{}/kfold/train_{}.pkl'.format(ds, i))
|
79 |
+
test_df.to_pickle('tmp/{}/kfold/test_{}.pkl'.format(ds, i))
|
80 |
+
kfold.append((train_df, test_df))
|
81 |
+
i += 1
|
82 |
+
return kfold
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
project/data/preprocess.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import path
|
2 |
+
import csv
|
3 |
+
import math
|
4 |
+
import json
|
5 |
+
import yaml
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import multiprocessing as mp
|
9 |
+
from datetime import datetime, timedelta
|
10 |
+
|
11 |
+
def remove_sessions(df, leq=1):
|
12 |
+
group = df.groupby(by='session').agg(list)
|
13 |
+
group = group['song'].apply(len)
|
14 |
+
to_stay = group[group > leq].index.values
|
15 |
+
return df[df.session.isin(to_stay)]
|
16 |
+
|
17 |
+
|
18 |
+
def sessionize_user(ds, session_time, s_path):
|
19 |
+
df = pd.read_csv('dataset/{}/listening_history.csv'.format(ds), sep = ',')
|
20 |
+
df['timestamp'] = df['timestamp'].astype('datetime64')
|
21 |
+
df['dif'] = df['timestamp'].diff()
|
22 |
+
df['session'] = df.apply(lambda x: 'NEW_SESSION' if x.dif >= timedelta(minutes=session_time) else 'SAME_SESSION', axis=1)
|
23 |
+
s_no = 0
|
24 |
+
l_u = ''
|
25 |
+
f = open(s_path, 'w+')
|
26 |
+
print(','.join(['user', 'song', 'timestamp', 'session']), file=f)
|
27 |
+
print('Sessionized "%s" data file: %s' % (ds, s_path))
|
28 |
+
for row in df.values:
|
29 |
+
if s_no == 0:
|
30 |
+
l_u = row[0]
|
31 |
+
if (row[4] == 'NEW_SESSION' and l_u == row[0]) or (l_u != row[0]):
|
32 |
+
s_no+=1
|
33 |
+
row[3] = 's{}'.format(s_no)
|
34 |
+
l_u = row[0]
|
35 |
+
row[2] = str(row[2])
|
36 |
+
print(','.join(row[:-1]), file=f)
|
37 |
+
|
38 |
+
def gen_seq_files(df, pwd, window_size):
|
39 |
+
c_sessions = df.groupby('session')['song'].agg(list)
|
40 |
+
u_sessions = df.groupby('user')['song'].agg(list)
|
41 |
+
num_w = window_size // 2
|
42 |
+
fc = open(pwd + 'c_seqs.csv', 'w+')
|
43 |
+
fu = open(pwd + 'u_seqs.csv', 'w+')
|
44 |
+
dict_song = {}
|
45 |
+
for session in c_sessions:
|
46 |
+
for ix in range(len(session)):
|
47 |
+
b4 = list(range(ix - num_w, ix))
|
48 |
+
af = list(range(ix + 1, ix + num_w + 1))
|
49 |
+
b4 = [session[i] if i >= 0 else '-' for i in b4]
|
50 |
+
af = [session[i] if i < len(session) else '-' for i in af]
|
51 |
+
if session[ix] not in dict_song:
|
52 |
+
dict_song[session[ix]] = []
|
53 |
+
dict_song[session[ix]].append(b4 + [session[ix]] + af)
|
54 |
+
for song, values in dict_song.items():
|
55 |
+
for seq in values:
|
56 |
+
print(song + '\t'+ '{}'.format(seq), file=fc)
|
57 |
+
|
58 |
+
dict_song = {}
|
59 |
+
for session in u_sessions:
|
60 |
+
for ix in range(len(session)):
|
61 |
+
b4 = list(range(ix - num_w, ix))
|
62 |
+
af = list(range(ix + 1, ix + num_w + 1))
|
63 |
+
b4 = [session[i] if i >= 0 else '-' for i in b4]
|
64 |
+
af = [session[i] if i < len(session) else '-' for i in af]
|
65 |
+
if session[ix] not in dict_song:
|
66 |
+
dict_song[session[ix]] = []
|
67 |
+
dict_song[session[ix]].append(b4 + [session[ix]] + af)
|
68 |
+
for song, values in dict_song.items():
|
69 |
+
for seq in values:
|
70 |
+
print(song + '\t'+ '{}'.format(seq), file=fu)
|
71 |
+
|
72 |
+
|
73 |
+
def preprocess(conf):
|
74 |
+
ds = conf['evaluation']['dataset']
|
75 |
+
interval = conf['session']['interval']
|
76 |
+
if path.exists('dataset/{}/session_listening_history.csv'.format(ds)):
|
77 |
+
print('The "%s" dataset is already sessionized' % ds)
|
78 |
+
return
|
79 |
+
print('Started to sessionize dataset "%s"' % ds)
|
80 |
+
sessionize_user(ds, interval, 'dataset/{}/session_listening_history.csv'.format(ds))
|
81 |
+
|
82 |
+
|
project/evaluation/ResultReport.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
class Results():
|
6 |
+
def __init__(self, setups, k):
|
7 |
+
self.metrics = {}
|
8 |
+
self.k = k
|
9 |
+
self.final_df = pd.DataFrame()
|
10 |
+
|
11 |
+
def fold_results(self, params, m2vTN, sm2vTN, csm2vTN, csm2vUK, fold):
|
12 |
+
metrics = np.vstack([m2vTN, sm2vTN, csm2vTN, csm2vUK])
|
13 |
+
print()
|
14 |
+
data = {
|
15 |
+
'params': [params] * 4,
|
16 |
+
'algo': ['m2vTN','sm2vTN','csm2vTN','csm2vUK'],
|
17 |
+
'folds':[fold] * 4,
|
18 |
+
'prec': metrics[:,0],
|
19 |
+
'rec': metrics[:,1],
|
20 |
+
'f1': metrics[:,2],
|
21 |
+
'map': metrics[:,3],
|
22 |
+
'ndcg@5': metrics[:,4],
|
23 |
+
'p@5': metrics[:,5]
|
24 |
+
}
|
25 |
+
df = pd.DataFrame(data)
|
26 |
+
return df
|
27 |
+
|
project/evaluation/metrics.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from project.evaluation.ranking_metrics import mean_average_precision, ndcg_at, precision_at
|
2 |
+
|
3 |
+
def __Prec(topn, test):
|
4 |
+
num_intersect = len(set.intersection(set(topn), set(test)))
|
5 |
+
num_rec = len(topn)
|
6 |
+
return num_intersect / num_rec
|
7 |
+
|
8 |
+
def __Rec(topn, test):
|
9 |
+
num_intersect = len(set.intersection(set(topn), set(test)))
|
10 |
+
num_test = len(list(set(test)))
|
11 |
+
return num_intersect / num_test
|
12 |
+
|
13 |
+
def Hitrate(topn, test):
|
14 |
+
num_intersect = len([value for value in list(set(test)) if value in topn])
|
15 |
+
num_rec = len(topn)
|
16 |
+
return num_intersect / num_rec
|
17 |
+
|
18 |
+
def __F1(prec, rec):
|
19 |
+
return (2 * ((prec * rec) / (prec + rec))) if (prec + rec) > 0 else 0
|
20 |
+
|
21 |
+
|
22 |
+
def get_metrics(topn, test):
|
23 |
+
prec = __Prec(topn, test)
|
24 |
+
rec = __Rec(topn, test)
|
25 |
+
f = __F1(prec, rec)
|
26 |
+
MAP = mean_average_precision([test], [topn], assume_unique=False)
|
27 |
+
ndcg_5 = ndcg_at([test], [topn], k=5, assume_unique=False)
|
28 |
+
p_5 = precision_at([test], [topn], k=5, assume_unique=False)
|
29 |
+
|
30 |
+
return [prec, rec, f, MAP, ndcg_5, p_5]
|
project/evaluation/ranking_metrics.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
#
|
3 |
+
# Author: Taylor G Smith
|
4 |
+
#
|
5 |
+
# Recommender system ranking metrics derived from Spark source for use with
|
6 |
+
# Python-based recommender libraries (i.e., implicit,
|
7 |
+
# http://github.com/benfred/implicit/). These metrics are derived from the
|
8 |
+
# original Spark Scala source code for recommender metrics.
|
9 |
+
# https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
import warnings
|
14 |
+
|
15 |
+
__all__ = [
|
16 |
+
'mean_average_precision',
|
17 |
+
'ndcg_at',
|
18 |
+
'precision_at',
|
19 |
+
]
|
20 |
+
def _require_positive_k(k):
|
21 |
+
"""Helper function to avoid copy/pasted code for validating K"""
|
22 |
+
if k <= 0:
|
23 |
+
raise ValueError("ranking position k should be positive")
|
24 |
+
|
25 |
+
|
26 |
+
def _mean_ranking_metric(predictions, labels, metric):
|
27 |
+
"""Helper function for precision_at_k and mean_average_precision"""
|
28 |
+
# do not zip, as this will require an extra pass of O(N). Just assert
|
29 |
+
# equal length and index (compute in ONE pass of O(N)).
|
30 |
+
# if len(predictions) != len(labels):
|
31 |
+
# raise ValueError("dim mismatch in predictions and labels!")
|
32 |
+
# return np.mean([
|
33 |
+
# metric(np.asarray(predictions[i]), np.asarray(labels[i]))
|
34 |
+
# for i in xrange(len(predictions))
|
35 |
+
# ])
|
36 |
+
|
37 |
+
# Actually probably want lazy evaluation in case preds is a
|
38 |
+
# generator, since preds can be very dense and could blow up
|
39 |
+
# memory... but how to assert lengths equal? FIXME
|
40 |
+
return np.mean([
|
41 |
+
metric(np.asarray(prd), np.asarray(labels[i]))
|
42 |
+
for i, prd in enumerate(predictions) # lazy eval if generator
|
43 |
+
])
|
44 |
+
|
45 |
+
|
46 |
+
def _warn_for_empty_labels():
|
47 |
+
"""Helper for missing ground truth sets"""
|
48 |
+
warnings.warn("Empty ground truth set! Check input data")
|
49 |
+
return 0.
|
50 |
+
|
51 |
+
|
52 |
+
def precision_at(predictions, labels, k=10, assume_unique=True):
|
53 |
+
"""Compute the precision at K.
|
54 |
+
Compute the average precision of all the queries, truncated at
|
55 |
+
ranking position k. If for a query, the ranking algorithm returns
|
56 |
+
n (n is less than k) results, the precision value will be computed
|
57 |
+
as #(relevant items retrieved) / k. This formula also applies when
|
58 |
+
the size of the ground truth set is less than k.
|
59 |
+
If a query has an empty ground truth set, zero will be used as
|
60 |
+
precision together with a warning.
|
61 |
+
Parameters
|
62 |
+
----------
|
63 |
+
predictions : array-like, shape=(n_predictions,)
|
64 |
+
The prediction array. The items that were predicted, in descending
|
65 |
+
order of relevance.
|
66 |
+
labels : array-like, shape=(n_ratings,)
|
67 |
+
The labels (positively-rated items).
|
68 |
+
k : int, optional (default=10)
|
69 |
+
The rank at which to measure the precision.
|
70 |
+
assume_unique : bool, optional (default=True)
|
71 |
+
Whether to assume the items in the labels and predictions are each
|
72 |
+
unique. That is, the same item is not predicted multiple times or
|
73 |
+
rated multiple times.
|
74 |
+
Examples
|
75 |
+
--------
|
76 |
+
>>> # predictions for 3 users
|
77 |
+
>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
|
78 |
+
... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
|
79 |
+
... [1, 2, 3, 4, 5]]
|
80 |
+
>>> # labels for the 3 users
|
81 |
+
>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
|
82 |
+
>>> precision_at(preds, labels, 1)
|
83 |
+
0.33333333333333331
|
84 |
+
>>> precision_at(preds, labels, 5)
|
85 |
+
0.26666666666666666
|
86 |
+
>>> precision_at(preds, labels, 15)
|
87 |
+
0.17777777777777778
|
88 |
+
"""
|
89 |
+
# validate K
|
90 |
+
_require_positive_k(k)
|
91 |
+
|
92 |
+
def _inner_pk(pred, lab):
|
93 |
+
# need to compute the count of the number of values in the predictions
|
94 |
+
# that are present in the labels. We'll use numpy in1d for this (set
|
95 |
+
# intersection in O(1))
|
96 |
+
if lab.shape[0] > 0:
|
97 |
+
n = min(pred.shape[0], k)
|
98 |
+
cnt = np.in1d(pred[:n], lab, assume_unique=assume_unique).sum()
|
99 |
+
return float(cnt) / k
|
100 |
+
else:
|
101 |
+
return _warn_for_empty_labels()
|
102 |
+
|
103 |
+
return _mean_ranking_metric(predictions, labels, _inner_pk)
|
104 |
+
|
105 |
+
|
106 |
+
def mean_average_precision(predictions, labels, assume_unique=True):
|
107 |
+
"""Compute the mean average precision on predictions and labels.
|
108 |
+
Returns the mean average precision (MAP) of all the queries. If a query
|
109 |
+
has an empty ground truth set, the average precision will be zero and a
|
110 |
+
warning is generated.
|
111 |
+
Parameters
|
112 |
+
----------
|
113 |
+
predictions : array-like, shape=(n_predictions,)
|
114 |
+
The prediction array. The items that were predicted, in descending
|
115 |
+
order of relevance.
|
116 |
+
labels : array-like, shape=(n_ratings,)
|
117 |
+
The labels (positively-rated items).
|
118 |
+
assume_unique : bool, optional (default=True)
|
119 |
+
Whether to assume the items in the labels and predictions are each
|
120 |
+
unique. That is, the same item is not predicted multiple times or
|
121 |
+
rated multiple times.
|
122 |
+
Examples
|
123 |
+
--------
|
124 |
+
>>> # predictions for 3 users
|
125 |
+
>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
|
126 |
+
... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
|
127 |
+
... [1, 2, 3, 4, 5]]
|
128 |
+
>>> # labels for the 3 users
|
129 |
+
>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
|
130 |
+
>>> mean_average_precision(preds, labels)
|
131 |
+
0.35502645502645497
|
132 |
+
"""
|
133 |
+
def _inner_map(pred, lab):
|
134 |
+
if lab.shape[0]:
|
135 |
+
# compute the number of elements within the predictions that are
|
136 |
+
# present in the actual labels, and get the cumulative sum weighted
|
137 |
+
# by the index of the ranking
|
138 |
+
n = pred.shape[0]
|
139 |
+
|
140 |
+
# Scala code from Spark source:
|
141 |
+
# var i = 0
|
142 |
+
# var cnt = 0
|
143 |
+
# var precSum = 0.0
|
144 |
+
# val n = pred.length
|
145 |
+
# while (i < n) {
|
146 |
+
# if (labSet.contains(pred(i))) {
|
147 |
+
# cnt += 1
|
148 |
+
# precSum += cnt.toDouble / (i + 1)
|
149 |
+
# }
|
150 |
+
# i += 1
|
151 |
+
# }
|
152 |
+
# precSum / labSet.size
|
153 |
+
|
154 |
+
arange = np.arange(n, dtype=np.float32) + 1. # this is the denom
|
155 |
+
present = np.in1d(pred[:n], lab, assume_unique=assume_unique)
|
156 |
+
prec_sum = np.ones(present.sum()).cumsum()
|
157 |
+
denom = arange[present]
|
158 |
+
return (prec_sum / denom).sum() / lab.shape[0]
|
159 |
+
|
160 |
+
else:
|
161 |
+
return _warn_for_empty_labels()
|
162 |
+
|
163 |
+
return _mean_ranking_metric(predictions, labels, _inner_map)
|
164 |
+
|
165 |
+
|
166 |
+
def ndcg_at(predictions, labels, k=10, assume_unique=True):
|
167 |
+
"""Compute the normalized discounted cumulative gain at K.
|
168 |
+
Compute the average NDCG value of all the queries, truncated at ranking
|
169 |
+
position k. The discounted cumulative gain at position k is computed as:
|
170 |
+
sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1)
|
171 |
+
and the NDCG is obtained by dividing the DCG value on the ground truth set.
|
172 |
+
In the current implementation, the relevance value is binary.
|
173 |
+
If a query has an empty ground truth set, zero will be used as
|
174 |
+
NDCG together with a warning.
|
175 |
+
Parameters
|
176 |
+
----------
|
177 |
+
predictions : array-like, shape=(n_predictions,)
|
178 |
+
The prediction array. The items that were predicted, in descending
|
179 |
+
order of relevance.
|
180 |
+
labels : array-like, shape=(n_ratings,)
|
181 |
+
The labels (positively-rated items).
|
182 |
+
k : int, optional (default=10)
|
183 |
+
The rank at which to measure the NDCG.
|
184 |
+
assume_unique : bool, optional (default=True)
|
185 |
+
Whether to assume the items in the labels and predictions are each
|
186 |
+
unique. That is, the same item is not predicted multiple times or
|
187 |
+
rated multiple times.
|
188 |
+
Examples
|
189 |
+
--------
|
190 |
+
>>> # predictions for 3 users
|
191 |
+
>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
|
192 |
+
... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
|
193 |
+
... [1, 2, 3, 4, 5]]
|
194 |
+
>>> # labels for the 3 users
|
195 |
+
>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
|
196 |
+
>>> ndcg_at(preds, labels, 3)
|
197 |
+
0.3333333432674408
|
198 |
+
>>> ndcg_at(preds, labels, 10)
|
199 |
+
0.48791273434956867
|
200 |
+
References
|
201 |
+
----------
|
202 |
+
.. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for
|
203 |
+
retrieving highly relevant documents."
|
204 |
+
"""
|
205 |
+
# validate K
|
206 |
+
_require_positive_k(k)
|
207 |
+
|
208 |
+
def _inner_ndcg(pred, lab):
|
209 |
+
if lab.shape[0]:
|
210 |
+
# if we do NOT assume uniqueness, the set is a bit different here
|
211 |
+
if not assume_unique:
|
212 |
+
lab = np.unique(lab)
|
213 |
+
|
214 |
+
n_lab = lab.shape[0]
|
215 |
+
n_pred = pred.shape[0]
|
216 |
+
n = min(max(n_pred, n_lab), k) # min(min(p, l), k)?
|
217 |
+
|
218 |
+
# similar to mean_avg_prcsn, we need an arange, but this time +2
|
219 |
+
# since python is zero-indexed, and the denom typically needs +1.
|
220 |
+
# Also need the log base2...
|
221 |
+
arange = np.arange(n, dtype=np.float32) # length n
|
222 |
+
|
223 |
+
# since we are only interested in the arange up to n_pred, truncate
|
224 |
+
# if necessary
|
225 |
+
arange = arange[:n_pred]
|
226 |
+
denom = np.log2(arange + 2.) # length n
|
227 |
+
gains = 1. / denom # length n
|
228 |
+
|
229 |
+
# compute the gains where the prediction is present in the labels
|
230 |
+
dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique)
|
231 |
+
dcg = gains[dcg_mask].sum()
|
232 |
+
|
233 |
+
# the max DCG is sum of gains where the index < the label set size
|
234 |
+
max_dcg = gains[arange < n_lab].sum()
|
235 |
+
return dcg / max_dcg
|
236 |
+
|
237 |
+
else:
|
238 |
+
return _warn_for_empty_labels()
|
239 |
+
|
240 |
+
return _mean_ranking_metric(predictions, labels, _inner_ndcg)
|
project/evaluation/run.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import csv
|
3 |
+
import os
|
4 |
+
import yaml
|
5 |
+
import pickle
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import project.evaluation.metrics as m
|
9 |
+
from os.path import exists
|
10 |
+
from project.data.preparation import prepare_data, get_embeddings
|
11 |
+
from project.recsys.helper import Helper
|
12 |
+
from datetime import datetime
|
13 |
+
from project.recsys.algorithms import execute_algo
|
14 |
+
from project.evaluation.ResultReport import Results
|
15 |
+
from keras.models import model_from_yaml
|
16 |
+
|
17 |
+
def get_rnn():
|
18 |
+
model = model_from_yaml(open('training_model.yaml','r'))
|
19 |
+
model.load_weights('training_weights.h5')
|
20 |
+
return model
|
21 |
+
|
22 |
+
def skip_all(executed, params, k):
|
23 |
+
folds = executed[executed['params'] == params]['folds']
|
24 |
+
return folds.max() == k
|
25 |
+
|
26 |
+
def skip_fold(executed, params, fold):
|
27 |
+
folds = executed[executed['params'] == params]['folds']
|
28 |
+
return folds.max() >= fold
|
29 |
+
|
30 |
+
def cross_validation(df, conf, setups):
|
31 |
+
params = conf['evaluation']
|
32 |
+
r_paths = conf['results']
|
33 |
+
|
34 |
+
kfold = prepare_data(df, conf)
|
35 |
+
dataset = params['dataset']
|
36 |
+
topN = int(params['topN'])
|
37 |
+
k = int(params['k'])
|
38 |
+
results = Results(setups, k)
|
39 |
+
exec_path = r_paths['full']
|
40 |
+
pwd_rec = 'tmp/{}/rec/'.format(dataset)
|
41 |
+
|
42 |
+
if not exists(pwd_rec):
|
43 |
+
os.mkdir(pwd_rec)
|
44 |
+
if not exists(exec_path):
|
45 |
+
pd.DataFrame({},columns=['params','algo','folds','prec','rec','f1','map','ndcg@5','p@5']).to_csv(exec_path,index=None,sep='\t')
|
46 |
+
|
47 |
+
executed = pd.read_csv(exec_path, sep='\t')
|
48 |
+
|
49 |
+
for setup in setups:
|
50 |
+
_, params, path = setup
|
51 |
+
if not exists(pwd_rec + params):
|
52 |
+
os.mkdir(pwd_rec + params)
|
53 |
+
if skip_all(executed, params, k):
|
54 |
+
continue
|
55 |
+
songs = df['song'].unique().tolist()
|
56 |
+
m2v, sm2v = get_embeddings(path, songs)
|
57 |
+
songs = pd.DataFrame({ 'm2v': [m2v[x] for x in songs], 'sm2v': [sm2v[x] for x in songs]}, index=songs, columns=['m2v','sm2v'])
|
58 |
+
fold = 1
|
59 |
+
for train, test in kfold:
|
60 |
+
if skip_fold(executed, params, fold):
|
61 |
+
fold+=1
|
62 |
+
continue
|
63 |
+
time = datetime.now().strftime('%d/%m/%Y %H:%M')
|
64 |
+
print('%s | fold-%d | Running recsys w/ k-fold with the following params: %s' % (time, fold, params))
|
65 |
+
helper = Helper(train, test, songs, dataset)
|
66 |
+
m2vTN, sm2vTN, csm2vTN, csm2vUK = execute_algo(train.index, test.index, songs, topN, k, helper, pwd_rec + params)
|
67 |
+
res = results.fold_results(params, m2vTN, sm2vTN, csm2vTN, csm2vUK, fold)
|
68 |
+
res.to_csv(exec_path, sep='\t', mode='a', index=None, header=None)
|
69 |
+
fold+=1
|
project/models/embeddings.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
import pickle
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from os import makedirs
|
7 |
+
from os.path import exists
|
8 |
+
from gensim.models import Word2Vec, Doc2Vec
|
9 |
+
from gensim.models.doc2vec import TaggedDocument
|
10 |
+
from datetime import datetime
|
11 |
+
from glove import Glove, Corpus
|
12 |
+
from project.models.rnn import rnn
|
13 |
+
from project.models.setups import Setups
|
14 |
+
from project.models.seq2seq import start as rnn_start
|
15 |
+
|
16 |
+
def data_prep(model, df):
|
17 |
+
if model == 'user':
|
18 |
+
return df.groupby(by='user')['song'].apply(list).values.tolist()
|
19 |
+
if model == 'user_doc':
|
20 |
+
return df.groupby(by='user')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist()
|
21 |
+
if model == 'session':
|
22 |
+
return df.groupby(by='session')['song'].apply(list).values.tolist()
|
23 |
+
if model == 'session_doc':
|
24 |
+
return df.groupby(by='session')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist()
|
25 |
+
|
26 |
+
def music2vec(data, w2v_type, dim, lr, window, down, neg_sample, epochs):
|
27 |
+
sentences = data_prep(w2v_type, data)
|
28 |
+
return Word2Vec(sentences, size=dim, alpha=lr, window=window, sample=down,
|
29 |
+
sg=1, hs=0, negative=neg_sample, iter=epochs, min_count=1, compute_loss=True)
|
30 |
+
|
31 |
+
def doc2vec(data, d2v_type, dim, lr, window, down, neg_sample, epochs):
|
32 |
+
sequence = data_prep(d2v_type, data)
|
33 |
+
return Doc2Vec(sequence, dm=1, vector_size=dim, alpha=lr, window=window, sample=down,
|
34 |
+
negative=neg_sample, epochs=epochs, min_count=1, compute_loss=True)
|
35 |
+
|
36 |
+
def glove(data, glove_type, window, dim, lr, epochs):
|
37 |
+
sentences = data_prep(glove_type, data)
|
38 |
+
corpus = Corpus()
|
39 |
+
corpus.fit(sentences, window=window)
|
40 |
+
glove = Glove(no_components=dim, learning_rate=lr)
|
41 |
+
glove.fit(corpus.matrix, epochs=epochs, no_threads=4, verbose=True)
|
42 |
+
glove.add_dictionary(corpus.dictionary)
|
43 |
+
return glove
|
44 |
+
|
45 |
+
def embeddings(df, conf):
|
46 |
+
ds = conf['evaluation']['dataset']
|
47 |
+
cwd = 'tmp/{}/models'.format(ds)
|
48 |
+
|
49 |
+
if not exists(cwd):
|
50 |
+
makedirs(cwd)
|
51 |
+
|
52 |
+
setups = Setups(conf)
|
53 |
+
generators = setups.get_generators()
|
54 |
+
|
55 |
+
c_id = 0
|
56 |
+
setups_id = []
|
57 |
+
for method, generator in generators:
|
58 |
+
if method == 'rnn':
|
59 |
+
for s in generator:
|
60 |
+
to_str = setups.setup_to_string(c_id, s, method)
|
61 |
+
print(to_str)
|
62 |
+
|
63 |
+
path = '{}/{}__{}.pickle'.format(cwd, method, c_id)
|
64 |
+
path_s = '{}/s{}__{}.pickle'.format(cwd, method, c_id)
|
65 |
+
|
66 |
+
if not exists(path):
|
67 |
+
user, session = rnn(df, ds, s['model'], s['window'], s['epochs'],
|
68 |
+
s['batch'], s['dim'], s['num_units'], s['bidi'])
|
69 |
+
fu = open(path, 'wb')
|
70 |
+
fs = open(path_s, 'wb')
|
71 |
+
|
72 |
+
pickle.dump(user, fu, protocol=pickle.HIGHEST_PROTOCOL)
|
73 |
+
pickle.dump(session, fs, protocol=pickle.HIGHEST_PROTOCOL)
|
74 |
+
|
75 |
+
fu.close()
|
76 |
+
fs.close()
|
77 |
+
|
78 |
+
setups_id.append([c_id, to_str, path])
|
79 |
+
c_id+=1
|
80 |
+
if method == 'music2vec':
|
81 |
+
for s in generator:
|
82 |
+
to_str = setups.setup_to_string(c_id, s, method)
|
83 |
+
print(to_str)
|
84 |
+
|
85 |
+
path = '{}/{}__{}.model'.format(cwd, method, c_id)
|
86 |
+
path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
|
87 |
+
|
88 |
+
if not exists(path):
|
89 |
+
|
90 |
+
m2v = music2vec(df,'user', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
|
91 |
+
sm2v = music2vec(df,'session', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
|
92 |
+
|
93 |
+
m2v.save(path)
|
94 |
+
sm2v.save(path_s)
|
95 |
+
|
96 |
+
setups_id.append([c_id, to_str, path])
|
97 |
+
|
98 |
+
c_id+=1
|
99 |
+
if method == 'doc2vec':
|
100 |
+
for s in generator:
|
101 |
+
to_str = setups.setup_to_string(c_id, s, method)
|
102 |
+
path = '{}/{}__{}.model'.format(cwd, method, c_id)
|
103 |
+
path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
|
104 |
+
print(to_str)
|
105 |
+
|
106 |
+
if not exists(path):
|
107 |
+
|
108 |
+
d2v = doc2vec(df,'user_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
|
109 |
+
sd2v = doc2vec(df,'session_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
|
110 |
+
|
111 |
+
d2v.save(path)
|
112 |
+
sd2v.save(path_s)
|
113 |
+
|
114 |
+
setups_id.append([c_id, to_str, path])
|
115 |
+
|
116 |
+
c_id+=1
|
117 |
+
if method == 'glove':
|
118 |
+
for s in generator:
|
119 |
+
to_str = setups.setup_to_string(c_id, s, method)
|
120 |
+
path = '{}/{}__{}.model'.format(cwd, method, c_id)
|
121 |
+
path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
|
122 |
+
print(to_str)
|
123 |
+
|
124 |
+
if not exists(path):
|
125 |
+
|
126 |
+
glv = glove(df, 'user', s['window'], s['dim'], s['lr'], s['epochs'])
|
127 |
+
sglv = glove(df, 'session', s['window'], s['dim'], s['lr'], s['epochs'])
|
128 |
+
|
129 |
+
glv.save(path)
|
130 |
+
sglv.save(path_s)
|
131 |
+
|
132 |
+
c_id+=1
|
133 |
+
if method == 'genres':
|
134 |
+
for s in generator:
|
135 |
+
to_str = s
|
136 |
+
print(to_str)
|
137 |
+
path = 'tmp/{}/experiments/'.format(ds)
|
138 |
+
path_s = 'tmp/{}/experiments/'.format(ds)
|
139 |
+
|
140 |
+
if s == 'add-all':
|
141 |
+
path += 'all_genres/add/all_add.pickle'
|
142 |
+
path_s += 'all_genres/add/sall_add.pickle'
|
143 |
+
if s == 'mul-all':
|
144 |
+
path += 'all_genres/mul/all_mul.pickle'
|
145 |
+
path_s += 'all_genres/mul/sall_mul.pickle'
|
146 |
+
if s == 'avg-all':
|
147 |
+
path += 'all_genres/avg/all_avg.pickle'
|
148 |
+
path_s += 'all_genres/avg/sall_avg.pickle'
|
149 |
+
if s == 'add-ran':
|
150 |
+
path += 'random_genres/add/ran_add.pickle'
|
151 |
+
path_s += 'random_genres/add/sran_add.pickle'
|
152 |
+
if s == 'mul-ran':
|
153 |
+
path += 'random_genres/mul/ran_mul.pickle'
|
154 |
+
path_s += 'random_genres/mul/sran_mul.pickle'
|
155 |
+
if s == 'avg-ran':
|
156 |
+
path += 'random_genres/avg/ran_avg.pickle'
|
157 |
+
path_s += 'random_genres/avg/sran_avg.pickle'
|
158 |
+
|
159 |
+
setups_id.append([c_id, to_str, path])
|
160 |
+
|
161 |
+
c_id+=1
|
162 |
+
|
163 |
+
setups_id = np.stack(setups_id, axis=0)
|
164 |
+
|
165 |
+
np.save('{}/ids'.format(cwd), setups_id)
|
166 |
+
|
project/models/rnn.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os.path import exists
|
2 |
+
from keras.utils import to_categorical
|
3 |
+
from keras.models import Model
|
4 |
+
from keras.layers import Embedding, LSTM, Dense, CuDNNGRU, LSTM, Input, Bidirectional, Dropout, Concatenate, Bidirectional
|
5 |
+
from keras.models import Sequential, load_model
|
6 |
+
from keras.callbacks import EarlyStopping, ModelCheckpoint
|
7 |
+
from keras.preprocessing.sequence import TimeseriesGenerator
|
8 |
+
import concurrent.futures as fut
|
9 |
+
import os
|
10 |
+
import gc
|
11 |
+
import keras
|
12 |
+
import pickle
|
13 |
+
import time
|
14 |
+
import numpy as np
|
15 |
+
import pickle as pk
|
16 |
+
import pandas as pd
|
17 |
+
import tensorflow as tf
|
18 |
+
import matplotlib.pyplot as plt
|
19 |
+
from math import floor
|
20 |
+
|
21 |
+
|
22 |
+
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
23 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'
|
24 |
+
|
25 |
+
def get_window(playlist, ix, window):
|
26 |
+
el = playlist[ix]
|
27 |
+
|
28 |
+
# This is the perfect case:
|
29 |
+
if (ix - window >= 0) and (ix + window + 1) < len(playlist):
|
30 |
+
window = playlist[ix - window:ix] + playlist[ix + 1:(ix + 1) + window]
|
31 |
+
return window
|
32 |
+
|
33 |
+
# Not running into the perfect case, will turn into the damage reduction clause:
|
34 |
+
b4 = []
|
35 |
+
after = []
|
36 |
+
# If the problem is in the before clause, prepend the song until it mets the window size.
|
37 |
+
if (ix - window < 0):
|
38 |
+
b4 = (abs(ix - window) * ['0']) + playlist[0:ix]
|
39 |
+
else:
|
40 |
+
b4 = playlist[ix - window:ix]
|
41 |
+
# If the problem is in the after clause, append the song until it mets the window size.
|
42 |
+
|
43 |
+
if (ix + window + 1) > len(playlist):
|
44 |
+
num = (ix + window + 1) - len(playlist)
|
45 |
+
after = playlist[ix + 1:ix + window + 1] + (num * ['0'])
|
46 |
+
else:
|
47 |
+
after = playlist[ix + 1:(ix + 1) + window]
|
48 |
+
|
49 |
+
return b4 + after
|
50 |
+
|
51 |
+
|
52 |
+
def window_seqs(sequence, w_size):
|
53 |
+
ix = 0
|
54 |
+
max_ix = (len(sequence) - 1) - w_size
|
55 |
+
x = []
|
56 |
+
y = []
|
57 |
+
while ix < max_ix:
|
58 |
+
x.append(sequence[ix:ix+w_size])
|
59 |
+
y.append([sequence[ix+w_size]])
|
60 |
+
ix+=1
|
61 |
+
return x, y
|
62 |
+
|
63 |
+
def rnn(df, DS, MODEL, W_SIZE, EPOCHS, BATCH_SIZE, EMBEDDING_DIM, NUM_UNITS, BIDIRECTIONAL):
|
64 |
+
pwd = 'dataset/{}/'.format(DS)
|
65 |
+
WINDOW = W_SIZE * 2
|
66 |
+
|
67 |
+
vocab = sorted(set(df.song.unique().tolist()))
|
68 |
+
vocab_size = len(vocab) +1
|
69 |
+
song2ix = {u:i for i, u in enumerate(vocab, 1)}
|
70 |
+
pickle.dump(song2ix, open('{}_song2ix.pickle'.format(DS), 'wb'), pickle.HIGHEST_PROTOCOL)
|
71 |
+
|
72 |
+
|
73 |
+
if not exists(pwd + 'song_context_{}.txt'.format(W_SIZE)):
|
74 |
+
df['song'] = df.song.apply(lambda song: song2ix[song])
|
75 |
+
u_playlists = df[['user', 'song']].groupby('user').agg(tuple)['song'].values
|
76 |
+
u_playlists = [list(p) for p in u_playlists]
|
77 |
+
s_playlists = df[['session', 'song']].groupby('session').agg(tuple)['song'].values
|
78 |
+
s_playlists = [list(p) for p in s_playlists]
|
79 |
+
|
80 |
+
nou_playlists = len(u_playlists)
|
81 |
+
nos_playlists = len(s_playlists)
|
82 |
+
|
83 |
+
user_windows = dict()
|
84 |
+
session_windows = dict()
|
85 |
+
|
86 |
+
|
87 |
+
for song in vocab:
|
88 |
+
user_windows[song2ix[song]] = []
|
89 |
+
session_windows[song2ix[song]] = []
|
90 |
+
|
91 |
+
k4 = 1
|
92 |
+
for pl in u_playlists:
|
93 |
+
print('[{}/{}] [USER] Playlist'.format(k4, nou_playlists), flush=False, end='\r')
|
94 |
+
k4+=1
|
95 |
+
ixes = range(0, len(pl))
|
96 |
+
s_windows = [(pl[ix], get_window(pl, ix, W_SIZE)) for ix in ixes]
|
97 |
+
for song, window in s_windows:
|
98 |
+
user_windows[song].append(window)
|
99 |
+
print()
|
100 |
+
k4 = 1
|
101 |
+
for pl in s_playlists:
|
102 |
+
print('[{}/{}] [SESSION] Playlist'.format(k4, nos_playlists), flush=False, end='\r')
|
103 |
+
k4+=1
|
104 |
+
ixes = range(0, len(pl))
|
105 |
+
s_windows = [(pl[ix], get_window(pl, ix, W_SIZE)) for ix in ixes]
|
106 |
+
for song, window in s_windows:
|
107 |
+
session_windows[song].append(window)
|
108 |
+
print()
|
109 |
+
|
110 |
+
f = open(pwd + 'song_context_{}.txt'.format(W_SIZE), 'w')
|
111 |
+
for song in vocab:
|
112 |
+
u_occurrences = user_windows[song2ix[song]]
|
113 |
+
s_occurrences = session_windows[song2ix[song]]
|
114 |
+
for u_o, s_o in zip(u_occurrences, s_occurrences):
|
115 |
+
print('{}\t{}\t{}'.format(','.join([str(i) for i in u_o]), ','.join([str(i) for i in s_o]), str(song2ix[song])), file=f)
|
116 |
+
f.close()
|
117 |
+
|
118 |
+
f = open(pwd + 'song_context_{}.txt'.format(W_SIZE), mode='r')
|
119 |
+
|
120 |
+
data = []
|
121 |
+
for line in f:
|
122 |
+
line = line.replace('\n', '')
|
123 |
+
input_user, input_session, target = line.split('\t')
|
124 |
+
line = [np.array([int(x) for x in input_user.split(',')]), np.array([int(x) for x in input_session.split(',')]), int(target)]
|
125 |
+
data.append(line)
|
126 |
+
|
127 |
+
data = np.vstack(data)
|
128 |
+
|
129 |
+
np.random.shuffle(data)
|
130 |
+
|
131 |
+
def batch(data, bs):
|
132 |
+
while True:
|
133 |
+
for ix in range(0, len(data), bs):
|
134 |
+
u_input = data[ix:ix+bs,0]
|
135 |
+
s_input = data[ix:ix+bs,1]
|
136 |
+
target = data[ix:ix+bs,2]
|
137 |
+
yield [np.vstack(u_input), np.vstack(s_input)], to_categorical(target, num_classes=vocab_size)
|
138 |
+
|
139 |
+
|
140 |
+
train, test = data[int(len(data) *.20):], data[:int(len(data) *.20)]
|
141 |
+
|
142 |
+
input_session = Input(batch_shape=(None, WINDOW))
|
143 |
+
embedding_session = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, name='Session_Embeddings', mask_zero=True)(input_session)
|
144 |
+
drop_session = Dropout(0.2)(embedding_session)
|
145 |
+
rec_session = LSTM(NUM_UNITS, name='Session_LSTM')(drop_session)
|
146 |
+
drop_session = Dropout(0.2)(rec_session)
|
147 |
+
|
148 |
+
input_user = Input(batch_shape=(None, WINDOW))
|
149 |
+
embedding_user = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, name='User_Embeddings', mask_zero=True)(input_user)
|
150 |
+
drop_user = Dropout(0.2)(embedding_user)
|
151 |
+
rec_user = LSTM(NUM_UNITS, name='User_LSTM')(drop_user)
|
152 |
+
drop_user = Dropout(0.2)(rec_user)
|
153 |
+
combination = Concatenate()([drop_session, drop_user])
|
154 |
+
dense = Dense(vocab_size, activation='softmax', name='Densa')(combination)
|
155 |
+
model = Model(inputs=[input_session, input_user], outputs=dense)
|
156 |
+
checkpoint = ModelCheckpoint('{}_model_checkpoint.h5'.format(DS), monitor='loss', verbose=0, save_best_only=False, period=1)
|
157 |
+
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
|
158 |
+
|
159 |
+
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
160 |
+
model.summary()
|
161 |
+
|
162 |
+
if exists('{}_model_checkpoint.h5'.format(DS)):
|
163 |
+
model = load_model('{}_model_checkpoint.h5'.format(DS))
|
164 |
+
|
165 |
+
model.fit_generator(generator=batch(train, BATCH_SIZE), steps_per_epoch=len(train) // BATCH_SIZE, epochs=EPOCHS,
|
166 |
+
validation_data=batch(test, BATCH_SIZE), validation_steps=len(test) // BATCH_SIZE, callbacks=[es, checkpoint])
|
167 |
+
|
168 |
+
session_embeddings = model.get_layer('Session_Embeddings').get_weights()[0]
|
169 |
+
user_embeddings = model.get_layer('User_Embeddings').get_weights()[0]
|
170 |
+
|
171 |
+
u_emb = {}
|
172 |
+
s_emb = {}
|
173 |
+
|
174 |
+
for song in vocab:
|
175 |
+
u_emb[song] = user_embeddings[song2ix[song]]
|
176 |
+
s_emb[song] = session_embeddings[song2ix[song]]
|
177 |
+
|
178 |
+
del model
|
179 |
+
gc.collect()
|
180 |
+
return u_emb, s_emb
|
project/models/seq2seq.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from project.data.preprocess import gen_seq_files
|
5 |
+
from os.path import exists
|
6 |
+
from keras.models import Model
|
7 |
+
from keras.callbacks import EarlyStopping
|
8 |
+
from keras.layers import Dense, CuDNNLSTM, CuDNNGRU, Embedding, Input, SimpleRNN
|
9 |
+
|
10 |
+
def read_input_targets(path, win_size, t):
|
11 |
+
d = {}
|
12 |
+
if t == 'session':
|
13 |
+
f = open(path + 'c_seqs.csv')
|
14 |
+
s_i = []
|
15 |
+
for line in f:
|
16 |
+
l = line.rstrip('\n').split('\t')
|
17 |
+
x = ' '.join(l[1].replace('[', '').replace(']', '').split(','))
|
18 |
+
s_i.append(x)
|
19 |
+
if l[0] in d:
|
20 |
+
d[l[0]].append(x)
|
21 |
+
else:
|
22 |
+
d[l[0]] = [x]
|
23 |
+
f.close()
|
24 |
+
s_t = ['START_ ' + session + ' _END' for session in s_i]
|
25 |
+
return s_i, s_t, d
|
26 |
+
if t == 'listening':
|
27 |
+
f = open(path + 'u_seqs.csv')
|
28 |
+
s_i = []
|
29 |
+
for line in f:
|
30 |
+
l = line.rstrip('\n').split('\t')
|
31 |
+
x = ' '.join(l[1].replace('[', '').replace(']', '').split(','))
|
32 |
+
s_i.append(x)
|
33 |
+
if l[0] in d:
|
34 |
+
d[l[0]].append(x)
|
35 |
+
else:
|
36 |
+
d[l[0]] = [x]
|
37 |
+
f.close()
|
38 |
+
s_t = ['START_ ' + session + ' _END' for session in s_i]
|
39 |
+
return s_i, s_t, d
|
40 |
+
|
41 |
+
def get_unique_songs(s_i, s_t):
|
42 |
+
all_i = set()
|
43 |
+
all_t = set()
|
44 |
+
for songs in s_i:
|
45 |
+
for song in songs.split():
|
46 |
+
if song not in all_i:
|
47 |
+
all_i.add(song)
|
48 |
+
for songs in s_t:
|
49 |
+
for song in songs.split():
|
50 |
+
if song not in all_t:
|
51 |
+
all_t.add(song)
|
52 |
+
return sorted(list(all_i)), sorted(list(all_t))
|
53 |
+
|
54 |
+
def get_max_length(s_i, s_t):
|
55 |
+
max_i = np.max([len(session.split()) for session in s_i])
|
56 |
+
max_t = np.max([len(session.split()) for session in s_t])
|
57 |
+
return max_i, max_t
|
58 |
+
|
59 |
+
def get_dicts(i_songs, t_songs):
|
60 |
+
song_ix_i = dict([(song, i+1) for i, song in enumerate(i_songs)])
|
61 |
+
song_ix_t = dict([(word, i+1) for i, word in enumerate(t_songs)])
|
62 |
+
ix_song_i = dict((i, song) for song, i in song_ix_i.items())
|
63 |
+
ix_song_t = dict((i, song) for song, i in song_ix_t.items())
|
64 |
+
return song_ix_i, song_ix_t, ix_song_i, ix_song_t
|
65 |
+
|
66 |
+
def __run_s2s(sessions_i, sessions_t, num_songs, song_ix, max_l, NUM_DIM=128, BATCH_SIZE= 128, EPOCHS=50, MODEL='RNN', WINDOW_SIZE=5):
|
67 |
+
X, y = sessions_i, sessions_t
|
68 |
+
num_encoder_songs, num_decoder_songs = num_songs
|
69 |
+
song_ix_i, song_ix_t = song_ix
|
70 |
+
max_length_i, max_length_t = max_l
|
71 |
+
|
72 |
+
def generate_batch(X, y, batch_size= 128):
|
73 |
+
while True:
|
74 |
+
for j in range(0, len(X), batch_size):
|
75 |
+
encoder_input_data = np.zeros((batch_size, max_length_i), dtype='float32')
|
76 |
+
decoder_input_data = np.zeros((batch_size, max_length_t), dtype='float32')
|
77 |
+
decoder_target_data = np.zeros((batch_size, max_length_t, num_decoder_songs), dtype='float32')
|
78 |
+
for i, (input_sequence, target_sequence) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
|
79 |
+
for t, word in enumerate(input_sequence.split()):
|
80 |
+
encoder_input_data[i, t] = song_ix_i[word] if word != '-' else 0
|
81 |
+
for t, word in enumerate(target_sequence.split()):
|
82 |
+
if t < len(target_sequence.split()) - 1:
|
83 |
+
decoder_input_data[i, t] = song_ix_t[word] if word != '-' else 0
|
84 |
+
if t > 0:
|
85 |
+
decoder_target_data[i, t - 1, song_ix_t[word] if word != '-' else 0] = 1
|
86 |
+
yield([encoder_input_data, decoder_input_data], decoder_target_data)
|
87 |
+
|
88 |
+
np.random.shuffle(X)
|
89 |
+
np.random.shuffle(y)
|
90 |
+
|
91 |
+
X_train, X_test = X[int(len(X) *.1):], X[:int(len(X) *.1)]
|
92 |
+
y_train, y_test = y[int(len(y) *.1):], y[:int(len(y) *.1)]
|
93 |
+
|
94 |
+
TRAIN_SAMPLES = len(X_train)
|
95 |
+
VAL_SAMPLES = len(X_test)
|
96 |
+
|
97 |
+
ENCODER_INPUT = Input(shape=(None,))
|
98 |
+
ENCODER_EMBEDDING = Embedding(num_encoder_songs, NUM_DIM)(ENCODER_INPUT)
|
99 |
+
if MODEL == 'LSTM':
|
100 |
+
ENCODER_NN = CuDNNLSTM(NUM_DIM, return_state=True)
|
101 |
+
_, state_h, state_c = ENCODER_NN(ENCODER_EMBEDDING)
|
102 |
+
ENCODER_STATE = [state_h, state_c]
|
103 |
+
if MODEL == 'GRU':
|
104 |
+
ENCODER_NN = CuDNNGRU(NUM_DIM, return_state=True)
|
105 |
+
_, ENCODER_STATE = ENCODER_NN(ENCODER_EMBEDDING)
|
106 |
+
if MODEL == 'RNN':
|
107 |
+
ENCODER_NN = SimpleRNN(NUM_DIM, return_state=True)
|
108 |
+
_, ENCODER_STATE = ENCODER_NN(ENCODER_EMBEDDING)
|
109 |
+
|
110 |
+
DECODER_INPUT = Input(shape=(None,))
|
111 |
+
DECODER_EMBEDDING = Embedding(num_decoder_songs, NUM_DIM)(DECODER_INPUT)
|
112 |
+
if MODEL == 'LSTM':
|
113 |
+
DECODER_NN = CuDNNLSTM(NUM_DIM, return_sequences=True, return_state=True)
|
114 |
+
DECODER_OUTPUT,_,_ = DECODER_NN(DECODER_EMBEDDING, initial_state=ENCODER_STATE)
|
115 |
+
if MODEL == 'GRU':
|
116 |
+
DECODER_NN = CuDNNGRU(NUM_DIM, return_sequences=True, return_state=True)
|
117 |
+
DECODER_OUTPUT,_ = DECODER_NN(DECODER_EMBEDDING, initial_state=ENCODER_STATE)
|
118 |
+
if MODEL == 'RNN':
|
119 |
+
DECODER_NN = SimpleRNN(NUM_DIM, return_sequences=True, return_state=True)
|
120 |
+
DECODER_OUTPUT,_ = DECODER_NN(DECODER_EMBEDDING, initial_state=ENCODER_STATE)
|
121 |
+
DENSE_DECODER = Dense(num_decoder_songs, activation='softmax')
|
122 |
+
DECODER_OUTPUT = DENSE_DECODER(DECODER_OUTPUT)
|
123 |
+
|
124 |
+
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
|
125 |
+
|
126 |
+
model = Model([ENCODER_INPUT, DECODER_INPUT], DECODER_OUTPUT)
|
127 |
+
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
|
128 |
+
model.summary()
|
129 |
+
model.fit_generator(generator= generate_batch(X_train, y_train, batch_size= BATCH_SIZE),
|
130 |
+
steps_per_epoch= TRAIN_SAMPLES // BATCH_SIZE,
|
131 |
+
epochs=EPOCHS,
|
132 |
+
validation_data= generate_batch(X_test, y_test, batch_size= BATCH_SIZE),
|
133 |
+
validation_steps= VAL_SAMPLES // BATCH_SIZE, callbacks=[es])
|
134 |
+
|
135 |
+
|
136 |
+
return Model(ENCODER_INPUT, ENCODER_STATE), generate_batch
|
137 |
+
|
138 |
+
def start(df, conf, id, ds):
|
139 |
+
s2s = conf
|
140 |
+
if not exists('dataset/{}/u_seqs.csv'.format(ds)):
|
141 |
+
print('Files %s and %s are going to be at "%s"' % ('u_seqs.csv', 'c_seqs.csv', 'dataset/{}/'.format(ds)))
|
142 |
+
gen_seq_files(df, 'dataset/{}/'.format(ds), conf['window_size'])
|
143 |
+
songs = df.song.unique()
|
144 |
+
del df
|
145 |
+
|
146 |
+
sessions_i, sessions_t, song_seqs_ses = read_input_targets('dataset/{}/'.format(ds), s2s['window_size'], 'session')
|
147 |
+
listening_i,listening_t, song_seqs_list = read_input_targets('dataset/{}/'.format(ds), s2s['window_size'], 'listening')
|
148 |
+
input_songs, target_songs = get_unique_songs(listening_i, listening_t)
|
149 |
+
max_length_i, max_length_t = get_max_length(listening_i, listening_t)
|
150 |
+
num_encoder_songs, num_decoder_songs = len(input_songs) + 1, len(target_songs) + 1
|
151 |
+
song_ix_i, song_ix_t, _, _ = get_dicts(input_songs, target_songs)
|
152 |
+
|
153 |
+
model, gen = __run_s2s(listening_i, listening_t, (num_encoder_songs, num_decoder_songs), (song_ix_i, song_ix_t),
|
154 |
+
(max_length_i, max_length_t), NUM_DIM=s2s['vector_dim'], BATCH_SIZE=s2s['batch_size'], EPOCHS=s2s['epochs'],
|
155 |
+
MODEL=s2s['model'], WINDOW_SIZE=s2s['window_size'])
|
156 |
+
|
157 |
+
embeddings = []
|
158 |
+
for song in songs:
|
159 |
+
seqs = song_seqs_list[song]
|
160 |
+
get_seq = gen(seqs, ['START_ ' + seq + ' _END' for seq in seqs], batch_size=1)
|
161 |
+
seq_embeddings = []
|
162 |
+
i=0
|
163 |
+
for (input_seq, _), _ in get_seq:
|
164 |
+
if i == len(seqs):
|
165 |
+
break
|
166 |
+
if s2s['model'] == 'LSTM':
|
167 |
+
state, _ = model.predict(input_seq)
|
168 |
+
else:
|
169 |
+
state = model.predict(input_seq)
|
170 |
+
seq_embeddings.append(state[0])
|
171 |
+
i+=1
|
172 |
+
emb_final = np.mean(np.array(seq_embeddings), 0)
|
173 |
+
embeddings.append(emb_final)
|
174 |
+
emb_values = np.array([songs, embeddings])
|
175 |
+
np.save('tmp/{}/models/{}'.format(ds, id), emb_values)
|
176 |
+
|
177 |
+
######################################################################################################################
|
178 |
+
|
179 |
+
model, gen = __run_s2s(sessions_i, sessions_t, (num_encoder_songs, num_decoder_songs), (song_ix_i, song_ix_t),
|
180 |
+
(max_length_i, max_length_t), NUM_DIM=s2s['vector_dim'], BATCH_SIZE=s2s['batch_size'], EPOCHS=s2s['epochs'],
|
181 |
+
MODEL=s2s['model'], WINDOW_SIZE=s2s['window_size'])
|
182 |
+
|
183 |
+
embeddings = []
|
184 |
+
for song in songs:
|
185 |
+
seqs = song_seqs_ses[song]
|
186 |
+
get_seq = gen(seqs, ['START_ ' + seq + ' _END' for seq in seqs], batch_size=1)
|
187 |
+
seq_embeddings = []
|
188 |
+
i=0
|
189 |
+
for (input_seq, _), _ in get_seq:
|
190 |
+
if i == len(seqs):
|
191 |
+
break
|
192 |
+
if s2s['model'] == 'LSTM':
|
193 |
+
state, _ = model.predict(input_seq)
|
194 |
+
else:
|
195 |
+
state = model.predict(input_seq)
|
196 |
+
seq_embeddings.append(state[0])
|
197 |
+
i+=1
|
198 |
+
emb_final = np.mean(np.array(seq_embeddings), 0)
|
199 |
+
embeddings.append(emb_final)
|
200 |
+
emb_values = np.array([songs, embeddings])
|
201 |
+
np.save('tmp/{}/models/s{}'.format(ds, id), emb_values)
|
project/models/setups.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Setups():
|
2 |
+
def __init__(self, config):
|
3 |
+
self.__config = config
|
4 |
+
self.models_config = config['models']
|
5 |
+
|
6 |
+
def get_config(self):
|
7 |
+
return self.__config
|
8 |
+
|
9 |
+
def rnn_setups(self):
|
10 |
+
c = self.models_config['rnn']
|
11 |
+
|
12 |
+
for m in c['model']:
|
13 |
+
for w in c['window']:
|
14 |
+
for n in c['num_units']:
|
15 |
+
for e in c['embedding_dim']:
|
16 |
+
for ep in c['epochs']:
|
17 |
+
for bi in c['bi']:
|
18 |
+
yield { 'window': int(w), 'model': m, 'dim': int(e), 'batch': int(c['batch']),
|
19 |
+
'epochs': int(ep), 'num_units': int(n), 'bidi': bi}
|
20 |
+
|
21 |
+
def d2v_m2v_setups(self, model):
|
22 |
+
c = self.models_config[model]
|
23 |
+
for w in c['window']:
|
24 |
+
for sample in c['negative_sample']:
|
25 |
+
for down in c['down_sample']:
|
26 |
+
for lr in c['learning_rate']:
|
27 |
+
for ep in c['epochs']:
|
28 |
+
for dim in c['embedding_dim']:
|
29 |
+
yield { 'window': w, 'dim': int(dim), 'lr': float(lr), 'down': float(down), 'epochs': int(ep), 'neg_sample': float(sample)}
|
30 |
+
|
31 |
+
def glove_setups(self):
|
32 |
+
c = self.models_config['glove']
|
33 |
+
for w in c['window']:
|
34 |
+
for dim in c['embedding_dim']:
|
35 |
+
for lr in c['learning_rate']:
|
36 |
+
for ep in c['epochs']:
|
37 |
+
yield { 'window': int(w), 'dim': int(dim), 'lr': float(lr), 'epochs': int(ep)}
|
38 |
+
|
39 |
+
def genre_setups(self):
|
40 |
+
c = self.models_config['genres']
|
41 |
+
for a in c['all']:
|
42 |
+
yield '{}-{}'.format(a, 'all')
|
43 |
+
for r in c['ran']:
|
44 |
+
yield '{}-{}'.format(r, 'ran')
|
45 |
+
def __return_gen(self, model):
|
46 |
+
if model == 'rnn':
|
47 |
+
return self.rnn_setups()
|
48 |
+
if model == 'music2vec' or model == 'doc2vec':
|
49 |
+
return self.d2v_m2v_setups(model)
|
50 |
+
if model == 'glove':
|
51 |
+
return self.glove_setups()
|
52 |
+
if model == 'genres':
|
53 |
+
return self.genre_setups()
|
54 |
+
|
55 |
+
def get_generators(self):
|
56 |
+
generators = []
|
57 |
+
for emb_methods in self.__config['embeddings'].items():
|
58 |
+
k, v = emb_methods
|
59 |
+
if v['usage'] == True:
|
60 |
+
generators.append((k, self.__return_gen(k)))
|
61 |
+
return generators
|
62 |
+
|
63 |
+
def setup_to_string(self, id, setup_obj, model_type):
|
64 |
+
setup_str = '--'.join([x + ':' + str(y) for x,y in list(setup_obj.items())])
|
65 |
+
return '{}--{}--{}'.format(model_type, id, setup_str)
|
project/recsys/algorithms.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import yaml
|
5 |
+
import pickle
|
6 |
+
import multiprocessing as mp
|
7 |
+
import numpy as np
|
8 |
+
from project.evaluation.metrics import get_metrics
|
9 |
+
from datetime import datetime
|
10 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
11 |
+
|
12 |
+
|
13 |
+
def write_rec(pwd, sessions):
|
14 |
+
f = open(pwd, 'wb')
|
15 |
+
pickle.dump(sessions, f, protocol=pickle.HIGHEST_PROTOCOL)
|
16 |
+
f.close()
|
17 |
+
|
18 |
+
def recs(session, original, mtn_rec, smtn_rec, csmtn_rec, csmuk_rec):
|
19 |
+
return ({ 'session': session, 'original': original, 'mtn_rec': mtn_rec.tolist(), 'smtn_rec': smtn_rec.tolist(), 'csmtn_rec': csmtn_rec.tolist(), 'csmuk_rec': csmtn_rec.tolist()})
|
20 |
+
|
21 |
+
def execute_algo(train, test, songs, topN, k_sim, data, pwd):
|
22 |
+
|
23 |
+
m2vTN = []
|
24 |
+
sm2vTN = []
|
25 |
+
csm2vTN = []
|
26 |
+
csm2vUK = []
|
27 |
+
|
28 |
+
u_songs = data.us_matrix()
|
29 |
+
users = data.uu_matrix()
|
30 |
+
|
31 |
+
def report_users(num_users):
|
32 |
+
def f_aux(ix_user, user_id, algo):
|
33 |
+
return '[{}/{}] Running algorithm {} for user {}!'.format(ix_user, num_users,algo, user_id)
|
34 |
+
return f_aux
|
35 |
+
|
36 |
+
num_users = len(test)
|
37 |
+
rep = report_users(num_users)
|
38 |
+
u = 1
|
39 |
+
|
40 |
+
def pref(u, k_similar, song):
|
41 |
+
listened_to = [(k, u_songs[k, data.song_ix(song)] == 1) for k in k_similar]
|
42 |
+
sum_sims = 0
|
43 |
+
for u_k, listen in listened_to:
|
44 |
+
if listen == True:
|
45 |
+
sum_sims += users[u][u_k] / [v[1] for v in listened_to].count(True)
|
46 |
+
return sum_sims
|
47 |
+
|
48 |
+
|
49 |
+
for user in test:
|
50 |
+
f = open(pwd + '/' + user.replace('/', '_'), 'wb')
|
51 |
+
pickle.dump({}, f, protocol=pickle.HIGHEST_PROTOCOL)
|
52 |
+
f.close()
|
53 |
+
|
54 |
+
print(rep(u, user, 'M-TN'), flush=False, end='\r')
|
55 |
+
user_cos = cosine_similarity(data.u_pref(user).reshape(1, -1), data.m2v_songs)[0]
|
56 |
+
user_tn = data.get_n_largest(user_cos, topN)
|
57 |
+
|
58 |
+
sim_ix = np.argpartition(users[data.ix_user(user)], -k_sim)[-k_sim:]
|
59 |
+
song_sim = np.array([pref(data.ix_user(user), sim_ix, s) for s in songs.index.values])
|
60 |
+
to_write = []
|
61 |
+
s = 1
|
62 |
+
|
63 |
+
sessions = data.user_sessions(user)
|
64 |
+
for (train_songs, test_songs) in sessions:
|
65 |
+
if len(train_songs) > 0:
|
66 |
+
m2vTN.append(get_metrics(user_tn, test_songs))
|
67 |
+
c_pref = data.c_pref(train_songs)
|
68 |
+
|
69 |
+
print(rep(u, user, 'SM-TN'), flush=False, end='\r')
|
70 |
+
con_cos = cosine_similarity(c_pref.reshape(1, -1), data.sm2v_songs)[0]
|
71 |
+
cos_tn = data.get_n_largest(con_cos, topN)
|
72 |
+
sm2vTN.append(get_metrics(cos_tn, test_songs))
|
73 |
+
|
74 |
+
print(rep(u, user, 'CSM-TN'), flush=False, end='\r')
|
75 |
+
f_cos = np.sum([user_cos, con_cos], axis=0)
|
76 |
+
both_tn = data.get_n_largest(f_cos, topN)
|
77 |
+
csm2vTN.append(get_metrics(both_tn, test_songs))
|
78 |
+
|
79 |
+
print(rep(u, user, 'CSM-UK'), flush=False, end='\r')
|
80 |
+
UK_cos = np.sum([song_sim, con_cos], axis=0)
|
81 |
+
uk_tn = data.get_n_largest(UK_cos, topN)
|
82 |
+
csm2vUK.append(get_metrics(uk_tn, test_songs))
|
83 |
+
to_write.append(recs(s, test_songs, user_tn, cos_tn, both_tn, uk_tn))
|
84 |
+
s+=1
|
85 |
+
write_rec(pwd + '/' + user.replace('/', '_'), to_write)
|
86 |
+
u+=1
|
87 |
+
|
88 |
+
m_m2vTN = np.mean(m2vTN, axis=0).tolist()
|
89 |
+
m_sm2vTN = np.mean(sm2vTN, axis=0).tolist()
|
90 |
+
m_csm2vTN = np.mean(csm2vTN, axis=0).tolist()
|
91 |
+
m_csm2vUK = np.mean(csm2vUK, axis=0).tolist()
|
92 |
+
return (m_m2vTN, m_sm2vTN, m_csm2vTN, m_csm2vUK)
|
project/recsys/helper.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import math
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import warnings
|
6 |
+
|
7 |
+
class Helper():
|
8 |
+
def __init__(self, train, test, songs, ds):
|
9 |
+
self.ds = ds
|
10 |
+
self.train = train
|
11 |
+
self.test = test
|
12 |
+
self.songs = songs
|
13 |
+
self.m2v_songs = self.songs.m2v.tolist()
|
14 |
+
self.sm2v_songs = self.songs.sm2v.tolist()
|
15 |
+
self.songs_ix = { v:k for k,v in enumerate(songs.index, 0) }
|
16 |
+
self.ix_songs = { k:v for k,v in enumerate(songs.index, 0) }
|
17 |
+
self.ix_users = { v:k for k,v in enumerate(np.concatenate([train.index.values, test.index.values]).tolist(), 0) }
|
18 |
+
self.num_users = len(self.ix_users)
|
19 |
+
self.num_songs = len(songs.index)
|
20 |
+
self.ix_pref = { v:self.u_pref(k) for (k,v) in self.ix_users.items() }
|
21 |
+
self.ix_u_songs = { v:self.unique_songs(k) for (k,v) in self.ix_users.items() }
|
22 |
+
|
23 |
+
def user_sessions(self, user):
|
24 |
+
history = self.test.loc[user, 'history']
|
25 |
+
return [(s[:len(s)//2], s[len(s)//2:]) for s in history]
|
26 |
+
|
27 |
+
def song_ix(self, song):
|
28 |
+
return self.songs_ix[song]
|
29 |
+
|
30 |
+
def ix_user(self, ix):
|
31 |
+
return self.ix_users[ix]
|
32 |
+
|
33 |
+
def unique_songs(self, user):
|
34 |
+
if user in self.train.index:
|
35 |
+
history = self.train[self.train.index == user]['history'].values[0]
|
36 |
+
if user in self.test.index:
|
37 |
+
history = self.test[self.test.index == user]['history'].values[0]
|
38 |
+
flat_history = [song for session in history for song in session]
|
39 |
+
unique_songs = list(set(flat_history))
|
40 |
+
return unique_songs
|
41 |
+
|
42 |
+
def u_pref(self, user):
|
43 |
+
if user in self.train.index:
|
44 |
+
history = self.train[self.train.index == user]['history'].values[0]
|
45 |
+
if user in self.test.index:
|
46 |
+
history = self.test[self.test.index == user]['history'].values[0]
|
47 |
+
history = [s[:len(s)//2] for s in history]
|
48 |
+
flat_history = [song for session in history for song in session]
|
49 |
+
flat_history = [self.songs.loc[song, 'm2v'] for song in flat_history]
|
50 |
+
mean = np.mean(flat_history, axis=0)
|
51 |
+
return mean
|
52 |
+
|
53 |
+
def c_pref(self, songs):
|
54 |
+
flat_vecs = self.songs.loc[songs, 'sm2v'].tolist()
|
55 |
+
return np.mean(np.array(flat_vecs), axis=0)
|
56 |
+
|
57 |
+
def get_n_largest(self, cos,n):
|
58 |
+
songs = self.songs.index.values
|
59 |
+
index = np.argpartition(cos, -n)[-n:]
|
60 |
+
return songs[index]
|
61 |
+
|
62 |
+
def uu_matrix(self):
|
63 |
+
if os.path.isfile('tmp/{}/matrix_users.npy'.format(self.ds)):
|
64 |
+
return np.load('tmp/{}/matrix_users.npy'.format(self.ds))
|
65 |
+
|
66 |
+
matrix_users = np.zeros((self.num_users, self.num_users))
|
67 |
+
|
68 |
+
for ix in range(self.num_users):
|
69 |
+
u_array = np.array([self.ix_pref[i] for i in range(self.num_users)])
|
70 |
+
y_array = np.zeros(self.num_users)
|
71 |
+
for j in range(self.num_users):
|
72 |
+
y_array[j] = math.sqrt(len(self.ix_u_songs[ix]) + len(self.ix_u_songs[j]))
|
73 |
+
cos = cosine_similarity(self.ix_pref[ix].reshape(1, -1), u_array)
|
74 |
+
val = np.sum([cos, y_array], axis=0)
|
75 |
+
matrix_users[ix] = np.divide(np.ones(val.shape), val)
|
76 |
+
np.save('tmp/{}/matrix_users'.format(self.ds), matrix_users)
|
77 |
+
return matrix_users
|
78 |
+
|
79 |
+
def us_matrix(self):
|
80 |
+
if os.path.isfile('tmp/{}/matrix_user_songs.npy'.format(self.ds)):
|
81 |
+
return np.load('tmp/{}/matrix_user_songs.npy'.format(self.ds))
|
82 |
+
|
83 |
+
matrix_u_songs = np.zeros((self.num_users, self.num_songs))
|
84 |
+
for u in list(self.ix_u_songs.keys()):
|
85 |
+
songs = self.ix_u_songs[u]
|
86 |
+
songs_ids = [self.songs_ix[s] for s in songs]
|
87 |
+
y_array = np.zeros(self.num_songs)
|
88 |
+
y_array[songs_ids] = 1
|
89 |
+
matrix_u_songs[u] = y_array
|
90 |
+
np.save('tmp/{}/matrix_user_songs'.format(self.ds), matrix_u_songs)
|
91 |
+
return matrix_u_songs
|
92 |
+
|