Spaces:
Runtime error
Runtime error
refactor
Browse files- .gitignore +0 -3
- README.md +31 -0
- data/{model_track_interactions.csv → model.csv} +0 -0
- {recommendation-api → data}/model.pkl +0 -0
- recommendation-api/recommender.py +24 -27
- recommendation-api/server.py +4 -2
- recommendation-api/tracks.py +43 -43
.gitignore
CHANGED
@@ -1,4 +1 @@
|
|
1 |
-
data/*
|
2 |
-
!data/music_info.csv
|
3 |
-
!data/model_track_interactions.csv
|
4 |
recommendation-api/__pycache__
|
|
|
|
|
|
|
|
|
1 |
recommendation-api/__pycache__
|
README.md
CHANGED
@@ -9,3 +9,34 @@ license: mit
|
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
12 |
+
|
13 |
+
# ai-academy-2024-group8
|
14 |
+
|
15 |
+
A lightweight backend API for song recommender.
|
16 |
+
|
17 |
+
Dataset used in this project is public and available from [online](https://www.kaggle.com/datasets/undefinenull/million-song-dataset-spotify-lastfm)
|
18 |
+
|
19 |
+
## What's in here
|
20 |
+
|
21 |
+
- `data/`: Contains the trained `model.pkl` and related `model.csv` that has the training set in csv format
|
22 |
+
- `notebooks/`: Contains any jupyter notebooks used in the project
|
23 |
+
- `recommendation-api/`: A FastAPI app to serve user recommendations
|
24 |
+
|
25 |
+
## Running service locally
|
26 |
+
|
27 |
+
1. (Optional) Create an activate python venv
|
28 |
+
2. Install the requirements `pip install -r requirements.txt`
|
29 |
+
3. Start the service `python recommendation-api/server.py`
|
30 |
+
|
31 |
+
Then
|
32 |
+
|
33 |
+
- `curl http://localhost:7860/users` to fetch list of supported users
|
34 |
+
- `curl http://localhost:7860/users/<id>` to fetch track history for individual user
|
35 |
+
- `curl http://localhost:7860/recommend/<id>` to recommend tracks for the specific user
|
36 |
+
|
37 |
+
## Running in Huggingface
|
38 |
+
|
39 |
+
Application is built and started on push to master.
|
40 |
+
|
41 |
+
Application is available from [here](https://schibsted-ai-academy-2024-gr8-recommender-api.hf.space/docs)
|
42 |
+
|
data/{model_track_interactions.csv → model.csv}
RENAMED
File without changes
|
{recommendation-api → data}/model.pkl
RENAMED
File without changes
|
recommendation-api/recommender.py
CHANGED
@@ -1,28 +1,25 @@
|
|
1 |
-
from fastai.learner import Learner
|
2 |
-
import pandas as pd
|
3 |
-
|
4 |
-
from tracks import get_unlistened_tracks_for_user, predictions_to_tracks
|
5 |
-
|
6 |
-
def get_recommendations_for_user(learn: Learner, user_id: str, limit: int = 5):
|
7 |
-
not_listened_tracks = get_unlistened_tracks_for_user(user_id)
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
tracks_with_predictions
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
"user_id": user_id,
|
26 |
-
"limit": limit,
|
27 |
-
"recommendations": recommendations
|
28 |
}
|
|
|
1 |
+
from fastai.learner import Learner
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from tracks import get_unlistened_tracks_for_user, predictions_to_tracks
|
5 |
+
|
6 |
+
def get_recommendations_for_user(learn: Learner, user_id: str, limit: int = 5):
|
7 |
+
not_listened_tracks = get_unlistened_tracks_for_user(user_id)
|
8 |
+
|
9 |
+
# Get predictions for the tracks user hasn't listened yet
|
10 |
+
input_dataframe = pd.DataFrame({'user_id': [user_id] * len(not_listened_tracks), 'entry': not_listened_tracks})
|
11 |
+
test_dl = learn.dls.test_dl(input_dataframe)
|
12 |
+
predictions = learn.get_preds(dl=test_dl)
|
13 |
+
|
14 |
+
# Associate them with prediction score and sort
|
15 |
+
tracks_with_predictions = list(zip(not_listened_tracks, predictions[0].numpy()))
|
16 |
+
tracks_with_predictions.sort(key=lambda x: x[1], reverse=True)
|
17 |
+
|
18 |
+
# Pick n and return as full tracks
|
19 |
+
recommendations = predictions_to_tracks(tracks_with_predictions[:limit])
|
20 |
+
|
21 |
+
return {
|
22 |
+
"user_id": user_id,
|
23 |
+
"limit": limit,
|
24 |
+
"recommendations": recommendations
|
|
|
|
|
|
|
25 |
}
|
recommendation-api/server.py
CHANGED
@@ -5,10 +5,12 @@ import os
|
|
5 |
|
6 |
from tracks import get_top_tracks_for_user, get_users_with_track_interactions
|
7 |
from recommender import get_recommendations_for_user
|
8 |
-
|
|
|
|
|
9 |
|
10 |
app = FastAPI()
|
11 |
-
model_filename =
|
12 |
learn = None
|
13 |
|
14 |
@app.on_event("startup")
|
|
|
5 |
|
6 |
from tracks import get_top_tracks_for_user, get_users_with_track_interactions
|
7 |
from recommender import get_recommendations_for_user
|
8 |
+
|
9 |
+
# custom_accuracy needs to be imported to the global namespace for Learner to load
|
10 |
+
from learner import setup_learner, custom_accuracy
|
11 |
|
12 |
app = FastAPI()
|
13 |
+
model_filename = 'data/model.pkl'
|
14 |
learn = None
|
15 |
|
16 |
@app.on_event("startup")
|
recommendation-api/tracks.py
CHANGED
@@ -1,44 +1,44 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
|
3 |
-
# Read the CSV files
|
4 |
-
tracks_df = pd.read_csv('data/music_info.csv')
|
5 |
-
tracks_df.fillna('', inplace=True)
|
6 |
-
tracks_df["entry"] = tracks_df["name"] + ", " + tracks_df["artist"] + ", " + tracks_df["year"].astype(str)
|
7 |
-
track_interactions_df = pd.read_csv('data/
|
8 |
-
|
9 |
-
# Merge data on those two csvs
|
10 |
-
dataframe = pd.merge(tracks_df, track_interactions_df, on='track_id', how='left')
|
11 |
-
# Convert all columns to string type
|
12 |
-
dataframe = dataframe.astype(str)
|
13 |
-
# Create a history lookup dictionary by 'user_id'
|
14 |
-
user_to_track_history_dict = {user_id: group.drop('user_id', axis=1).to_dict('records')
|
15 |
-
for user_id, group in dataframe.groupby('user_id')}
|
16 |
-
|
17 |
-
def get_users_with_track_interactions(ascending=False, limit=10):
|
18 |
-
playcount_summary = track_interactions_df.groupby('user_id').size().reset_index(name='track_interactions')
|
19 |
-
playcount_summary.sort_values(by='track_interactions', ascending=ascending, inplace=True)
|
20 |
-
if limit is not None:
|
21 |
-
playcount_summary = playcount_summary.head(limit)
|
22 |
-
return playcount_summary.to_dict(orient='records')
|
23 |
-
|
24 |
-
def get_top_tracks_for_user(user_id: str, limit=10):
|
25 |
-
track_list = user_to_track_history_dict.get(user_id, [])
|
26 |
-
sorted_tracks = sorted(track_list, key=lambda x: int(x['playcount']) if 'playcount' in x and x['playcount'].isdigit() else 0, reverse=True)
|
27 |
-
if limit is not None:
|
28 |
-
sorted_tracks = sorted_tracks[:limit]
|
29 |
-
return sorted_tracks
|
30 |
-
|
31 |
-
def get_unlistened_tracks_for_user(user_id:str):
|
32 |
-
all_tracks = tracks_df['entry'].tolist()
|
33 |
-
listened_tracks = [track['entry'] for track in user_to_track_history_dict.get(user_id, [])]
|
34 |
-
return list(set(all_tracks) - set(listened_tracks))
|
35 |
-
|
36 |
-
def predictions_to_tracks(entries_and_predictions):
|
37 |
-
tracks = []
|
38 |
-
for entry, score in entries_and_predictions:
|
39 |
-
track_info = tracks_df[tracks_df['entry'] == entry]
|
40 |
-
if not track_info.empty:
|
41 |
-
track_dict = track_info.to_dict('records')[0]
|
42 |
-
track_dict['score'] = score.astype(str)
|
43 |
-
tracks.append(track_dict)
|
44 |
return tracks
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# Read the CSV files
|
4 |
+
tracks_df = pd.read_csv('data/music_info.csv')
|
5 |
+
tracks_df.fillna('', inplace=True)
|
6 |
+
tracks_df["entry"] = tracks_df["name"] + ", " + tracks_df["artist"] + ", " + tracks_df["year"].astype(str)
|
7 |
+
track_interactions_df = pd.read_csv('data/model.csv')[['user_id', 'track_id']]
|
8 |
+
|
9 |
+
# Merge data on those two csvs
|
10 |
+
dataframe = pd.merge(tracks_df, track_interactions_df, on='track_id', how='left')
|
11 |
+
# Convert all columns to string type
|
12 |
+
dataframe = dataframe.astype(str)
|
13 |
+
# Create a history lookup dictionary by 'user_id'
|
14 |
+
user_to_track_history_dict = {user_id: group.drop('user_id', axis=1).to_dict('records')
|
15 |
+
for user_id, group in dataframe.groupby('user_id')}
|
16 |
+
|
17 |
+
def get_users_with_track_interactions(ascending=False, limit=10):
|
18 |
+
playcount_summary = track_interactions_df.groupby('user_id').size().reset_index(name='track_interactions')
|
19 |
+
playcount_summary.sort_values(by='track_interactions', ascending=ascending, inplace=True)
|
20 |
+
if limit is not None:
|
21 |
+
playcount_summary = playcount_summary.head(limit)
|
22 |
+
return playcount_summary.to_dict(orient='records')
|
23 |
+
|
24 |
+
def get_top_tracks_for_user(user_id: str, limit=10):
|
25 |
+
track_list = user_to_track_history_dict.get(user_id, [])
|
26 |
+
sorted_tracks = sorted(track_list, key=lambda x: int(x['playcount']) if 'playcount' in x and x['playcount'].isdigit() else 0, reverse=True)
|
27 |
+
if limit is not None:
|
28 |
+
sorted_tracks = sorted_tracks[:limit]
|
29 |
+
return sorted_tracks
|
30 |
+
|
31 |
+
def get_unlistened_tracks_for_user(user_id:str):
|
32 |
+
all_tracks = tracks_df['entry'].tolist()
|
33 |
+
listened_tracks = [track['entry'] for track in user_to_track_history_dict.get(user_id, [])]
|
34 |
+
return list(set(all_tracks) - set(listened_tracks))
|
35 |
+
|
36 |
+
def predictions_to_tracks(entries_and_predictions):
|
37 |
+
tracks = []
|
38 |
+
for entry, score in entries_and_predictions:
|
39 |
+
track_info = tracks_df[tracks_df['entry'] == entry]
|
40 |
+
if not track_info.empty:
|
41 |
+
track_dict = track_info.to_dict('records')[0]
|
42 |
+
track_dict['score'] = score.astype(str)
|
43 |
+
tracks.append(track_dict)
|
44 |
return tracks
|