Spaces:

schibsted-presplit
/

ai-academy-2024-gr8-recommender-api

Runtime error

App Files Files Community

jrno commited on Apr 29, 2024

Commit

dddf97f

1 Parent(s): 52bc18b

refactor

Browse files

Files changed (7) hide show

.gitignore +0 -3
README.md +31 -0
data/{model_track_interactions.csv → model.csv} +0 -0
{recommendation-api → data}/model.pkl +0 -0
recommendation-api/recommender.py +24 -27
recommendation-api/server.py +4 -2
recommendation-api/tracks.py +43 -43

.gitignore CHANGED Viewed

@@ -1,4 +1 @@
-data/*
-!data/music_info.csv
-!data/model_track_interactions.csv
 recommendation-api/__pycache__





1	recommendation-api/__pycache__

README.md CHANGED Viewed

@@ -9,3 +9,34 @@ license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# ai-academy-2024-group8
+A lightweight backend API for song recommender.
+Dataset used in this project is public and available from [online](https://www.kaggle.com/datasets/undefinenull/million-song-dataset-spotify-lastfm)
+## What's in here
+- `data/`: Contains the trained `model.pkl` and related `model.csv` that has the training set in csv format
+- `notebooks/`: Contains any jupyter notebooks used in the project
+- `recommendation-api/`: A FastAPI app to serve user recommendations
+## Running service locally
+1. (Optional) Create an activate python venv
+2. Install the requirements `pip install -r requirements.txt`
+3. Start the service `python recommendation-api/server.py`
+Then
+- `curl http://localhost:7860/users` to fetch list of supported users
+- `curl http://localhost:7860/users/<id>` to fetch track history for individual user
+- `curl http://localhost:7860/recommend/<id>` to recommend tracks for the specific user
+## Running in Huggingface
+Application is built and started on push to master.
+Application is available from [here](https://schibsted-ai-academy-2024-gr8-recommender-api.hf.space/docs)

data/{model_track_interactions.csv → model.csv} RENAMED Viewed

File without changes

{recommendation-api → data}/model.pkl RENAMED Viewed

File without changes

recommendation-api/recommender.py CHANGED Viewed

@@ -1,28 +1,25 @@
-from fastai.learner import Learner
-import pandas as pd
-from tracks import get_unlistened_tracks_for_user, predictions_to_tracks
-def get_recommendations_for_user(learn: Learner, user_id: str, limit: int = 5):
-    not_listened_tracks = get_unlistened_tracks_for_user(user_id)
-    print(len(not_listened_tracks))
-    # Get predictions for the tracks user hasn't listened yet
-    input_dataframe = pd.DataFrame({'user_id': [user_id] * len(not_listened_tracks), 'entry': not_listened_tracks})
-    test_dl = learn.dls.test_dl(input_dataframe)
-    predictions = learn.get_preds(dl=test_dl)
-    # Associate them with prediction score and sort
-    tracks_with_predictions = list(zip(not_listened_tracks, predictions[0].numpy()))
-    tracks_with_predictions.sort(key=lambda x: x[1], reverse=True)
-    print(tracks_with_predictions[:limit])
-    # Pick n and return as full tracks
-    recommendations = predictions_to_tracks(tracks_with_predictions[:limit])
-    return {
-        "user_id": user_id,
-        "limit": limit,
-        "recommendations": recommendations
     }

+from fastai.learner import Learner
+import pandas as pd
+from tracks import get_unlistened_tracks_for_user, predictions_to_tracks
+def get_recommendations_for_user(learn: Learner, user_id: str, limit: int = 5):
+    not_listened_tracks = get_unlistened_tracks_for_user(user_id)
+    # Get predictions for the tracks user hasn't listened yet
+    input_dataframe = pd.DataFrame({'user_id': [user_id] * len(not_listened_tracks), 'entry': not_listened_tracks})
+    test_dl = learn.dls.test_dl(input_dataframe)
+    predictions = learn.get_preds(dl=test_dl)
+    # Associate them with prediction score and sort
+    tracks_with_predictions = list(zip(not_listened_tracks, predictions[0].numpy()))
+    tracks_with_predictions.sort(key=lambda x: x[1], reverse=True)
+    # Pick n and return as full tracks
+    recommendations = predictions_to_tracks(tracks_with_predictions[:limit])
+    return {
+        "user_id": user_id,
+        "limit": limit,
+        "recommendations": recommendations
     }

recommendation-api/server.py CHANGED Viewed

@@ -5,10 +5,12 @@ import os
 from tracks import get_top_tracks_for_user, get_users_with_track_interactions
 from recommender import get_recommendations_for_user
-from learner import setup_learner, custom_accuracy # Note that DotProductBias must be imported to global namespace
 app = FastAPI()
-model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model.pkl')
 learn = None
 @app.on_event("startup")

 from tracks import get_top_tracks_for_user, get_users_with_track_interactions
 from recommender import get_recommendations_for_user
+# custom_accuracy needs to be imported to the global namespace for Learner to load
+from learner import setup_learner, custom_accuracy
 app = FastAPI()
+model_filename = 'data/model.pkl'
 learn = None
 @app.on_event("startup")

recommendation-api/tracks.py CHANGED Viewed

@@ -1,44 +1,44 @@
-import pandas as pd
-# Read the CSV files
-tracks_df = pd.read_csv('data/music_info.csv')
-tracks_df.fillna('', inplace=True)
-tracks_df["entry"] = tracks_df["name"] + ", " + tracks_df["artist"] + ", " + tracks_df["year"].astype(str)
-track_interactions_df = pd.read_csv('data/model_track_interactions.csv')[['user_id', 'track_id']]
-# Merge data on those two csvs
-dataframe = pd.merge(tracks_df, track_interactions_df, on='track_id', how='left')
-# Convert all columns to string type
-dataframe = dataframe.astype(str)
-# Create a history lookup dictionary by 'user_id'
-user_to_track_history_dict = {user_id: group.drop('user_id', axis=1).to_dict('records')
-                              for user_id, group in dataframe.groupby('user_id')}
-def get_users_with_track_interactions(ascending=False, limit=10):
-    playcount_summary = track_interactions_df.groupby('user_id').size().reset_index(name='track_interactions')
-    playcount_summary.sort_values(by='track_interactions', ascending=ascending, inplace=True)
-    if limit is not None:
-        playcount_summary = playcount_summary.head(limit)
-    return playcount_summary.to_dict(orient='records')
-def get_top_tracks_for_user(user_id: str, limit=10):
-    track_list = user_to_track_history_dict.get(user_id, [])
-    sorted_tracks = sorted(track_list, key=lambda x: int(x['playcount']) if 'playcount' in x and x['playcount'].isdigit() else 0, reverse=True)
-    if limit is not None:
-        sorted_tracks = sorted_tracks[:limit]
-    return sorted_tracks
-def get_unlistened_tracks_for_user(user_id:str):
-    all_tracks = tracks_df['entry'].tolist()
-    listened_tracks = [track['entry'] for track in user_to_track_history_dict.get(user_id, [])]
-    return list(set(all_tracks) - set(listened_tracks))
-def predictions_to_tracks(entries_and_predictions):
-    tracks = []
-    for entry, score in entries_and_predictions:
-        track_info = tracks_df[tracks_df['entry'] == entry]
-        if not track_info.empty:
-            track_dict = track_info.to_dict('records')[0]
-            track_dict['score'] = score.astype(str)
-            tracks.append(track_dict)
     return tracks

+import pandas as pd
+# Read the CSV files
+tracks_df = pd.read_csv('data/music_info.csv')
+tracks_df.fillna('', inplace=True)
+tracks_df["entry"] = tracks_df["name"] + ", " + tracks_df["artist"] + ", " + tracks_df["year"].astype(str)
+track_interactions_df = pd.read_csv('data/model.csv')[['user_id', 'track_id']]
+# Merge data on those two csvs
+dataframe = pd.merge(tracks_df, track_interactions_df, on='track_id', how='left')
+# Convert all columns to string type
+dataframe = dataframe.astype(str)
+# Create a history lookup dictionary by 'user_id'
+user_to_track_history_dict = {user_id: group.drop('user_id', axis=1).to_dict('records')
+                              for user_id, group in dataframe.groupby('user_id')}
+def get_users_with_track_interactions(ascending=False, limit=10):
+    playcount_summary = track_interactions_df.groupby('user_id').size().reset_index(name='track_interactions')
+    playcount_summary.sort_values(by='track_interactions', ascending=ascending, inplace=True)
+    if limit is not None:
+        playcount_summary = playcount_summary.head(limit)
+    return playcount_summary.to_dict(orient='records')
+def get_top_tracks_for_user(user_id: str, limit=10):
+    track_list = user_to_track_history_dict.get(user_id, [])
+    sorted_tracks = sorted(track_list, key=lambda x: int(x['playcount']) if 'playcount' in x and x['playcount'].isdigit() else 0, reverse=True)
+    if limit is not None:
+        sorted_tracks = sorted_tracks[:limit]
+    return sorted_tracks
+def get_unlistened_tracks_for_user(user_id:str):
+    all_tracks = tracks_df['entry'].tolist()
+    listened_tracks = [track['entry'] for track in user_to_track_history_dict.get(user_id, [])]
+    return list(set(all_tracks) - set(listened_tracks))
+def predictions_to_tracks(entries_and_predictions):
+    tracks = []
+    for entry, score in entries_and_predictions:
+        track_info = tracks_df[tracks_df['entry'] == entry]
+        if not track_info.empty:
+            track_dict = track_info.to_dict('records')[0]
+            track_dict['score'] = score.astype(str)
+            tracks.append(track_dict)
     return tracks