hsKNN / matrixRec.py
hscrown's picture
Upload matrixRec.py
617cd6a verified
raw
history blame contribute delete
No virus
3.88 kB
# -*- coding: utf-8 -*-
!pip install datasets
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from datasets import load_dataset
"""## ๋ฐ์ดํ„ฐ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
"""
# df:๋„์„œ๊ด€,๋ฐ•๋ฌผ๊ด€,๊ณต์›์ด place_id,place_name,gu_name,type ์œผ๋กœ๋œ ๋ฐ์ดํ„ฐ: csv ํŒŒ์ผ๋กœ ์ฝ์–ด์˜ค๊ธฐ
df= read_csv('places.csv', index=False, encoding='utf-8')
"""# ์‚ฌ์šฉ์ž ํ‰์ ๋ฐ์ดํ„ฐ """
user_rating= read_csv('user_rating_1000.csv', index=False, encoding='utf-8')
"""์•„์ดํ…œ-ํŠน์„ฑ ๋ฐ์ดํ„ฐ ๋งŒ๋“ค๊ธฐ"""
# place_id, type, place_name๋งŒ ์ถ”์ถœ
item_feature = df[['place_id', 'type', 'place_name']]
item_feature.head()
"""์ถ”์ฒœ์‹œ์Šคํ…œ๊ตฌํ˜„
"""
# ์‚ฌ์šฉ์ž-์žฅ์†Œ-ํ‰์  ํ”ผ๋ด‡๋งŒ๋“ค๊ธฐ
df_user_place_ratings = user_place_data.pivot_table(index='user_id', columns='place_id', values='rating')
df_user_place_ratings.head()
"""
์ดํ›„ ํ• ์ผ
1)pivot table์„ matrix๋กœ ๋ณ€ํ™˜
2)np.mean(axis = 1)์„ ํ†ตํ•ด ์žฅ์†Œ๋ณ„ ๊ฐ ์‚ฌ์šฉ์ž๋“ค์ด ๋งค๊ธฐ๋Š” ํ‰์  ํ‰๊ท ์„ ๊ตฌํ•จ
1์—์„œ ๊ตฌํ•œ ๊ฐ’๊ณผ 2์—์„œ ๊ตฌํ•œ ๊ฐ’์„ ๋นผ์„œ ์‚ฌ์šฉ์ž-ํ‰๊ท  ๋ฐ์ดํ„ฐ ๊ฐ’์„ ๋ณ€๊ฒฝ
"""
# floatํƒ€์ž…์„ str๋กœ ๋ณ€ํ™˜
df_user_place_ratings.columns = df_user_place_ratings.columns.astype(str)
# df_user_place_ratings: pivot_table ๊ฐ’์„ numpy matrix๋กœ ๋งŒ๋“  ๊ฒƒ
df_user_place_ratings.columns = df_user_place_ratings.columns.str.strip() # ํ™”์ดํŠธ์ŠคํŽ˜์ด์Šค ์ง€์šฐ๊ธฐ
matrix = df_user_place_ratings.values #as_matrix function์€ depricated.
# user_ratings_mean: ์‚ฌ์šฉ์ž์˜ ํ‰๊ท  ํ‰์ 
user_ratings_mean = np.mean(matrix, axis = 1)
# # matrix_user_mean : ์‚ฌ์šฉ์ž-์˜ํ™”์— ๋Œ€ํ•ด ์‚ฌ์šฉ์ž ํ‰๊ท  ํ‰์ ์„ ๋บ€ ๊ฒƒ.
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)
pd.DataFrame(matrix_user_mean, columns = df_user_place_ratings.columns).head()
# scipy์—์„œ ์ œ๊ณตํ•ด์ฃผ๋Š” svd.
# U ํ–‰๋ ฌ, sigma ํ–‰๋ ฌ, V ์ „์น˜ ํ–‰๋ ฌ์„ ๋ฐ˜ํ™˜.
U, sigma, Vt = svds(matrix_user_mean, k = 12)
# ํ˜„์žฌ ์ด Sigma ํ–‰๋ ฌ์€ 0์ด ์•„๋‹Œ ๊ฐ’๋งŒ 1์ฐจ์› ํ–‰๋ ฌ๋กœ ํ‘œํ˜„๋œ ์ƒํƒœ์ž…๋‹ˆ๋‹ค.
# ์ฆ‰, 0์ด ํฌํ•จ๋œ ๋Œ€์นญํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜ํ•  ๋•Œ๋Š” numpy์˜ diag๋ฅผ ์ด์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
sigma = np.diag(sigma)
sigma.shape
# U, Sigma, Vt์˜ ๋‚ด์ ์„ ์ˆ˜ํ–‰ํ•˜๋ฉด, ๋‹ค์‹œ ์›๋ณธ ํ–‰๋ ฌ๋กœ ๋ณต์›์ด ๋œ๋‹ค.
# ๊ฑฐ๊ธฐ์— + ์‚ฌ์šฉ์ž ํ‰๊ท  rating์„ ์ ์šฉํ•œ๋‹ค.
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_place_ratings.columns)
df_svd_preds.head()
df_svd_preds.shape
# ์˜ˆ์ธก ๊ฒฐ๊ณผ ์ •๋ ฌ ๋ฐ ๋ฐ˜ํ™˜์ฝ”๋“œ
# ์‚ฌ์šฉ์ž์˜ ์˜ˆ์ธก ํ‰์ ์ด ๋†’์€ ์ˆœ์œผ๋กœ ์ •๋ ฌ๋œ ๋ฐ์ดํ„ฐ
# user_id๊ฐ€ 0๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋ฏ€๋กœ user_row_number๋กœ ์“ด๋‹ค. 1๋ถ€ํ„ฐ์‹œ์ž‘ํ•˜๋ฉด user_id-1ํ•˜๋ฉด๋จ.
user_id = 0 # 0๋ฒˆํšŒ์›์˜ ํ‰์  ์˜ˆ์ธก
user_row_number = user_id
sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
sorted_user_predictions = pd.DataFrame(sorted_user_predictions.reset_index())
sorted_user_predictions.columns = ['place_id', 'predict_rating']
sorted_user_predictions['place_id'] = sorted_user_predictions['place_id'].astype('int64')
# ์›๋ณธ ํ‰์  ๋ฐ์ดํ„ฐ์—์„œ user id์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœ
user_data = user_rating[user_rating['user_id'] == user_id]
# user_data์—์„œ ํ‰์ ์ด 0์ธ ๋ฐ์ดํ„ฐ๋Š” ์•„์ง ์•ˆ ๊ฐ€๋ณธ ๊ฒƒ์ด๋ฏ€๋กœ ์‚ญ์ œ
user_data = user_data[user_data['rating'] != 0.0]
# (๊ฐ€๋ณธ ์žฅ์†Œ) ์ถ”์ถœ๋œ ๋ฐ์ดํ„ฐ์™€ ์›๋ณธ ์žฅ์†Œ ๋ฐ์ดํ„ฐ๋ฅผ ํ•ฉ์นจ
user_history = user_data.merge(item_feature, on='place_id').sort_values(['rating'], ascending=False)
# ์‚ฌ์šฉ์ž๊ฐ€ ํ‰๊ฐ€ํ•˜์ง€ ์•Š์€ ์žฅ์†Œ๋ฅผ ์ถ”์ฒœ ๋Œ€์ƒ์œผ๋กœ ์„ค์ •
recommendations = item_feature[~item_feature['place_id'].isin(user_history['place_id'])]