Upload matrixRec.py
Browse files- matrixRec.py +105 -0
matrixRec.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
!pip install datasets
|
3 |
+
|
4 |
+
from sklearn.decomposition import TruncatedSVD
|
5 |
+
from scipy.sparse.linalg import svds
|
6 |
+
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import seaborn as sns
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import warnings
|
12 |
+
warnings.filterwarnings("ignore")
|
13 |
+
from datasets import load_dataset
|
14 |
+
|
15 |
+
"""## ๋ฐ์ดํฐ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ
|
16 |
+
|
17 |
+
"""
|
18 |
+
|
19 |
+
# df:๋์๊ด,๋ฐ๋ฌผ๊ด,๊ณต์์ด place_id,place_name,gu_name,type ์ผ๋ก๋ ๋ฐ์ดํฐ: csv ํ์ผ๋ก ์ฝ์ด์ค๊ธฐ
|
20 |
+
df= read_csv('places.csv', index=False, encoding='utf-8')
|
21 |
+
|
22 |
+
"""# ์ฌ์ฉ์ ํ์ ๋ฐ์ดํฐ """
|
23 |
+
user_rating= read_csv('user_rating_1000.csv', index=False, encoding='utf-8')
|
24 |
+
|
25 |
+
"""์์ดํ
-ํน์ฑ ๋ฐ์ดํฐ ๋ง๋ค๊ธฐ"""
|
26 |
+
# place_id, type, place_name๋ง ์ถ์ถ
|
27 |
+
item_feature = df[['place_id', 'type', 'place_name']]
|
28 |
+
item_feature.head()
|
29 |
+
|
30 |
+
"""์ถ์ฒ์์คํ
๊ตฌํ
|
31 |
+
"""
|
32 |
+
|
33 |
+
# ์ฌ์ฉ์-์ฅ์-ํ์ ํผ๋ด๋ง๋ค๊ธฐ
|
34 |
+
df_user_place_ratings = user_place_data.pivot_table(index='user_id', columns='place_id', values='rating')
|
35 |
+
df_user_place_ratings.head()
|
36 |
+
|
37 |
+
"""
|
38 |
+
์ดํ ํ ์ผ
|
39 |
+
1)pivot table์ matrix๋ก ๋ณํ
|
40 |
+
2)np.mean(axis = 1)์ ํตํด ์ฅ์๋ณ ๊ฐ ์ฌ์ฉ์๋ค์ด ๋งค๊ธฐ๋ ํ์ ํ๊ท ์ ๊ตฌํจ
|
41 |
+
1์์ ๊ตฌํ ๊ฐ๊ณผ 2์์ ๊ตฌํ ๊ฐ์ ๋นผ์ ์ฌ์ฉ์-ํ๊ท ๋ฐ์ดํฐ ๊ฐ์ ๋ณ๊ฒฝ
|
42 |
+
"""
|
43 |
+
|
44 |
+
# floatํ์
์ str๋ก ๋ณํ
|
45 |
+
df_user_place_ratings.columns = df_user_place_ratings.columns.astype(str)
|
46 |
+
|
47 |
+
# df_user_place_ratings: pivot_table ๊ฐ์ numpy matrix๋ก ๋ง๋ ๊ฒ
|
48 |
+
df_user_place_ratings.columns = df_user_place_ratings.columns.str.strip() # ํ์ดํธ์คํ์ด์ค ์ง์ฐ๊ธฐ
|
49 |
+
matrix = df_user_place_ratings.values #as_matrix function์ depricated.
|
50 |
+
|
51 |
+
# user_ratings_mean: ์ฌ์ฉ์์ ํ๊ท ํ์
|
52 |
+
user_ratings_mean = np.mean(matrix, axis = 1)
|
53 |
+
|
54 |
+
# # matrix_user_mean : ์ฌ์ฉ์-์ํ์ ๋ํด ์ฌ์ฉ์ ํ๊ท ํ์ ์ ๋บ ๊ฒ.
|
55 |
+
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)
|
56 |
+
|
57 |
+
pd.DataFrame(matrix_user_mean, columns = df_user_place_ratings.columns).head()
|
58 |
+
|
59 |
+
# scipy์์ ์ ๊ณตํด์ฃผ๋ svd.
|
60 |
+
# U ํ๋ ฌ, sigma ํ๋ ฌ, V ์ ์น ํ๋ ฌ์ ๋ฐํ.
|
61 |
+
U, sigma, Vt = svds(matrix_user_mean, k = 12)
|
62 |
+
|
63 |
+
# ํ์ฌ ์ด Sigma ํ๋ ฌ์ 0์ด ์๋ ๊ฐ๋ง 1์ฐจ์ ํ๋ ฌ๋ก ํํ๋ ์ํ์
๋๋ค.
|
64 |
+
# ์ฆ, 0์ด ํฌํจ๋ ๋์นญํ๋ ฌ๋ก ๋ณํํ ๋๋ numpy์ diag๋ฅผ ์ด์ฉํด์ผ ํฉ๋๋ค.
|
65 |
+
|
66 |
+
sigma = np.diag(sigma)
|
67 |
+
|
68 |
+
sigma.shape
|
69 |
+
|
70 |
+
# U, Sigma, Vt์ ๋ด์ ์ ์ํํ๋ฉด, ๋ค์ ์๋ณธ ํ๋ ฌ๋ก ๋ณต์์ด ๋๋ค.
|
71 |
+
# ๊ฑฐ๊ธฐ์ + ์ฌ์ฉ์ ํ๊ท rating์ ์ ์ฉํ๋ค.
|
72 |
+
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
|
73 |
+
|
74 |
+
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_place_ratings.columns)
|
75 |
+
df_svd_preds.head()
|
76 |
+
|
77 |
+
df_svd_preds.shape
|
78 |
+
|
79 |
+
# ์์ธก ๊ฒฐ๊ณผ ์ ๋ ฌ ๋ฐ ๋ฐํ์ฝ๋
|
80 |
+
|
81 |
+
# ์ฌ์ฉ์์ ์์ธก ํ์ ์ด ๋์ ์์ผ๋ก ์ ๋ ฌ๋ ๋ฐ์ดํฐ
|
82 |
+
# user_id๊ฐ 0๋ถํฐ ์์ํ๋ฏ๋ก user_row_number๋ก ์ด๋ค. 1๋ถํฐ์์ํ๋ฉด user_id-1ํ๋ฉด๋จ.
|
83 |
+
user_id = 0 # 0๋ฒํ์์ ํ์ ์์ธก
|
84 |
+
user_row_number = user_id
|
85 |
+
sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
|
86 |
+
|
87 |
+
sorted_user_predictions = pd.DataFrame(sorted_user_predictions.reset_index())
|
88 |
+
sorted_user_predictions.columns = ['place_id', 'predict_rating']
|
89 |
+
|
90 |
+
sorted_user_predictions['place_id'] = sorted_user_predictions['place_id'].astype('int64')
|
91 |
+
|
92 |
+
# ์๋ณธ ํ์ ๋ฐ์ดํฐ์์ user id์ ํด๋นํ๋ ๋ฐ์ดํฐ๋ฅผ ์ถ์ถ
|
93 |
+
user_data = user_rating[user_rating['user_id'] == user_id]
|
94 |
+
|
95 |
+
# user_data์์ ํ์ ์ด 0์ธ ๋ฐ์ดํฐ๋ ์์ง ์ ๊ฐ๋ณธ ๊ฒ์ด๋ฏ๋ก ์ญ์
|
96 |
+
user_data = user_data[user_data['rating'] != 0.0]
|
97 |
+
|
98 |
+
# (๊ฐ๋ณธ ์ฅ์) ์ถ์ถ๋ ๋ฐ์ดํฐ์ ์๋ณธ ์ฅ์ ๋ฐ์ดํฐ๋ฅผ ํฉ์นจ
|
99 |
+
user_history = user_data.merge(item_feature, on='place_id').sort_values(['rating'], ascending=False)
|
100 |
+
|
101 |
+
# ์ฌ์ฉ์๊ฐ ํ๊ฐํ์ง ์์ ์ฅ์๋ฅผ ์ถ์ฒ ๋์์ผ๋ก ์ค์
|
102 |
+
recommendations = item_feature[~item_feature['place_id'].isin(user_history['place_id'])]
|
103 |
+
|
104 |
+
|
105 |
+
|