hscrown commited on
Commit
617cd6a
โ€ข
1 Parent(s): 178bcef

Upload matrixRec.py

Browse files
Files changed (1) hide show
  1. matrixRec.py +105 -0
matrixRec.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ !pip install datasets
3
+
4
+ from sklearn.decomposition import TruncatedSVD
5
+ from scipy.sparse.linalg import svds
6
+
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ import pandas as pd
10
+ import numpy as np
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+ from datasets import load_dataset
14
+
15
+ """## ๋ฐ์ดํ„ฐ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
16
+
17
+ """
18
+
19
+ # df:๋„์„œ๊ด€,๋ฐ•๋ฌผ๊ด€,๊ณต์›์ด place_id,place_name,gu_name,type ์œผ๋กœ๋œ ๋ฐ์ดํ„ฐ: csv ํŒŒ์ผ๋กœ ์ฝ์–ด์˜ค๊ธฐ
20
+ df= read_csv('places.csv', index=False, encoding='utf-8')
21
+
22
+ """# ์‚ฌ์šฉ์ž ํ‰์ ๋ฐ์ดํ„ฐ """
23
+ user_rating= read_csv('user_rating_1000.csv', index=False, encoding='utf-8')
24
+
25
+ """์•„์ดํ…œ-ํŠน์„ฑ ๋ฐ์ดํ„ฐ ๋งŒ๋“ค๊ธฐ"""
26
+ # place_id, type, place_name๋งŒ ์ถ”์ถœ
27
+ item_feature = df[['place_id', 'type', 'place_name']]
28
+ item_feature.head()
29
+
30
+ """์ถ”์ฒœ์‹œ์Šคํ…œ๊ตฌํ˜„
31
+ """
32
+
33
+ # ์‚ฌ์šฉ์ž-์žฅ์†Œ-ํ‰์  ํ”ผ๋ด‡๋งŒ๋“ค๊ธฐ
34
+ df_user_place_ratings = user_place_data.pivot_table(index='user_id', columns='place_id', values='rating')
35
+ df_user_place_ratings.head()
36
+
37
+ """
38
+ ์ดํ›„ ํ• ์ผ
39
+ 1)pivot table์„ matrix๋กœ ๋ณ€ํ™˜
40
+ 2)np.mean(axis = 1)์„ ํ†ตํ•ด ์žฅ์†Œ๋ณ„ ๊ฐ ์‚ฌ์šฉ์ž๋“ค์ด ๋งค๊ธฐ๋Š” ํ‰์  ํ‰๊ท ์„ ๊ตฌํ•จ
41
+ 1์—์„œ ๊ตฌํ•œ ๊ฐ’๊ณผ 2์—์„œ ๊ตฌํ•œ ๊ฐ’์„ ๋นผ์„œ ์‚ฌ์šฉ์ž-ํ‰๊ท  ๋ฐ์ดํ„ฐ ๊ฐ’์„ ๋ณ€๊ฒฝ
42
+ """
43
+
44
+ # floatํƒ€์ž…์„ str๋กœ ๋ณ€ํ™˜
45
+ df_user_place_ratings.columns = df_user_place_ratings.columns.astype(str)
46
+
47
+ # df_user_place_ratings: pivot_table ๊ฐ’์„ numpy matrix๋กœ ๋งŒ๋“  ๊ฒƒ
48
+ df_user_place_ratings.columns = df_user_place_ratings.columns.str.strip() # ํ™”์ดํŠธ์ŠคํŽ˜์ด์Šค ์ง€์šฐ๊ธฐ
49
+ matrix = df_user_place_ratings.values #as_matrix function์€ depricated.
50
+
51
+ # user_ratings_mean: ์‚ฌ์šฉ์ž์˜ ํ‰๊ท  ํ‰์ 
52
+ user_ratings_mean = np.mean(matrix, axis = 1)
53
+
54
+ # # matrix_user_mean : ์‚ฌ์šฉ์ž-์˜ํ™”์— ๋Œ€ํ•ด ์‚ฌ์šฉ์ž ํ‰๊ท  ํ‰์ ์„ ๋บ€ ๊ฒƒ.
55
+ matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)
56
+
57
+ pd.DataFrame(matrix_user_mean, columns = df_user_place_ratings.columns).head()
58
+
59
+ # scipy์—์„œ ์ œ๊ณตํ•ด์ฃผ๋Š” svd.
60
+ # U ํ–‰๋ ฌ, sigma ํ–‰๋ ฌ, V ์ „์น˜ ํ–‰๋ ฌ์„ ๋ฐ˜ํ™˜.
61
+ U, sigma, Vt = svds(matrix_user_mean, k = 12)
62
+
63
+ # ํ˜„์žฌ ์ด Sigma ํ–‰๋ ฌ์€ 0์ด ์•„๋‹Œ ๊ฐ’๋งŒ 1์ฐจ์› ํ–‰๋ ฌ๋กœ ํ‘œํ˜„๋œ ์ƒํƒœ์ž…๋‹ˆ๋‹ค.
64
+ # ์ฆ‰, 0์ด ํฌํ•จ๋œ ๋Œ€์นญํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜ํ•  ๋•Œ๋Š” numpy์˜ diag๋ฅผ ์ด์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
65
+
66
+ sigma = np.diag(sigma)
67
+
68
+ sigma.shape
69
+
70
+ # U, Sigma, Vt์˜ ๋‚ด์ ์„ ์ˆ˜ํ–‰ํ•˜๋ฉด, ๋‹ค์‹œ ์›๋ณธ ํ–‰๋ ฌ๋กœ ๋ณต์›์ด ๋œ๋‹ค.
71
+ # ๊ฑฐ๊ธฐ์— + ์‚ฌ์šฉ์ž ํ‰๊ท  rating์„ ์ ์šฉํ•œ๋‹ค.
72
+ svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
73
+
74
+ df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_place_ratings.columns)
75
+ df_svd_preds.head()
76
+
77
+ df_svd_preds.shape
78
+
79
+ # ์˜ˆ์ธก ๊ฒฐ๊ณผ ์ •๋ ฌ ๋ฐ ๋ฐ˜ํ™˜์ฝ”๋“œ
80
+
81
+ # ์‚ฌ์šฉ์ž์˜ ์˜ˆ์ธก ํ‰์ ์ด ๋†’์€ ์ˆœ์œผ๋กœ ์ •๋ ฌ๋œ ๋ฐ์ดํ„ฐ
82
+ # user_id๊ฐ€ 0๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋ฏ€๋กœ user_row_number๋กœ ์“ด๋‹ค. 1๋ถ€ํ„ฐ์‹œ์ž‘ํ•˜๋ฉด user_id-1ํ•˜๋ฉด๋จ.
83
+ user_id = 0 # 0๋ฒˆํšŒ์›์˜ ํ‰์  ์˜ˆ์ธก
84
+ user_row_number = user_id
85
+ sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
86
+
87
+ sorted_user_predictions = pd.DataFrame(sorted_user_predictions.reset_index())
88
+ sorted_user_predictions.columns = ['place_id', 'predict_rating']
89
+
90
+ sorted_user_predictions['place_id'] = sorted_user_predictions['place_id'].astype('int64')
91
+
92
+ # ์›๋ณธ ํ‰์  ๋ฐ์ดํ„ฐ์—์„œ user id์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœ
93
+ user_data = user_rating[user_rating['user_id'] == user_id]
94
+
95
+ # user_data์—์„œ ํ‰์ ์ด 0์ธ ๋ฐ์ดํ„ฐ๋Š” ์•„์ง ์•ˆ ๊ฐ€๋ณธ ๊ฒƒ์ด๋ฏ€๋กœ ์‚ญ์ œ
96
+ user_data = user_data[user_data['rating'] != 0.0]
97
+
98
+ # (๊ฐ€๋ณธ ์žฅ์†Œ) ์ถ”์ถœ๋œ ๋ฐ์ดํ„ฐ์™€ ์›๋ณธ ์žฅ์†Œ ๋ฐ์ดํ„ฐ๋ฅผ ํ•ฉ์นจ
99
+ user_history = user_data.merge(item_feature, on='place_id').sort_values(['rating'], ascending=False)
100
+
101
+ # ์‚ฌ์šฉ์ž๊ฐ€ ํ‰๊ฐ€ํ•˜์ง€ ์•Š์€ ์žฅ์†Œ๋ฅผ ์ถ”์ฒœ ๋Œ€์ƒ์œผ๋กœ ์„ค์ •
102
+ recommendations = item_feature[~item_feature['place_id'].isin(user_history['place_id'])]
103
+
104
+
105
+