-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathRecommenders.py
216 lines (164 loc) · 9.12 KB
/
Recommenders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import numpy as np
import pandas
#Class for Popularity based Recommender System model
class popularity_recommender():
def __init__(self):
self.train_data = None
self.user_id = None
self.item_id = None
self.popularity_recommendations = None
#Create the popularity based recommender system model
def create(self, train_data, user_id, item_id):
self.train_data = train_data
self.user_id = user_id
self.item_id = item_id
#Get a count of user_ids for each unique song as recommendation score
train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
#Sort the songs based upon recommendation score
train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
#Generate a recommendation rank based upon score
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
#Get the top 10 recommendations
self.popularity_recommendations = train_data_sort.head(10)
#Use the popularity based recommender system model to
#make recommendations
def recommend(self, user_id):
user_recommendations = self.popularity_recommendations
#Add user_id column for which the recommendations are being generated
user_recommendations['user_id'] = user_id
#Bring user_id column to the front
cols = user_recommendations.columns.tolist()
cols = cols[-1:] + cols[:-1]
user_recommendations = user_recommendations[cols]
return user_recommendations
#Class for Item similarity based Recommender System model
class item_similarity_recommender():
def __init__(self):
self.train_data = None
self.user_id = None
self.item_id = None
self.cooccurence_matrix = None
self.songs_dict = None
self.rev_songs_dict = None
self.item_similarity_recommendations = None
#Get unique items (songs) corresponding to a given user
def get_user_items(self, user):
user_data = self.train_data[self.train_data[self.user_id] == user]
user_items = list(user_data[self.item_id].unique())
return user_items
#Get unique users for a given item (song)
def get_item_users(self, item):
item_data = self.train_data[self.train_data[self.item_id] == item]
item_users = set(item_data[self.user_id].unique())
return item_users
#Get unique items (songs) in the training data
def get_all_items_train_data(self):
all_items = list(self.train_data[self.item_id].unique())
return all_items
#Construct cooccurence matrix
def construct_cooccurence_matrix(self, user_songs, all_songs):
####################################
#Get users for all songs in user_songs.
####################################
user_songs_users = []
for i in range(0, len(user_songs)):
user_songs_users.append(self.get_item_users(user_songs[i]))
###############################################
#Initialize the item cooccurence matrix of size
#len(user_songs) X len(songs)
###############################################
cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
#############################################################
#Calculate similarity between user songs and all unique songs
#in the training data
#############################################################
for i in range(0,len(all_songs)):
#Calculate unique listeners (users) of song (item) i
songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
users_i = set(songs_i_data[self.user_id].unique())
for j in range(0,len(user_songs)):
#Get unique listeners (users) of song (item) j
users_j = user_songs_users[j]
#Calculate intersection of listeners of songs i and j
users_intersection = users_i.intersection(users_j)
#Calculate cooccurence_matrix[i,j] as Jaccard Index
if len(users_intersection) != 0:
#Calculate union of listeners of songs i and j
users_union = users_i.union(users_j)
cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
else:
cooccurence_matrix[j,i] = 0
return cooccurence_matrix
#Use the cooccurence matrix to make top recommendations
def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
#Calculate a weighted average of the scores in cooccurence matrix for all user songs.
user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
user_sim_scores = np.array(user_sim_scores)[0].tolist()
#Sort the indices of user_sim_scores based upon their value
#Also maintain the corresponding score
sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
#Create a dataframe from the following
columns = ['user_id', 'song', 'score', 'rank']
#index = np.arange(1) # array of numbers for the number of samples
df = pandas.DataFrame(columns=columns)
#Fill the dataframe with top 10 item based recommendations
rank = 1
for i in range(0,len(sort_index)):
if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
rank = rank+1
#Handle the case where there are no recommendations
if df.shape[0] == 0:
print("The current user has no songs for training the item similarity based recommendation model.")
return -1
else:
return df
#Create the item similarity based recommender system model
def create(self, train_data, user_id, item_id):
self.train_data = train_data
self.user_id = user_id
self.item_id = item_id
#Use the item similarity based recommender system model to
#make recommendations
def recommend(self, user):
########################################
#A. Get all unique songs for this user
########################################
user_songs = self.get_user_items(user)
print("No. of unique songs for the user: %d" % len(user_songs))
######################################################
#B. Get all unique items (songs) in the training data
######################################################
all_songs = self.get_all_items_train_data()
print("no. of unique songs in the training set: %d" % len(all_songs))
###############################################
#C. Construct item cooccurence matrix of size
#len(user_songs) X len(songs)
###############################################
cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
#######################################################
#D. Use the cooccurence matrix to make recommendations
#######################################################
df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
return df_recommendations
#Get similar items to given items
def get_similar_items(self, item_list):
user_songs = item_list
######################################################
#B. Get all unique items (songs) in the training data
######################################################
all_songs = self.get_all_items_train_data()
print("no. of unique songs in the training set: %d" % len(all_songs))
###############################################
#C. Construct item cooccurence matrix of size
#len(user_songs) X len(songs)
###############################################
cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
#######################################################
#D. Use the cooccurence matrix to make recommendations
#######################################################
user = ""
df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
return df_recommendations