-
Notifications
You must be signed in to change notification settings - Fork 11
/
helpers.py
205 lines (168 loc) · 6.17 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import scipy.sparse as sp
import numpy as np
def threshold_interactions_df(df, row_name, col_name, row_min, col_min):
"""Limit interactions df to minimum row and column interactions.
Parameters
----------
df : DataFrame
DataFrame which contains a single row for each interaction between
two entities. Typically, the two entities are a user and an item.
row_name : str
Name of column in df which corresponds to the eventual row in the
interactions matrix.
col_name : str
Name of column in df which corresponds to the eventual column in the
interactions matrix.
row_min : int
Minimum number of interactions that the row entity has had with
distinct column entities.
col_min : int
Minimum number of interactions that the column entity has had with
distinct row entities.
Returns
-------
df : DataFrame
Thresholded version of the input df. Order of rows is not preserved.
Examples
--------
df looks like:
user_id | item_id
=================
1001 | 2002
1001 | 2004
1002 | 2002
thus, row_name = 'user_id', and col_name = 'item_id'
If we were to set row_min = 2 and col_min = 1, then the returned df would
look like
user_id | item_id
=================
1001 | 2002
1001 | 2004
"""
n_rows = df[row_name].unique().shape[0]
n_cols = df[col_name].unique().shape[0]
sparsity = float(df.shape[0]) / float(n_rows*n_cols) * 100
print('Starting interactions info')
print('Number of rows: {}'.format(n_rows))
print('Number of cols: {}'.format(n_cols))
print('Sparsity: {:4.3f}%'.format(sparsity))
done = False
while not done:
starting_shape = df.shape[0]
col_counts = df.groupby(row_name)[col_name].count()
df = df[~df[row_name].isin(col_counts[col_counts < col_min].index.tolist())]
row_counts = df.groupby(col_name)[row_name].count()
df = df[~df[col_name].isin(row_counts[row_counts < row_min].index.tolist())]
ending_shape = df.shape[0]
if starting_shape == ending_shape:
done = True
n_rows = df[row_name].unique().shape[0]
n_cols = df[col_name].unique().shape[0]
sparsity = float(df.shape[0]) / float(n_rows*n_cols) * 100
print('Ending interactions info')
print('Number of rows: {}'.format(n_rows))
print('Number of columns: {}'.format(n_cols))
print('Sparsity: {:4.3f}%'.format(sparsity))
return df
def get_df_matrix_mappings(df, row_name, col_name):
"""Map entities in interactions df to row and column indices
Parameters
----------
df : DataFrame
Interactions DataFrame.
row_name : str
Name of column in df which contains row entities.
col_name : str
Name of column in df which contains column entities.
Returns
-------
rid_to_idx : dict
Maps row ID's to the row index in the eventual interactions matrix.
idx_to_rid : dict
Reverse of rid_to_idx. Maps row index to row ID.
cid_to_idx : dict
Same as rid_to_idx but for column ID's
idx_to_cid : dict
"""
# Create mappings
rid_to_idx = {}
idx_to_rid = {}
for (idx, rid) in enumerate(df[row_name].unique().tolist()):
rid_to_idx[rid] = idx
idx_to_rid[idx] = rid
cid_to_idx = {}
idx_to_cid = {}
for (idx, cid) in enumerate(df[col_name].unique().tolist()):
cid_to_idx[cid] = idx
idx_to_cid[idx] = cid
return rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid
def df_to_matrix(df, row_name, col_name):
"""Take interactions dataframe and convert to a sparse matrix
Parameters
----------
df : DataFrame
row_name : str
col_name : str
Returns
-------
interactions : sparse csr matrix
rid_to_idx : dict
idx_to_rid : dict
cid_to_idx : dict
idx_to_cid : dict
"""
rid_to_idx, idx_to_rid,\
cid_to_idx, idx_to_cid = get_df_matrix_mappings(df,
row_name,
col_name)
def map_ids(row, mapper):
return mapper[row]
I = df[row_name].apply(map_ids, args=[rid_to_idx]).as_matrix()
J = df[col_name].apply(map_ids, args=[cid_to_idx]).as_matrix()
V = np.ones(I.shape[0])
interactions = sp.coo_matrix((V, (I, J)), dtype=np.float64)
interactions = interactions.tocsr()
return interactions, rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid
def train_test_split(interactions, split_count, fraction=None):
"""
Split recommendation data into train and test sets
Params
------
interactions : scipy.sparse matrix
Interactions between users and items.
split_count : int
Number of user-item-interactions per user to move
from training to test set.
fractions : float
Fraction of users to split off some of their
interactions into test set. If None, then all
users are considered.
"""
# Note: likely not the fastest way to do things below.
train = interactions.copy().tocoo()
test = sp.lil_matrix(train.shape)
if fraction:
try:
user_index = np.random.choice(
np.where(np.bincount(train.row) >= split_count * 2)[0],
replace=False,
size=np.int64(np.floor(fraction * train.shape[0]))
).tolist()
except:
print(('Not enough users with > {} '
'interactions for fraction of {}')\
.format(2*split_count, fraction))
raise
else:
user_index = range(train.shape[0])
train = train.tolil()
for user in user_index:
test_interactions = np.random.choice(interactions.getrow(user).indices,
size=split_count,
replace=False)
train[user, test_interactions] = 0.
# These are just 1.0 right now
test[user, test_interactions] = interactions[user, test_interactions]
# Test and training are truly disjoint
assert(train.multiply(test).nnz == 0)
return train.tocsr(), test.tocsr(), user_index