-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathutils.py
322 lines (261 loc) · 11.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# MIT License
#
# Copyright (c) 2019 Mohamed-Achref MAIZA
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
import os
import shutil
import urllib.error
import urllib.request
import matplotlib.pyplot as plt
import matplotlib.style as style
import multiprocessing
import numpy as np
import pandas as pd
import tensorflow as tf
from joblib import Parallel, delayed
from time import time
from tqdm import tqdm
def download_parallel(movies, image_dir):
"""Downloads images from Internet in parallel.
Args:
movies (dataframe): must contain 'imdbId' and 'Poster' url columns
image_dir (string): path to destination directory
Returns:
movies (dataframe): input dataframe without posters that failed to download
"""
# Create list of filenames
filenames = movies['imdbId'].apply(lambda imbdId : os.path.join(image_dir, str(imbdId)+'.jpg'))
# Create list of image urls
urls = movies['Poster']
# Create destination directory
if os.path.exists(image_dir):
print("Directory '{}' already exists and will be deleted.".format(image_dir))
shutil.rmtree(image_dir)
print("Created new directory '{}'".format(image_dir))
os.makedirs(image_dir)
# Define function to download one single image
def download_image(url, filename):
try:
urllib.request.urlretrieve(url, filename)
return 0
except:
return os.path.basename(filename).split('.')[0]
# Download images in parallel
start = time()
print("\nDownloading...")
num_cores = multiprocessing.cpu_count()
ko_list = Parallel(n_jobs=num_cores)(delayed(download_image)(u, f) for f, u in zip(filenames, urls))
print("\nDownload in parallel mode took %d seconds." %(time()-start))
print("Success:", len([i for i in ko_list if i==0]))
print("Errors:", len([i for i in ko_list if i!=0]))
# Remove not downloaded posters from the dataframe
ko_index = movies[movies['imdbId'].isin(ko_list)].index
movies = movies.drop(ko_index)
return movies
def download_sequential(movies, image_dir):
"""Downloads images from Internet sequentially.
Args:
movies (dataframe): must contain 'imdbId' and 'Poster' columns
image_dir (string): path to destination directory
Returns:
movies (dataframe): input dataframe without posters that failed to download
"""
# Create list of filenames
filenames = movies['imdbId'].apply(lambda imbdId : os.path.join(image_dir, str(imbdId)+'.jpg'))
# Create list of image urls
urls = movies['Poster']
# Create destination directory
if os.path.exists(image_dir):
print("Directory '{}' already exists and will be deleted.".format(image_dir))
shutil.rmtree(image_dir)
print("Created new directory '{}'".format(image_dir))
os.makedirs(image_dir)
# Define function to download one single image
def download_image(image_path, filename):
urllib.request.urlretrieve(image_path, filename)
# Download images sequentially
start = time()
print("\nDownloading...")
ko_list = []
for i in tqdm(range(len(filenames))):
filename = filenames.iloc[i]
url = urls.iloc[i]
try:
download_image(url, filename)
except:
img_id = os.path.basename(filename).split('.')[0]
ko_list.append(img_id)
pass
print("\nDownload in sequential mode took %d seconds." %(time()-start))
print("Success:", (len(filenames)-len(ko_list)))
print("Errors:", len(ko_list))
# Remove not downloaded posters from the dataframe
ko_index = movies[movies['imdbId'].isin(ko_list)].index
movies = movies.drop(ko_index)
return movies
def micro_bce(y, y_hat):
"""Compute the micro binary cross-entropy on a batch of observations.
Args:
y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
y_hat (float32 Tensor): probability matrix output from forward propagation of shape (BATCH_SIZE, N_LABELS)
Returns:
cost (scalar Tensor): value of the cost function for the batch
"""
# Convert the target array to float32
y = tf.cast(y, tf.float32)
# Implement cross entropy loss for each observation and label
cross_entropy = - y * tf.math.log(tf.maximum(y_hat, 1e-16)) - (1-y) * tf.math.log(tf.maximum(1-y_hat, 1e-16))
# Average binary cross entropy across all batch observations and labels
cost = tf.reduce_mean(cross_entropy)
return cost
def macro_bce(y, y_hat):
"""Compute the macro binary cross-entropy on a batch of observations (average across all labels).
Args:
y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
Returns:
cost (scalar Tensor): value of the cost function for the batch
"""
# Convert the target array to float32
y = tf.cast(y, tf.float32)
# Implement cross entropy loss for each observation and label
cross_entropy = - y * tf.math.log(tf.maximum(y_hat, 1e-16)) - (1-y) * tf.math.log(tf.maximum(1-y_hat, 1e-16))
# Average all binary cross entropy losses over the whole batch for each label
cost = tf.reduce_mean(cross_entropy, axis=0)
# Average all binary cross entropy losses over labels within the batch
cost = tf.reduce_mean(cost)
return cost
def macro_double_soft_f1(y, y_hat):
"""Compute the macro soft F1-score as a cost (average 1 - soft-F1 across all labels).
Use probability values instead of binary predictions.
This version uses the computation of soft-F1 for both positive and negative class for each label.
Args:
y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
Returns:
cost (scalar Tensor): value of the cost function for the batch
"""
y = tf.cast(y, tf.float32)
y_hat = tf.cast(y_hat, tf.float32)
tp = tf.reduce_sum(y_hat * y, axis=0)
fp = tf.reduce_sum(y_hat * (1 - y), axis=0)
fn = tf.reduce_sum((1 - y_hat) * y, axis=0)
tn = tf.reduce_sum((1 - y_hat) * (1 - y), axis=0)
soft_f1_class1 = 2*tp / (2*tp + fn + fp + 1e-16)
soft_f1_class0 = 2*tn / (2*tn + fn + fp + 1e-16)
cost_class1 = 1 - soft_f1_class1 # reduce 1 - soft-f1_class1 in order to increase soft-f1 on class 1
cost_class0 = 1 - soft_f1_class0 # reduce 1 - soft-f1_class0 in order to increase soft-f1 on class 0
cost = 0.5 * (cost_class1 + cost_class0) # take into account both class 1 and class 0
macro_cost = tf.reduce_mean(cost) # average on all labels
return macro_cost
def learning_curves(history):
"""Plot the learning curves of loss and macro f1 score
for the training and validation datasets.
Args:
history: history callback of fitting a tensorflow keras model
"""
loss = history.history['loss']
val_loss = history.history['val_loss']
macro_f1 = history.history['macro_f1']
val_macro_f1 = history.history['val_macro_f1']
epochs = len(loss)
style.use("bmh")
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(range(1, epochs+1), loss, label='Training Loss')
plt.plot(range(1, epochs+1), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.subplot(2, 1, 2)
plt.plot(range(1, epochs+1), macro_f1, label='Training Macro F1-score')
plt.plot(range(1, epochs+1), val_macro_f1, label='Validation Macro F1-score')
plt.legend(loc='lower right')
plt.ylabel('Macro F1-score')
plt.title('Training and Validation Macro F1-score')
plt.xlabel('epoch')
plt.show()
return loss, val_loss, macro_f1, val_macro_f1
def perf_grid(ds, target, label_names, model, n_thresh=100):
"""Computes the performance table containing target, label names,
label frequencies, thresholds between 0 and 1, number of tp, fp, fn,
precision, recall and f-score metrics for each label.
Args:
ds (tf.data.Datatset): contains the features array
target (numpy array): target matrix of shape (BATCH_SIZE, N_LABELS)
label_names (list of strings): column names in target matrix
model (tensorflow keras model): model to use for prediction
n_thresh (int) : number of thresholds to try
Returns:
grid (Pandas dataframe): performance table
"""
# Get predictions
y_hat_val = model.predict(ds)
# Define target matrix
y_val = target
# Find label frequencies in the validation set
label_freq = target.sum(axis=0)
# Get label indexes
label_index = [i for i in range(len(label_names))]
# Define thresholds
thresholds = np.linspace(0,1,n_thresh+1).astype(np.float32)
# Compute all metrics for all labels
ids, labels, freqs, tps, fps, fns, precisions, recalls, f1s = [], [], [], [], [], [], [], [], []
for l in label_index:
for thresh in thresholds:
ids.append(l)
labels.append(label_names[l])
freqs.append(round(label_freq[l]/len(y_val),2))
y_hat = y_hat_val[:,l]
y = y_val[:,l]
y_pred = y_hat > thresh
tp = np.count_nonzero(y_pred * y)
fp = np.count_nonzero(y_pred * (1-y))
fn = np.count_nonzero((1-y_pred) * y)
precision = tp / (tp + fp + 1e-16)
recall = tp / (tp + fn + 1e-16)
f1 = 2*tp / (2*tp + fn + fp + 1e-16)
tps.append(tp)
fps.append(fp)
fns.append(fn)
precisions.append(precision)
recalls.append(recall)
f1s.append(f1)
# Create the performance dataframe
grid = pd.DataFrame({
'id':ids,
'label':labels,
'freq':freqs,
'threshold':list(thresholds)*len(label_index),
'tp':tps,
'fp':fps,
'fn':fns,
'precision':precisions,
'recall':recalls,
'f1':f1s})
grid = grid[['id', 'label', 'freq', 'threshold',
'tp', 'fn', 'fp', 'precision', 'recall', 'f1']]
return grid
def print_time(t):
"""Function that converts time period in seconds into %h:%m:%s expression.
Args:
t (int): time period in seconds
Returns:
s (string): time period formatted
"""
h = t//3600
m = (t%3600)//60
s = (t%3600)%60
return '%dh:%dm:%ds'%(h,m,s)