-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
133 lines (104 loc) · 4.16 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from torch import nn
import torch
import numpy as np
import re,string, typing, gc, json
import torch.nn.functional as F
import spacy
import model, parsing, squad
from sklearn.model_selection import train_test_split
from collections import Counter
import torch.optim as optim
from torch.autograd import Variable
from tqdm import tqdm
def evaluate(predictions):
'''
Gets a dictionary of predictions with question_id as key
and prediction as value. The validation dataset has multiple
answers for a single question. Hence we compare our prediction
with all the answers and choose the one that gives us
the maximum metric (em or f1).
This method first parses the JSON file, gets all the answers
for a given id and then passes the list of answers and the
predictions to calculate em, f1.
:param dict predictions
Returns
: exact_match: 1 if the prediction and ground truth
match exactly, 0 otherwise.
: f1_score:
'''
with open('./data/squad_dev.json','r',encoding='utf-8') as f:
dataset = json.load(f)
dataset = dataset['data']
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return exact_match, f1
def normalize_answer(s):
'''
Performs a series of cleaning steps on the ground truth and
predicted answer.
'''
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
'''
Returns maximum value of metrics for predicition by model against
multiple ground truths.
:param func metric_fn: can be 'exact_match_score' or 'f1_score'
:param str prediction: predicted answer span by the model
:param list ground_truths: list of ground truths against which
metrics are calculated. Maximum values of
metrics are chosen.
'''
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def f1_score(prediction, ground_truth):
'''
Returns f1 score of two strings.
'''
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
'''
Returns exact_match_score of two strings.
'''
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def epoch_time(start_time, end_time):
'''
Helper function to record epoch time.
'''
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs