forked from microsoft/MLHyperparameterTuning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_utilities.py
86 lines (67 loc) · 2.51 KB
/
text_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Copyright (C) Microsoft Corporation. All rights reserved.
import pandas as pd
import re
import math
import gzip
import requests
import json
def read_csv_gz(url, **kwargs):
"""Load raw data from a .tsv.gz file into Pandas data frame."""
df = pd.read_csv(gzip.open(requests.get(url, stream=True).raw, mode='rb'),
sep='\t', encoding='utf8', **kwargs)
return df.set_index('Id')
def clean_text(text):
"""Remove embedded code chunks, HTML tags and links/URLs."""
if not isinstance(text, str):
return text
text = re.sub('<pre><code>.*?</code></pre>', '', text)
text = re.sub('<a[^>]+>(.*)</a>', replace_link, text)
return re.sub('<[^>]+>', '', text)
def replace_link(match):
if re.match('[a-z]+://', match.group(1)):
return ''
else:
return match.group(1)
def round_sample(X, frac=0.1, min=1):
"""Sample X ensuring at least min samples are selected."""
n = max(min, math.floor(len(X) * frac))
return X.sample(n)
def round_sample_strat(X, strat, **kwargs):
"""Sample X ensuring at least min samples are selected."""
return X.groupby(strat).apply(round_sample, **kwargs)
def random_merge(A, B, N=20, on='AnswerId', key='key', n='n'):
"""Pair all rows of A with 1 matching row on "on" and N-1 random rows from B
"""
if key in A or key in B:
raise KeyError('key {} is either in A or in B'.format(key))
X = A.copy()
X[key] = A[on]
Y = B.copy()
Y[key] = B[on]
match = X.merge(Y, on=key).drop(key, axis=1)
match[n] = 0
df_list = [match]
for i in A.index:
X = A.loc[[i]]
Y = B[B[on] != X[on].iloc[0]].sample(N-1)
X[key] = 1
Y[key] = 1
Z = X.merge(Y, how='outer', on=key).drop(key, axis=1)
Z[n] = range(1, N)
df_list.append(Z)
df = pd.concat(df_list, ignore_index=True)
return df
def text_to_json(text):
return json.dumps({'input': '{0}'.format(text)})
def write_json_to_file(json_dict, filename, mode='w'):
with open(filename, mode) as outfile:
json.dump(json_dict, outfile, indent=4, sort_keys=True)
outfile.write('\n\n')
def read_questions(path, id, answerid):
"""Read in a questions file with at least Id and AnswerId columns."""
questions = pd.read_csv(path, sep='\t', encoding='latin1')
questions[id] = questions[id].astype(str)
questions[answerid] = questions[answerid].astype(str)
questions = questions.set_index(id, drop=False)
questions.sort_index(inplace=True)
return questions