-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
147 lines (108 loc) · 3.8 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
import re
import nltk
import string
from utils import stopwords
WILDCARD_DEFAULT_VALUE = 1
WILDCARD_MASK = u'*~{}'
WILDCARD = WILDCARD_MASK.format(WILDCARD_DEFAULT_VALUE)
def wildcard_if_token_not_in(token, words):
"""Returns the wildcard if token is NOT IN the list of words.
Args:
token (str): Word to be analised.
words (list): List of words that the token must be in to not
return wildcard.
Returns:
str: Token or wildcard if token not in words.
"""
return token if token in words else WILDCARD
def remove_punctation(tokens):
"""This method removes punctuation from the text.
Args:
tokens (list): List of tokens to remove punctuation.
Returns:
list: List of tokens without punctuation.
"""
non_stopwords = []
for token in tokens:
if token not in string.punctuation:
non_stopwords.append(token)
return non_stopwords
def remove_stopwords(text):
"""This method removes stopwords from the text.
Args:
text (str): Text to remove stopwords.
Returns:
text: Text without stopwords.
"""
tokens = nltk.word_tokenize(text)
non_stopwords = [tk for tk in tokens if tk not in stopwords.stopwords]
return ' '.join(non_stopwords)
def replace_stopwords(tokens):
"""This method replaces stopwords from the text by wildcard.
Args:
tokens (list): List of tokens to remove punctuation.
Returns:
list: List of tokens without punctuation.
"""
non_stopwords = []
for token in tokens:
new_token = token if token not in stopwords.stopwords else WILDCARD
non_stopwords.append(new_token)
return non_stopwords
def strip_wildcards(text):
"""Removes wildcards at start or ending of text. It is used to
improve rule generalization power.
Args:
text (str): Text.
Returns:
str: Text without starting and ending wildcards.
"""
removed_start = re.sub(r'^(\*~\d+ ?)+', '', text.strip())
removed_end = re.sub(r'(\*~\d+ ?)+$', '', removed_start)
return removed_end.strip()
def add_wildcards(text):
"""Sum the consecutive wildcards and replace them by the result.
Args:
text (str): Text.
Returns:
str: Text with added wildcards.
"""
followed_wd = re.search(r'(\*~\d+ ?){2,}', text)
while followed_wd:
followed_wd_text = followed_wd.group().strip()
wd_values = re.findall(r'\*~(\d+)', followed_wd_text)
wd_added = sum([int(wd_value) for wd_value in wd_values])
text = text.replace(followed_wd_text, WILDCARD_MASK.format(wd_added))
followed_wd = re.search(r'(\*~\d+ ?){2,}', text)
return text
def replace_context_entities(ctx_entities, text):
"""Search form context entities in the text and join them with
underscore.
Args:
text (str): Text.
Returns:
list: List of tokens with context entities undescored.
"""
for entity in ctx_entities:
text = text.replace(entity, '_'.join(entity.split()))
return text
def preprocess(question, ctx_entities):
"""Do all steps of question preprocessing.
Args:
question (str): Original question in natural language.
ctx_entities (list): Context entities.
Returns:
str: Question lower, without punctuation and with stopwords
replaced by wildcards.
"""
rplcd_ctx_entities = replace_context_entities(
ctx_entities, question.lower()
)
tokens = nltk.word_tokenize(rplcd_ctx_entities)
no_punctuation = remove_punctation(tokens)
no_stopwords = replace_stopwords(no_punctuation)
no_stopwords = ' '.join(no_stopwords)
striped_wildcards = strip_wildcards(no_stopwords)
added_wildcards = add_wildcards(striped_wildcards)
return added_wildcards