-
Notifications
You must be signed in to change notification settings - Fork 1
/
token_selectors.py
169 lines (148 loc) · 6.29 KB
/
token_selectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import re
from collections import Counter
from separadorSilabas import silabas
import unittest
def get_syllables(word, middle, end):
'''Uses separadorSilabas to form a list of syllables
Raises TypeError:
if word is not separable i.e. if it's not a spanish word
Args:
word: string with the word to be separate
middle: char to be added at the end of the middle syllables
end: char to be added at the end of the end syllable
Returns:
List with syllables
'''
word_syllables = silabas(word).split('-')
word_syllables = [word + middle for word in word_syllables]
word_syllables[-1] = word_syllables[-1][0:-1] + end
return word_syllables
def get_most_frequent(list_tokens, quantity):
'''Selects most frequent tokens.
Args:
list_tokens: list of tokens from where to select
quantity: number that indicates the elements to be select from the list, if it's between [0,1] it's used as percentage
Returns:
set of length less than or equal to quantity (percentage*len(different tokens)) containing the most frequent tokens
'''
count = 0
ans = set()
freq_dict = Counter(list_tokens).most_common()
max_tokens = int(len(freq_dict)*quantity) if quantity <= 1 else quantity
for token, freq in freq_dict:
if len(ans) < max_tokens:
ans.add(token)
return ans
def get_next_word(string, begining_at):
j = string[begining_at:].find(' ')
return string[begining_at:begining_at+j] if j != -1 else string[begining_at:]
def get_list_words(corpus, to_ignore):
'''Returns the list of words in the corpus.'''
no_newline = corpus.split('\n')
new_corpus = []
for s in no_newline:
new_corpus += s.split(' ')
for i, word in enumerate(new_corpus):
for sign in to_ignore:
word = word.replace(sign, '')
new_corpus[i] = word
return new_corpus
class TokenSelector():
def __init__(self, final_char=':', inter_char='-'):
self.final_char = final_char
self.inter_char = inter_char
def calculate_most_frequent(corpus, quantity):
'''Selects and save as self most frequent tokens. If quantity 1 selects all of them.
Args:
corpus: string from where to select the tokens
quantity: number to indicate how many tokens to select, if it's between 0-1 is considered as percentage.
Returns:
Nothing'''
pass
def select(corpus, i, tokens_selected):
'''Appends in tokens_selected the next token starting at index i and returns updated i'''
pass
# Implementations
class PuntuactionSelector(TokenSelector):
def __init__(self, final_char=':', inter_char='-'):
super().__init__(final_char, inter_char)
self.map_punctuation = {'¿': '<ai>', '?': '<ci>', '.': '<pt>', '\n': '<nl>', ',': '<cm>'}
self.frequent = set(self.map_punctuation)
def select(self, corpus, i, tokens_selected):
if corpus[i] in self.frequent:
tokens_selected.append(self.map_punctuation[corpus[i]])
i += 1
return i
class WordSelector(TokenSelector):
def __init__(self, final_char=':', inter_char='-', to_ignore=[]):
'''
Args:
to_ignore: String containing characters to be ignored in the corpus'''
super().__init__(final_char, inter_char)
self.to_ignore = to_ignore
def calculate_most_frequent(self, corpus, quantity):
corpus = get_list_words(corpus, self.to_ignore)
# asume que hay un ' ' que separa las palabras incluso al ignorar esos caracteres
self.frequent = get_most_frequent(corpus, quantity)
def select(self, corpus, i, tokens_selected):
total_deleted = sum([1 for k, c in enumerate(corpus) if c in set(self.to_ignore) and k < i])
corpus = ''.join([c for c in corpus if c not in set(self.to_ignore)])
word = get_next_word(corpus, i - total_deleted)
if word in self.frequent:
tokens_selected.append(word+self.final_char)
i += len(word)
return i
class SyllableSelector(TokenSelector):
def __init__(self, final_char=':', inter_char='-', to_ignore=[]):
super().__init__(final_char, inter_char)
self.to_ignore = to_ignore
def calculate_most_frequent(self, corpus, quantity):
new_corpus = get_list_words(corpus, self.to_ignore)
list_syll = []
for word in new_corpus:
try:
list_syll += get_syllables(word, self.inter_char, self.final_char)
except TypeError:
print("word not considered for syllables: '{}'".format(word))
self.frequent = get_most_frequent(list_syll, quantity)
def select(self, corpus, i, tokens_selected):
total_deleted = sum([1 for k,c in enumerate(corpus) if c in set(self.to_ignore) and k<i])
corpus = ''.join([c for c in corpus if c not in set(self.to_ignore)])
word = get_next_word(corpus, i-total_deleted)
try:
syllables = get_syllables(word, self.inter_char, self.final_char)
to_add = []
len_corpus_added = 0
for syll in syllables:
if syll in self.frequent:
len_corpus_added += len(syll)-1
to_add.append(syll)
else:
break
tokens_selected += to_add
return i + len_corpus_added
except:
return i
class CharacterSelector(TokenSelector): # letter
def __init__(self, final_char=':', inter_char='-'):
super().__init__()
letters = 'aáeéoóíúiuübcdfghjklmnñopqrstvwxyz'
letters += letters.upper()
self.accepted_chars = set(letters)
def select(self, corpus, i, tokens_selected):
if corpus[i] in self.accepted_chars:
if i+1<len(corpus) and corpus[i+1] in self.accepted_chars:
tokens_selected.append(corpus[i] + self.inter_char)
else:
tokens_selected.append(corpus[i] + self.final_char)
i += 1
return i