-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathpreprocess.py
70 lines (50 loc) · 1.65 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from math import log
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def ShannonEntropyAndNomalize(name):
entropy = []
upper_list = []
lower_list = []
for s in name:
word = {}
upper_num = 0
lower_num = 0
for c in s:
if c.isupper():
upper_num += 1/len(s)
elif c.islower():
lower_num += 1/len(s)
else:
pass
currentlabel = c
if c not in word.keys():
word[c] = 0
word[currentlabel] += 1
upper_list.append(upper_num)
lower_list.append(lower_num)
ShannonEnt = 0.0
for i in word:
prob = float(word[i])/len(s)
ShannonEnt -= prob * log(prob, 2)
entropy.append(ShannonEnt)
return entropy, upper_list, lower_list
def gene_bigram(string):
if len(string) < 2:
ngrams = [string]
else:
ngrams = [string[i - 2 : i] for i in range(2, len(string) + 1 )]
return ngrams
def TFIDF(name):
word_bigram_ = [gene_bigram(word) for word in name]
word_bigram = []
for word in word_bigram_:
word_bigram.append(' '.join(word))
vectorizer = CountVectorizer()
word_freq = vectorizer.fit_transform(word_bigram)
tfidftrans = TfidfTransformer()
tfidf = tfidftrans.fit_transform(word_freq)
tfidf_list = tfidf.toarray().tolist()
return tfidf_list
if __name__ == '__main__':
name = ['dasfasdfas', 'Aabbcc', 'mazihan880_aaaaa','aa88']
print(TFIDF(name))