-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
68 lines (46 loc) · 1.14 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import math
import string
def word2vec(f, choice, fll):
poswords=[]
negwords=[]
pronouns = ["i", "me","mine","my","you","your","yours","we","us","ours"]
negation = ["no","but","nothing","never","nope"]
f1 = open('positive-words.csv', 'r+')
for line in f1:
poswords.append(line.strip())
f1.close()
f2 = open('negative-words.csv', 'r+')
for line in f2:
negwords.append(line.strip())
f2.close()
for line in f:
x1=0
x2=0
x31=0
x4=0
Y=choice
new_line =line.translate(str.maketrans('', '', string.punctuation))
temp = new_line.split(" ")
for token in temp:
if token in poswords:
x1+=1
if token in negwords:
x2+=1
if token in negation:
x31 = 1
if token in pronouns:
x4+=1
x5 = math.log(len(temp))
x32 = 1-x31 #dummy coded categorical variable
final = str(x1)+','+str(x2)+','+str(x31)+','+str(x32)+','+str(x4)+','+str(x5)+','+str(Y)+'\n'
fll.write(final)
def main():
fll = open('word2vec.txt', 'w')
f1 = open('positive.txt', 'r+')
word2vec(f1, 1, fll)
f1.close()
f2 = open('negative.txt', 'r+')
word2vec(f2, 0, fll)
f2.close()
fll.close()
main()