-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
54 lines (37 loc) · 1.08 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import sys
TRAIN_N = 5000
TEST_N = 1000
def cleanContent(content):
cleaned = content.replace('"', "")
cleaned = cleaned.replace('[', "")
cleaned = cleaned.replace(']', "")
cleaned = cleaned.replace(',', " ")
cleaned = cleaned.replace('\n', "")
cleaned = cleaned.lstrip()
cleaned = ' '.join(cleaned.split())
return cleaned
class Data:
def __init__(self, row):
row = row.rstrip()
self.sentiment = ""
self.content = ""
dashes = 0
for c in row:
if c == "-": dashes += 1
elif dashes >= 3 and dashes < 6: self.sentiment += c
elif dashes >= 6: self.content += c
self.content = cleanContent(self.content)
def write(self, fp):
fp.write(self.content + "," + self.sentiment + "\n")
dirty = open("./data/isear.txt")
train = open("./data/train.csv", "w")
test = open("./data/test.csv", "w")
for i, line in enumerate(dirty):
if i == TRAIN_N + TEST_N: break
d = Data(line)
out = train if i < TRAIN_N else test
d.write(out)
print("Cleaned " + str(TRAIN_N + TEST_N) + " rows of data.")
dirty.close()
train.close()
test.close()