-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain5.py
172 lines (148 loc) · 4.58 KB
/
main5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import random
def tocrfformat(data):
crfFormat = []
for line in data:
words = line.split()[1:]
if len(words) == 0:
continue
for word in words:
if "/t" not in word:
word = word.split('/')[0]
length = len(word)
for i in range(0, length):
crfFormat.append(word[i] + " " + "O" + "\n")
else: # 是时间词
word = word.split('/')[0]
length = len(word)
if length >= 2: # length >= 3
crfFormat.append(word[0] + " " + "B-PER" + "\n")
for i in range(1, length):
crfFormat.append(word[i] + " " + "I-PER" + "\n")
crfFormat.append("\n")
return crfFormat
def readFile(filename, data):
f1 = open(filename, encoding='utf-8')
for line in f1.readlines():
if "/t" not in line:
continue
if line.strip():
data.append(line)
f1.close()
data = []
f = 199800
for i in range(6):
f = f + 1
readFile("FullData/" + f.__str__() + ".txt", data)
timewords = set()
f1 = open("prefix.txt", 'r', encoding='utf-8')
prefix = set()
for word in f1.readlines():
prefix.add(word.strip())
f1.close()
newdata2 = []
for line in data: # 前缀为prefix中词的时间词
newline = ""
words = line.split()
words.append(" ")
timeword = ""
i = 0
while i < (words.__len__() - 1):
pre = words[i].split("/")[0]
if pre not in prefix: # 前词不在prefix表中
newline += words[i] + " "
i += 1
continue
else: # 前词在prefix表中
if "/t" not in words[i + 1]: # 后词不是/t词
newline += words[i] + " "
i += 1
continue
else: # 后词是/t词
timeword = pre + words[i + 1]
timewords.add(timeword)
newline += timeword + " "
i += 2
newdata2.append(newline)
f2 = open("suffix.txt", 'r', encoding='utf-8')
suffix = set()
for word in f2.readlines():
suffix.add(word.strip())
f2.close()
newdata3 = []
for line in newdata2: # 后缀为suffix中词的时间词
newline = ""
timeword = ""
words = line.split()
words.append(" ")
i = 0
while i < (words.__len__() - 1):
suf = words[i + 1].split("/")[0]
if "/t" not in words[i]: # 该词不是/t词
newline += words[i] + " "
i += 1
continue
elif "/t" in words[i] and suf not in suffix: # 该词是/t词 但后词不是suffix词表中的词
newline += words[i] + " "
# timewords.add(words[i])
i += 1
continue
else: # 该词是/t词 而且 后词是suffix词表中的词
timeword = words[i].replace("/t", "") + suf + "/t"
# timewords.add(timeword)
newline += timeword + " "
i += 2
newdata3.append(newline)
newdata4 = []
for line in newdata3:
newline = ""
words = line.split()
words.append(" ")
timeword = ""
for i in range(words.__len__() - 1):
if "/t" not in words[i] and timeword == "":
newline += words[i] + " "
elif "/t" in words[i]:
timeword += words[i]
else:
timeword = timeword.replace("/t", "") + "/t"
newline += timeword + " " + words[i] + " "
timeword = ""
newdata4.append(newline)
for line in newdata4:
words = line.split()
for word in words:
if "/t" in word:
timewords.add(word)
f = open("newdata.txt", 'w', encoding='utf-8')
for line in newdata4:
f.write(line + "\n")
f.close()
random.shuffle(newdata4)
testSet = newdata4[0:int(len(newdata4) / 10)]
devSet = newdata4[int(len(newdata4)/10):int(len(newdata4) / 10)*2]
trainSet = newdata4[int(len(newdata4) / 10)*2:]
f = open("TestSet.txt", 'w', encoding='utf-8', newline='\n')
for a in testSet:
f.write(a + '\n')
f.close()
crfTestSet = tocrfformat(testSet)
crfTrainSet = tocrfformat(trainSet)
crfDevSet = tocrfformat(devSet)
f = open("example.test", 'w', encoding='utf-8', newline='\n')
for a in crfTestSet:
f.write(a)
f.close()
f = open("example.train", 'w', encoding='utf-8', newline='\n')
for a in crfTrainSet:
f.write(a)
f.close()
f = open("example.dev", 'w', encoding='utf-8', newline='\n')
for a in crfDevSet:
f.write(a)
f.close()
out = open("timewords.txt", 'w', encoding='utf-8', newline='\n')
for word in timewords:
word = word.split("/")[0]
out.write(word + '\n')
out.close()
print("end")