-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
117 lines (85 loc) · 3.42 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from kogpt2.data import sentencePieceTokenizer, toString
import sys, os
import re
import kss
def divideSentence():
file_name = 'twitter_ai'
load_path = '/home/user/shame/dataset/data/' + file_name + '.txt'
save_file = open('/home/user/shame/dataset/data/' + file_name + '_1.txt', 'w', encoding='utf-8')
file = open(load_path, 'r', encoding='utf-8')
while True:
line = file.readline()
if not line:
file.close()
break
for sent in kss.split_sentences(line):
sent = _clean_str(sent)
save_file.write(sent + '\n')
save_file.close()
def _clean_str(text):
pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거
text = re.sub(pattern=pattern, repl='', string=text)
pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
text = re.sub(pattern=pattern, repl='', string=text)
pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 한글 자음, 모음 제거
text = re.sub(pattern=pattern, repl='', string=text)
pattern = '<[^>]*>' # HTML 태그 제거
text = re.sub(pattern=pattern, repl='', string=text)
pattern = '[^\w\s]' # 특수기호제거
text = re.sub(pattern=pattern, repl='', string=text)
return text
def makeDataFile():
load_path = '/home/user/shame/dataset/data/'
file_list = os.listdir(load_path)
data_files = [file for file in file_list if file.endswith(".txt")]
save_file = open('/home/user/shame/dataset/sin.txt', 'w', encoding='utf-8')
for data_file in data_files:
file = open(load_path + data_file, 'r', encoding='utf-8')
while True:
line = file.readline()
if not line:
file.close()
break
if line == '\n':
continue
line = _clean_str(line)
save_file.write(line)
save_file.close()
def makeDataUnderMaxTokenLen(max_token_len):
# tokenizer
sentencepieceTokenizer= sentencePieceTokenizer()
# Files for read and write
file_name = 'sin.txt'
file = open('./dataset/' + file_name, 'r', encoding='utf-8')
untokenized_file = open('./dataset/untokenized_' + file_name, 'w', encoding='utf-8')
tokenized_file = open('./dataset/tokenized_' + file_name, 'w', encoding='utf-8')
# Data for saving that will use on training
untokenized = ""
tokenized = ""
data_length = 0
# Preprocess datas
while True:
line = file.readline()
if not line:
break
tokenized_line = sentencepieceTokenizer(line)
# Data length for writing has to under 1022
# input data can get 1024 token
# but we need to use BOS and EOS token
if data_length + len(tokenized_line) + 2 >= max_token_len: # bos와 eos 토큰 갯수 고려 +2
untokenized_file.write(untokenized + '\n')
tokenized_file.write(tokenized + '\n')
untokenized = ""
tokenized = ""
data_length = 0
untokenized = untokenized + "<s>" + line[:-1] + "</s>"
tokenized = tokenized + "<s>" + toString(tokenized_line) + "</s>"
data_length = data_length + len(tokenized_line) + 2 # bos와 eos 토큰 갯수 고려 +2
file.close()
untokenized_file.close()
tokenized_file.close()
if __name__ == "__main__":
# execute only if run as a script
# divideSentence()
makeDataFile()
makeDataUnderMaxTokenLen(1024)