-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDoc_import.py
117 lines (89 loc) · 3.66 KB
/
Doc_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def doc_import(docs):
docs = input("Input filename: ")
while len(docs) < 1: break
with open(docs, 'r') as f:
for line in f:
line = f.read().replace('\n', '')
return line
def import_pan21(base_path, returns=1000):
import os
import json
test_train = os.listdir(base_path)
train_features = []
train_labels = []
test_features = []
test_labels = []
for dic in test_train:
path = base_path + "/" + dic
data_path = os.listdir(path)
if dic == test_train[0]:
for i, file_name in enumerate(data_path[0:int(len(data_path) / 2)]):
file_path = path + "/" + file_name
with open(file_path, 'r', encoding='utf8') as f:
text = f.readlines()
train_features.append(text)
for i, file_name in enumerate(data_path[int(len(data_path) / 2):int(len(data_path))]):
file_path = path + "/" + file_name
f = open(file_path)
text = json.load(f)
train_labels.append(text)
if dic == test_train[1]:
for i, file_name in enumerate(data_path[0:int(len(data_path) / 2)]):
file_path = path + "/" + file_name
with open(file_path, 'r', encoding='utf8') as f:
text = f.readlines()
test_features.append(text)
for i, file_name in enumerate(data_path[int(len(data_path) / 2):int(len(data_path))]):
file_path = path + "/" + file_name
f = open(file_path)
text = json.load(f)
train_labels.append(text)
return train_features[:returns], train_labels[:returns], test_features[:returns], test_labels[:returns]
def import_verification(base_path):
import os
import json
import re
test_train = os.listdir(base_path)
train_features = []
train_labels = []
test_features = []
test_labels = []
for dic in test_train:
if dic == test_train[0]:
with open(dic, 'r') as handle:
text_data = handle.read()
text_data = '[' + re.sub(r'\}\s\{', '},{', text_data) + ']'
json_data = json.loads(text_data)
for item in json_data:
train_labels.append(item)
if dic == test_train[1]:
with open(dic, 'r') as handle:
text_data = handle.read()
text_data = '[' + re.sub(r'\}\s\{', '},{', text_data) + ']'
json_data = json.loads(text_data)
train_features.append(json_data)
if dic == test_train[2]:
with open(dic, 'r') as handle:
text_data = handle.read()
text_data = '[' + re.sub(r'\}\s\{', '},{', text_data) + ']'
json_data = json.loads(text_data)
test_features.append(json_data)
if dic == test_train[3]:
with open(dic, 'r') as handle:
text_data = handle.read()
text_data = '[' + re.sub(r'\}\s\{', '},{', text_data) + ']'
json_data = json.loads(text_data)
test_labels.append(json_data)
return train_features, train_labels , test_features, test_labels
def text_preprocessing(fname_labels, fname_features):
import json
same_author = {}
for line in open(fname_labels):
author = json.loads(line.strip())
same_author[author['id']] = int(author['same'])
texts = []
for line in open(fname_features):
text = json.loads(line.strip())
if text['id'] in same_author:
texts.extend(text['pair'])
return same_author, texts