forked from JPOonGIT/Plagiarism_Detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDoc_import.py
99 lines (78 loc) · 3.31 KB
/
Doc_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def doc_import():
docs = input("Input filename: ")
while len(docs) < 1: break
with open(docs, 'r') as f:
for line in f:
line = f.read().replace('\n', '')
return line
def import_pan21(base_path):
import os
import json
test_train = os.listdir(base_path)
train_features = []
train_labels = []
test_features = []
test_labels = []
for dic in test_train:
path = base_path + "/" + dic
data_path = os.listdir(path)
if dic == test_train[0]:
for i, file_name in enumerate(data_path[0:int(len(data_path) / 2)]):
file_path = path + "/" + file_name
with open(file_path, 'r', encoding='utf8') as f:
text = f.readlines()
train_features.append(text)
for i, file_name in enumerate(data_path[int(len(data_path) / 2):int(len(data_path))]):
file_path = path + "/" + file_name
f = open(file_path)
text = json.load(f)
train_labels.append(text)
if dic == test_train[1]:
for i, file_name in enumerate(data_path[0:int(len(data_path) / 2)]):
file_path = path + "/" + file_name
with open(file_path, 'r', encoding='utf8') as f:
text = f.readlines()
test_features.append(text)
for i, file_name in enumerate(data_path[int(len(data_path) / 2):int(len(data_path))]):
file_path = path + "/" + file_name
f = open(file_path)
text = json.load(f)
train_labels.append(text)
return train_features, train_labels, test_features, test_labels
def import_verification(base_path):
import os
import json
test_train = os.listdir(base_path)
train_features = []
train_labels = []
test_features = []
test_labels = []
for dic in test_train:
path = base_path + "/" + dic
data_path = os.listdir(path)
if dic == test_train[0]:
for i, file_name in enumerate(data_path[0:int(len(data_path) / 2)]):
file_path = path + "/" + file_name
with open(file_path, 'r', encoding='utf8') as f:
data = json.load(f)
if data == "pairs":
text =
train_features.append(text)
for i, file_name in enumerate(data_path[int(len(data_path) / 2):int(len(data_path))]):
file_path = path + "/" + file_name
f = open(file_path)
data = json.load(f)
if data ==
train_labels.append(text)
if dic == test_train[1]:
for i, file_name in enumerate(data_path[0:int(len(data_path) / 2)]):
file_path = path + "/" + file_name
with open(file_path, 'r', encoding='utf8') as f:
data = json.load(f)
if data == "fandoms":
test_features.append(text)
for i, file_name in enumerate(data_path[int(len(data_path) / 2):int(len(data_path))]):
file_path = path + "/" + file_name
f = open(file_path)
text = json.load(f)
train_labels.append(text)