-
Notifications
You must be signed in to change notification settings - Fork 124
/
Copy pathjsonhandler.py
167 lines (129 loc) · 4.79 KB
/
jsonhandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# META_FNAME - name of the meta-file.json
# GT_FNAME - name of the ground-truth.json
# OUT_FNAME - file to write the output in (answers.json)
# encoding - encoding of the texts (from json)
# language - language of the texts (from json)
# upath - path of the 'unknown' dir in the corpus (from json)
# candidates - list of candidate author names (from json)
# unknowns - list of unknown filenames (from json)
# trainings - dictionary with lists of filenames of trainingtexts for each author
# {"candidate2":["file1.txt", "file2.txt", ...], "candidate2":["file1.txt", ...] ...}
# trueAuthors - list of true authors of the texts (from GT_FNAME json)
# correstponding to 'unknowns'
'''
EXAMPLE:
import jsonhandler
candidates = jsonhandler.candidates
unknowns = jsonhandler.unknowns
jsonhandler.loadJson("testcorpus")
# If you want to do training:
jsonhandler.loadTraining()
for cand in candidates:
for file in jsonhandler.trainings[cand]:
# Get content of training file 'file' of candidate 'cand' as a string with:
# jsonhandler.getTrainingText(cand, file)
# Create lists for your answers (and scores)
authors = []
scores = []
# Get Parameters from json-file:
l = jsonhandler.language
e = jsonhandler.encoding
for file in unknowns:
# Get content of unknown file 'file' as a string with:
# jsonhandler.getUnknownText(file)
# Determine author of the file, and score (optional)
author = "oneAuthor"
score = 0.5
authors.append(author)
scores.append(score)
# Save results to json-file out.json (passing 'scores' is optional)
jsonhandler.storeJson(unknowns, authors, scores)
# If you want to evaluate the ground-truth file
loadGroundTruth()
# find out true author of document unknowns[i]:
# trueAuthors[i]
'''
import os
import json
import codecs
META_FNAME = "meta-file.json"
OUT_FNAME = "answers.json"
GT_FNAME = "ground-truth.json"
# always run this method first to evaluate the meta json file. Pass the
# directory of the corpus (where meta-file.json is situated)
def loadJson(corpus):
global corpusdir, upath, candidates, unknowns, encoding, language
corpusdir += corpus
mfile = open(os.path.join(corpusdir, META_FNAME), "r")
metajson = json.load(mfile)
mfile.close()
upath += os.path.join(corpusdir, metajson["folder"])
encoding += metajson["encoding"]
language += metajson["language"]
candidates += [author["author-name"]
for author in metajson["candidate-authors"]]
unknowns += [text["unknown-text"] for text in metajson["unknown-texts"]]
# run this method next, if you want to do training (read training files etc)
def loadTraining():
for cand in candidates:
trainings[cand] = []
for subdir, dirs, files in os.walk(os.path.join(corpusdir, cand)):
for doc in files:
trainings[cand].append(doc)
# get training text 'fname' from candidate 'cand' (obtain values from
# 'trainings', see example above)
def getTrainingText(cand, fname):
dfile = codecs.open(os.path.join(corpusdir, cand, fname), "r", "utf-8")
s = dfile.read()
dfile.close()
return s
# get training file as bytearray
def getTrainingBytes(cand, fname):
dfile = open(os.path.join(corpusdir, cand, fname), "rb")
b = bytearray(dfile.read())
dfile.close()
return b
# get unknown text 'fname' (obtain values from 'unknowns', see example above)
def getUnknownText(fname):
dfile = codecs.open(os.path.join(upath, fname), "r", "utf-8")
s = dfile.read()
dfile.close()
return s
# get unknown file as bytearray
def getUnknownBytes(fname):
dfile = open(os.path.join(upath, fname), "rb")
b = bytearray(dfile.read())
dfile.close()
return b
# run this method in the end to store the output in the 'path' directory as OUT_FNAME
# pass a list of filenames (you can use 'unknowns'), a list of your
# predicted authors and optionally a list of the scores (both must of
# course be in the same order as the 'texts' list)
def storeJson(path, texts, cands, scores=None):
answers = []
if scores == None:
scores = [1 for text in texts]
for i in range(len(texts)):
answers.append(
{"unknown_text": texts[i], "author": cands[i], "score": scores[i]})
f = open(os.path.join(path, OUT_FNAME), "w")
json.dump({"answers": answers}, f, indent=2)
f.close()
# if you want to evaluate your answers using the ground-truth.json, load
# the true authors in 'trueAuthors' using this function
def loadGroundTruth():
tfile = open(os.path.join(corpusdir, GT_FNAME), "r")
tjson = json.load(tfile)
tfile.close()
global trueauthors
for i in range(len(tjson["ground-truth"])):
trueAuthors.append(tjson["ground-truth"][i]["true-author"])
# initialization of global variables
encoding = ""
language = ""
corpusdir = ""
upath = ""
candidates = []
unknowns = []
trainings = {}
trueAuthors = []