-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path4_srm_import.py
64 lines (59 loc) · 2.51 KB
/
4_srm_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from pathlib import Path
import json
import io
#import simplejson as json
def read_jls_and_txts_into_json():
papers = []
pathlist = Path("data/").glob("**/*.jl")
for file_path in pathlist:
with io.open(file_path, 'r', encoding='utf8') as f:
j = 0
for line in f:
if j > 3:
break
j_content = json.loads(line)
body = "Leipzig"#j_content['body']
if 'mainFile' in j_content:
fileName = j_content['mainFile']['fileName'][:-4]
try:
with open('data/txts/{}.txt'.format(fileName), 'r') as file:
content = file.read()
except:
content = '<data/txts/{}.txt not found>'.format(fileName)
else:
content = "<empty>"
name = j_content['name']
resolution = None #j_content['resolution']
if 'leipzig:originator' in j_content:
originator = j_content['leipzig:originator']
else:
originator = 'Unbekannt'
paper_type = j_content['paperType']
published_at = j_content['date'] if 'date' in j_content else j_content['created'] if 'created' in j_content else '1970-01-01'
reference = j_content['reference']
url = j_content['web']
#print("body ", body)
#print("content ", content)
#print("name ", name)
#print("resolution ", resolution)
#print("originator ", originator)
#print("paper_type ", paper_type)
#print("published_at ", published_at)
#print("reference ", reference)
print("url ", url)
paperdict = {}
paperdict["body"] = body
paperdict["content"] = content
paperdict["name"] = name
paperdict["resolution"] = resolution
paperdict["originator"] = originator
paperdict["paper_type"] = paper_type
paperdict["published_at"] = published_at
paperdict["reference"] = reference
paperdict["url"] = url
papers.append(paperdict)
return papers
def write_to_json():
with io.open('input.json', 'w', encoding='utf8') as json_file:
json.dump(read_jls_and_txts_into_json(), json_file)
write_to_json()