Skip to content

Commit 74fa94e

Browse files
author
7048730
committed
upd harvester
- minimal demo - minor fixes
1 parent c7426fb commit 74fa94e

File tree

5 files changed

+71
-65
lines changed

5 files changed

+71
-65
lines changed

README.md

+12-1
Original file line numberDiff line numberDiff line change
@@ -177,5 +177,16 @@ Explanation of the switches:
177177
# (4) Harvester tool:
178178

179179
Collects OAI-PMH metadata from catalogues, data registries and repositories and parses them to assist the completion of a metadata set such as the one in o2r.
180+
_Note, that this tool is currently only a demo._
180181

181-
_TBD_
182+
183+
python o2rmeta.py harvest -e <ELEMENT> -q <QUERY>
184+
185+
Example call:
186+
187+
python o2rmeta.py -debug harvest -e"doi" -q"10.14457/CU.THE.1989.1"
188+
189+
Explanation of the switches:
190+
191+
+ `-e` <ELEMENT> : MD element type for search, e.g. _doi_ or _creator_
192+
+ `-q` <QUERY> : MD content to start the search

broker/mappings/eudat_b2share-map.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@
3333
},
3434
"open_access": {
3535
"translatesTo": "open_access",
36+
"type": "boolean",
3637
"hasParent": "root",
3738
"needsParent": "root"
3839
}
3940
}
40-
}
41+
}

harvest/bases.json

+14
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,20 @@
5555
"isSubscriberOnly": false,
5656
"schema": ""
5757
},
58+
"oadoi": {
59+
"url": "\\url{https://api.oadoi.org/}",
60+
"default_parameter": "",
61+
"isActive": true,
62+
"isSubscriberOnly": false,
63+
"schema": ""
64+
},
65+
"Orcid": {
66+
"url": "\\url{https://pub.sandbox.orcid.org/v2.0/search?q=}",
67+
"default_parameter": "",
68+
"isActive": true,
69+
"isSubscriberOnly": false,
70+
"schema": ""
71+
},
5872
"OpenAIRE": {
5973
"url": "\\url{http://api.openaire.eu/oai_pmh}",
6074
"default_parameter": "",

harvest/metaharvest.py

+37-59
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,20 @@
1-
'''
2-
Copyright (c) 2016 - o2r project
3-
4-
Licensed under the Apache License, Version 2.0 (the "License");
5-
you may not use this file except in compliance with the License.
6-
You may obtain a copy of the License at
7-
8-
http://www.apache.org/licenses/LICENSE-2.0
9-
10-
Unless required by applicable law or agreed to in writing, software
11-
distributed under the License is distributed on an "AS IS" BASIS,
12-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
See the License for the specific language governing permissions and
14-
limitations under the License.
15-
16-
'''
17-
18-
import argparse
191
import base64
202
import json
21-
import sys
223
import urllib.request
234

245
from lxml import etree
256

267

278
def parse_from_unicode(unicode_str):
9+
utf8_parser = etree.XMLParser(encoding='utf-8')
2810
s = unicode_str.encode('utf-8')
2911
return etree.fromstring(s, parser=utf8_parser)
3012

3113
def qu(q_type,q_string,q_base):
3214
# check if json url has been parsed correctly
3315
if q_base.startswith('\\url{'):
3416
q_base = q_base[5:].replace('}', '')
35-
accepted = ['doi','creator'] #<-- generify
17+
accepted = ['doi', 'creator']
3618
if q_type in accepted:
3719
# datacite solr query encoding ideolect:
3820
my_query = str.encode('q='+q_type+'%3A'+q_string)
@@ -47,43 +29,39 @@ def qu(q_type,q_string,q_base):
4729
print('query type not available')
4830
return None
4931

32+
33+
def status_note(msg):
34+
print(''.join(('[o2rmeta][harvest] ', str(msg))))
35+
36+
5037
# main:
51-
if __name__ == "__main__":
52-
if sys.version_info[0] < 3:
53-
# py2
54-
print('requires py3k or later')
55-
sys.exit()
56-
else:
57-
parser = argparse.ArgumentParser(description='description')
58-
parser.add_argument('-i', '--input', help='type of provided metadata element for input, e.g. doi or creator', required=True)
59-
parser.add_argument('-q', '--query', help='query string', required=True)
60-
args = parser.parse_args()
61-
argsdict = vars(args)
62-
i = argsdict['input']
63-
q = argsdict['query']
64-
try:
65-
with open('bases.json', encoding='utf-8') as data_file:
66-
bases = json.load(data_file)
67-
settings_data = bases['Settings']
68-
baseurl_data = bases['BaseURLs']
69-
except:
70-
raise
71-
print('[metaharvest] starting request')
72-
#test datacite
73-
my_base = 'DataCite' #<-- make this parsed arg
74-
try:
75-
result = qu(i.lower(), q, baseurl_data[my_base]['url'] + baseurl_data[my_base]['default_parameter'])
76-
print('[metaharvest] !debug using ' + result[:128] + ' ...')
77-
utf8_parser = etree.XMLParser(encoding='utf-8')
78-
tree = parse_from_unicode(result)
79-
# e.g. return author from datacite creatorName
80-
output = {}
81-
for node in tree.xpath('//ns:creatorName', namespaces={'ns': 'http://datacite.org/schema/kernel-3'}):
82-
output['authorName'] = node.text
83-
json_output = json.dumps(output)
84-
#output if not empty
85-
if str(json_output) != '{}':
86-
print(str(json_output))
87-
except:
88-
raise
89-
#pass
38+
def start(**kwargs):
39+
global e
40+
e = kwargs.get('e', None)
41+
global q
42+
q = kwargs.get('q', None)
43+
try:
44+
with open('harvest/bases.json', encoding='utf-8') as data_file:
45+
bases = json.load(data_file)
46+
settings_data = bases['Settings']
47+
baseurl_data = bases['BaseURLs']
48+
except:
49+
raise
50+
status_note('new request')
51+
# demo datacite
52+
my_base = 'DataCite'
53+
# todo: argparse base endpoint
54+
try:
55+
result = qu(e.lower(), q, baseurl_data[my_base]['url'] + baseurl_data[my_base]['default_parameter'])
56+
status_note('!debug using ' + result[:256] + ' ...')
57+
tree = parse_from_unicode(result)
58+
# e.g. return author from datacite creatorName
59+
output = {}
60+
for node in tree.xpath('//ns:creatorName', namespaces={'ns': 'http://datacite.org/schema/kernel-3'}):
61+
output['authorName'] = node.text
62+
json_output = json.dumps(output)
63+
# output if not empty
64+
if str(json_output) != '{}':
65+
print(str(json_output))
66+
except:
67+
raise

o2rmeta.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,17 @@ def status_note(msg, **kwargs):
7474
group.add_argument('-s', '--outputtostdout', help='output the result of the brokering to stdout',
7575
action='store_true', default=False)
7676
# - - - - - - - - - - - - - - - - - -
77+
harvester = subparsers.add_parser("harvest")
78+
harvester.add_argument('-e', '--element', type=str, help='element type, e.g. doi or creator',required=True)
79+
harvester.add_argument('-q', '--query', type=str, help='query string', required=True)
80+
# - - - - - - - - - - - - - - - - - -
7781
validator = subparsers.add_parser("validate")
7882
validator.add_argument("-s", "--schema", type=str, required=True)
7983
validator.add_argument("-c", "--candidate", type=str, required=True)
8084
# - - - - - - - - - - - - - - - - - -
81-
# harvester
82-
# - - - - - - - - - - - - - - - - - -
8385
args = parser.parse_args()
8486
argsd = vars(args)
85-
my_version = 23 # update me!
87+
my_version = 24 # update me!
8688
my_mod = ''
8789
try:
8890
my_mod = datetime.datetime.fromtimestamp(os.stat(__file__).st_mtime)
@@ -102,7 +104,7 @@ def status_note(msg, **kwargs):
102104
metavalidate.start(s=argsd['schema'], c=argsd['candidate'])
103105
elif argsd['tool'] == "harvest":
104106
status_note('launching harvester')
105-
print('TBD') # todo
107+
metaharvest.start(e=argsd['element'], q=argsd['query'])
106108
else:
107109
pass
108110
except:

0 commit comments

Comments
 (0)