Skip to content

Commit 7e1b08b

Browse files
author
7048730
committed
upd extract
- add -b | -basedir flag to set starting point dir for relative paths output - upd readme - upd exceptions for doi lookup
1 parent 3f5d7fe commit 7e1b08b

File tree

3 files changed

+27
-15
lines changed

3 files changed

+27
-15
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ Explanation of the switches:
9797
+ `-m` : option to additionally enable individual output of all processed files.
9898
+ `-xml` : option to change output format from json (default) to xml.
9999
+ `-ercid` <ERC_ID>: option to provide an ERC identifier.
100+
+ `-b` <BASE_DIR>: option to provide starting point directory for relative paths output
100101

101102
### Supported files and formats for the metadata extraction process:
102103

extract/metaextract.py

+23-13
Original file line numberDiff line numberDiff line change
@@ -62,20 +62,23 @@ def get_doi_http(md_title, md_author):
6262
status_note('requesting doi via crossref.org ...')
6363
my_params = {'query.title': md_title, 'query.author': md_author}
6464
r = requests.get('https://api.crossref.org/works', params=my_params, timeout=20)
65-
#status_note('debug: <get_doi_http> GET ' + r.url)
6665
status_note(' '.join((str(r.status_code), r.reason)))
67-
if 'message' in r.json():
68-
if 'items' in r.json()['message']:
69-
if type(r.json()['message']['items']) is list:
70-
# take first hit, best match
71-
if 'DOI' in r.json()['message']['items'][0]:
72-
return r.json()['message']['items'][0]['DOI']
66+
if r is not None:
67+
status_note('debug: <get_doi_http> GET ' + r.url)
68+
if 'message' in r.json():
69+
if 'items' in r.json()['message']:
70+
if type(r.json()['message']['items']) is list:
71+
# take first hit, best match
72+
if 'DOI' in r.json()['message']['items'][0]:
73+
return r.json()['message']['items'][0]['DOI']
7374
except requests.exceptions.Timeout:
7475
status_note('http doi request: timeout')
7576
except requests.exceptions.TooManyRedirects:
7677
status_note('http doi request: too many redirects')
7778
except requests.exceptions.RequestException as e:
7879
status_note('http doi request: ' + str(e))
80+
except:
81+
status_note('! error while requesting doi')
7982

8083

8184
def get_orcid_http(txt_input, bln_sandbox):
@@ -150,6 +153,11 @@ def get_r_package_class(package):
150153
#raise
151154
status_note(''.join(('! error while classifying r package:', str(exc.problem_mark), str(exc.problem))))
152155

156+
def get_rel_path(input_path):
157+
# this is the path for output and display, relative to --basedir flag
158+
output_path = os.path.relpath(os.path.join(input_path), basedir).replace('\\', '/')
159+
return output_path
160+
153161

154162
def parse_bagitfile(file_path):
155163
txt_dict = {'bagittxt_file': file_path}
@@ -213,7 +221,7 @@ def parse_spatial(file_id, filepath, fformat):
213221
if 'files' not in CANDIDATES_MD_DICT[file_id]['spatial']:
214222
key_files = {'files': []}
215223
CANDIDATES_MD_DICT[file_id]['spatial'] = key_files
216-
new_file_key['source_file'] = filepath
224+
new_file_key['source_file'] = get_rel_path(filepath)
217225
new_file_key['geojson'] = {}
218226
if coords is not None:
219227
new_file_key['geojson']['bbox'] = coords.bounds
@@ -423,7 +431,7 @@ def extract_from_candidate(file_id, path_file, out_format, out_mode, multiline,
423431
if s:
424432
md_filepath = s.group(1)
425433
else:
426-
md_filepath = path_file
434+
md_filepath = get_rel_path(path_file)
427435
md_record_date = datetime.datetime.today().strftime('%Y-%m-%d')
428436
data_dict = {'file': {'filename': md_file, 'filepath': md_filepath, 'mimetype': md_mime_type},
429437
'ercIdentifier': md_erc_id,
@@ -569,6 +577,8 @@ def start(**kwargs):
569577
input_dir = kwargs.get('i', None)
570578
global md_erc_id
571579
md_erc_id = kwargs.get('e', None)
580+
global basedir
581+
basedir = kwargs.get('b', None)
572582
global stay_offline
573583
stay_offline = kwargs.get('xo', None)
574584
global metafiles_all
@@ -693,7 +703,7 @@ def start(**kwargs):
693703
# give it a number
694704
new_id = str(uuid.uuid4())
695705
if os.path.isfile(full_file_path) and full_file_path not in file_list_input_candidates:
696-
file_list_input_candidates.append(full_file_path)
706+
file_list_input_candidates.append(get_rel_path(full_file_path))
697707
if nr < 50:
698708
# use buffering to prevent performance issues when parsing very large numbers of files
699709
log_buffer = False
@@ -718,12 +728,12 @@ def start(**kwargs):
718728
CANDIDATES_MD_DICT[new_id][bagit_txt_file] = (parse_bagitfile(full_file_path))
719729
elif file_extension == '.r':
720730
extract_from_candidate(new_id, full_file_path, output_format, output_mode, False, rule_set_r)
721-
MASTER_MD_DICT['codefiles'].append(full_file_path)
731+
MASTER_MD_DICT['codefiles'].append(get_rel_path(full_file_path))
722732
elif file_extension == '.rmd':
723733
extract_from_candidate(new_id, full_file_path, output_format, output_mode, True, rule_set_rmd_multiline)
724734
parse_temporal(new_id, full_file_path, None, None)
725735
elif file_extension == '.html':
726-
MASTER_MD_DICT['viewfile'].append(full_file_path)
736+
MASTER_MD_DICT['viewfile'].append(get_rel_path(full_file_path))
727737
else:
728738
parse_spatial(new_id, full_file_path, file_extension)
729739
status_note(''.join((str(nr), ' files processed')))
@@ -788,4 +798,4 @@ def start(**kwargs):
788798
output_extraction(MASTER_MD_DICT, output_format, output_mode, os.path.join(output_dir, main_metadata_filename))
789799
get_ercspec_http(output_dir)
790800
# Write erc.yml according to ERC spec:
791-
ercyml_write(output_dir)
801+
#ercyml_write(output_dir)

o2rmeta.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def status_note(msg, **kwargs):
5555
group.add_argument('-s', '--outputtostdout', help='output the result of the extraction to stdout',
5656
action='store_true', default=False)
5757
extractor.add_argument('-e', '--ercid', type=str, help='erc identifier', required=False)
58+
extractor.add_argument('-b', '--basedir', type=str, help='base directory for relative paths', required=False)
5859
extractor.add_argument('-xml', '--modexml', help='output xml', action='store_true', default=False, required=False)
5960
extractor.add_argument('-xo', '--stayoffline', help='skip all http requests', action='store_true', default=False,
6061
required=False)
@@ -84,7 +85,7 @@ def status_note(msg, **kwargs):
8485
# - - - - - - - - - - - - - - - - - -
8586
args = parser.parse_args()
8687
argsd = vars(args)
87-
my_version = 31 # update me!
88+
my_version = 32 # update me!
8889
my_mod = ''
8990
try:
9091
my_mod = datetime.datetime.fromtimestamp(os.stat(__file__).st_mtime)
@@ -95,7 +96,7 @@ def status_note(msg, **kwargs):
9596
try:
9697
if argsd['tool'] == "extract":
9798
status_note('launching extractor')
98-
metaextract.start(i=argsd['inputdir'], o=argsd['outputdir'], s=argsd['outputtostdout'], xo=argsd['stayoffline'], e=argsd['ercid'], m=argsd['metafiles'], xml=argsd['modexml'])
99+
metaextract.start(i=argsd['inputdir'], o=argsd['outputdir'], s=argsd['outputtostdout'], xo=argsd['stayoffline'], e=argsd['ercid'], b=argsd['basedir'], m=argsd['metafiles'], xml=argsd['modexml'])
99100
elif argsd['tool'] == "broker":
100101
status_note('launching broker')
101102
metabroker.start(c=argsd['check'], m=argsd['map'], i=argsd['inputfile'], o=argsd['outputdir'], s=argsd['outputtostdout'])

0 commit comments

Comments
 (0)