Skip to content

Commit de9c512

Browse files
author
7048730
committed
upd extract
- add rdata processing
1 parent f98e3af commit de9c512

File tree

2 files changed

+49
-27
lines changed

2 files changed

+49
-27
lines changed

extract/metaextract.py

+49-20
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import sys
2424
import urllib.request
2525
import uuid
26+
from subprocess import Popen, PIPE, STDOUT
2627
from xml.dom import minidom
2728

2829
import dicttoxml
@@ -154,12 +155,55 @@ def get_r_package_class(package):
154155
#raise
155156
status_note(''.join(('! error while classifying r package:', str(exc.problem_mark), str(exc.problem))))
156157

158+
157159
def get_rel_path(input_path):
158160
# this is the path for output and display, relative to --basedir flag
159161
output_path = os.path.relpath(os.path.join(input_path), basedir).replace('\\', '/')
160162
return output_path
161163

162164

165+
def get_rdata(filepath):
166+
# skip large files, unsuitable for text preview
167+
if os.stat(filepath).st_size / 1024 ** 2 > 200:
168+
status_note('[debug] skipping large RData file...')
169+
return None
170+
rhome_name = 'R_HOME'
171+
if rhome_name in os.environ:
172+
if os.environ[rhome_name] is not None:
173+
# OK try R_HOME value
174+
rpath = os.environ[rhome_name].replace("\\", "/")
175+
# add executable to path
176+
if not rpath.endswith('R') and not rpath.endswith('R.exe'):
177+
if os.path.exists(os.path.join(rpath, 'R.exe')):
178+
rpath = os.path.join(rpath, 'R.exe')
179+
else:
180+
if os.path.exists(os.path.join(rpath, 'R')):
181+
rpath = os.path.join(rpath, 'R')
182+
else:
183+
# Cannot take path
184+
status_note('[debug] invalid path to R executable')
185+
rpath = None
186+
if not os.path.exists(rpath):
187+
# Cannot take path
188+
status_note('[debug] invalid path to R installation')
189+
rpath = None
190+
else:
191+
status_note(''.join(('[debug] ', rhome_name, ' NULL')))
192+
rpath = None
193+
else:
194+
status_note(''.join(('[debug] ', rhome_name, ' R_HOME env is not set...')))
195+
return None
196+
try:
197+
if rpath is None:
198+
return None
199+
status_note('processing RData')
200+
p = Popen([rpath, '--vanilla', os.path.abspath(filepath)], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
201+
out = p.communicate(input=b'ls.str()')[0].decode('ISO-8859-1')[:-4].split("> ls.str()")[1]
202+
return out[:40000]
203+
except:
204+
raise
205+
206+
163207
def parse_bagitfile(file_path):
164208
txt_dict = {'bagittxt_file': file_path}
165209
with open(file_path) as f:
@@ -199,23 +243,6 @@ def parse_r(input_text, parser_dict):
199243
#status_note(''.join(('! error while parsing R input: ', str(exc.args[0]))))
200244

201245

202-
def parse_rdata(filepath):
203-
try:
204-
# set test user:
205-
os.environ['R_USER'] = 'test'
206-
import rpy2.robjects as robjects
207-
my_robjs = []
208-
# walk r objects stored in binary rdata file:
209-
for key in robjects.r['load'](filepath):
210-
my_robjs.append(str(key))
211-
md_rdata = {'file': get_rel_path(filepath), 'rdata': my_robjs}
212-
if 'r_rdata' in MASTER_MD_DICT:
213-
MASTER_MD_DICT['r_rdata'].append(md_rdata)
214-
except:
215-
status_note('debug: <parse_rdata> errored')
216-
#raise
217-
218-
219246
def parse_spatial(filepath, fformat):
220247
try:
221248
# <side_key> is an dict key in candidates to store all spatial files as list, other than finding the best candidate of spatial file
@@ -747,15 +774,17 @@ def start(**kwargs):
747774
if file_extension == '.txt':
748775
if file.lower() == 'bagit.txt':
749776
CANDIDATES_MD_DICT[new_id] = {}
750-
CANDIDATES_MD_DICT[new_id][bagit_txt_file] = (parse_bagitfile(full_file_path))
777+
CANDIDATES_MD_DICT[new_id][bagit_txt_file] = parse_bagitfile(full_file_path)
751778
elif file_extension == '.r':
752779
extract_from_candidate(new_id, full_file_path, output_format, output_mode, False, rule_set_r)
753780
MASTER_MD_DICT['codefiles'].append(get_rel_path(full_file_path))
754781
elif file_extension == '.rmd':
755782
extract_from_candidate(new_id, full_file_path, output_format, output_mode, True, rule_set_rmd_multiline)
756783
parse_temporal(new_id, full_file_path, None, None)
757-
#elif file_extension == '.rdata':
758-
# parse_rdata(full_file_path)
784+
elif file_extension == '.rdata':
785+
MASTER_MD_DICT['r_rdata'].append({'file': file,
786+
'filepath': get_rel_path(full_file_path),
787+
'rdata_preview': get_rdata(full_file_path)})
759788
elif file_extension == '.html':
760789
MASTER_MD_DICT['viewfile'].append(get_rel_path(full_file_path))
761790
else:

o2rmeta.py

-7
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,6 @@ def status_note(msg, **kwargs):
8585
# - - - - - - - - - - - - - - - - - -
8686
args = parser.parse_args()
8787
argsd = vars(args)
88-
#my_version = 32 # update me!
89-
#my_mod = ''
90-
#try:
91-
# my_mod = datetime.datetime.fromtimestamp(os.stat(__file__).st_mtime)
92-
#except OSError:
93-
# pass
94-
#status_note(''.join(('v', str(my_version), ' - ', str(my_mod))), debug=argsd['debug'])
9588
status_note(''.join(('received arguments: ', str(argsd))), debug=argsd['debug'])
9689
try:
9790
if argsd['tool'] == "extract":

0 commit comments

Comments
 (0)