-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrowse_utilities.py
352 lines (327 loc) · 13.8 KB
/
browse_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import datetime
import re
import urllib2
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def cdscat2browse_request(bibcode, browse_request_file, browse_table_name='', observatory='', master_catalog='',
table_type='', freq_regime='', server='cdsarc.u-strasbg.fr'):
"""
from a CDS data table, use the CDS ReadMe file to create a request for creation of a Browse table
Sample CDS ReadMe file ftp://cdsarc.u-strasbg.fr/pub/cats/J/ApJS/208/17/ReadMe
param: bibcode - string, should be of the form YYYYJournal..volume...page, for example '2013ApJS..208...17A'
param: browse_request_file: Name of output browse request file
return: creates browse table readme file
"""
#
# find the tables in the CDS ReadMe file
# they can be found in the lines beginning with "Byte-by-byte Description of file:'
#
b = bibcode.split('..')
yr = b[0][0:4]
journal = b[0][4:]
vol = b[1]
pg = b[2][1:-1]
pgsuffix = b[2][-1]
rurl = 'ftp://{3}/pub/cats/J/{0}/{1}/{2}/ReadMe'.format(journal, vol, pg, server)
print 'Retrieving {0}'.format(rurl)
#freadme = open(ReadMe_file, mode='r')
try:
freadme = urllib2.urlopen(rurl)
except Exception, errmsg:
print 'Error reading {0}'.format(rurl)
print 'Error message: "{0}"'.format(errmsg)
print 'Returning'
return
readme = freadme.readlines()
freadme.close()
catindx = []
catname = []
# generate table author name
auth = readme[0].split(' ')[-1].strip().strip('(').strip(')').replace('+',' et al.')
for i, r in enumerate(readme):
if 'Byte-by-byte Description of file:' in r:
catindx.append(i)
catname.append(r.split('file:')[1].strip())
catendindx = [x - 1 for x in catindx[1:]]
catendindx.append(len(readme) - 1)
#
# print available catalogs with description and let user select one
#
catdes = []
print "Available catalogs"
for i, c in enumerate(catname):
# print "#{0} {1} {}".format(i, c)
pat = re.compile(c)
for r in readme:
m = pat.match(r)
if m:
d = r.split(' ')[-1].strip()
catdes.append(d)
# print catdes
print "#{0} {1} {2}".format(i, c.replace('.dat', ''), d)
catselindx = raw_input('Enter number of selected catalog >')
catselindx = int(catselindx)
# 0 or more spaces followed by one or more digits followed by zero or more "-" followed by 1 or more spaces followed by one or more digits
# p1 = re.compile('\s*\d*-*\s*\d*')
s = '--------------------------------------------------------------------------------'
p1 = re.compile(
'\s*\d*-\s*\d*\s*') # 0 or more spaces followed by one or more digits followed by a - followed by 1 or more spaces followed by one or more digits
dbytes = []
dformat = []
dunit = []
dname = []
dhname = []
dcomment = []
err = re.compile('e_', re.I) # re.I means case insensitive matching
for r in readme[catindx[catselindx] + 4:catendindx[catselindx] - 1]:
if not s in r:
if r[0:10].strip(): # if first 10 characters are not spaces process line
# print r
comment = ''
m = p1.match(r)
if m:
# process line to get data
dbytes.append(m.group())
test = r[m.end():].split(' ')
l = [x for x in test if x != '']
dformat.append(l[0])
dunit.append(l[1])
name = l[2].strip()
dname.append(name)
em = err.match(name)
if em: # if name begins with an "e_" or "E_", assume it's an error value
name = "{0}_error".format(name[em.end():])
if name=='RAdeg':
name ='RA' # assume this is right ascension
if name == 'DEdeg':
name = 'Dec' # assume this is Dec
dhname.append(name)
comment = ' '.join(l[3:])
dcomment.append(comment.strip().replace("\n", " "))
else:
comment = "{0} {1}".format(comment.strip(),
r.strip()) # first 10 comments are blank so append as continuation comment
# print comment
dcomment[-1] = comment.strip() # replace last comment with appended comment
else:
break # end of data reached
#
# Write output
#
delim = '--------------------------------------------------------------------------------'
metadata = [['Request to Create New Database', str(datetime.datetime.now())],
['New Database Name', browse_table_name],
['db description', catdes[catselindx]],
['w3b description', catdes[catselindx]],
['Observatory', observatory],
['Master Catalog', master_catalog],
['Table Type', table_type],
['Table Author', auth],
['Row Type', ' '],
['Frequency Regime', freq_regime],
['Default Search Radius', 1],
['Line Summary Display', ''],
['Unique Key', ''],
['Catalog Bibcode', "{0}{1}.{2}.{3}{4}".format(yr,journal,vol,pg, pgsuffix)],
['Add Class Field', ' '],
['Directory', ' '],
['Data File', 'ReadMe'],
['This File', 'README']
]
print "\n"
for m in metadata:
print "{0:<30s}: {1:<}".format(m[0], m[1])
print delim
print "\n"
print "Byte-by-byte Description of file: {0}".format(catname[catselindx])
title = "{0:>10s} {1:>5s} {2:>9s} {3:>10s} {4:>15s} {5:<}".format('Bytes', 'Format', 'Units', 'Label',
'HEASARC Name', 'Comment')
print delim
print title
print delim
a = [len(x) for x in dbytes]
dbml = max(a)
for db, df, du, dn, dh, dc in zip(dbytes, dformat, dunit, dname, dhname, dcomment):
print "{0:>10s} {1:>5s} {2:>9s} {3:>10s} {4:>15s} {5:<}".format(db.strip(), df.strip(), du, dn.strip(),
dh.strip(), dc)
#
# write to output request file
#
print "\n Writing to {0}".format(browse_request_file)
fout = open(browse_request_file, 'w')
fout.write("\n")
for m in metadata:
fout.write("{0:<30s}: {1:<}\n".format(m[0], m[1]))
fout.write(delim)
fout.write("\n")
fout.write("Byte-by-byte Description of file: {0}\n".format(catname[catselindx]))
fout.write(delim)
fout.write("\n")
fout.write("{0}\n".format(title))
fout.write(delim)
fout.write("\n")
a = [len(x) for x in dbytes]
dbml = max(a)
for db, df, du, dn, dh, dc in zip(dbytes, dformat, dunit, dname, dhname, dcomment):
fout.write("{0:>10s} {1:>5s} {2:>9s} {3:>10s} {4:>15s} {5:<}\n".format(db.strip(), df.strip(), du, dn.strip(),
dh.strip(), dc))
fout.close()
return
def cds_get_ucds(vizier_url):
"""
Get the UCD values for a vizier table for the parameters in a browse README table creation request file
:param: browse_readme = name of the browse README table creation request file
:param: vizier_url = url of the vizier page for the table
:return:
"""
#vizier_url = 'http://vizier.cfa.harvard.edu/viz-bin/VizieR?-source=J/A+A/588/A103'
req = requests.get(vizier_url)
soup = BeautifulSoup(req.text,'lxml')
tables = soup.findAll('table')
table = tables[7] # parameter table seems to be table 7 - not sure how general this is
rows = table.find_all('tr')
ucd_dict = dict()
for r in rows:
col = r.findAll('td')
try:
cdspar = col[2].text.strip()
except:
cdspar = "Not Found"
#pat = re.compile(r'\([a-z]*\.\D*\)')
# this pattern matches an open parenthesis, combinations of letters, periods, semicolons, numbers and dashes and a closing parenthesis
pat = re.compile('\([a-z,A-Z]*\.{1}[a-z,;,.,A-Z,-]*[0-9]?\)')
try:
cdsucd = pat.search(col[4].text).group().split(') (')[0].strip('(').strip(')')
except:
cdsucd='Not Found in {0}'.format(cdspar)
#print " Parameter {0} has UCD {1}".format(cdspar, cdsucd)
ucd_dict[cdspar]=cdsucd
return ucd_dict
def README_to_dict(readmefile, header=0,footer=0):
"""
reads the browse README file then
returns a dictionary of the cds parameter names and the corresponding brows parameter name, i.e.
readme_dict={'BROWSE_PARNAME1':'CDS_PARNAME1', ...}
:param readmefile: BROWSE readme file
:param header: number of header lines to skip
:param footer: number of footer lines to skip
:return:
"""
f = open(readmefile,'r')
lines = f.readlines()
numlines=len(lines)
readme_dict = dict()
for l in lines[header:numlines-footer]:
# first get rid of the integer and dash, which begins some (not all) of the lines
try:
x = re.split('\d+\-', l)[1]
except IndexError:
x=l
# then split x on 1 or more spaces - strip whitespace from x first; element 3 and 4 are the parameter names
cols = re.split(r'\s{1,}', x.strip())
try:
cdspar = cols[3].strip()
except IndexError:
cdspar = 'cds_par_smissing'
try:
browsepar = cols[4].strip()
except IndexError:
browsepar = 'browse_par_missing'
readme_dict[browsepar] = cdspar
return readme_dict
def strip(text):
try:
return text.strip()
except AttributeError:
return text
def update_ucdfile(vizier_url, UCDfile, README, header=0, footer=0, UpdateAll = False):
"""
Updates the initial UCD file produced when ingesting a table into BROWSE based on the
UCDs assigned by the CDS
:param vizier_url: the vizier URL of the original table
:param UCDfile: the name of the initial ucd file
:param README: the name of the BROWSE README file used to request the table ingest
:param header: header lines in BROWSE README file to skip
:param footer: footer lines in BROWSE README file to skip
:param UpdateAll: if False, updates UCDs only the parameters that are missing UCDs
:return: creates new ucd file
"""
# read the initial ucd file as a pandas dataframe
df_ucd = pd.read_csv(UCDfile, sep="|", names=['browse_table', 'browse_parameter', 'ucd'],
converters = {'browse_parameter':strip, 'ucd':strip})
# get the dictionary of browse parameters and their cds names from the README file
rdict = README_to_dict(README, header=header, footer=footer)
# get ucds from the vizier table
ucd_dict = cds_get_ucds(vizier_url)
# update ucds in the dataframe
for i, bp in enumerate(df_ucd['browse_parameter']):
# get cds parameter corresponding to browse parameter
try:
cdspar = rdict[bp]
except KeyError:
print "Browse parameter {0} not found in README dictionary".format(bp)
cdspar = 'missing'
if not cdspar == 'missing':
try:
ucd = ucd_dict[cdspar]
except KeyError:
print "CDS parameter {0} not found in UCD dictionary".format(cdspar)
ucd = ' '
if UpdateAll:
df_ucd.iloc[i]['ucd'] = ucd.strip()
else:
# only updating ucds if they are missing
if len(df_ucd.iloc[i]['ucd'].strip()) == 0:
df_ucd.iloc[i]['ucd'] = ucd.strip()
return df_ucd
def write_ucdfile(ucdfile,ucd_dataframe):
"""
Writes out the ucd file
:param ucdfile: path and name of the output ucd file
:param ucd_dataframe: dataframe containing the ucd for each parameter
:return:
"""
fout = open(ucdfile,mode='w')
for i,tn in enumerate(ucd_dataframe['browse_table']):
fout.write("{0}|{1:24s} |{2}\n".format(tn, ucd_dataframe.iloc[i]['browse_parameter'],
ucd_dataframe.iloc[i]['ucd'] ))
fout.close()
return
def test_cdscat2browse():
# cdsr="/Users/corcoran/program/HEASARC/missions/Fermi/Browse_Tables/2PC_auxiliary_files_v04/CDS/ReadMe"
cdsr = '2013ApJS..208...17A'
obs = 'Fermi'
cdscat2browse_request(cdsr,
'/Users/corcoran/program/HEASARC/missions/Fermi/Browse_Tables/2PC_auxiliary_files_v04/CDS/browse_request_test.txt',
observatory=obs, master_catalog='X-ray', browse_table_name='LAT2PSC',
freq_regime='gamma-ray', table_type='Object')
# cdsr = '2015MNRAS..448...3766K'
# cdscat2browse_request(cdsr, '/Users/corcoran/browse_request_test.txt')
def test_cds_get_ucds():
vizier_url = 'http://vizier.cfa.harvard.edu/viz-bin/VizieR?-source=J/A+A/588/A103'
ucd_dict = cds_get_ucds(vizier_url)
print ucd_dict.keys()
def test_readme_to_dict():
rfile = 'sample_data/README'
header = 27
footer = 12
rdict = README_to_dict(rfile, header=header, footer = footer)
print rdict.keys()
def test_update_ucdfile():
vizier_url = 'http://vizier.cfa.harvard.edu/viz-bin/VizieR?-source=J/A+A/588/A103'
rfile = 'sample_data/Readme'
ucdfile = 'sample_data/rass2rxs.ucd'
header = 27
footer = 12
df_ucd = update_ucdfile(vizier_url,ucdfile,rfile,header=header, footer=footer)
# print df_ucd['ucd']
ucdfile = 'sample_data/rass2rxs.ucd.update'
write_ucdfile(ucdfile, df_ucd)
return
if __name__=="__main__":
#test_cdscat2browse()
#test_cds_get_ucds()
#test_readme_to_dict()
test_update_ucdfile()