From 48a18fc6dd9719035a0e2d20d662bf1804bce3c9 Mon Sep 17 00:00:00 2001 From: michaelarthuranderson Date: Sun, 25 Feb 2018 15:20:12 -0500 Subject: [PATCH 1/2] CLIMATE-316 Add ESGF Download Script to repository --- examples/esgf_integration_example.py | 58 ++++++++++------- ocw/esgf/constants.py | 2 +- ocw/esgf/download.py | 53 +++++++++------ ocw/esgf/logon.py | 16 ++--- ocw/esgf/main.py | 96 ++++++++++++++-------------- ocw/esgf/search.py | 22 ++++--- 6 files changed, 138 insertions(+), 109 deletions(-) diff --git a/examples/esgf_integration_example.py b/examples/esgf_integration_example.py index e939927d..e5412731 100644 --- a/examples/esgf_integration_example.py +++ b/examples/esgf_integration_example.py @@ -30,36 +30,46 @@ """ -import ocw.data_source.esgf as esgf -from getpass import getpass +from __future__ import print_function + import ssl import sys +from getpass import getpass + +import ocw.data_source.esgf as esgf + + +def main(): + """ + An example of using the OCW ESGF library. Connects to an ESGF + server and downloads a dataset. + """ + if hasattr(ssl, '_create_unverified_context'): + ssl._create_default_https_context = ssl._create_unverified_context + + dataset_id = 'obs4mips.CNES.AVISO.zos.mon.v20110829|esgf-data.jpl.nasa.gov' + variable = 'zosStderr' -if hasattr(ssl, '_create_unverified_context'): - ssl._create_default_https_context = ssl._create_unverified_context + if sys.version_info[0] >= 3: + username = input('Enter your ESGF OpenID:\n') + else: + username = raw_input('Enter your ESGF OpenID:\n') -dataset_id = 'obs4mips.CNES.AVISO.zos.mon.v20110829|esgf-data.jpl.nasa.gov' -variable = 'zosStderr' + password = getpass(prompt='Enter your ESGF Password:\n') -if sys.version_info[0] >= 3: - username = input('Enter your ESGF OpenID:\n') -else: - username = raw_input('Enter your ESGF OpenID:\n') + # Multiple datasets are returned in a list if the ESGF dataset is + # divided into multiple files. + datasets = esgf.load_dataset(dataset_id, variable, username, password) -password = getpass(prompt='Enter your ESGF Password:\n') + # For this example, our dataset is only stored in a single file so + # we only need to look at the 0-th value in the returned list. + dataset = datasets[0] -# Multiple datasets are returned in a list if the ESGF dataset is -# divided into multiple files. -datasets = esgf.load_dataset(dataset_id, - variable, - username, - password) + print('\n--------\n') + print('Variable: ', dataset.variable) + print('Shape: ', dataset.values.shape) + print('A Value: ', dataset.values[100][100][100]) -# For this example, our dataset is only stored in a single file so -# we only need to look at the 0-th value in the returned list. -ds = datasets[0] -print('\n--------\n') -print('Variable: ', ds.variable) -print('Shape: ', ds.values.shape) -print('A Value: ', ds.values[100][100][100]) +if __name__ == '__main__': + main() diff --git a/ocw/esgf/constants.py b/ocw/esgf/constants.py index 8d30848d..90218fd8 100644 --- a/ocw/esgf/constants.py +++ b/ocw/esgf/constants.py @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. # -'''Module containing constant parameters for ESGF RCMES integration.''' +"""Module containing constant parameters for ESGF RCMES integration.""" # default location of ESGF user credentials ESGF_CREDENTIALS = "~/.esg/credentials.pem" diff --git a/ocw/esgf/download.py b/ocw/esgf/download.py index 690915c5..951a341f 100644 --- a/ocw/esgf/download.py +++ b/ocw/esgf/download.py @@ -16,12 +16,18 @@ # specific language governing permissions and limitations # under the License. # -''' +""" OCW module to download a file from ESGF. -''' +""" + +from __future__ import print_function import sys +from os.path import expanduser, join + +from ocw.esgf.constants import ESGF_CREDENTIALS + if sys.version_info[0] >= 3: from http.client import HTTPSConnection from urllib.request import build_opener @@ -35,15 +41,12 @@ from urllib2 import build_opener from urllib2 import HTTPCookieProcessor from urllib2 import HTTPSHandler -from os.path import expanduser, join - -from ocw.esgf.constants import ESGF_CREDENTIALS class HTTPSClientAuthHandler(HTTPSHandler): - ''' + """ HTTP handler that transmits an X509 certificate as part of the request - ''' + """ def __init__(self, key, cert): HTTPSHandler.__init__(self) @@ -51,34 +54,44 @@ def __init__(self, key, cert): self.cert = cert def https_open(self, req): + """ + Opens the https connection. + :param req: The https request object. + :return: An addinfourl object for the request. + """ return self.do_open(self.getConnection, req) def getConnection(self, host, timeout=300): - return HTTPSConnection(host, key_file=self.key, cert_file=self.cert) + """ + Create an HTTPSConnection object. + :param host: The ESGF server to connect to. + :param timeout: Connection timeout in seconds. + :return: + """ + return HTTPSConnection(host, key_file=self.key, cert_file=self.cert, timeout=timeout) def download(url, toDirectory="/tmp"): - ''' + """ Function to download a single file from ESGF. - :param url: the URL of the file to download :param toDirectory: target directory where the file will be written - ''' + """ # setup HTTP handler - certFile = expanduser(ESGF_CREDENTIALS) - opener = build_opener(HTTPSClientAuthHandler(certFile, certFile)) + cert_file = expanduser(ESGF_CREDENTIALS) + opener = build_opener(HTTPSClientAuthHandler(cert_file, cert_file)) opener.add_handler(HTTPCookieProcessor()) # download file - localFilePath = join(toDirectory, url.split('/')[-1]) - print("\nDownloading url: %s to local path: %s ..." % (url, localFilePath)) - localFile = open(localFilePath, 'w') - webFile = opener.open(url) - localFile.write(webFile.read()) + local_file_path = join(toDirectory, url.split('/')[-1]) + print("\nDownloading url: %s to local path: %s ..." % (url, local_file_path)) + local_file = open(local_file_path, 'w') + web_file = opener.open(url) + local_file.write(web_file.read()) # cleanup - localFile.close() - webFile.close() + local_file.close() + web_file.close() opener.close() print("... done") diff --git a/ocw/esgf/logon.py b/ocw/esgf/logon.py index b792cfa8..a49335d8 100644 --- a/ocw/esgf/logon.py +++ b/ocw/esgf/logon.py @@ -16,28 +16,28 @@ # specific language governing permissions and limitations # under the License. # -''' +""" RCMES module to logon onto the ESGF. -''' +""" import os from pyesgf.logon import LogonManager -from ocw.esgf.constants import JPL_MYPROXY_SERVER_DN, JPL_HOSTNAME +from ocw.esgf.constants import JPL_HOSTNAME, JPL_MYPROXY_SERVER_DN def logon(openid, password): - ''' + """ Function to retrieve a short-term X.509 certificate that can be used to authenticate with ESGF. The certificate is written in the location ~/.esg/credentials.pem. The trusted CA certificates are written in the directory ~/.esg/certificates. - ''' + """ # Must configure the DN of the JPL MyProxy server if using a JPL openid if JPL_HOSTNAME in openid: os.environ['MYPROXY_SERVER_DN'] = JPL_MYPROXY_SERVER_DN - lm = LogonManager() + logon_manager = LogonManager() - lm.logon_with_openid(openid, password, bootstrap=True) + logon_manager.logon_with_openid(openid, password, bootstrap=True) - return lm.is_logged_on() + return logon_manager.is_logged_on() diff --git a/ocw/esgf/main.py b/ocw/esgf/main.py index 5c900420..0fb46562 100644 --- a/ocw/esgf/main.py +++ b/ocw/esgf/main.py @@ -16,21 +16,23 @@ # specific language governing permissions and limitations # under the License. # -''' +""" Example main program for ESGF-RCMES integration. - -''' -# constant parameters -DATA_DIRECTORY = "/tmp" +""" + +from __future__ import print_function +from ocw.esgf.download import download from ocw.esgf.logon import logon from ocw.esgf.search import SearchClient -from ocw.esgf.download import download + +# constant parameters +DATA_DIRECTORY = "/tmp" def main(): - '''Example driver program''' + """Example driver program""" username = raw_input('Enter your ESGF Username:\n') password = raw_input('Enter your ESGF Password:\n') @@ -42,8 +44,8 @@ def main(): print("...done.") # step 2: execute faceted search for files - urls = main_obs4mips() - #urls = main_cmip5() + # urls = main_obs4mips() + urls = main_cmip5() # step 3: download file(s) for i, url in enumerate(urls): @@ -53,66 +55,66 @@ def main(): def main_cmip5(): - ''' + """ Example workflow to search for CMIP5 files - ''' + """ - searchClient = SearchClient( + search_client = SearchClient( searchServiceUrl="http://pcmdi9.llnl.gov/esg-search/search", distrib=False) - print('\nAvailable projects=%s' % searchClient.getFacets('project')) - searchClient.setConstraint(project='CMIP5') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable projects=%s' % search_client.getFacets('project')) + search_client.setConstraint(project='CMIP5') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) + + print('\nAvailable models=%s' % search_client.getFacets('model')) + search_client.setConstraint(model='INM-CM4') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable models=%s' % searchClient.getFacets('model')) - searchClient.setConstraint(model='INM-CM4') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable experiments=%s' % search_client.getFacets('experiment')) + search_client.setConstraint(experiment='historical') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable experiments=%s' % searchClient.getFacets('experiment')) - searchClient.setConstraint(experiment='historical') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable time frequencies=%s' % search_client.getFacets('time_frequency')) + search_client.setConstraint(time_frequency='mon') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable time frequencies=%s' % - searchClient.getFacets('time_frequency')) - searchClient.setConstraint(time_frequency='mon') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable CF standard names=%s' % search_client.getFacets('cf_standard_name')) + search_client.setConstraint(cf_standard_name='air_temperature') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable CF standard names=%s' % - searchClient.getFacets('cf_standard_name')) - searchClient.setConstraint(cf_standard_name='air_temperature') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + urls = search_client.getFiles() - urls = searchClient.getFiles() return urls def main_obs4mips(): - ''' + """ Example workflow to search for obs4MIPs files. - ''' + """ - searchClient = SearchClient(distrib=False) + search_client = SearchClient(distrib=False) # obs4MIPs - print('\nAvailable projects=%s' % searchClient.getFacets('project')) - searchClient.setConstraint(project='obs4MIPs') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable projects=%s' % search_client.getFacets('project')) + search_client.setConstraint(project='obs4MIPs') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable variables=%s' % searchClient.getFacets('variable')) - searchClient.setConstraint(variable='hus') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable variables=%s' % search_client.getFacets('variable')) + search_client.setConstraint(variable='hus') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable time frequencies=%s' % - searchClient.getFacets('time_frequency')) - searchClient.setConstraint(time_frequency='mon') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasets()) + print('\nAvailable time frequencies=%s' % search_client.getFacets('time_frequency')) + search_client.setConstraint(time_frequency='mon') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) - print('\nAvailable models=%s' % searchClient.getFacets('model')) - searchClient.setConstraint(model='Obs-MLS') - print("Number of Datasets=%d" % searchClient.getNumberOfDatasetsi()) + print('\nAvailable models=%s' % search_client.getFacets('model')) + search_client.setConstraint(model='Obs-MLS') + print("Number of Datasets=%d" % search_client.getNumberOfDatasets()) + + urls = search_client.getFiles() - urls = searchClient.getFiles() return urls + if __name__ == '__main__': main() diff --git a/ocw/esgf/search.py b/ocw/esgf/search.py index c2f4e12b..a807c420 100644 --- a/ocw/esgf/search.py +++ b/ocw/esgf/search.py @@ -16,17 +16,19 @@ # specific language governing permissions and limitations # under the License. # -''' +""" RCMES module to execute a faceted search for ESGF files. -''' +""" + +from __future__ import print_function from pyesgf.search import SearchConnection from ocw.esgf.constants import JPL_SEARCH_SERVICE_URL -class SearchClient(): +class SearchClient(object): """ Simple ESGF search client for RCMES. This class is a thin layer on top of the esgfpy-client package. @@ -36,7 +38,7 @@ class SearchClient(): def __init__(self, searchServiceUrl=JPL_SEARCH_SERVICE_URL, distrib=True): """ :param searchServiceUrl: URL of ESGF search service to query - :param distrib: True to execute a federation-wide search, + :param distrib: True to execute a federation-wide search, False to search only the specified search service """ connection = SearchConnection(searchServiceUrl, distrib=distrib) @@ -66,8 +68,10 @@ def getNumberOfDatasets(self): def getFacets(self, facet): """ - :return: a dictionary of (facet value, facet count) for the specified facet and current constraints. - Example (for facet='project'): {u'COUND': 4, u'CMIP5': 2657, u'obs4MIPs': 7} + :return: a dictionary of (facet value, facet count) for the specified facet + and current constraints. + + Example (for facet='project'): {u'COUND': 4, u'CMIP5': 2657, u'obs4MIPs': 7} """ return self.context.facet_counts[facet] @@ -82,7 +86,7 @@ def getFiles(self): print("\nSearching files for dataset=%s with constraints: %s" % (dataset.dataset_id, self.constraints)) files = dataset.file_context().search(**self.constraints) - for file in files: - print('Found file=%s' % file.download_url) - urls.append(file.download_url) + for current_file in files: + print('Found file=%s' % current_file.download_url) + urls.append(current_file.download_url) return urls From 848cdb692774cba27fe435e2730b08118cb9daf3 Mon Sep 17 00:00:00 2001 From: michaelarthuranderson Date: Sun, 25 Feb 2018 16:08:01 -0500 Subject: [PATCH 2/2] CLIMATE-316 Add ESGF Download Script to repository --- ocw/data_source/esgf.py | 66 ++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/ocw/data_source/esgf.py b/ocw/data_source/esgf.py index 0dcc2e05..6b2f042f 100644 --- a/ocw/data_source/esgf.py +++ b/ocw/data_source/esgf.py @@ -16,9 +16,26 @@ # specific language governing permissions and limitations # under the License. # +""" +A set of functions to wrap downloading ESGF datasets into an OCW dataset object. +*** Note *** The ESGF data source requires that the user have certain credentials downloaded from +the ESG. The current version of the module should download these automatically. Older versions of +the library will not download them. The solution is to use the WGET script from the EGS to download +a test dataset to get the credentials. The data source should work as expected then. + +""" import os import sys + +import requests +from bs4 import BeautifulSoup + +import ocw.data_source.local as local +from ocw.esgf.constants import DEFAULT_ESGF_SEARCH +from ocw.esgf.download import download +from ocw.esgf.logon import logon + if sys.version_info[0] >= 3: from urllib.error import HTTPError else: @@ -27,15 +44,6 @@ # might be around one day from urllib2 import HTTPError -from ocw.esgf.constants import DEFAULT_ESGF_SEARCH -from ocw.esgf.download import download -from ocw.esgf.logon import logon -from ocw.esgf.search import SearchClient -import ocw.data_source.local as local - -from bs4 import BeautifulSoup -import requests - def load_dataset(dataset_id, variable_name, @@ -44,9 +52,8 @@ def load_dataset(dataset_id, search_url=DEFAULT_ESGF_SEARCH, elevation_index=0, name='', - save_path='/tmp', - **additional_constraints): - ''' Load an ESGF dataset. + save_path='/tmp'): + """ Load an ESGF dataset. :param dataset_id: The ESGF ID of the dataset to load. :type dataset_id: :mod:`string` @@ -74,32 +81,24 @@ def load_dataset(dataset_id, :param save_path: (Optional) Path to where downloaded files should be saved. :type save_path: :mod:`string` - :param additional_constraints: (Optional) Additional key,value pairs to - pass as constraints to the search wrapper. These can be anything found - on the ESGF metadata page for a dataset. - :returns: A :class:`list` of :class:`dataset.Dataset` contained the requested dataset. If the dataset is stored in multiple files each will be loaded into a separate :class:`dataset.Dataset`. :raises ValueError: If no dataset can be found for the supplied ID and variable, or if the requested dataset is a multi-file dataset. - ''' - download_data = _get_file_download_data(url=search_url, - dataset_id=dataset_id, - variable=variable_name) + """ + download_data = \ + _get_file_download_data(url=search_url, dataset_id=dataset_id, variable=variable_name) datasets = [] + for url, var in download_data: - _download_files([url], - esgf_username, - esgf_password, - download_directory=save_path) + _download_files([url], esgf_username, esgf_password, download_directory=save_path) file_save_path = os.path.join(save_path, url.split('/')[-1]) - datasets.append(local.load_file(file_save_path, - var, - name=name, + + datasets.append(local.load_file(file_save_path, var, name=name, elevation_index=elevation_index)) origin = { @@ -107,19 +106,20 @@ def load_dataset(dataset_id, 'dataset_id': dataset_id, 'variable': variable_name } - for ds in datasets: - ds.origin = origin + + for dataset in datasets: + dataset.origin = origin return datasets def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH): - '''''' + """""" url += '?type=File&dataset_id={}&variable={}' url = url.format(dataset_id, variable) - r = requests.get(url) - xml = BeautifulSoup(r.content, "html.parser") + raw_data = requests.get(url) + xml = BeautifulSoup(raw_data.content, "html.parser") dont_have_results = not bool(xml.response.result['numfound']) @@ -141,7 +141,7 @@ def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH): def _download_files(file_urls, username, password, download_directory='/tmp'): - '''''' + """""" try: logon(username, password) except HTTPError: