From 09127d28444af672394c63f662131e689d75397c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 11 Mar 2024 11:14:15 -0500 Subject: [PATCH 01/81] point to correct openapi definition --- flask_app/broker/routes.py | 2 +- flask_app/templates/{swagger_ui.html => swagger_ui.broker.html} | 2 +- sphinx/flask/structure.rst | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename flask_app/templates/{swagger_ui.html => swagger_ui.broker.html} (90%) create mode 100644 sphinx/flask/structure.rst diff --git a/flask_app/broker/routes.py b/flask_app/broker/routes.py index cbaabfcf..e2d8db11 100644 --- a/flask_app/broker/routes.py +++ b/flask_app/broker/routes.py @@ -68,7 +68,7 @@ def swagger_ui(): Returns: a webpage UI of the Specify Network schema. """ - return render_template("swagger_ui.html") + return render_template("swagger_ui.broker.html") # ..................................................................................... diff --git a/flask_app/templates/swagger_ui.html b/flask_app/templates/swagger_ui.broker.html similarity index 90% rename from flask_app/templates/swagger_ui.html rename to flask_app/templates/swagger_ui.broker.html index a6f4a5f3..1beb4832 100644 --- a/flask_app/templates/swagger_ui.html +++ b/flask_app/templates/swagger_ui.broker.html @@ -18,7 +18,7 @@ diff --git a/sphinx/flask/structure.rst b/sphinx/flask/structure.rst new file mode 100644 index 00000000..e69de29b From 0458b26fc5a6749e25f75f75a8aae74c925d14a7 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 11 Mar 2024 11:14:54 -0500 Subject: [PATCH 02/81] doc for SpNetwork structure --- sphinx/flask/structure.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sphinx/flask/structure.rst b/sphinx/flask/structure.rst index e69de29b..dbab00fa 100644 --- a/sphinx/flask/structure.rst +++ b/sphinx/flask/structure.rst @@ -0,0 +1,14 @@ +Structure +###################################### + +Specify Network consists of four Docker containers running on a single EC2 instance. + +The nginx and front-end containers support both the Analyst and Broker. Two flask +containers, one for Analyst, and one for Broker, expose the APIs of each to different +subdomains of the same domain. Code for each is in the flask_app.analyst and +flask_app.broker directories. In each, the routes.py file defines the different +endpoints. + + + + From ef20344d5916beb85cb326541bd2c94451a0050c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 11 Mar 2024 17:29:07 -0500 Subject: [PATCH 03/81] separate schema files --- flask_app/broker/routes.py | 6 +++--- flask_app/common/constants.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/flask_app/broker/routes.py b/flask_app/broker/routes.py index e2d8db11..347e93f4 100644 --- a/flask_app/broker/routes.py +++ b/flask_app/broker/routes.py @@ -1,10 +1,10 @@ """URL Routes for the Specify Network API services.""" -import os from flask import Blueprint, Flask, render_template, request +import os # from flask_app.application import create_app from flask_app.common.constants import ( - TEMPLATE_DIR, STATIC_DIR, SCHEMA_DIR, SCHEMA_FNAME + TEMPLATE_DIR, STATIC_DIR, SCHEMA_DIR, SCHEMA_BROKER_FNAME ) from flask_app.common.s2n_type import APIEndpoint @@ -54,7 +54,7 @@ def display_raw_schema(): Returns: schema: the schema for the Specify Network. """ - fname = os.path.join(SCHEMA_DIR, SCHEMA_FNAME) + fname = os.path.join(SCHEMA_DIR, SCHEMA_BROKER_FNAME) with open(fname, "r") as f: schema = f.read() return schema diff --git a/flask_app/common/constants.py b/flask_app/common/constants.py index 3e50c197..1f782357 100644 --- a/flask_app/common/constants.py +++ b/flask_app/common/constants.py @@ -8,4 +8,5 @@ SCHEMA_DIR = f"{STATIC_DIR}/schema" TEMPLATE_DIR = "../templates" -SCHEMA_FNAME = "open_api.yaml" +SCHEMA_ANALYST_FNAME = "open_api.analyst.yaml" +SCHEMA_BROKER_FNAME = "open_api.broker.yaml" From f3e54f4755f317824e9956b76dfda785e072ecb8 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 11 Mar 2024 17:30:22 -0500 Subject: [PATCH 04/81] add GBIF datasetkey resolver --- flask_app/analyst/count.py | 74 ++++++++++++++++++++++++------------- flask_app/analyst/routes.py | 72 +++++++++++++++++++----------------- sppy/tools/provider/gbif.py | 68 ++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 60 deletions(-) diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 65ad6291..375a2694 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -1,10 +1,12 @@ """Class for the Specify Network Name API service.""" +import boto3 from http import HTTPStatus from flask_app.common.s2n_type import APIService, AnalystOutput from flask_app.common.util import print_analyst_output from flask_app.analyst.base import _AnalystService +from sppy.aws.aws_tools import query_s3_table from sppy.tools.s2n.utils import get_traceback @@ -16,9 +18,21 @@ class CountSvc(_AnalystService): # ............................................... @classmethod - def get_counts(cls, collection_id, organization_id): + def get_counts(cls, dataset_key): + """Get counts for datasetKey. + + Args: + dataset_key: Unique identifier for GBIF datasets. + + Returns: + a flask_app.broker.s2n_type.BrokerOutput object with optional records as a + list of dictionaries of records corresponding to specimen occurrences in + the provider database. + + Todo: Consider adding publishing organization queries with pub_org_key + """ try: - output = cls._get_records(collection_id, organization_id) + output = cls._get_records(dataset_key, ) except Exception: traceback = get_traceback() output = AnalystOutput( @@ -29,15 +43,15 @@ def get_counts(cls, collection_id, organization_id): # ............................................... @classmethod - def _get_organization_counts(cls, organization_id): + def _get_organization_counts(cls, pub_org_key): return { "Organization Raw Counts": { - organization_id: 1, + pub_org_key: 1, "org_id_2": 2, "org_id_3": 3 }, - f"{organization_id} to other orgs": + f"{pub_org_key} to other orgs": { "to total": "0.5", "org_id_2": "1.2", @@ -47,33 +61,41 @@ def _get_organization_counts(cls, organization_id): # ............................................... @classmethod - def _get_collection_counts(cls, collection_id): - return { - "Collection Raw Counts": - { - collection_id: 1, - "coll_id_2": 2, - "coll_id_3": 3 - }, - f"{collection_id} Ratios": - { - collection_id: "0.5", - "coll_id_2": "0.5", - "coll_id_3": "0.5", - "to total": "0.5" - } - } + def _get_dataset_counts(cls, dataset_key): + s3 = boto3.client('s3') + + resp = s3.select_object_content( + Bucket=PROJ_, + Key='sample_data.csv', + ExpressionType='SQL', + Expression="SELECT * FROM s3object s where s.\"Name\" = 'Jane'", + InputSerialization={'CSV': {"FileHeaderInfo": "Use"}, 'CompressionType': 'NONE'}, + OutputSerialization={'CSV': {}}, + ) + + for event in resp['Payload']: + if 'Records' in event: + records = event['Records']['Payload'].decode('utf-8') + print(records) + elif 'Stats' in event: + statsDetails = event['Stats']['Details'] + print("Stats details bytesScanned: ") + print(statsDetails['BytesScanned']) + print("Stats details bytesProcessed: ") + print(statsDetails['BytesProcessed']) + print("Stats details bytesReturned: ") + print(statsDetails['BytesReturned']) # ............................................... @classmethod - def _get_records(cls, collection_id, organization_id): + def _get_records(cls, dataset_key, pub_org_key): allrecs = [] # for response metadata - if collection_id is not None: - coll_data = cls._get_collection_counts(collection_id) + if dataset_key is not None: + coll_data = cls._get_collection_counts(dataset_key) allrecs.append(coll_data) - if organization_id is not None: - org_data = cls._get_organization_counts(organization_id) + if pub_org_key is not None: + org_data = cls._get_organization_counts(pub_org_key) allrecs.append(org_data) # Assemble diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py index f242e289..f53ad06c 100644 --- a/flask_app/analyst/routes.py +++ b/flask_app/analyst/routes.py @@ -1,8 +1,11 @@ """URL Routes for the Specify Network API services.""" from flask import Blueprint, Flask, render_template, request +import os from flask_app.analyst.count import CountSvc -from flask_app.common.constants import (STATIC_DIR, TEMPLATE_DIR) +from flask_app.common.constants import ( + STATIC_DIR, TEMPLATE_DIR, SCHEMA_DIR, SCHEMA_ANALYST_FNAME) +from flask_app.common.s2n_type import APIEndpoint analyst_blueprint = Blueprint( "analyst", __name__, template_folder=TEMPLATE_DIR, static_folder=STATIC_DIR, @@ -18,37 +21,37 @@ def index(): return render_template("analyst.index.html") -# # ..................................................................................... -# @app.route("/api/v1/", methods=["GET"]) -# def analyst_status(): -# """Get services available from broker. -# -# Returns: -# dict: A dictionary of status information for the server. -# """ -# endpoints = APIEndpoint.get_analyst_endpoints() -# system_status = "In Development" -# return { -# "num_services": len(endpoints), -# "endpoints": endpoints, -# "status": system_status -# } -# +# ..................................................................................... +@app.route("/api/v1/", methods=["GET"]) +def analyst_status(): + """Get services available from broker. + + Returns: + dict: A dictionary of status information for the server. + """ + endpoints = APIEndpoint.get_analyst_endpoints() + system_status = "In Development" + return { + "num_services": len(endpoints), + "endpoints": endpoints, + "status": system_status + } + + +# .......................... +@app.route("/api/v1/schema") +def display_raw_schema(): + """Show the schema XML. + + Returns: + schema: the schema for the Specify Network. + """ + fname = os.path.join(SCHEMA_DIR, SCHEMA_ANALYST_FNAME) + with open(fname, "r") as f: + schema = f.read() + return schema + -# # .......................... -# @app.route("/api/v1/schema") -# def display_raw_schema(): -# """Show the schema XML. -# -# Returns: -# schema: the schema for the Specify Network. -# """ -# fname = os.path.join(SCHEMA_DIR, SCHEMA_FNAME) -# with open(fname, "r") as f: -# schema = f.read() -# return schema -# -# # # .......................... # @app.route("/api/v1/swaggerui") # def swagger_ui(): @@ -70,11 +73,12 @@ def count_endpoint(): API response. """ coll_arg = request.args.get("collection_id", default=None, type=str) - org_arg = request.args.get("organization_id", default=None, type=str) - if coll_arg is None and org_arg is None: + # org_arg = request.args.get("organization_id", default=None, type=str) + # if coll_arg is None and org_arg is None: + if coll_arg is None: response = CountSvc.get_endpoint() else: - response = CountSvc.get_counts(coll_arg, org_arg) + response = CountSvc.get_counts(coll_arg) return response diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py index 2a54e98a..7854bc7f 100644 --- a/sppy/tools/provider/gbif.py +++ b/sppy/tools/provider/gbif.py @@ -69,6 +69,19 @@ def _get_output_val(cls, out_dict, name): return None return val + # ............................................... + @classmethod + def _get_nested_output_val(cls, output, key_list): + while key_list: + key = key_list[0] + key_list = key_list[1:] + try: + output = output[key] + if not key_list: + return str(output).encode(ENCODING) + except Exception: + return None + # # ............................................... # @classmethod # def get_taxonomy(cls, taxon_key, logger=None): @@ -646,13 +659,68 @@ def get_publishing_org(cls, pub_org_key, logger=None): raise return pub_org_name + # ............................................... + @classmethod + def get_dataset(cls, dataset_key, logger=None): + """Return title from one dataset record with this key. + + Args: + dataset_key: GBIF identifier for this dataset + logger: object for logging messages and errors. + + Returns: + dataset_name: the name of the dataset. + citation: the preferred citation for the dataset. + + Raises: + Exception: on query failure. + """ + ds_api = GbifAPI( + service=GBIF.DATASET_SERVICE, key=dataset_key, logger=logger) + try: + ds_api.query() + dataset_name = ds_api._get_output_val(ds_api.output, "title") + except Exception as e: + logit(logger, str(e), refname=cls.__name__) + raise + try: + citation = ds_api._get_nested_output_val( + ds_api.output, ["citation", "text"]) + except Exception as e: + citation = None + return dataset_name, citation + # ............................................... def query(self): """Query the API and set "output" attribute to a ElementTree object.""" APIQuery.query_by_get(self, output_type="json", verify=False) + + + # ............................................................................. if __name__ == "__main__": # test pass + +""" +from sppy.tools.provider.gbif import GbifAPI + +dataset_key = 'e9d1c589-5df6-4bd8-aead-c09e2d8630e4' +ds_api = GbifAPI(service='dataset', key=dataset_key) +try: + ds_api.query() + dataset_name = ds_api._get_output_val(ds_api.output, "title") +except Exception as e: + logit(logger, str(e), refname=cls.__name__) + raise +try: + citation = ds_api._get_nested_output_val( + ds_api.output, ["citation", "text"]) +except Exception as e: + logit(logger, str(e), refname=cls.__name__) + raise +return dataset_name, citation + +""" \ No newline at end of file From 91a66a05f0868f322e15e9a34c8936a4e829b464 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 11 Mar 2024 17:31:53 -0500 Subject: [PATCH 05/81] S3 Select access with upgraded dependency --- requirements.txt | 2 +- sphinx/aws/aws-setup.rst | 7 +++ sppy/aws/aws_tools.py | 111 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1af53c6e..553b4dc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ pykew>=0.1.3 gunicorn==20.1.0 rtree>=1.0.0 awscli -boto3 +boto3>=1.34.60 pandas pyarrow s3fs diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst index e657a105..828da48d 100644 --- a/sphinx/aws/aws-setup.rst +++ b/sphinx/aws/aws-setup.rst @@ -9,6 +9,13 @@ Configure AWS credentials either through * AWS CLI configuration (for command line tools), or * using an IAM role attached to your instance if running on AWS infrastructure. +The AWS cli depends on boto3, so both must be up to date. In my testing, awscli +1.27.118 (with requirement botocore==1.29.118) and boto3 1.28.1, failed on +S3 Select access. + +I upgraded awscli (sudo apt install awscli), then upgraded boto3 +(pip install --upgrade boto3) , which installed 1.34.60. Success + Redshift =========================================================== diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index b9ff1a89..a055ab6b 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -13,8 +13,9 @@ import os from sppy.aws.aws_constants import ( - INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, PROJ_NAME, - REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, USER_DATA_TOKEN) + INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, + PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + USER_DATA_TOKEN) # -------------------------------------------------------------------------------------- @@ -672,3 +673,109 @@ def create_dataframe_from_s3obj( # s3_fs = s3fs.S3FileSystem df = pd.read_parquet(s3_uri) return df + + +# ............................................................................. +class S3Query(): + """Specify Network API service for retrieving taxonomic information.""" + + # ............................................... + @classmethod + def __init__( + self, bucket, region=REGION, encoding="utf-8"): + """Object to query tabular data in S3. + + Args: + bucket: S3 bucket containing data. + s3_path: S3 folder(s) containing data objects. + datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed. + region: AWS region containing the data. + encoding: encoding of the data. + """ + self.s3 = boto3.client('s3') + self.bucket = bucket + self.region = region + self.encoding = encoding + self._current_datestr = get_current_datadate_str() + self.exp_type = 'SQL' + + # ---------------------------------------------------- + def query_s3_table(self, s3_path, query_str): + """Query the S3 resource defined for this class. + + Args: + query_str: a SQL query for S3 select. + + Returns: + list of records matching the query + """ + recs = [] + resp = self.s3.select_object_content( + Bucket=self.bucket, + Key= self.s3_path, + ExpressionType='SQL', + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization={"JSON": {}} + ) + for event in resp["Payload"]: + if "Records" in event: + records = event["Records"]["Payload"].decode(self.encoding) + recs.append(records) + return recs + + # ---------------------------------------------------- + def get_dataset_counts(self, dataset_key): + """Query the S3 resource for occurrence and species counts for this dataset. + + Args: + dataset_key: unique GBIF identifier for dataset of interest. + + Returns: + records + """ + datestr = get_current_datadate_str() + datestr = "2024_02_01" + s3_path = f"summary/dataset_counts_{datestr}_000.parquet" + query_str = (f"SELECT occ_count, species_count " + f"FROM s3object s " + f"WHERE s.datasetkey = {dataset_key}") + records = self.query_s3_table(s3_path, query_str) + return records + + +""" +import boto3 + +from sppy.aws.aws_constants import ( + INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, + PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + USER_DATA_TOKEN) + +ctable = "dataset_counts_2024_02_01_000.parquet" +ltable = "dataset_lists_2024_02_01_000.parquet" +s3_path = f"summary/{ctable}" +dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + +s3 = boto3.client('s3') +query_str = (f"SELECT occ_count, species_count " + f"FROM s3object s " + f"WHERE s.datasetkey = '{dataset_key}'") + +SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515' + +resp = s3.select_object_content( + Bucket=PROJ_BUCKET, + Key=s3_path, + ExpressionType='SQL', + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization={"CSV": {}} + ) + +for event in resp["Payload"]: + if "Records" in event: + records = event["Records"]["Payload"].decode('utf-8') + print(records) + +""" From d2c682102d2ed3e921b52b193ed5d7e595cb806c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 11 Mar 2024 17:32:11 -0500 Subject: [PATCH 06/81] testing notes --- sphinx/misc/debugging.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst index 29543b7e..8cfd0aa3 100644 --- a/sphinx/misc/debugging.rst +++ b/sphinx/misc/debugging.rst @@ -10,6 +10,7 @@ IDE debugging of functions Local debugging of flask app ============================================= +* Choose to run the Analyst or Broker with FLASK_APP environment variable * Run flask at command prompt ```zsh @@ -17,7 +18,11 @@ export FLASK_ENV=development export FLASK_APP=flask_app.broker.routes flask run ``` +* With either Analyst or Broker, the development port will be 5000 + + * Connect to http://127.0.0.1:5000 in browser, + i.e. http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False& -* Connect to localhost in browser. * Flask will auto-update on file save. * Refresh browser after changes + From 250225bab1f96b014862102b1c6197030cabdd6e Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 12 Mar 2024 14:24:00 -0500 Subject: [PATCH 07/81] move common Analyst/BrokerService methods to new superclass --- flask_app/analyst/base.py | 218 ++++------------------------------ flask_app/broker/base.py | 217 +--------------------------------- flask_app/common/base.py | 241 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 270 insertions(+), 406 deletions(-) create mode 100644 flask_app/common/base.py diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index fc31c364..c4568345 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -1,71 +1,20 @@ """Parent Class for the Specify Network API services.""" from flask import Flask +from werkzeug.exceptions import (BadRequest, InternalServerError) -import sppy.tools.s2n.utils as lmutil +from flask_app.common.base import _SpecifyNetworkService +from sppy.tools.s2n.utils import add_errinfo, get_traceback from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService app = Flask(__name__) # ............................................................................. -class _AnalystService: +class _AnalystService(_SpecifyNetworkService): """Base S-to-the-N service, handles parameter names and acceptable values.""" # overridden by subclasses SERVICE_TYPE = APIService.AnalystRoot - # ............................................... - @classmethod - def _get_valid_requested_params(cls, user_params_string, valid_params): - """Return valid and invalid options for parameters that accept >1 values. - - Args: - user_params_string: user-requested parameters as a string. - valid_params: valid parameter values - - Returns: - valid_requested_params: list of valid params from the provided query string - invalid_params: list of invalid params from the provided query string - - Note: - For the badge service, exactly one provider is required. For all other - services, multiple providers are accepted, and None indicates to query all - valid providers. - """ - valid_requested_params = invalid_params = [] - - if user_params_string: - tmplst = user_params_string.split(",") - user_params = {tp.lower().strip() for tp in tmplst} - - valid_requested_params = set() - invalid_params = set() - # valid_requested_providers, invalid_providers = - # cls.get_multivalue_options(user_provs, valid_providers) - for param in user_params: - if param in valid_params: - valid_requested_params.add(param) - else: - invalid_params.add(param) - - invalid_params = list(invalid_params) - if valid_requested_params: - valid_requested_params = list(valid_requested_params) - else: - valid_requested_params = [] - - return valid_requested_params, invalid_params - - # ............................................................................. - @classmethod - def endpoint(cls): - """Return the URL endpoint for this class. - - Returns: - URL endpoint for the service - """ - endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}" - return endpoint - # ............................................... @classmethod def get_endpoint(cls, **kwargs): @@ -75,7 +24,7 @@ def get_endpoint(cls, **kwargs): **kwargs: keyword arguments are accepted but ignored Returns: - flask_app.broker.s2n_type.S2nOutput object + flask_app.analyst.s2n_type.S2nOutput object Raises: Exception: on unknown error. @@ -106,155 +55,40 @@ def _show_online(cls): # ............................................... @classmethod - def _fix_type_new(cls, key, provided_val): - """Modify a parameter value to a valid type and value. - - Args: - key: parameter key - provided_val: user-provided parameter value - - Returns: - usr_val: a valid value for the parameter - valid_options: list of valid options (for error message) - - Note: - Corrections: - * cast to correct type - * validate with any options - * if value is invalid (type or value), return the default. - """ - valid_options = None - if provided_val is None: - return None - # all strings are lower case - try: - provided_val = provided_val.lower() - except Exception: - pass - - # First see if restricted to options - default_val = cls.SERVICE_TYPE["params"][key]["default"] - type_val = cls.SERVICE_TYPE["params"][key]["type"] - # If restricted options, check - try: - options = cls.SERVICE_TYPE["params"][key]["options"] - except KeyError: - options = None - else: - # Invalid option returns default value - if provided_val in options: - usr_val = provided_val - else: - valid_options = options - usr_val = default_val - - # If not restricted to options - if options is None: - # Cast values to correct type. Failed conversions return default value - if isinstance(type_val, str) and not options: - usr_val = str(provided_val) - - elif isinstance(type_val, float): - try: - usr_val = float(provided_val) - except ValueError: - usr_val = default_val - - # Boolean also tests as int, so try boolean first - elif isinstance(type_val, bool): - if provided_val in (0, "0", "n", "no", "f", "false"): - usr_val = False - elif provided_val in (1, "1", "y", "yes", "t", "true"): - usr_val = True - else: - valid_options = (True, False) - usr_val = default_val - - elif isinstance(type_val, int): - try: - usr_val = int(provided_val) - except ValueError: - usr_val = default_val - - else: - usr_val = provided_val - - return usr_val, valid_options - - # ............................................... - @classmethod - def _process_params(cls, user_kwargs=None): - """Modify all user provided keys to lowercase and values to correct types. - - Args: - user_kwargs: dictionary of keywords and values sent by the user for - the current service. - - Returns: - good_params: dictionary of valid parameters and values - errinfo: dictionary of errors for different error levels. - - Note: - A list of valid values for a keyword can include None as a default - if user-provided value is invalid - Todo: - Do we need not_in_valid_options for error message? - """ - good_params = {} - errinfo = {} - - # Correct all parameter keys/values present - for key in cls.SERVICE_TYPE["params"]: - val = user_kwargs[key] - # Done in calling function - if val is not None: - usr_val, valid_options = cls._fix_type_new(key, val) - if valid_options is not None and val not in valid_options: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Value {val} for parameter {key} is not in valid options " - f"{cls.SERVICE_TYPE['params'][key]['options']}") - good_params[key] = None - else: - good_params[key] = usr_val - - # Fill in defaults for missing parameters - for key in cls.SERVICE_TYPE["params"]: - param_meta = cls.SERVICE_TYPE["params"][key] - try: - _ = good_params[key] - except KeyError: - good_params[key] = param_meta["default"] - - return good_params, errinfo - - # ............................................... - @classmethod - def _standardize_params(cls, collection_id=None, organization_id=None): + def _standardize_params( + cls, dataset_key=None, pub_org_key=None, order="descending", limit=10): """Standardize query parameters to send to appropriate service. Args: - collection_id: collection identifier for comparisons - organization_id: organization identifier for comparisons + dataset_key: unique GBIF dataset identifier for comparisons + pub_org_key: unique publishing organization identifier for comparisons Returns: a dictionary containing keys and properly formatted values for the user specified parameters. """ user_kwargs = { - "collection_id": collection_id, - "organization_id": organization_id + "collection_id": dataset_key, + "organization_id": pub_org_key, + "order": order, + "limit": limit } - usr_params, errinfo = cls._process_params(user_kwargs) + try: + usr_params, errinfo = cls._process_params(user_kwargs) - return usr_params, errinfo + # errinfo indicates bad parameters + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass - # .......................... - @staticmethod - def OPTIONS(): - """Common options request for all services (needed for CORS).""" - return + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) + + return usr_params, errinfo # ............................................................................. diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py index 5b5afb0c..200ed43e 100644 --- a/flask_app/broker/base.py +++ b/flask_app/broker/base.py @@ -3,6 +3,7 @@ from werkzeug.exceptions import BadRequest, InternalServerError import sppy.tools.s2n.utils as lmutil +from flask_app.common.base import _SpecifyNetworkService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider) from sppy.tools.provider.gbif import GbifAPI @@ -21,7 +22,7 @@ def handle_bad_response(e): return f"Internal Server Error: {e}" # ............................................................................. -class _BrokerService: +class _BrokerService(_SpecifyNetworkService): """Base S-to-the-N service, handles parameter names and acceptable values.""" # overridden by subclasses SERVICE_TYPE = APIService.BrokerRoot @@ -84,59 +85,6 @@ def get_providers(cls, filter_params=None): provnames = cls._order_providers(provnames) return provnames - # ............................................................................. - @classmethod - def _get_valid_requested_params(cls, user_params_string, valid_params): - """Return valid and invalid options for parameters that accept >1 values. - - Args: - user_params_string: user-requested parameters as a string. - valid_params: valid parameter values - - Returns: - valid_requested_params: list of valid params from the provided query string - invalid_params: list of invalid params from the provided query string - - Note: - For the badge service, exactly one provider is required. For all other - services, multiple providers are accepted, and None indicates to query all - valid providers. - """ - valid_requested_params = invalid_params = [] - - if user_params_string: - tmplst = user_params_string.split(",") - user_params = {tp.lower().strip() for tp in tmplst} - - valid_requested_params = set() - invalid_params = set() - # valid_requested_providers, invalid_providers = - # cls.get_multivalue_options(user_provs, valid_providers) - for param in user_params: - if param in valid_params: - valid_requested_params.add(param) - else: - invalid_params.add(param) - - invalid_params = list(invalid_params) - if valid_requested_params: - valid_requested_params = list(valid_requested_params) - else: - valid_requested_params = [] - - return valid_requested_params, invalid_params - - # ............................................................................. - @classmethod - def endpoint(cls): - """Return the URL endpoint for this class. - - Returns: - URL endpoint for the service - """ - endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}" - return endpoint - # ............................................... @classmethod def get_endpoint(cls, **kwargs): @@ -236,154 +184,6 @@ def match_name_with_itis(self, namestr): pass return namestr - # ............................................... - @classmethod - def _fix_type_new(cls, key, provided_val): - """Modify a parameter value to a valid type and value. - - Args: - key: parameter key - provided_val: user-provided parameter value - - Returns: - usr_val: a valid value for the parameter - valid_options: list of valid options (for error message) - - Note: - Corrections: - * cast to correct type - * validate with any options - * if value is invalid (type or value), return the default. - """ - valid_options = None - if provided_val is None: - return None - # all strings are lower case - try: - provided_val = provided_val.lower() - except Exception: - pass - - param_meta = cls.SERVICE_TYPE["params"][key] - # First see if restricted to options - default_val = param_meta["default"] - type_val = param_meta["type"] - # If restricted options, check - try: - options = param_meta["options"] - except KeyError: - options = None - else: - # Invalid option returns default value - if provided_val in options: - usr_val = provided_val - else: - valid_options = options - usr_val = default_val - - # If not restricted to options - if options is None: - # Cast values to correct type. Failed conversions return default value - if isinstance(type_val, str) and not options: - usr_val = str(provided_val) - - elif isinstance(type_val, float): - try: - usr_val = float(provided_val) - except ValueError: - usr_val = default_val - - # Boolean also tests as int, so try boolean first - elif isinstance(type_val, bool): - if provided_val in (0, "0", "n", "no", "f", "false"): - usr_val = False - elif provided_val in (1, "1", "y", "yes", "t", "true"): - usr_val = True - else: - valid_options = (True, False) - usr_val = default_val - - elif isinstance(type_val, int): - try: - usr_val = int(provided_val) - except ValueError: - usr_val = default_val - - else: - usr_val = provided_val - - return usr_val, valid_options - - # ............................................... - @classmethod - def _process_params(cls, user_kwargs=None): - """Modify all user provided keys to lowercase and values to correct types. - - Args: - user_kwargs: dictionary of keywords and values sent by the user for - the current service. - - Returns: - good_params: dictionary of valid parameters and values - errinfo: dictionary of errors for different error levels. - - Note: - A list of valid values for a keyword can include None as a default - if user-provided value is invalid - Todo: - Do we need not_in_valid_options for error message? - """ - good_params = {} - errinfo = {} - - # Correct all parameter keys/values present - for key, param_meta in cls.SERVICE_TYPE["params"].items(): - val = user_kwargs[key] - # Done in calling function - if key == "provider": - pass - - # Do not edit namestr, maintain capitalization - elif key == "namestr": - good_params["namestr"] = val - - # Require one valid icon_status - elif key == "icon_status": - valid_stat = param_meta["options"] - if val is None: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Parameter {key} containing one of {valid_stat} options is " - f"required") - elif val not in valid_stat: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Value {val} for parameter {key} not in valid options " - f"{valid_stat}") - else: - good_params[key] = val - - elif val is not None: - usr_val, valid_options = cls._fix_type_new(key, val) - if valid_options is not None and val not in valid_options: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Value {val} for parameter {key} is not in valid options " - f"{param_meta['options']}") - good_params[key] = None - else: - good_params[key] = usr_val - - # Fill in defaults for missing parameters - for key in cls.SERVICE_TYPE["params"]: - param_meta = cls.SERVICE_TYPE["params"][key] - try: - _ = good_params[key] - except KeyError: - good_params[key] = param_meta["default"] - - return good_params, errinfo - # ............................................... @classmethod def _get_providers_from_string(cls, usr_req_providers, filter_params=None): @@ -498,18 +298,7 @@ def _standardize_params( return usr_params, errinfo - # .......................... - @staticmethod - def OPTIONS(): - """Common options request for all services (needed for CORS).""" - return - # ............................................................................. if __name__ == "__main__": - kwarg_defaults = { - "count_only": False, - "width": 600, - "height": 300, - "type": [], - } + pass diff --git a/flask_app/common/base.py b/flask_app/common/base.py new file mode 100644 index 00000000..bae23db9 --- /dev/null +++ b/flask_app/common/base.py @@ -0,0 +1,241 @@ +"""Parent Class for the Specify Network API services.""" +from flask import Flask +from werkzeug.exceptions import BadRequest, InternalServerError + +import sppy.tools.s2n.utils as lmutil +from flask_app.common.s2n_type import ( + APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider) +from sppy.tools.provider.gbif import GbifAPI +from sppy.tools.provider.itis import ItisAPI + +app = Flask(__name__) + + +# ............................................................................. +@app.errorhandler(BadRequest) +def handle_bad_request(e): + return f"Bad request: {e}" + +@app.errorhandler(InternalServerError) +def handle_bad_response(e): + return f"Internal Server Error: {e}" + +# ............................................................................. +class _SpecifyNetworkService: + """Base S-to-the-N service, handles parameter names and acceptable values.""" + # overridden by subclasses + SERVICE_TYPE = None + + + # ............................................................................. + @classmethod + def _get_valid_requested_params(cls, user_params_string, valid_params): + """Return valid and invalid options for parameters that accept >1 values. + + Args: + user_params_string: user-requested parameters as a string. + valid_params: valid parameter values + + Returns: + valid_requested_params: list of valid params from the provided query string + invalid_params: list of invalid params from the provided query string + + Note: + For the badge service, exactly one provider is required. For all other + services, multiple providers are accepted, and None indicates to query all + valid providers. + """ + valid_requested_params = invalid_params = [] + + if user_params_string: + tmplst = user_params_string.split(",") + user_params = {tp.lower().strip() for tp in tmplst} + + valid_requested_params = set() + invalid_params = set() + # valid_requested_providers, invalid_providers = + # cls.get_multivalue_options(user_provs, valid_providers) + for param in user_params: + if param in valid_params: + valid_requested_params.add(param) + else: + invalid_params.add(param) + + invalid_params = list(invalid_params) + if valid_requested_params: + valid_requested_params = list(valid_requested_params) + else: + valid_requested_params = [] + + return valid_requested_params, invalid_params + + # ............................................................................. + @classmethod + def endpoint(cls): + """Return the URL endpoint for this class. + + Returns: + URL endpoint for the service + """ + endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}" + return endpoint + + # ............................................... + @classmethod + def _fix_type_new(cls, key, provided_val): + """Modify a parameter value to a valid type and value. + + Args: + key: parameter key + provided_val: user-provided parameter value + + Returns: + usr_val: a valid value for the parameter + valid_options: list of valid options (for error message) + + Note: + Corrections: + * cast to correct type + * validate with any options + * if value is invalid (type or value), return the default. + """ + valid_options = None + if provided_val is None: + return None + # all strings are lower case + try: + provided_val = provided_val.lower() + except Exception: + pass + + param_meta = cls.SERVICE_TYPE["params"][key] + # First see if restricted to options + default_val = param_meta["default"] + type_val = param_meta["type"] + # If restricted options, check + try: + options = param_meta["options"] + except KeyError: + options = None + else: + # Invalid option returns default value + if provided_val in options: + usr_val = provided_val + else: + valid_options = options + usr_val = default_val + + # If not restricted to options + if options is None: + # Cast values to correct type. Failed conversions return default value + if isinstance(type_val, str) and not options: + usr_val = str(provided_val) + + elif isinstance(type_val, float): + try: + usr_val = float(provided_val) + except ValueError: + usr_val = default_val + + # Boolean also tests as int, so try boolean first + elif isinstance(type_val, bool): + if provided_val in (0, "0", "n", "no", "f", "false"): + usr_val = False + elif provided_val in (1, "1", "y", "yes", "t", "true"): + usr_val = True + else: + valid_options = (True, False) + usr_val = default_val + + elif isinstance(type_val, int): + try: + usr_val = int(provided_val) + except ValueError: + usr_val = default_val + + else: + usr_val = provided_val + + return usr_val, valid_options + + # ............................................... + @classmethod + def _process_params(cls, user_kwargs=None): + """Modify all user provided keys to lowercase and values to correct types. + + Args: + user_kwargs: dictionary of keywords and values sent by the user for + the current service. + + Returns: + good_params: dictionary of valid parameters and values + errinfo: dictionary of errors for different error levels. + + Note: + A list of valid values for a keyword can include None as a default + if user-provided value is invalid + Todo: + Do we need not_in_valid_options for error message? + """ + good_params = {} + errinfo = {} + + # Correct all parameter keys/values present + for key, param_meta in cls.SERVICE_TYPE["params"].items(): + val = user_kwargs[key] + # Done in calling function + if key == "provider": + pass + + # Do not edit namestr, maintain capitalization + elif key == "namestr": + good_params["namestr"] = val + + # Require one valid icon_status + elif key == "icon_status": + valid_stat = param_meta["options"] + if val is None: + errinfo = lmutil.add_errinfo( + errinfo, "error", + f"Parameter {key} containing one of {valid_stat} options is " + f"required") + elif val not in valid_stat: + errinfo = lmutil.add_errinfo( + errinfo, "error", + f"Value {val} for parameter {key} not in valid options " + f"{valid_stat}") + else: + good_params[key] = val + + elif val is not None: + usr_val, valid_options = cls._fix_type_new(key, val) + if valid_options is not None and val not in valid_options: + errinfo = lmutil.add_errinfo( + errinfo, "error", + f"Value {val} for parameter {key} is not in valid options " + f"{param_meta['options']}") + good_params[key] = None + else: + good_params[key] = usr_val + + # Fill in defaults for missing parameters + for key in cls.SERVICE_TYPE["params"]: + param_meta = cls.SERVICE_TYPE["params"][key] + try: + _ = good_params[key] + except KeyError: + good_params[key] = param_meta["default"] + + return good_params, errinfo + + + # .......................... + @staticmethod + def OPTIONS(): + """Common options request for all services (needed for CORS).""" + return + + +# ............................................................................. +if __name__ == "__main__": + pass From ea6bdbb4257492cfd1081e1130fbe5366b40b901 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 12 Mar 2024 14:25:46 -0500 Subject: [PATCH 08/81] doc, description --- flask_app/broker/occ.py | 3 +-- flask_app/common/s2n_type.py | 12 +++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py index 02c5fc20..007de72c 100644 --- a/flask_app/broker/occ.py +++ b/flask_app/broker/occ.py @@ -157,12 +157,11 @@ def get_occurrence_records( if occid is None and gbif_dataset_key is None: return cls.get_endpoint() else: - # No filter_params defined for Name service yet try: good_params, errinfo = cls._standardize_params( occid=occid, provider=provider, gbif_dataset_key=gbif_dataset_key, count_only=count_only) - # Bad parameters + # errinfo indicates bad parameters try: error_description = "; ".join(errinfo["error"]) raise BadRequest(error_description) diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index 2648ba3e..b281c16c 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -163,18 +163,20 @@ class APIService: "name": APIEndpoint.Count, "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Count}", "params": { - "collection_id": { + "dataset_key": { "type": "", - "description": "Collection identifier", + "description": "GBIF Dataset Key", "default": None }, - "organization_id": { + "pub_org_key": { "type": "", - "description": "Organization identifier", + "description": "GBIF Publishing Organization Key", "default": None } }, - "description": "Return record count for the given collection or organization.", + "description": + "Return occurrence and species counts for the given dataset or " + "publishing organization.", S2nKey.RECORD_FORMAT: "" } # Taxonomic Resolution From b6939d08f9840544ee6487f60028eeed19a9f4a2 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 12 Mar 2024 14:28:07 -0500 Subject: [PATCH 09/81] add SpNetwork S3 resources as a provider --- sppy/aws/aws_constants.py | 1 + sppy/aws/aws_tools.py | 106 +--------------------- sppy/tools/provider/awss3.py | 169 +++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 105 deletions(-) create mode 100644 sppy/tools/provider/awss3.py diff --git a/sppy/aws/aws_constants.py b/sppy/aws/aws_constants.py index 027aab91..266c3f1d 100644 --- a/sppy/aws/aws_constants.py +++ b/sppy/aws/aws_constants.py @@ -2,6 +2,7 @@ PROJ_NAME = "specnet" REGION = "us-east-1" PROJ_BUCKET = f"{PROJ_NAME}-{REGION}" +ENCODING = "utf-8" INPUT_PATH = "summary" LOG_PATH = "log" diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index a055ab6b..c01fba23 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -14,7 +14,7 @@ from sppy.aws.aws_constants import ( INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, - PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, USER_DATA_TOKEN) @@ -675,107 +675,3 @@ def create_dataframe_from_s3obj( return df -# ............................................................................. -class S3Query(): - """Specify Network API service for retrieving taxonomic information.""" - - # ............................................... - @classmethod - def __init__( - self, bucket, region=REGION, encoding="utf-8"): - """Object to query tabular data in S3. - - Args: - bucket: S3 bucket containing data. - s3_path: S3 folder(s) containing data objects. - datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed. - region: AWS region containing the data. - encoding: encoding of the data. - """ - self.s3 = boto3.client('s3') - self.bucket = bucket - self.region = region - self.encoding = encoding - self._current_datestr = get_current_datadate_str() - self.exp_type = 'SQL' - - # ---------------------------------------------------- - def query_s3_table(self, s3_path, query_str): - """Query the S3 resource defined for this class. - - Args: - query_str: a SQL query for S3 select. - - Returns: - list of records matching the query - """ - recs = [] - resp = self.s3.select_object_content( - Bucket=self.bucket, - Key= self.s3_path, - ExpressionType='SQL', - Expression=query_str, - InputSerialization={"Parquet": {}}, - OutputSerialization={"JSON": {}} - ) - for event in resp["Payload"]: - if "Records" in event: - records = event["Records"]["Payload"].decode(self.encoding) - recs.append(records) - return recs - - # ---------------------------------------------------- - def get_dataset_counts(self, dataset_key): - """Query the S3 resource for occurrence and species counts for this dataset. - - Args: - dataset_key: unique GBIF identifier for dataset of interest. - - Returns: - records - """ - datestr = get_current_datadate_str() - datestr = "2024_02_01" - s3_path = f"summary/dataset_counts_{datestr}_000.parquet" - query_str = (f"SELECT occ_count, species_count " - f"FROM s3object s " - f"WHERE s.datasetkey = {dataset_key}") - records = self.query_s3_table(s3_path, query_str) - return records - - -""" -import boto3 - -from sppy.aws.aws_constants import ( - INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, - PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, - USER_DATA_TOKEN) - -ctable = "dataset_counts_2024_02_01_000.parquet" -ltable = "dataset_lists_2024_02_01_000.parquet" -s3_path = f"summary/{ctable}" -dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" - -s3 = boto3.client('s3') -query_str = (f"SELECT occ_count, species_count " - f"FROM s3object s " - f"WHERE s.datasetkey = '{dataset_key}'") - -SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515' - -resp = s3.select_object_content( - Bucket=PROJ_BUCKET, - Key=s3_path, - ExpressionType='SQL', - Expression=query_str, - InputSerialization={"Parquet": {}}, - OutputSerialization={"CSV": {}} - ) - -for event in resp["Payload"]: - if "Records" in event: - records = event["Records"]["Payload"].decode('utf-8') - print(records) - -""" diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py new file mode 100644 index 00000000..18f1d35c --- /dev/null +++ b/sppy/tools/provider/awss3.py @@ -0,0 +1,169 @@ +"""Class to query tabular summary Specify Network data in S3""" +import base64 +import boto3 +from botocore.exceptions import ClientError +import csv +import datetime +import logging +from logging.handlers import RotatingFileHandler +import pandas as pd +import os + +from sppy.aws.aws_tools import get_current_datadate_str + +from sppy.aws.aws_constants import ( + INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, + PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + USER_DATA_TOKEN) + + + +# ............................................................................. +class S3Query(): + """Specify Network API service for retrieving taxonomic information.""" + + # ............................................... + @classmethod + def __init__( + self, bucket, region=REGION, encoding="utf-8"): + """Object to query tabular data in S3. + + Args: + bucket: S3 bucket containing data. + s3_path: S3 folder(s) containing data objects. + datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed. + region: AWS region containing the data. + encoding: encoding of the data. + """ + self.s3 = boto3.client('s3') + self.bucket = bucket + self.region = region + self.encoding = encoding + self._current_datestr = get_current_datadate_str() + self.exp_type = 'SQL' + + # ---------------------------------------------------- + def query_s3_table(self, s3_path, query_str): + """Query the S3 resource defined for this class. + + Args: + query_str: a SQL query for S3 select. + + Returns: + list of records matching the query + """ + recs = [] + resp = self.s3.select_object_content( + Bucket=self.bucket, + Key=self.s3_path, + ExpressionType='SQL', + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization={"JSON": {}} + ) + for event in resp["Payload"]: + if "Records" in event: + records = event["Records"]["Payload"].decode(self.encoding) + recs.append(records) + return recs + + # ---------------------------------------------------- + def get_dataset_counts(self, dataset_key): + """Query the S3 resource for occurrence and species counts for this dataset. + + Args: + dataset_key: unique GBIF identifier for dataset of interest. + + Returns: + records: empty list or list of 1 record containing occ_count, species_count + """ + (occ_count, species_count) = (0,0) + datestr = get_current_datadate_str() + datestr = "2024_02_01" + s3_path = f"summary/dataset_counts_{datestr}_000.parquet" + query_str = (f"SELECT occ_count, species_count " + f"FROM s3object s " + f"WHERE s.datasetkey = {dataset_key}") + # Returns empty list or list of 1 record with [(occ_count, species_count)] + records = self.query_s3_table(s3_path, query_str) + if records: + (occ_count, species_count) = records[0] + return (occ_count, species_count) + + # ---------------------------------------------------- + def get_org_counts(self, pub_org_key): + """Query S3 for occurrence and species counts for this organization. + + Args: + pub_org_key: unique GBIF identifier for organization of interest. + + Returns: + records: empty list or list of 1 record containing occ_count, species_count + + TODO: implement this? + """ + (occ_count, species_count) = (0,0) + return (occ_count, species_count) + + # ---------------------------------------------------- + def rank_datasets_by_species(self, order="descending", limit=10): + """Return the top or bottom datasets, with counts, ranked by number of species. + + Args: + order: ascending (bottom up) or descending (top down). + descending = return top X datasets in descending order + ascending = return bottom X datasets in ascending order + limit: number of datasets to return, no more than 300. + + Returns: + records: empty list or list of 1 record containing occ_count, species_count + """ + (occ_count, species_count) = (0,0) + datestr = get_current_datadate_str() + datestr = "2024_02_01" + s3_path = f"summary/dataset_counts_{datestr}_000.parquet" + query_str = (f"SELECT occ_count, species_count " + f"FROM s3object s " + f"WHERE s.datasetkey = {dataset_key}") + # Returns empty list or list of 1 record with [(occ_count, species_count)] + records = self.query_s3_table(s3_path, query_str) + if records: + (occ_count, species_count) = records[0] + return (occ_count, species_count) + + +""" +import boto3 + +from sppy.aws.aws_constants import ( + INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, + PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + USER_DATA_TOKEN) + +ctable = "dataset_counts_2024_02_01_000.parquet" +ltable = "dataset_lists_2024_02_01_000.parquet" +s3_path = f"summary/{ctable}" +dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + +s3 = boto3.client('s3') +query_str = (f"SELECT occ_count, species_count " + f"FROM s3object s " + f"WHERE s.datasetkey = '{dataset_key}'") + +SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515' + +resp = s3.select_object_content( + Bucket=PROJ_BUCKET, + Key=s3_path, + ExpressionType='SQL', + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization={"CSV": {}} + ) + +for event in resp["Payload"]: + if "Records" in event: + records = event["Records"]["Payload"].decode('utf-8') + print(records) + +""" From 61b94c62a511e4efb3cf21229189653e695d08d4 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 12 Mar 2024 16:17:50 -0500 Subject: [PATCH 10/81] support min/max for numeric params --- flask_app/common/base.py | 57 ++++++++++++++++++++++++++++-------- flask_app/common/s2n_type.py | 4 ++- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/flask_app/common/base.py b/flask_app/common/base.py index bae23db9..5144d0d5 100644 --- a/flask_app/common/base.py +++ b/flask_app/common/base.py @@ -131,12 +131,6 @@ def _fix_type_new(cls, key, provided_val): if isinstance(type_val, str) and not options: usr_val = str(provided_val) - elif isinstance(type_val, float): - try: - usr_val = float(provided_val) - except ValueError: - usr_val = default_val - # Boolean also tests as int, so try boolean first elif isinstance(type_val, bool): if provided_val in (0, "0", "n", "no", "f", "false"): @@ -146,17 +140,54 @@ def _fix_type_new(cls, key, provided_val): else: valid_options = (True, False) usr_val = default_val + else: + usr_val = cls._test_numbers(provided_val, param_meta) + + return usr_val, valid_options - elif isinstance(type_val, int): - try: - usr_val = int(provided_val) - except ValueError: - usr_val = default_val + # ............................................... + @classmethod + def _test_numbers(cls, provided_val, param_meta): + default_val = param_meta["default"] + type_val = param_meta["type"] + # If restricted numeric values, check + try: + min_val = param_meta["min"] + except KeyError: + min_val = None + # If restricted numeric values, check + try: + max_val = param_meta["min"] + except KeyError: + max_val = None + + if isinstance(type_val, float): + try: + usr_val = float(provided_val) + except ValueError: + usr_val = default_val else: - usr_val = provided_val + if min_val and usr_val < min_val: + usr_val = min_val + if max_val and usr_val > max_val: + usr_val = max_val - return usr_val, valid_options + elif isinstance(type_val, int): + try: + usr_val = int(provided_val) + except ValueError: + usr_val = default_val + else: + if min_val and usr_val < min_val: + usr_val = min_val + if max_val and usr_val > max_val: + usr_val = max_val + + else: + usr_val = provided_val + + return usr_val # ............................................... @classmethod diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index b281c16c..15831e2a 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -172,7 +172,9 @@ class APIService: "type": "", "description": "GBIF Publishing Organization Key", "default": None - } + }, + "descending": { "type": True, "default": True}, + "limit": {"type": 2, "default": 10, "min": 1, "max": 500}, }, "description": "Return occurrence and species counts for the given dataset or " From 6816e2bb139a7876cb6030a90c5c9dfe4db07c8a Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 12 Mar 2024 16:19:51 -0500 Subject: [PATCH 11/81] initial analyst apis --- flask_app/analyst/base.py | 9 +- flask_app/analyst/constants.py | 1 + flask_app/analyst/count.py | 215 ++++++++++++++++++++++----------- 3 files changed, 152 insertions(+), 73 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index c4568345..6b3e4978 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -2,6 +2,7 @@ from flask import Flask from werkzeug.exceptions import (BadRequest, InternalServerError) +from flask_app.analyst.constants import QUERY_LIMIT from flask_app.common.base import _SpecifyNetworkService from sppy.tools.s2n.utils import add_errinfo, get_traceback from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService @@ -56,12 +57,16 @@ def _show_online(cls): # ............................................... @classmethod def _standardize_params( - cls, dataset_key=None, pub_org_key=None, order="descending", limit=10): + cls, dataset_key=None, pub_org_key=None, descending=True, limit=10): """Standardize query parameters to send to appropriate service. Args: dataset_key: unique GBIF dataset identifier for comparisons pub_org_key: unique publishing organization identifier for comparisons + descending: boolean value indicating whether to sort records descending + (True) or ascending (False) + limit: integer indicating how many ranked records to return, value must + be less than QUERY_LIMIT. Returns: a dictionary containing keys and properly formatted values for the @@ -70,7 +75,7 @@ def _standardize_params( user_kwargs = { "collection_id": dataset_key, "organization_id": pub_org_key, - "order": order, + "descending": descending, "limit": limit } diff --git a/flask_app/analyst/constants.py b/flask_app/analyst/constants.py index d0a99126..8c95a510 100644 --- a/flask_app/analyst/constants.py +++ b/flask_app/analyst/constants.py @@ -1 +1,2 @@ """Constants for the Specify Network Analyst API services.""" +QUERY_LIMIT = 500 \ No newline at end of file diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 375a2694..bac143b8 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -1,12 +1,14 @@ """Class for the Specify Network Name API service.""" import boto3 from http import HTTPStatus +from werkzeug.exceptions import (BadRequest, InternalServerError) from flask_app.common.s2n_type import APIService, AnalystOutput from flask_app.common.util import print_analyst_output from flask_app.analyst.base import _AnalystService -from sppy.aws.aws_tools import query_s3_table +from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION +from sppy.tools.provider.awss3 import S3Query from sppy.tools.s2n.utils import get_traceback @@ -18,92 +20,163 @@ class CountSvc(_AnalystService): # ............................................... @classmethod - def get_counts(cls, dataset_key): + def _get_params_errors(cls, *kwargs): + try: + good_params, errinfo = cls._standardize_params(cls, kwargs) + # errinfo indicates bad parameters + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass + + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) + + return good_params, errinfo + + # ............................................... + @classmethod + def get_counts(cls, dataset_key=None, pub_org_key=None): + if dataset_key is None and pub_org_key is None: + return cls.get_endpoint() + else: + try: + good_params, errinfo = cls._standardize_params( + cls, dataset_key=dataset_key, pub_org_key=pub_org_key) + # errinfo indicates bad parameters + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass + + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) + + # Do Query! + try: + allrecs = [] + errors = {} + # for response metadata + if dataset_key is not None: + records, errors = cls._get_dataset_counts(dataset_key) + allrecs.append(records) + if pub_org_key is not None: + errors["warning"] = \ + "Count by Publishing Organization is not implemented" + # records, errors = cls._get_organization_counts(pub_org_key) + # allrecs.append(records) + + # Assemble + full_out = AnalystOutput( + cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], + records=allrecs, errors=errors) + + # Add message on invalid parameters to output + try: + for err in errinfo["warning"]: + full_out.append_error("warning", err) + except KeyError: + pass + + except Exception: + error_description = get_traceback() + raise InternalServerError(error_description) + + return full_out.response + + # ............................................... + @classmethod + def get_ranked_counts(cls, descending=True, limit=10): + try: + good_params, errinfo = cls._standardize_params( + cls, descending=descending, limit=limit) + # errinfo indicates bad parameters + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass + + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) + + # Do Query! + try: + s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) + records = s3.rank_datasets_by_species(descending=True, limit=limit) + + # ............................................... + @classmethod + def _get_dataset_counts(cls, dataset_key): """Get counts for datasetKey. Args: dataset_key: Unique identifier for GBIF datasets. Returns: - a flask_app.broker.s2n_type.BrokerOutput object with optional records as a - list of dictionaries of records corresponding to specimen occurrences in - the provider database. - - Todo: Consider adding publishing organization queries with pub_org_key + a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a + list of records corresponding to occurrence and counts for the dataset. """ + records = [] + errors = {} + s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) try: - output = cls._get_records(dataset_key, ) + (occ_count, species_count) = s3.get_dataset_counts(dataset_key) except Exception: traceback = get_traceback() - output = AnalystOutput( - cls.SERVICE_TYPE["name"], - description=cls.SERVICE_TYPE["description"], - errors={"error": [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]}) - return output.response + errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] + else: + records.append((occ_count, species_count)) + return records, errors # ............................................... @classmethod def _get_organization_counts(cls, pub_org_key): - return { - "Organization Raw Counts": - { - pub_org_key: 1, - "org_id_2": 2, - "org_id_3": 3 - }, - f"{pub_org_key} to other orgs": - { - "to total": "0.5", - "org_id_2": "1.2", - "org_id_3": "1.2" - } - } + """Get counts for publishingOrganizationKey. - # ............................................... - @classmethod - def _get_dataset_counts(cls, dataset_key): - s3 = boto3.client('s3') - - resp = s3.select_object_content( - Bucket=PROJ_, - Key='sample_data.csv', - ExpressionType='SQL', - Expression="SELECT * FROM s3object s where s.\"Name\" = 'Jane'", - InputSerialization={'CSV': {"FileHeaderInfo": "Use"}, 'CompressionType': 'NONE'}, - OutputSerialization={'CSV': {}}, - ) - - for event in resp['Payload']: - if 'Records' in event: - records = event['Records']['Payload'].decode('utf-8') - print(records) - elif 'Stats' in event: - statsDetails = event['Stats']['Details'] - print("Stats details bytesScanned: ") - print(statsDetails['BytesScanned']) - print("Stats details bytesProcessed: ") - print(statsDetails['BytesProcessed']) - print("Stats details bytesReturned: ") - print(statsDetails['BytesReturned']) + Args: + pub_org_key: Unique identifier for GBIF publishing organizations. - # ............................................... - @classmethod - def _get_records(cls, dataset_key, pub_org_key): - allrecs = [] - # for response metadata - if dataset_key is not None: - coll_data = cls._get_collection_counts(dataset_key) - allrecs.append(coll_data) - if pub_org_key is not None: - org_data = cls._get_organization_counts(pub_org_key) - allrecs.append(org_data) - - # Assemble - full_out = AnalystOutput( - cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], - records=allrecs, errors={}) - - return full_out + Returns: + a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a + list of records corresponding to occurrence and counts for the organization. + """ + records = [] + errors = {} + s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) + try: + (occ_count, species_count) = s3.get_org_counts(pub_org_key) + except Exception: + traceback = get_traceback() + errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] + else: + records.append((occ_count, species_count)) + return records, errors + + + # # ............................................... + # @classmethod + # def _get_records(cls, dataset_key, pub_org_key): + # allrecs = [] + # # for response metadata + # if dataset_key is not None: + # records, errors = cls._get_dataset_counts(dataset_key) + # allrecs.append(records) + # if pub_org_key is not None: + # records, errors = cls._get_organization_counts(pub_org_key) + # allrecs.append(records) + # + # # Assemble + # full_out = AnalystOutput( + # cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], + # records=allrecs, errors={}) + # + # return full_out # ............................................................................. From 2a6080bb47adaa66011bcb06ffd6f55a56846c02 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 12 Mar 2024 16:20:21 -0500 Subject: [PATCH 12/81] use pandassql for ranked records --- requirements.txt | 2 + sppy/aws/aws_constants.py | 1 + sppy/tools/provider/awss3.py | 83 ++++++++++++++++++++++-------------- 3 files changed, 55 insertions(+), 31 deletions(-) diff --git a/requirements.txt b/requirements.txt index 553b4dc4..f7a78ccb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,9 @@ gunicorn==20.1.0 rtree>=1.0.0 awscli boto3>=1.34.60 +sqlalchemy pandas +pandas-sql pyarrow s3fs ggshield \ No newline at end of file diff --git a/sppy/aws/aws_constants.py b/sppy/aws/aws_constants.py index 266c3f1d..9fd7fe79 100644 --- a/sppy/aws/aws_constants.py +++ b/sppy/aws/aws_constants.py @@ -2,6 +2,7 @@ PROJ_NAME = "specnet" REGION = "us-east-1" PROJ_BUCKET = f"{PROJ_NAME}-{REGION}" +SUMMARY_FOLDER = "summary" ENCODING = "utf-8" INPUT_PATH = "summary" diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 18f1d35c..8ceea88d 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -1,26 +1,17 @@ """Class to query tabular summary Specify Network data in S3""" -import base64 import boto3 -from botocore.exceptions import ClientError -import csv -import datetime -import logging -from logging.handlers import RotatingFileHandler import pandas as pd -import os +from pandassql import sqldf from sppy.aws.aws_tools import get_current_datadate_str -from sppy.aws.aws_constants import ( - INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, - PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, - USER_DATA_TOKEN) +from sppy.aws.aws_constants import (REGION, SUMMARY_FOLDER) # ............................................................................. class S3Query(): - """Specify Network API service for retrieving taxonomic information.""" + """Specify Network API service for retrieving tabular parquet data from AWS S3.""" # ............................................... @classmethod @@ -43,7 +34,7 @@ def __init__( self.exp_type = 'SQL' # ---------------------------------------------------- - def query_s3_table(self, s3_path, query_str): + def _query_s3_table(self, s3_path, query_str): """Query the S3 resource defined for this class. Args: @@ -67,6 +58,42 @@ def query_s3_table(self, s3_path, query_str): recs.append(records) return recs + # ---------------------------------------------------- + def _create_dataframe_from_s3obj(self, s3_path): + """Read CSV data from S3 into a pandas DataFrame. + + Args: + s3_path: the object name with enclosing S3 bucket folders. + + Returns: + df: pandas DataFrame containing the CSV data. + """ + # import pyarrow.parquet as pq + # import s3fs + s3_uri = f"s3://{self.bucket}/{s3_path}" + # s3_fs = s3fs.S3FileSystem + df = pd.read_parquet(s3_uri) + return df + + # ---------------------------------------------------- + def _query_order_s3_table(self, s3_path, sort_field, descending, limit): + """Query the S3 resource defined for this class. + + Args: + query_str: a SQL query for S3 select. + + Returns: + list of records matching the query + """ + recs = [] + df = self._create_dataframe_from_s3obj(s3_path) + df.sort_values(by=sort_field, ascending=(not descending)) + for event in resp["Payload"]: + if "Records" in event: + records = event["Records"]["Payload"].decode(self.encoding) + recs.append(records) + return recs + # ---------------------------------------------------- def get_dataset_counts(self, dataset_key): """Query the S3 resource for occurrence and species counts for this dataset. @@ -80,12 +107,12 @@ def get_dataset_counts(self, dataset_key): (occ_count, species_count) = (0,0) datestr = get_current_datadate_str() datestr = "2024_02_01" - s3_path = f"summary/dataset_counts_{datestr}_000.parquet" + s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" query_str = (f"SELECT occ_count, species_count " f"FROM s3object s " f"WHERE s.datasetkey = {dataset_key}") # Returns empty list or list of 1 record with [(occ_count, species_count)] - records = self.query_s3_table(s3_path, query_str) + records = self._query_s3_table(s3_path, query_str) if records: (occ_count, species_count) = records[0] return (occ_count, species_count) @@ -106,35 +133,29 @@ def get_org_counts(self, pub_org_key): return (occ_count, species_count) # ---------------------------------------------------- - def rank_datasets_by_species(self, order="descending", limit=10): + def rank_datasets_by_species(self, descending=True, limit=10): """Return the top or bottom datasets, with counts, ranked by number of species. Args: - order: ascending (bottom up) or descending (top down). - descending = return top X datasets in descending order - ascending = return bottom X datasets in ascending order + descending: boolean value, if true return top X datasets in descending + order, if false, return bottom X datasets in ascending order limit: number of datasets to return, no more than 300. Returns: - records: empty list or list of 1 record containing occ_count, species_count + records: list of limit records containing dataset_key, occ_count, species_count """ - (occ_count, species_count) = (0,0) + records = [] datestr = get_current_datadate_str() datestr = "2024_02_01" - s3_path = f"summary/dataset_counts_{datestr}_000.parquet" - query_str = (f"SELECT occ_count, species_count " - f"FROM s3object s " - f"WHERE s.datasetkey = {dataset_key}") - # Returns empty list or list of 1 record with [(occ_count, species_count)] - records = self.query_s3_table(s3_path, query_str) - if records: - (occ_count, species_count) = records[0] - return (occ_count, species_count) + s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" + records = self._query_order_s3_table( + s3_path, "species_count", descending, limit) + return records """ import boto3 - +SELECT s.datasetkey, s.occ_count, s.species_count FROM s3object s ORDER BY s.species_count DESC LIMIT 5 from sppy.aws.aws_constants import ( INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, From 9f1a5c8b721651081fa8e3466fbc2476ef487195 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 13 Mar 2024 11:38:24 -0500 Subject: [PATCH 13/81] generalize standardize_params --- flask_app/analyst/base.py | 19 +++-- flask_app/analyst/count.py | 144 +++++++++++------------------------ flask_app/broker/base.py | 29 +++++-- sppy/tools/provider/awss3.py | 15 ++-- 4 files changed, 88 insertions(+), 119 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 6b3e4978..004d662f 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -68,6 +68,10 @@ def _standardize_params( limit: integer indicating how many ranked records to return, value must be less than QUERY_LIMIT. + Raises: + BadRequest: on invalid query parameters. + BadRequest: on unknown exception parsing parameters. + Returns: a dictionary containing keys and properly formatted values for the user specified parameters. @@ -81,18 +85,17 @@ def _standardize_params( try: usr_params, errinfo = cls._process_params(user_kwargs) - - # errinfo indicates bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - except Exception: error_description = get_traceback() raise BadRequest(error_description) + # errinfo["error"] indicates bad parameters, throws exception + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass + return usr_params, errinfo diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index bac143b8..ad402665 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -9,7 +9,7 @@ from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION from sppy.tools.provider.awss3 import S3Query -from sppy.tools.s2n.utils import get_traceback +from sppy.tools.s2n.utils import combine_errinfo, get_traceback # ............................................................................. @@ -18,99 +18,71 @@ class CountSvc(_AnalystService): SERVICE_TYPE = APIService.Count ORDERED_FIELDNAMES = [] - # ............................................... - @classmethod - def _get_params_errors(cls, *kwargs): - try: - good_params, errinfo = cls._standardize_params(cls, kwargs) - # errinfo indicates bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) - - return good_params, errinfo - # ............................................... @classmethod def get_counts(cls, dataset_key=None, pub_org_key=None): if dataset_key is None and pub_org_key is None: return cls.get_endpoint() - else: - try: - good_params, errinfo = cls._standardize_params( - cls, dataset_key=dataset_key, pub_org_key=pub_org_key) - # errinfo indicates bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) + allrecs = [] + try: + good_params, errinfo = cls._standardize_params( + cls, dataset_key=dataset_key, pub_org_key=pub_org_key) - # Do Query! - try: - allrecs = [] - errors = {} - # for response metadata - if dataset_key is not None: + except BadRequest as e: + errinfo = combine_errinfo(errinfo, {"error": e.description}) + + else: + + # Query dataset counts + if dataset_key is not None: + try: records, errors = cls._get_dataset_counts(dataset_key) + except Exception: + errors = {"error": get_traceback()} + else: allrecs.append(records) - if pub_org_key is not None: - errors["warning"] = \ - "Count by Publishing Organization is not implemented" - # records, errors = cls._get_organization_counts(pub_org_key) - # allrecs.append(records) - - # Assemble - full_out = AnalystOutput( - cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], - records=allrecs, errors=errors) - - # Add message on invalid parameters to output - try: - for err in errinfo["warning"]: - full_out.append_error("warning", err) - except KeyError: - pass + # Combine errors from success or failure + errinfo = combine_errinfo(errinfo, errors) - except Exception: - error_description = get_traceback() - raise InternalServerError(error_description) + # Query organization counts + if pub_org_key is not None: + errors = {"warning": "Count by Publishing Organization is not implemented"} + errinfo = combine_errinfo(errinfo, errors) + + # Assemble + full_out = AnalystOutput( + cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], + records=allrecs, errors=errinfo) return full_out.response # ............................................... @classmethod def get_ranked_counts(cls, descending=True, limit=10): - try: - good_params, errinfo = cls._standardize_params( - cls, descending=descending, limit=limit) - # errinfo indicates bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass + allrecs = [] + try: + good_params, errinfo = cls._standardize_params( + cls, descending=descending, limit=limit) - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) + except BadRequest as e: + errinfo = combine_errinfo(errinfo, {"error": e.description}) + else: # Do Query! try: s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) - records = s3.rank_datasets_by_species(descending=True, limit=limit) - - # ............................................... + records, errors = s3.rank_datasets_by_species( + descending=True, limit=limit) + except Exception: + errors = {"error": get_traceback()} + else: + allrecs.append(records) + # Combine errors from success or failure + errinfo = combine_errinfo(errinfo, errors) + return allrecs, errinfo + +# ............................................... @classmethod def _get_dataset_counts(cls, dataset_key): """Get counts for datasetKey. @@ -181,27 +153,6 @@ def _get_organization_counts(cls, pub_org_key): # ............................................................................. if __name__ == "__main__": - # from flask_app.broker.constants import import TST_VALUES - # occids = TST_VALUES.GUIDS_WO_SPECIFY_ACCESS[0:3] - occids = ["84fe1494-c378-4657-be15-8c812b228bf4", - "04c05e26-4876-4114-9e1d-984f78e89c15", - "2facc7a2-dd88-44af-b95a-733cc27527d4"] - occids = ["01493b05-4310-4f28-9d81-ad20860311f3", - "01559f57-62ca-45ba-80b1-d2aafdc46f44", - "015f35b8-655a-4720-9b88-c1c09f6562cb", - "016613ba-4e65-44d5-94d1-e24605afc7e1", - "0170cead-c9cd-48ba-9819-6c5d2e59947e", - "01792c67-910f-4ad6-8912-9b1341cbd983", - "017ea8f2-fc5a-4660-92ec-c203daaaa631", - "018728bb-c376-4562-9ccb-8e3c3fd70df6", - "018a34a9-55da-4503-8aee-e728ba4be146", - "019b547a-79c7-47b3-a5ae-f11d30c2b0de"] - # This occ has 16 issues in IDB, 0 in GBIF - occids = ["2facc7a2-dd88-44af-b95a-733cc27527d4", - "2c1becd5-e641-4e83-b3f5-76a55206539a"] - occids = ["bffe655b-ea32-4838-8e80-a80e391d5b11"] - occids = ["db193603-1ed3-11e3-bfac-90b11c41863e"] - svc = CountSvc() out = svc.get_endpoint() print_analyst_output(out, do_print_rec=True) @@ -210,8 +161,3 @@ def _get_organization_counts(cls, pub_org_key): org_id = None out = svc.get_counts(coll_id, org_id) print_analyst_output(out, do_print_rec=True) - - # for occid in occids: - # out = svc.get_occurrence_records(occid=occid, provider=None, count_only=False) - # outputs = out["records"] - # print_broker_output(out, do_print_rec=True) diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py index 200ed43e..c7b1c455 100644 --- a/flask_app/broker/base.py +++ b/flask_app/broker/base.py @@ -2,7 +2,7 @@ from flask import Flask from werkzeug.exceptions import BadRequest, InternalServerError -import sppy.tools.s2n.utils as lmutil +from sppy.tools.s2n.utils import add_errinfo, combine_errinfo, get_traceback from flask_app.common.base import _SpecifyNetworkService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider) @@ -204,14 +204,14 @@ def _get_providers_from_string(cls, usr_req_providers, filter_params=None): providers = valid_requested_providers[0] else: providers = None - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "error", f"Parameter provider containing exactly one of {valid_providers} " f"options is required") if invalid_providers: for ip in invalid_providers: - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "warning", f"Value {ip} for parameter provider not in valid options " f"{valid_providers}") @@ -249,6 +249,9 @@ def _standardize_params( a dictionary containing keys and properly formatted values for the user specified parameters. + Raises: + BadRequest on invalid query parameters + Note: filter_params is present to distinguish between providers for occ service by occurrence_id or by dataset_id. @@ -275,12 +278,25 @@ def _standardize_params( # "width": width, "icon_status": icon_status} - providers, prov_errinfo = cls._get_providers_from_string( + providers, errinfo = cls._get_providers_from_string( provider, filter_params=filter_params) - usr_params, errinfo = cls._process_params(user_kwargs) + + try: + usr_params, param_errinfo = cls._process_params(user_kwargs) + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) + # consolidate parameters and errors usr_params["provider"] = providers - errinfo = lmutil.combine_errinfo(errinfo, prov_errinfo) + errinfo = combine_errinfo(errinfo, param_errinfo) + + # errinfo["error"] indicates bad parameters, throws exception + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass # Remove gbif_parse and itis_match flags gbif_parse = itis_match = False @@ -292,6 +308,7 @@ def _standardize_params( itis_match = usr_params.pop("itis_match") except Exception: pass + # Replace namestr with GBIF-parsed namestr if namestr and (gbif_parse or itis_match): usr_params["namestr"] = cls.parse_name_with_gbif(namestr) diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 8ceea88d..5306e93a 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -1,11 +1,10 @@ """Class to query tabular summary Specify Network data in S3""" import boto3 import pandas as pd -from pandassql import sqldf from sppy.aws.aws_tools import get_current_datadate_str - from sppy.aws.aws_constants import (REGION, SUMMARY_FOLDER) +from sppy.tools.s2n.utils import get_traceback @@ -86,13 +85,14 @@ def _query_order_s3_table(self, s3_path, sort_field, descending, limit): list of records matching the query """ recs = [] + errors = {} df = self._create_dataframe_from_s3obj(s3_path) df.sort_values(by=sort_field, ascending=(not descending)) for event in resp["Payload"]: if "Records" in event: records = event["Records"]["Payload"].decode(self.encoding) recs.append(records) - return recs + return recs, errors # ---------------------------------------------------- def get_dataset_counts(self, dataset_key): @@ -148,9 +148,12 @@ def rank_datasets_by_species(self, descending=True, limit=10): datestr = get_current_datadate_str() datestr = "2024_02_01" s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" - records = self._query_order_s3_table( - s3_path, "species_count", descending, limit) - return records + try: + records, errors = self._query_order_s3_table( + s3_path, "species_count", descending, limit) + except Exception as e: + errors = {"error": get_traceback()} + return records, errors """ From 296c1228e02581dd038ebd776acc56a3cd093b3e Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 13 Mar 2024 17:55:49 -0500 Subject: [PATCH 14/81] debugging analyst response --- flask_app/analyst/base.py | 4 +- flask_app/analyst/count.py | 19 +++--- flask_app/analyst/routes.py | 6 +- flask_app/broker/badge.py | 43 +++++------- flask_app/broker/name.py | 39 ++++------- flask_app/broker/occ.py | 42 +++++------- flask_app/common/s2n_type.py | 13 ++++ sphinx/misc/debugging.rst | 10 ++- sppy/tools/provider/awss3.py | 123 ++++++++++++++++++++++------------- 9 files changed, 161 insertions(+), 138 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 004d662f..1ed8e959 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -77,8 +77,8 @@ def _standardize_params( user specified parameters. """ user_kwargs = { - "collection_id": dataset_key, - "organization_id": pub_org_key, + "dataset_key": dataset_key, + "pub_org_key": pub_org_key, "descending": descending, "limit": limit } diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index ad402665..cdd82375 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -27,13 +27,12 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): allrecs = [] try: good_params, errinfo = cls._standardize_params( - cls, dataset_key=dataset_key, pub_org_key=pub_org_key) + dataset_key=dataset_key, pub_org_key=pub_org_key) except BadRequest as e: - errinfo = combine_errinfo(errinfo, {"error": e.description}) + errinfo = {"error": e.description} else: - # Query dataset counts if dataset_key is not None: try: @@ -47,7 +46,8 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): # Query organization counts if pub_org_key is not None: - errors = {"warning": "Count by Publishing Organization is not implemented"} + errors = { + "warning": "Count by Publishing Organization is not implemented"} errinfo = combine_errinfo(errinfo, errors) # Assemble @@ -98,12 +98,11 @@ def _get_dataset_counts(cls, dataset_key): errors = {} s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) try: - (occ_count, species_count) = s3.get_dataset_counts(dataset_key) + records = s3.get_dataset_counts(dataset_key) except Exception: traceback = get_traceback() errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] - else: - records.append((occ_count, species_count)) + return records, errors # ............................................... @@ -157,7 +156,7 @@ def _get_organization_counts(cls, pub_org_key): out = svc.get_endpoint() print_analyst_output(out, do_print_rec=True) - coll_id = "a7156437-55ec-4c6f-89de-938f9361753d" - org_id = None - out = svc.get_counts(coll_id, org_id) + ds_key = "a7156437-55ec-4c6f-89de-938f9361753d" + org_key = None + out = svc.get_counts(dataset_key=ds_key, pub_org_key=org_key) print_analyst_output(out, do_print_rec=True) diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py index f53ad06c..7afc0d09 100644 --- a/flask_app/analyst/routes.py +++ b/flask_app/analyst/routes.py @@ -72,13 +72,13 @@ def count_endpoint(): response: A flask_app.analyst API response object containing the count API response. """ - coll_arg = request.args.get("collection_id", default=None, type=str) + ds_arg = request.args.get("dataset_key", default=None, type=str) # org_arg = request.args.get("organization_id", default=None, type=str) # if coll_arg is None and org_arg is None: - if coll_arg is None: + if ds_arg is None: response = CountSvc.get_endpoint() else: - response = CountSvc.get_counts(coll_arg) + response = CountSvc.get_counts(ds_arg) return response diff --git a/flask_app/broker/badge.py b/flask_app/broker/badge.py index 9be666d6..d95bcff5 100644 --- a/flask_app/broker/badge.py +++ b/flask_app/broker/badge.py @@ -6,7 +6,7 @@ from flask_app.broker.constants import (ICON_CONTENT, ICON_DIR) from flask_app.common.s2n_type import APIService, S2nKey, ServiceProvider -from sppy.tools.s2n.utils import get_traceback +from sppy.tools.s2n.utils import combine_errinfo, get_traceback from flask_app.broker.base import _BrokerService @@ -82,34 +82,27 @@ def get_icon( try: good_params, errinfo = cls._standardize_params( provider=provider, icon_status=icon_status) - # Bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except Exception: - pass - except Exception: - # Unknown error - error_description = get_traceback() - raise BadRequest(error_description) + except BadRequest as e: + raise - icon_basename = cls._get_icon_filename( - good_params["provider"][0], good_params["icon_status"]) - icon_fname = os.path.join(app_path, ICON_DIR, icon_basename) + else: + icon_basename = cls._get_icon_filename( + good_params["provider"][0], good_params["icon_status"]) + icon_fname = os.path.join(app_path, ICON_DIR, icon_basename) + + if icon_fname is not None: + if stream: + return send_file( + icon_fname, mimetype=ICON_CONTENT, as_attachment=False) + else: + return send_file( + icon_fname, mimetype=ICON_CONTENT, as_attachment=True, + attachment_filename=icon_fname) - if icon_fname is not None: - if stream: - return send_file( - icon_fname, mimetype=ICON_CONTENT, as_attachment=False) else: - return send_file( - icon_fname, mimetype=ICON_CONTENT, as_attachment=True, - attachment_filename=icon_fname) - - else: - raise NotImplementedError( - f"Badge {icon_status} not implemented for provider {provider}") + raise NotImplementedError( + f"Badge {icon_status} not implemented for provider {provider}") # ............................................................................. diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py index 4b1e2e63..d79a944a 100644 --- a/flask_app/broker/name.py +++ b/flask_app/broker/name.py @@ -9,7 +9,7 @@ from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.itis import ItisAPI from sppy.tools.provider.worms import WormsAPI -from sppy.tools.s2n.utils import get_traceback +from sppy.tools.s2n.utils import combine_errinfo, get_traceback # ............................................................................. @@ -156,35 +156,24 @@ def get_name_records( good_params, errinfo = cls._standardize_params( namestr=namestr, provider=provider, is_accepted=is_accepted, gbif_parse=gbif_parse, gbif_count=gbif_count, kingdom=kingdom) - # Bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) - try: - # Do Query! - output = cls._get_records( - good_params["namestr"], good_params["provider"], - good_params["is_accepted"], good_params["gbif_count"], - good_params["kingdom"]) + except BadRequest as e: + full_output = cls._get_badquery_output(e.description) - # Add message on invalid parameters to output + else: try: - for err in errinfo["warning"]: - output.append_error("warning", err) - except KeyError: - pass + # Do Query!, returns BrokerOutput + full_output = cls._get_records( + good_params["namestr"], good_params["provider"], + good_params["is_accepted"], good_params["gbif_count"], + good_params["kingdom"]) + except Exception: + full_output = cls._get_badquery_output(get_traceback()) - except Exception: - error_description = get_traceback() - raise InternalServerError(error_description) + # Combine with errors from parameters + full_output.combine_errors(errinfo) - return output.response + return full_output.response # ............................................................................. diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py index 007de72c..47d31378 100644 --- a/flask_app/broker/occ.py +++ b/flask_app/broker/occ.py @@ -161,36 +161,24 @@ def get_occurrence_records( good_params, errinfo = cls._standardize_params( occid=occid, provider=provider, gbif_dataset_key=gbif_dataset_key, count_only=count_only) - # errinfo indicates bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) - # Do Query! - try: - output = cls._get_records( - good_params["occid"], good_params["provider"], - good_params["count_only"], - gbif_dataset_key=good_params["gbif_dataset_key"]) + except BadRequest as e: + full_output = cls._get_badquery_output(e.description) - # Add message on invalid parameters to output + else: try: - for err in errinfo["warning"]: - output.append_error("warning", err) - except KeyError: - pass - - except Exception: - error_description = get_traceback() - raise InternalServerError(error_description) - - return output.response + # Do Query!, returns BrokerOutput + full_output = cls._get_records( + good_params["occid"], good_params["provider"], + good_params["count_only"], + gbif_dataset_key=good_params["gbif_dataset_key"]) + except Exception: + full_output = cls._get_badquery_output(get_traceback()) + + # Combine with errors from parameters + full_output.combine_errors(errinfo) + + return full_output.response # ............................................................................. diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index 15831e2a..ea4dc7e6 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -823,6 +823,19 @@ def append_error(self, error_type, error_desc): except KeyError: self._response[S2nKey.ERRORS][error_type] = [error_desc] + # ............................................... + def combine_errors(self, errinfo): + """Combine a dictionary of errors to the errors in a S2nOutput query response. + + Args: + errinfo: dictionary of errors, with error level, and list of descriptions. + """ + for err_type, err_desc in errinfo.items(): + try: + self._response[S2nKey.ERRORS][err_type].append(err_desc) + except KeyError: + self._response[S2nKey.ERRORS][err_type] = [err_desc] + # ............................................... @property def response(self): diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst index 8cfd0aa3..0c8b7844 100644 --- a/sphinx/misc/debugging.rst +++ b/sphinx/misc/debugging.rst @@ -18,11 +18,17 @@ export FLASK_ENV=development export FLASK_APP=flask_app.broker.routes flask run ``` -* With either Analyst or Broker, the development port will be 5000 +* With either Analyst or Broker, the development port will be 5000. Connect to + http://127.0.0.1:5000 in browser, - * Connect to http://127.0.0.1:5000 in browser, + * Broker i.e. http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False& + or http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif + * Analyst: + http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515 * Flask will auto-update on file save. * Refresh browser after changes +* The frontend endpoint cannot be tested this way, as it depends on frontend + **webpack-output** and **static-files** to be mounted as docker volumes. diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 5306e93a..15d43880 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -1,5 +1,6 @@ """Class to query tabular summary Specify Network data in S3""" import boto3 +import json import pandas as pd from sppy.aws.aws_tools import get_current_datadate_str @@ -33,7 +34,7 @@ def __init__( self.exp_type = 'SQL' # ---------------------------------------------------- - def _query_s3_table(self, s3_path, query_str): + def _query_s3_table(self, s3_path, query_str, format="JSON"): """Query the S3 resource defined for this class. Args: @@ -43,18 +44,36 @@ def _query_s3_table(self, s3_path, query_str): list of records matching the query """ recs = [] + if format not in ("JSON", "CSV"): + format = "JSON" + if format == "JSON": + out_serialization = {"JSON": {}} + elif format == "CSV": + out_serialization = { + "CSV": { + "QuoteFields": "ASNEEDED", + "FieldDelimiter": ",", + "QuoteCharacter": '"'} + } resp = self.s3.select_object_content( Bucket=self.bucket, - Key=self.s3_path, + Key=s3_path, ExpressionType='SQL', Expression=query_str, InputSerialization={"Parquet": {}}, - OutputSerialization={"JSON": {}} + OutputSerialization=out_serialization ) for event in resp["Payload"]: if "Records" in event: - records = event["Records"]["Payload"].decode(self.encoding) - recs.append(records) + recs_str = event["Records"]["Payload"].decode(ENCODING) + rec_strings = recs_str.split("\n") + for rs in rec_strings: + if rs: + if format == "JSON": + rec = json.loads(rs) + else: + rec = rs.split(",") + recs.append(rec) return recs # ---------------------------------------------------- @@ -108,14 +127,13 @@ def get_dataset_counts(self, dataset_key): datestr = get_current_datadate_str() datestr = "2024_02_01" s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" - query_str = (f"SELECT occ_count, species_count " + query_str = (f"SELECT datasetkey, occ_count, species_count " f"FROM s3object s " - f"WHERE s.datasetkey = {dataset_key}") + f"WHERE s.datasetkey = '{dataset_key}'") + print(query_str) # Returns empty list or list of 1 record with [(occ_count, species_count)] - records = self._query_s3_table(s3_path, query_str) - if records: - (occ_count, species_count) = records[0] - return (occ_count, species_count) + records = self._query_s3_table(s3_path, query_str, format="JSON") + return records # ---------------------------------------------------- def get_org_counts(self, pub_org_key): @@ -155,39 +173,56 @@ def rank_datasets_by_species(self, descending=True, limit=10): errors = {"error": get_traceback()} return records, errors - +# ............................................................................. +if __name__ == "__main__": + from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION + + datestr = "2024_02_01" + dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + s3 = boto3.client('s3') + + s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" + query_str = (f"SELECT datasetkey, occ_count, species_count " + f"FROM s3object s " + f"WHERE s.datasetkey = '{dataset_key}'") + query_str = f"SELECT datasetkey, occ_count, species_count FROM s3object s LIMIT 5" + + format = "CSV" + if format == "JSON": + out_serialization = {"JSON": {}} + elif format == "CSV": + out_serialization = { + "CSV": { + "QuoteFields": "ASNEEDED", + "FieldDelimiter": ",", + "QuoteCharacter": '"'} + } + resp = s3.select_object_content( + Bucket=PROJ_BUCKET, + Key=s3_path, + ExpressionType="SQL", + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization=out_serialization + ) + + for event in resp["Payload"]: + print(event) + if "Records" in event: + recs_str = event["Records"]["Payload"].decode(ENCODING) + rec_strings = recs_str.split("\n") + for rs in rec_strings: + if rs: + if format == "JSON": + rec = json.loads(rs) + else: + rec = rs.split(",") + print(rec) + + + + + # records = self._query_s3_table(s3_path, query_str) """ -import boto3 -SELECT s.datasetkey, s.occ_count, s.species_count FROM s3object s ORDER BY s.species_count DESC LIMIT 5 -from sppy.aws.aws_constants import ( - INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, - PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, - USER_DATA_TOKEN) - -ctable = "dataset_counts_2024_02_01_000.parquet" -ltable = "dataset_lists_2024_02_01_000.parquet" -s3_path = f"summary/{ctable}" -dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" - -s3 = boto3.client('s3') -query_str = (f"SELECT occ_count, species_count " - f"FROM s3object s " - f"WHERE s.datasetkey = '{dataset_key}'") - -SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515' - -resp = s3.select_object_content( - Bucket=PROJ_BUCKET, - Key=s3_path, - ExpressionType='SQL', - Expression=query_str, - InputSerialization={"Parquet": {}}, - OutputSerialization={"CSV": {}} - ) - -for event in resp["Payload"]: - if "Records" in event: - records = event["Records"]["Payload"].decode('utf-8') - print(records) """ From 3dd75b74ef020e4bc5540136d214093850c79d36 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 14 Mar 2024 11:48:44 -0500 Subject: [PATCH 15/81] fixed circular imports --- flask_app/analyst/base.py | 10 ++- flask_app/analyst/count.py | 49 +++++++++------ flask_app/broker/base.py | 10 --- flask_app/broker/name.py | 4 +- flask_app/common/base.py | 13 ++-- sppy/tools/provider/awss3.py | 118 +++++++++++------------------------ 6 files changed, 79 insertions(+), 125 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 1ed8e959..0c771cb6 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -1,13 +1,11 @@ """Parent Class for the Specify Network API services.""" -from flask import Flask -from werkzeug.exceptions import (BadRequest, InternalServerError) +from werkzeug.exceptions import BadRequest -from flask_app.analyst.constants import QUERY_LIMIT from flask_app.common.base import _SpecifyNetworkService -from sppy.tools.s2n.utils import add_errinfo, get_traceback -from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService +from sppy.tools.s2n.utils import get_traceback +from flask_app.common.s2n_type import AnalystOutput, APIService -app = Flask(__name__) +# app = Flask(__name__) # ............................................................................. diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index cdd82375..006325f2 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -1,15 +1,14 @@ """Class for the Specify Network Name API service.""" -import boto3 from http import HTTPStatus -from werkzeug.exceptions import (BadRequest, InternalServerError) +from werkzeug.exceptions import BadRequest from flask_app.common.s2n_type import APIService, AnalystOutput from flask_app.common.util import print_analyst_output from flask_app.analyst.base import _AnalystService -from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION +from sppy.aws.aws_constants import PROJ_BUCKET from sppy.tools.provider.awss3 import S3Query -from sppy.tools.s2n.utils import combine_errinfo, get_traceback +from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) # ............................................................................. @@ -20,7 +19,16 @@ class CountSvc(_AnalystService): # ............................................... @classmethod - def get_counts(cls, dataset_key=None, pub_org_key=None): + def get_counts(cls, dataset_key=None, pub_org_key=None, format="CSV"): + """Return occurrence and species counts for dataset/organization identifiers. + + Args: + dataset_key: URL parameter for unique GBIF identifier of dataset. + pub_org_key: URL parameter for unique GBIF identifier of + publishingOrganization. + format: output format, options "CSV" or "JSON" + + """ if dataset_key is None and pub_org_key is None: return cls.get_endpoint() @@ -34,18 +42,20 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): else: # Query dataset counts - if dataset_key is not None: + if good_params["dataset_key"] is not None: try: - records, errors = cls._get_dataset_counts(dataset_key) + records, errors = cls._get_dataset_counts( + good_params["dataset_key"], format) except Exception: errors = {"error": get_traceback()} else: - allrecs.append(records) + if records: + allrecs.append(records) # Combine errors from success or failure errinfo = combine_errinfo(errinfo, errors) # Query organization counts - if pub_org_key is not None: + if good_params["pub_org_key"] is not None: errors = { "warning": "Count by Publishing Organization is not implemented"} errinfo = combine_errinfo(errinfo, errors) @@ -71,7 +81,7 @@ def get_ranked_counts(cls, descending=True, limit=10): else: # Do Query! try: - s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) + s3 = S3Query(PROJ_BUCKET) records, errors = s3.rank_datasets_by_species( descending=True, limit=limit) except Exception: @@ -84,11 +94,12 @@ def get_ranked_counts(cls, descending=True, limit=10): # ............................................... @classmethod - def _get_dataset_counts(cls, dataset_key): + def _get_dataset_counts(cls, dataset_key, format): """Get counts for datasetKey. Args: - dataset_key: Unique identifier for GBIF datasets. + dataset_key: unique GBIF identifier for dataset of interest. + format: output format, options "CSV" or "JSON" Returns: a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a @@ -96,9 +107,9 @@ def _get_dataset_counts(cls, dataset_key): """ records = [] errors = {} - s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) + s3 = S3Query(PROJ_BUCKET) try: - records = s3.get_dataset_counts(dataset_key) + records = s3.get_dataset_counts(dataset_key, format=format) except Exception: traceback = get_traceback() errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] @@ -119,7 +130,7 @@ def _get_organization_counts(cls, pub_org_key): """ records = [] errors = {} - s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING) + s3 = S3Query(PROJ_BUCKET) try: (occ_count, species_count) = s3.get_org_counts(pub_org_key) except Exception: @@ -155,8 +166,8 @@ def _get_organization_counts(cls, pub_org_key): svc = CountSvc() out = svc.get_endpoint() print_analyst_output(out, do_print_rec=True) - - ds_key = "a7156437-55ec-4c6f-89de-938f9361753d" - org_key = None - out = svc.get_counts(dataset_key=ds_key, pub_org_key=org_key) + format = "CSV" + dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + out = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format="CSV") print_analyst_output(out, do_print_rec=True) + diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py index c7b1c455..bba52669 100644 --- a/flask_app/broker/base.py +++ b/flask_app/broker/base.py @@ -9,8 +9,6 @@ from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.itis import ItisAPI -app = Flask(__name__) - # ............................................................................. @app.errorhandler(BadRequest) @@ -268,14 +266,6 @@ def _standardize_params( "gbif_dataset_key": gbif_dataset_key, "count_only": count_only, "url": url, - # "bbox": bbox, - # "exceptions": exceptions, - # "height": height, - # "layers": layers, - # "request": request, - # "format": frmat, - # "srs": srs, - # "width": width, "icon_status": icon_status} providers, errinfo = cls._get_providers_from_string( diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py index d79a944a..bb2343ff 100644 --- a/flask_app/broker/name.py +++ b/flask_app/broker/name.py @@ -1,5 +1,5 @@ """Class for the Specify Network Name API service.""" -from werkzeug.exceptions import (BadRequest, InternalServerError) +from werkzeug.exceptions import BadRequest from flask_app.broker.base import _BrokerService from flask_app.common.s2n_type import ( @@ -9,7 +9,7 @@ from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.itis import ItisAPI from sppy.tools.provider.worms import WormsAPI -from sppy.tools.s2n.utils import combine_errinfo, get_traceback +from sppy.tools.s2n.utils import get_traceback # ............................................................................. diff --git a/flask_app/common/base.py b/flask_app/common/base.py index 5144d0d5..017ea59d 100644 --- a/flask_app/common/base.py +++ b/flask_app/common/base.py @@ -2,11 +2,8 @@ from flask import Flask from werkzeug.exceptions import BadRequest, InternalServerError -import sppy.tools.s2n.utils as lmutil -from flask_app.common.s2n_type import ( - APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider) -from sppy.tools.provider.gbif import GbifAPI -from sppy.tools.provider.itis import ItisAPI +from sppy.tools.s2n.utils import add_errinfo +from flask_app.common.s2n_type import APIEndpoint app = Flask(__name__) @@ -226,12 +223,12 @@ def _process_params(cls, user_kwargs=None): elif key == "icon_status": valid_stat = param_meta["options"] if val is None: - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "error", f"Parameter {key} containing one of {valid_stat} options is " f"required") elif val not in valid_stat: - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "error", f"Value {val} for parameter {key} not in valid options " f"{valid_stat}") @@ -241,7 +238,7 @@ def _process_params(cls, user_kwargs=None): elif val is not None: usr_val, valid_options = cls._fix_type_new(key, val) if valid_options is not None and val not in valid_options: - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "error", f"Value {val} for parameter {key} is not in valid options " f"{param_meta['options']}") diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 15d43880..992d969e 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -3,42 +3,44 @@ import json import pandas as pd +from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION, SUMMARY_FOLDER from sppy.aws.aws_tools import get_current_datadate_str -from sppy.aws.aws_constants import (REGION, SUMMARY_FOLDER) from sppy.tools.s2n.utils import get_traceback # ............................................................................. class S3Query(): - """Specify Network API service for retrieving tabular parquet data from AWS S3.""" + """Class for retrieving SpecifyNetwork summary data from AWS S3.""" # ............................................... @classmethod def __init__( - self, bucket, region=REGION, encoding="utf-8"): + self, bucket, region=REGION, encoding=ENCODING): """Object to query tabular data in S3. Args: bucket: S3 bucket containing data. - s3_path: S3 folder(s) containing data objects. - datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed. region: AWS region containing the data. encoding: encoding of the data. """ - self.s3 = boto3.client('s3') self.bucket = bucket self.region = region self.encoding = encoding - self._current_datestr = get_current_datadate_str() self.exp_type = 'SQL' + datestr = get_current_datadate_str() + datestr = "2024_02_01" + self._dataset_counts_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" + self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" # ---------------------------------------------------- - def _query_s3_table(self, s3_path, query_str, format="JSON"): + def _query_table(self, s3_path, query_str, format="CSV"): """Query the S3 resource defined for this class. Args: + s3_path: S3 folder and filename within the bucket query_str: a SQL query for S3 select. + format: output format, options "CSV" or "JSON" Returns: list of records matching the query @@ -55,10 +57,11 @@ def _query_s3_table(self, s3_path, query_str, format="JSON"): "FieldDelimiter": ",", "QuoteCharacter": '"'} } - resp = self.s3.select_object_content( + s3 = boto3.client("s3", region_name=self.region) + resp = s3.select_object_content( Bucket=self.bucket, Key=s3_path, - ExpressionType='SQL', + ExpressionType="SQL", Expression=query_str, InputSerialization={"Parquet": {}}, OutputSerialization=out_serialization @@ -94,45 +97,44 @@ def _create_dataframe_from_s3obj(self, s3_path): return df # ---------------------------------------------------- - def _query_order_s3_table(self, s3_path, sort_field, descending, limit): + def _query_order_s3_table( + self, s3_path, sort_field, descending, limit, format="CSV"): """Query the S3 resource defined for this class. Args: - query_str: a SQL query for S3 select. + s3_path: S3 folder and filename within the bucket + sort_field: fieldname to sort records on + descending: boolean flag indicating to sort ascending or descending + limit: number of records to return, limit is 500 + format: output format, options "CSV" or "JSON" Returns: - list of records matching the query + ordered list of records matching the query """ - recs = [] - errors = {} - df = self._create_dataframe_from_s3obj(s3_path) - df.sort_values(by=sort_field, ascending=(not descending)) - for event in resp["Payload"]: - if "Records" in event: - records = event["Records"]["Payload"].decode(self.encoding) - recs.append(records) - return recs, errors + pass + # recs = [] + # errors = {} + # df = self._create_dataframe_from_s3obj(s3_path) + # df.sort_values(by=sort_field, ascending=(not descending)) + # return recs, errors # ---------------------------------------------------- - def get_dataset_counts(self, dataset_key): + def get_dataset_counts(self, dataset_key, format="CSV"): """Query the S3 resource for occurrence and species counts for this dataset. Args: dataset_key: unique GBIF identifier for dataset of interest. + format: output format, options "CSV" or "JSON" Returns: - records: empty list or list of 1 record containing occ_count, species_count + records: empty list or list of 1 record (list) """ - (occ_count, species_count) = (0,0) - datestr = get_current_datadate_str() - datestr = "2024_02_01" - s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" - query_str = (f"SELECT datasetkey, occ_count, species_count " - f"FROM s3object s " + query_str = (f"SELECT datasetkey, occ_count, species_count FROM s3object s " f"WHERE s.datasetkey = '{dataset_key}'") + query_str = "SELECT * FROM s3object s LIMIT 5" print(query_str) # Returns empty list or list of 1 record with [(occ_count, species_count)] - records = self._query_s3_table(s3_path, query_str, format="JSON") + records = self._query_table(self._dataset_counts_path, query_str, format=format) return records # ---------------------------------------------------- @@ -175,54 +177,10 @@ def rank_datasets_by_species(self, descending=True, limit=10): # ............................................................................. if __name__ == "__main__": - from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION - - datestr = "2024_02_01" + format = "CSV" dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" - s3 = boto3.client('s3') + s3q = S3Query(PROJ_BUCKET) + recs = s3q.get_dataset_counts(dataset_key, format=format) + for r in recs: + print(r) - s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" - query_str = (f"SELECT datasetkey, occ_count, species_count " - f"FROM s3object s " - f"WHERE s.datasetkey = '{dataset_key}'") - query_str = f"SELECT datasetkey, occ_count, species_count FROM s3object s LIMIT 5" - - format = "CSV" - if format == "JSON": - out_serialization = {"JSON": {}} - elif format == "CSV": - out_serialization = { - "CSV": { - "QuoteFields": "ASNEEDED", - "FieldDelimiter": ",", - "QuoteCharacter": '"'} - } - resp = s3.select_object_content( - Bucket=PROJ_BUCKET, - Key=s3_path, - ExpressionType="SQL", - Expression=query_str, - InputSerialization={"Parquet": {}}, - OutputSerialization=out_serialization - ) - - for event in resp["Payload"]: - print(event) - if "Records" in event: - recs_str = event["Records"]["Payload"].decode(ENCODING) - rec_strings = recs_str.split("\n") - for rs in rec_strings: - if rs: - if format == "JSON": - rec = json.loads(rs) - else: - rec = rs.split(",") - print(rec) - - - - - # records = self._query_s3_table(s3_path, query_str) -""" - -""" From b19eda80d0fdff627460a1ed7cd0601b5d77e594 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 14 Mar 2024 12:21:49 -0500 Subject: [PATCH 16/81] move debug printing to output objects --- flask_app/analyst/count.py | 18 +++++---- flask_app/broker/name.py | 5 +-- flask_app/broker/occ.py | 11 +++--- flask_app/common/s2n_type.py | 77 +++++++++++++++++++++++++++++++++++- flask_app/common/util.py | 74 ---------------------------------- sppy/tools/provider/awss3.py | 10 ++--- 6 files changed, 97 insertions(+), 98 deletions(-) diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 006325f2..2eaf3165 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -3,7 +3,6 @@ from werkzeug.exceptions import BadRequest from flask_app.common.s2n_type import APIService, AnalystOutput -from flask_app.common.util import print_analyst_output from flask_app.analyst.base import _AnalystService from sppy.aws.aws_constants import PROJ_BUCKET @@ -19,7 +18,7 @@ class CountSvc(_AnalystService): # ............................................... @classmethod - def get_counts(cls, dataset_key=None, pub_org_key=None, format="CSV"): + def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"): """Return occurrence and species counts for dataset/organization identifiers. Args: @@ -163,11 +162,14 @@ def _get_organization_counts(cls, pub_org_key): # ............................................................................. if __name__ == "__main__": - svc = CountSvc() - out = svc.get_endpoint() - print_analyst_output(out, do_print_rec=True) - format = "CSV" + format = "JSON" dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" - out = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format="CSV") - print_analyst_output(out, do_print_rec=True) + + svc = CountSvc() + response = svc.get_endpoint() + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format=format) + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py index bb2343ff..cecde97c 100644 --- a/flask_app/broker/name.py +++ b/flask_app/broker/name.py @@ -4,7 +4,6 @@ from flask_app.broker.base import _BrokerService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, BrokerSchema, S2nKey, ServiceProvider) -from flask_app.common.util import print_broker_output from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.itis import ItisAPI @@ -195,7 +194,7 @@ def get_name_records( svc = NameSvc() for namestr in test_names: - out = svc.get_name_records( + response = svc.get_name_records( namestr=namestr, provider=None, is_accepted=False, gbif_parse=True, gbif_count=True, kingdom=None) - print_broker_output(out, do_print_rec=True) + BrokerOutput.print_output(response, do_print_rec=True) diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py index 47d31378..4f277c37 100644 --- a/flask_app/broker/occ.py +++ b/flask_app/broker/occ.py @@ -4,7 +4,6 @@ from flask_app.broker.base import _BrokerService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, BrokerSchema, S2nKey, ServiceProvider) -from flask_app.common.util import print_broker_output from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.idigbio import IdigbioAPI @@ -206,11 +205,11 @@ def get_occurrence_records( svc = OccurrenceSvc() out = svc.get_endpoint() - out = svc.get_occurrence_records(occid="a7156437-55ec-4c6f-89de-938f9361753d") + response = svc.get_occurrence_records(occid="a7156437-55ec-4c6f-89de-938f9361753d") - print_broker_output(out, do_print_rec=True) + BrokerOutput.print_output(response, do_print_rec=True) # for occid in occids: - # out = svc.get_occurrence_records(occid=occid, provider=None, count_only=False) - # outputs = out["records"] - # print_broker_output(out, do_print_rec=True) + # response = svc.get_occurrence_records(occid=occid, provider=None, count_only=False) + # recs = response["records"] + # BrokerOutput.print_output(response, do_print_rec=True) diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index ea4dc7e6..92bcedd8 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -992,13 +992,65 @@ def format_records(self, ordered_fieldnames): ordered_recs.append(ordrec) self._response[S2nKey.RECORDS] = ordered_recs + # ............................................................................. + @classmethod + def _print_sub_output(cls, oneelt, do_print_rec): + print("* One record of Specify Network Outputs *") + for name, attelt in oneelt.items(): + try: + if name == "records": + print(" records") + if do_print_rec is False: + print(f" {name}: {len(attelt)} returned records") + else: + for rec in attelt: + print(" record") + for k, v in rec.items(): + print(" {}: {}".format(k, v)) + else: + print(" {}: {}".format(name, attelt)) + except Exception: + pass + + # .................................... + @classmethod + def print_output(cls, response_dict, do_print_rec=False): + """Print a formatted string of the elements in an S2nOutput query response. + + Args: + response_dict: flask_app.broker.s2n_type.S2nOutput._response dictionary + do_print_rec: True to print each record in the response. + + TODO: move to a class method + """ + print("*** Broker output ***") + for name, attelt in response_dict.items(): + try: + if name == "records": + print("records: ") + for respdict in attelt: + cls._print_sub_output(respdict, do_print_rec) + else: + print(f"{name}: {attelt}") + except Exception: + pass + # outelts = set(response_dict.keys()) + # missing = S2nKey.broker_response_keys().difference(outelts) + # extras = outelts.difference(S2nKey.broker_response_keys()) + # if missing: + # print(f"Missing elements: {missing}") + # if extras: + # print(f"Extra elements: {extras}") + print("") + # ............................................................................. class AnalystOutput: """Response type for a Specify Network Analyst query.""" service: str description: str = "" - records: typing.List[dict] = [] + # records: typing.List[dict] = [] + records: typing.List = [] errors: dict = {} # ............................................... @@ -1008,7 +1060,7 @@ def __init__(self, service, description=None, records=None, errors=None): Args: service: API Service this object is responding to. description: Description of the computation in this response. - records: Records in this response. + records: Records (lists or dictionaries) in this response. errors: Errors encountered when generating this response. """ if errors is None: @@ -1035,6 +1087,27 @@ def response(self): """ return self._response + # .................................... + @classmethod + def print_output(cls, response_dict, do_print_rec=False): + """Print a formatted string of the elements in an S2nOutput query response. + + Args: + response_dict: flask_app.broker.s2n_type.S2nOutput._response dictionary + do_print_rec: True to print each record in the response. + """ + print("*** Analyst output ***") + for name, attelt in response_dict.items(): + try: + if name == "records" and do_print_rec: + print("records: ") + for rec in attelt: + print(rec) + else: + print(f"{name}: {attelt}") + except Exception: + pass + # ............................................................................. class ServiceProvider: diff --git a/flask_app/common/util.py b/flask_app/common/util.py index 3b5abd13..df3bd5ff 100644 --- a/flask_app/common/util.py +++ b/flask_app/common/util.py @@ -15,77 +15,3 @@ def get_host_url(): if host_url.endswith("/"): host_url = host_url[:-1] return host_url - - -# ............................................................................. -def _print_sub_output(oneelt, do_print_rec): - print("* One record of Specify Network Outputs *") - for name, attelt in oneelt.items(): - try: - if name == "records": - print(" records") - if do_print_rec is False: - print(f" {name}: {len(attelt)} returned records") - else: - for rec in attelt: - print(" record") - for k, v in rec.items(): - print(" {}: {}".format(k, v)) - else: - print(" {}: {}".format(name, attelt)) - except Exception: - pass - - -# .................................... -def print_broker_output(response_dict, do_print_rec=False): - """Print a formatted string of the elements in an S2nOutput query response. - - Args: - response_dict: flask_app.broker.s2n_type.S2nOutput object - do_print_rec: True to print each record in the response. - - TODO: move to a class method - """ - print("*** Broker output ***") - for name, attelt in response_dict.items(): - try: - if name == "records": - print(f"{name}: ") - for respdict in attelt: - _print_sub_output(respdict, do_print_rec) - else: - print(f"{name}: {attelt}") - except Exception: - pass - # outelts = set(response_dict.keys()) - # missing = S2nKey.broker_response_keys().difference(outelts) - # extras = outelts.difference(S2nKey.broker_response_keys()) - # if missing: - # print(f"Missing elements: {missing}") - # if extras: - # print(f"Extra elements: {extras}") - print("") - - -# .................................... -def print_analyst_output(response_dict, do_print_rec=False): - """Print a formatted string of the elements in an S2nOutput query response. - - Args: - response_dict: flask_app.broker.s2n_type.S2nOutput object - do_print_rec: True to print each record in the response. - - TODO: move to a class method - """ - print("*** Analyst output ***") - for name, attelt in response_dict.items(): - try: - if name == "records": - print(f"{name}: ") - for respdict in attelt: - _print_sub_output(respdict, do_print_rec) - else: - print(f"{name}: {attelt}") - except Exception: - pass diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 992d969e..1befc7fa 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -129,11 +129,11 @@ def get_dataset_counts(self, dataset_key, format="CSV"): Returns: records: empty list or list of 1 record (list) """ - query_str = (f"SELECT datasetkey, occ_count, species_count FROM s3object s " - f"WHERE s.datasetkey = '{dataset_key}'") - query_str = "SELECT * FROM s3object s LIMIT 5" - print(query_str) - # Returns empty list or list of 1 record with [(occ_count, species_count)] + query_str = ( + "SELECT datasetkey, occ_count, species_count FROM s3object s " + f"WHERE s.datasetkey = '{dataset_key}'" + ) + # Returns empty list or list of 1 record records = self._query_table(self._dataset_counts_path, query_str, format=format) return records From 527cf7e646580cb6dc9e3bad5f1f8281692b0b77 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 14 Mar 2024 14:20:38 -0500 Subject: [PATCH 17/81] remove methods that occur in superclass --- flask_app/broker/base.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py index bba52669..6c47a8ad 100644 --- a/flask_app/broker/base.py +++ b/flask_app/broker/base.py @@ -10,15 +10,6 @@ from sppy.tools.provider.itis import ItisAPI -# ............................................................................. -@app.errorhandler(BadRequest) -def handle_bad_request(e): - return f"Bad request: {e}" - -@app.errorhandler(InternalServerError) -def handle_bad_response(e): - return f"Internal Server Error: {e}" - # ............................................................................. class _BrokerService(_SpecifyNetworkService): """Base S-to-the-N service, handles parameter names and acceptable values.""" From e1372287e9150122f0a1e89280ecf9d224591e3d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 14 Mar 2024 15:52:28 -0500 Subject: [PATCH 18/81] ranked counts return --- flask_app/analyst/base.py | 4 +- flask_app/analyst/count.py | 4 ++ flask_app/analyst/rank.py | 97 ++++++++++++++++++++++++++++++++++++ flask_app/broker/occ.py | 11 ++-- flask_app/common/base.py | 2 +- flask_app/common/s2n_type.py | 18 ++++++- sppy/tools/provider/awss3.py | 37 +++++++++----- 7 files changed, 150 insertions(+), 23 deletions(-) create mode 100644 flask_app/analyst/rank.py diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 0c771cb6..297eeb76 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -55,7 +55,8 @@ def _show_online(cls): # ............................................... @classmethod def _standardize_params( - cls, dataset_key=None, pub_org_key=None, descending=True, limit=10): + cls, dataset_key=None, pub_org_key=None, by_species=True, descending=True, + limit=10): """Standardize query parameters to send to appropriate service. Args: @@ -77,6 +78,7 @@ def _standardize_params( user_kwargs = { "dataset_key": dataset_key, "pub_org_key": pub_org_key, + "by_species": by_species, "descending": descending, "limit": limit } diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 2eaf3165..926451d4 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -27,6 +27,10 @@ def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"): publishingOrganization. format: output format, options "CSV" or "JSON" + Returns: + full_output (flask_app.common.s2n_type.AnalystOutput): including records + as a list of one list (CSV) or dictionary (JSON) of a record + containing dataset_key, occurrence count, and species count. """ if dataset_key is None and pub_org_key is None: return cls.get_endpoint() diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py new file mode 100644 index 00000000..57240b1d --- /dev/null +++ b/flask_app/analyst/rank.py @@ -0,0 +1,97 @@ +"""Class for the Specify Network Name API service.""" +from http import HTTPStatus +from werkzeug.exceptions import BadRequest + +from flask_app.common.s2n_type import APIService, AnalystOutput +from flask_app.analyst.base import _AnalystService + +from sppy.aws.aws_constants import PROJ_BUCKET +from sppy.tools.provider.awss3 import S3Query +from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) + + +# ............................................................................. +class RankSvc(_AnalystService): + """Specify Network API service for retrieving taxonomic information.""" + SERVICE_TYPE = APIService.Rank + ORDERED_FIELDNAMES = [] + + # ............................................... + @classmethod + def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"): + """Return occurrence and species counts for dataset/organization identifiers. + + Args: + by_species: boolean URL parameter indicating whether to rank datasets by + species count (True) or occurrence count (False). + descending: boolean URL parameter indicating whether to rank top down (True) + or bottom up (False). + limit: integer URL parameter specifying the number of ordered records to + return. + format: output format, options "CSV" or "JSON" + + full_output (flask_app.common.s2n_type.AnalystOutput): including records + as a list of lists (CSV) or dictionaries (JSON) of records + containing dataset_key, occurrence count, and species count. + """ + if by_species is None: + return cls.get_endpoint() + + records = [] + try: + good_params, errinfo = cls._standardize_params( + by_species=by_species, descending=descending, limit=limit) + + except BadRequest as e: + errinfo = {"error": e.description} + + else: + # Query for ordered dataset counts + try: + records, errors = cls._get_ordered_counts( + good_params["by_species"], good_params["descending"], + good_params["limit"], format) + except Exception: + errors = {"error": get_traceback()} + + # Combine errors from success or failure + errinfo = combine_errinfo(errinfo, errors) + + # Assemble + full_out = AnalystOutput( + cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], + records=records, errors=errinfo) + + return full_out.response + + # ............................................... + @classmethod + def _get_ordered_counts(cls, by_species, descending, limit, format): + records = [] + s3 = S3Query(PROJ_BUCKET) + try: + records, errinfo = s3.rank_datasets( + by_species, descending=descending, limit=limit) + + except Exception: + errinfo = {"error": get_traceback()} + + return records, errinfo + +# ............................................................................. +if __name__ == "__main__": + format = "CSV" + dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + + svc = RankSvc() + response = svc.get_endpoint() + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + by_species = True + descending = True + limit = 5 + response = svc.rank_counts( + by_species, descending=descending, limit=limit, format=format) + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py index 4f277c37..3a6fb450 100644 --- a/flask_app/broker/occ.py +++ b/flask_app/broker/occ.py @@ -143,15 +143,10 @@ def get_occurrence_records( a count and records kwargs: any additional keyword arguments are ignored - Raises: - BadRequest: on invalid query parameters. - BadRequest: on unknown exception parsing parameters. - InternalServerError: on unknown exception when executing request - Returns: - a flask_app.broker.s2n_type.BrokerOutput object with optional records as a - list of dictionaries of records corresponding to specimen occurrences in - the provider database. + full_output (flask_app.common.s2n_type.BrokerOutput): including records + as a list of dictionaries of records corresponding to specimen + occurrences in the provider database. """ if occid is None and gbif_dataset_key is None: return cls.get_endpoint() diff --git a/flask_app/common/base.py b/flask_app/common/base.py index 017ea59d..0d599930 100644 --- a/flask_app/common/base.py +++ b/flask_app/common/base.py @@ -155,7 +155,7 @@ def _test_numbers(cls, provided_val, param_meta): min_val = None # If restricted numeric values, check try: - max_val = param_meta["min"] + max_val = param_meta["max"] except KeyError: max_val = None diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index 92bcedd8..95307272 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -81,11 +81,12 @@ class APIEndpoint: Occurrence = "occ" Frontend = "frontend" Count = "count" + Rank = "rank" @classmethod def Resources(cls): return { - cls.Analyst: [cls.Count], + cls.Analyst: [cls.Count, cls.Rank], cls.Broker: [ cls.Badge, @@ -181,6 +182,21 @@ class APIService: "publishing organization.", S2nKey.RECORD_FORMAT: "" } + # Rankings + Rank = { + "name": APIEndpoint.Rank, + "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Rank}", + "params": { + "by_species":{ "type": True, "default": True}, + "descending": { "type": True, "default": True}, + "limit": {"type": 2, "default": 10, "min": 1, "max": 500}, + }, + "description": + "Return an ordered list of datasets with occurrence and species counts " + "ranked by occurrence or species counts for the top X (descending) " + "or bottom X (ascending) datasets", + S2nKey.RECORD_FORMAT: "" + } # Taxonomic Resolution Name = { "name": APIEndpoint.Name, diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 1befc7fa..3365ba8f 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -103,7 +103,7 @@ def _query_order_s3_table( Args: s3_path: S3 folder and filename within the bucket - sort_field: fieldname to sort records on + sort_field: fieldname (column) to sort records on descending: boolean flag indicating to sort ascending or descending limit: number of records to return, limit is 500 format: output format, options "CSV" or "JSON" @@ -111,12 +111,21 @@ def _query_order_s3_table( Returns: ordered list of records matching the query """ - pass - # recs = [] - # errors = {} - # df = self._create_dataframe_from_s3obj(s3_path) - # df.sort_values(by=sort_field, ascending=(not descending)) - # return recs, errors + recs = [] + errors = {} + df = self._create_dataframe_from_s3obj(s3_path) + # Sort rows (Axis 0/index) by values in sort_field (column) + sorted_df = df.sort_values(by=sort_field, axis=0, ascending=(not descending)) + rec_df = sorted_df.head(limit) + + for row in rec_df.itertuples(): + rec = {"datasetkey": row.datasetkey, + "species_count": row.species_count, + "occ_count": row.occ_count} + recs.append(rec) + print(row) + print(rec) + return recs, errors # ---------------------------------------------------- def get_dataset_counts(self, dataset_key, format="CSV"): @@ -153,24 +162,28 @@ def get_org_counts(self, pub_org_key): return (occ_count, species_count) # ---------------------------------------------------- - def rank_datasets_by_species(self, descending=True, limit=10): + def rank_datasets(self, by_species, descending, limit, format="CSV"): """Return the top or bottom datasets, with counts, ranked by number of species. Args: + by_species: boolean flag indicating whether to rank datasets by + species count (True) or occurrence count (False). descending: boolean value, if true return top X datasets in descending order, if false, return bottom X datasets in ascending order limit: number of datasets to return, no more than 300. + format: output format, options "CSV" or "JSON" Returns: records: list of limit records containing dataset_key, occ_count, species_count """ records = [] - datestr = get_current_datadate_str() - datestr = "2024_02_01" - s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" + if by_species: + sort_field = "species_count" + else: + sort_field = "occ_count" try: records, errors = self._query_order_s3_table( - s3_path, "species_count", descending, limit) + self._dataset_counts_path, sort_field, descending, limit) except Exception as e: errors = {"error": get_traceback()} return records, errors From 6fe691bec46ed5081b3ebb24d22dc3ce6a4bc802 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Sun, 17 Mar 2024 16:30:33 -0500 Subject: [PATCH 19/81] exposed rank service --- flask_app/analyst/routes.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py index 7afc0d09..770a82f6 100644 --- a/flask_app/analyst/routes.py +++ b/flask_app/analyst/routes.py @@ -3,6 +3,7 @@ import os from flask_app.analyst.count import CountSvc +from flask_app.analyst.rank import RankSvc from flask_app.common.constants import ( STATIC_DIR, TEMPLATE_DIR, SCHEMA_DIR, SCHEMA_ANALYST_FNAME) from flask_app.common.s2n_type import APIEndpoint @@ -82,6 +83,25 @@ def count_endpoint(): return response +# ..................................................................................... +@app.route("/api/v1/rank/") +def rank_endpoint(): + """Get the available counts. + + Returns: + response: A flask_app.analyst API response object containing the count + API response. + """ + by_species_arg = request.args.get("by_species", default=None, type=bool) + descending_arg = request.args.get("descending", default=True, type=bool) + limit_arg = request.args.get("limit", default=10, type=int) + # if coll_arg is None and org_arg is None: + if by_species_arg is None: + response = RankSvc.get_endpoint() + else: + response = RankSvc.rank_counts(by_species_arg, descending_arg, limit_arg) + return response + # # ..................................................................................... # @app.route("/api/v1/collection/", methods=["GET"]) # def collection_get(): From f3440eb77fa37caf183fdac55a3dd0eb6abcbc4d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 11:34:59 -0500 Subject: [PATCH 20/81] JSON default format --- sppy/tools/provider/awss3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 3365ba8f..3c57bcd6 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -128,7 +128,7 @@ def _query_order_s3_table( return recs, errors # ---------------------------------------------------- - def get_dataset_counts(self, dataset_key, format="CSV"): + def get_dataset_counts(self, dataset_key, format="JSON"): """Query the S3 resource for occurrence and species counts for this dataset. Args: @@ -162,7 +162,7 @@ def get_org_counts(self, pub_org_key): return (occ_count, species_count) # ---------------------------------------------------- - def rank_datasets(self, by_species, descending, limit, format="CSV"): + def rank_datasets(self, by_species, descending, limit, format="JSON"): """Return the top or bottom datasets, with counts, ranked by number of species. Args: @@ -190,7 +190,7 @@ def rank_datasets(self, by_species, descending, limit, format="CSV"): # ............................................................................. if __name__ == "__main__": - format = "CSV" + format = "JSON" dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" s3q = S3Query(PROJ_BUCKET) recs = s3q.get_dataset_counts(dataset_key, format=format) From 998cbb4c3d05cac36623a38d4946a2e4e638a08e Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 11:36:25 -0500 Subject: [PATCH 21/81] fix args for s3.rank_datasets call; remove format option --- flask_app/analyst/count.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 926451d4..a51edc11 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -18,14 +18,13 @@ class CountSvc(_AnalystService): # ............................................... @classmethod - def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"): + def get_counts(cls, dataset_key=None, pub_org_key=None): """Return occurrence and species counts for dataset/organization identifiers. Args: dataset_key: URL parameter for unique GBIF identifier of dataset. pub_org_key: URL parameter for unique GBIF identifier of publishingOrganization. - format: output format, options "CSV" or "JSON" Returns: full_output (flask_app.common.s2n_type.AnalystOutput): including records @@ -48,7 +47,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"): if good_params["dataset_key"] is not None: try: records, errors = cls._get_dataset_counts( - good_params["dataset_key"], format) + good_params["dataset_key"]) except Exception: errors = {"error": get_traceback()} else: @@ -72,11 +71,11 @@ def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"): # ............................................... @classmethod - def get_ranked_counts(cls, descending=True, limit=10): + def get_ranked_counts(cls, by_species=True, descending=True, limit=10): allrecs = [] try: good_params, errinfo = cls._standardize_params( - cls, descending=descending, limit=limit) + cls, by_species=by_species, descending=descending, limit=limit) except BadRequest as e: errinfo = combine_errinfo(errinfo, {"error": e.description}) @@ -85,8 +84,7 @@ def get_ranked_counts(cls, descending=True, limit=10): # Do Query! try: s3 = S3Query(PROJ_BUCKET) - records, errors = s3.rank_datasets_by_species( - descending=True, limit=limit) + records, errors = s3.rank_datasets(by_species, descending, limit) except Exception: errors = {"error": get_traceback()} else: @@ -97,12 +95,11 @@ def get_ranked_counts(cls, descending=True, limit=10): # ............................................... @classmethod - def _get_dataset_counts(cls, dataset_key, format): + def _get_dataset_counts(cls, dataset_key): """Get counts for datasetKey. Args: dataset_key: unique GBIF identifier for dataset of interest. - format: output format, options "CSV" or "JSON" Returns: a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a @@ -112,7 +109,7 @@ def _get_dataset_counts(cls, dataset_key, format): errors = {} s3 = S3Query(PROJ_BUCKET) try: - records = s3.get_dataset_counts(dataset_key, format=format) + records = s3.get_dataset_counts(dataset_key) except Exception: traceback = get_traceback() errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] @@ -166,14 +163,13 @@ def _get_organization_counts(cls, pub_org_key): # ............................................................................. if __name__ == "__main__": - format = "JSON" dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" svc = CountSvc() response = svc.get_endpoint() AnalystOutput.print_output(response, do_print_rec=True) # print(response) - response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format=format) + response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None) AnalystOutput.print_output(response, do_print_rec=True) # print(response) From 2875b6b8c23f0b9ad4424ec402fb8f4fd493cb9c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 11:36:41 -0500 Subject: [PATCH 22/81] doc --- sphinx/misc/debugging.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst index 0c8b7844..c9d900d8 100644 --- a/sphinx/misc/debugging.rst +++ b/sphinx/misc/debugging.rst @@ -27,6 +27,7 @@ flask run * Analyst: http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515 + http://127.0.0.1:5000/api/v1/rank/?by_species=true * Flask will auto-update on file save. * Refresh browser after changes * The frontend endpoint cannot be tested this way, as it depends on frontend From f313b32a59103c6963ec8359953523f00f66d011 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 13:17:55 -0500 Subject: [PATCH 23/81] only encode value to str if returned as bytes --- sppy/tools/provider/gbif.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py index 7854bc7f..890a527e 100644 --- a/sppy/tools/provider/gbif.py +++ b/sppy/tools/provider/gbif.py @@ -63,10 +63,11 @@ def _assemble_filter_string(self, filter_string=None): @classmethod def _get_output_val(cls, out_dict, name): try: - tmp = out_dict[name] - val = str(tmp).encode(ENCODING) + val = out_dict[name] except Exception: return None + if type(val) is bytes: + val = str(val).encode(ENCODING) return val # ............................................... From f94bb2b139b4cc8e540d764a9ae466e4b2991890 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 13:19:50 -0500 Subject: [PATCH 24/81] return recs as json or csv --- sppy/tools/provider/awss3.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 3c57bcd6..9b88401c 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -69,14 +69,13 @@ def _query_table(self, s3_path, query_str, format="CSV"): for event in resp["Payload"]: if "Records" in event: recs_str = event["Records"]["Payload"].decode(ENCODING) - rec_strings = recs_str.split("\n") + rec_strings = recs_str.strip().split("\n") for rs in rec_strings: - if rs: - if format == "JSON": - rec = json.loads(rs) - else: - rec = rs.split(",") - recs.append(rec) + if format == "JSON": + rec = json.loads(rs) + else: + rec = rs.split(",") + recs.append(rec) return recs # ---------------------------------------------------- @@ -90,9 +89,7 @@ def _create_dataframe_from_s3obj(self, s3_path): df: pandas DataFrame containing the CSV data. """ # import pyarrow.parquet as pq - # import s3fs s3_uri = f"s3://{self.bucket}/{s3_path}" - # s3_fs = s3fs.S3FileSystem df = pd.read_parquet(s3_uri) return df @@ -123,8 +120,6 @@ def _query_order_s3_table( "species_count": row.species_count, "occ_count": row.occ_count} recs.append(rec) - print(row) - print(rec) return recs, errors # ---------------------------------------------------- From a50599a0ceab12bd5f11154af40e645fcf1fab30 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 14:13:59 -0500 Subject: [PATCH 25/81] remove obsolete --- flask_app/analyst/count.py | 67 -------------------------------------- 1 file changed, 67 deletions(-) diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index a51edc11..7d14e2b5 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -69,30 +69,6 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): return full_out.response - # ............................................... - @classmethod - def get_ranked_counts(cls, by_species=True, descending=True, limit=10): - allrecs = [] - try: - good_params, errinfo = cls._standardize_params( - cls, by_species=by_species, descending=descending, limit=limit) - - except BadRequest as e: - errinfo = combine_errinfo(errinfo, {"error": e.description}) - - else: - # Do Query! - try: - s3 = S3Query(PROJ_BUCKET) - records, errors = s3.rank_datasets(by_species, descending, limit) - except Exception: - errors = {"error": get_traceback()} - else: - allrecs.append(records) - # Combine errors from success or failure - errinfo = combine_errinfo(errinfo, errors) - return allrecs, errinfo - # ............................................... @classmethod def _get_dataset_counts(cls, dataset_key): @@ -116,49 +92,6 @@ def _get_dataset_counts(cls, dataset_key): return records, errors - # ............................................... - @classmethod - def _get_organization_counts(cls, pub_org_key): - """Get counts for publishingOrganizationKey. - - Args: - pub_org_key: Unique identifier for GBIF publishing organizations. - - Returns: - a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a - list of records corresponding to occurrence and counts for the organization. - """ - records = [] - errors = {} - s3 = S3Query(PROJ_BUCKET) - try: - (occ_count, species_count) = s3.get_org_counts(pub_org_key) - except Exception: - traceback = get_traceback() - errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] - else: - records.append((occ_count, species_count)) - return records, errors - - - # # ............................................... - # @classmethod - # def _get_records(cls, dataset_key, pub_org_key): - # allrecs = [] - # # for response metadata - # if dataset_key is not None: - # records, errors = cls._get_dataset_counts(dataset_key) - # allrecs.append(records) - # if pub_org_key is not None: - # records, errors = cls._get_organization_counts(pub_org_key) - # allrecs.append(records) - # - # # Assemble - # full_out = AnalystOutput( - # cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], - # records=allrecs, errors={}) - # - # return full_out # ............................................................................. From b6447d38307d94dc45f2c1fad588a6b9c52c8ca4 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 14:16:07 -0500 Subject: [PATCH 26/81] change rank URL keys and types --- flask_app/analyst/base.py | 10 +++++----- flask_app/analyst/rank.py | 27 +++++++++++++-------------- flask_app/analyst/routes.py | 12 ++++++++---- flask_app/common/s2n_type.py | 14 ++++++++++---- sppy/tools/provider/awss3.py | 21 +++++++++++---------- 5 files changed, 47 insertions(+), 37 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 297eeb76..8408d702 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -55,15 +55,15 @@ def _show_online(cls): # ............................................... @classmethod def _standardize_params( - cls, dataset_key=None, pub_org_key=None, by_species=True, descending=True, + cls, dataset_key=None, pub_org_key=None, count_by=None, order=None, limit=10): """Standardize query parameters to send to appropriate service. Args: dataset_key: unique GBIF dataset identifier for comparisons pub_org_key: unique publishing organization identifier for comparisons - descending: boolean value indicating whether to sort records descending - (True) or ascending (False) + count_by: counts of "occurrence" or "species" + order: sort records "descending" or "ascending" limit: integer indicating how many ranked records to return, value must be less than QUERY_LIMIT. @@ -78,8 +78,8 @@ def _standardize_params( user_kwargs = { "dataset_key": dataset_key, "pub_org_key": pub_org_key, - "by_species": by_species, - "descending": descending, + "count_by": count_by, + "order": order, "limit": limit } diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index 57240b1d..f51ba086 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -18,14 +18,14 @@ class RankSvc(_AnalystService): # ............................................... @classmethod - def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"): + def rank_counts(cls, count_by, order=None, limit=1, format="JSON"): """Return occurrence and species counts for dataset/organization identifiers. Args: - by_species: boolean URL parameter indicating whether to rank datasets by - species count (True) or occurrence count (False). - descending: boolean URL parameter indicating whether to rank top down (True) - or bottom up (False). + count_by: URL parameter indicating rank datasets by counts of "species" or + "occurrence" . + order: URL parameter indicating whether to rank in "descending" or + "ascending" order. limit: integer URL parameter specifying the number of ordered records to return. format: output format, options "CSV" or "JSON" @@ -34,13 +34,13 @@ def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"): as a list of lists (CSV) or dictionaries (JSON) of records containing dataset_key, occurrence count, and species count. """ - if by_species is None: + if count_by is None: return cls.get_endpoint() records = [] try: good_params, errinfo = cls._standardize_params( - by_species=by_species, descending=descending, limit=limit) + count_by=count_by, order=order, limit=limit) except BadRequest as e: errinfo = {"error": e.description} @@ -49,7 +49,7 @@ def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"): # Query for ordered dataset counts try: records, errors = cls._get_ordered_counts( - good_params["by_species"], good_params["descending"], + good_params["count_by"], good_params["order"], good_params["limit"], format) except Exception: errors = {"error": get_traceback()} @@ -66,12 +66,11 @@ def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"): # ............................................... @classmethod - def _get_ordered_counts(cls, by_species, descending, limit, format): + def _get_ordered_counts(cls, count_by, order, limit, format): records = [] s3 = S3Query(PROJ_BUCKET) try: - records, errinfo = s3.rank_datasets( - by_species, descending=descending, limit=limit) + records, errinfo = s3.rank_datasets(count_by, order, limit) except Exception: errinfo = {"error": get_traceback()} @@ -87,11 +86,11 @@ def _get_ordered_counts(cls, by_species, descending, limit, format): response = svc.get_endpoint() AnalystOutput.print_output(response, do_print_rec=True) # print(response) - by_species = True - descending = True + count_by = "species" + order = "ascending" limit = 5 response = svc.rank_counts( - by_species, descending=descending, limit=limit, format=format) + count_by, order=order, limit=limit, format=format) AnalystOutput.print_output(response, do_print_rec=True) # print(response) diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py index 770a82f6..d314c372 100644 --- a/flask_app/analyst/routes.py +++ b/flask_app/analyst/routes.py @@ -92,14 +92,18 @@ def rank_endpoint(): response: A flask_app.analyst API response object containing the count API response. """ - by_species_arg = request.args.get("by_species", default=None, type=bool) - descending_arg = request.args.get("descending", default=True, type=bool) + count_by_arg = request.args.get("count_by", default=None, type=str) + order_arg = request.args.get("order", default=None, type=str) limit_arg = request.args.get("limit", default=10, type=int) + print( + f"*** count_by_arg={count_by_arg}, order_arg={order_arg}, " + f"limit_arg={limit_arg} ***") # if coll_arg is None and org_arg is None: - if by_species_arg is None: + if count_by_arg is None: response = RankSvc.get_endpoint() else: - response = RankSvc.rank_counts(by_species_arg, descending_arg, limit_arg) + response = RankSvc.rank_counts( + count_by_arg, order=order_arg, limit=limit_arg) return response # # ..................................................................................... diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index 95307272..b11efa56 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -174,8 +174,6 @@ class APIService: "description": "GBIF Publishing Organization Key", "default": None }, - "descending": { "type": True, "default": True}, - "limit": {"type": 2, "default": 10, "min": 1, "max": 500}, }, "description": "Return occurrence and species counts for the given dataset or " @@ -187,8 +185,16 @@ class APIService: "name": APIEndpoint.Rank, "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Rank}", "params": { - "by_species":{ "type": True, "default": True}, - "descending": { "type": True, "default": True}, + "count_by": { + "type": "", + "options": ["occurrence", "species"], + "default": None + }, + "order": { + "type": "", + "options": ["ascending", "descending"], + "default": None + }, "limit": {"type": 2, "default": 10, "min": 1, "max": 500}, }, "description": diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 9b88401c..da511b1a 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -95,13 +95,13 @@ def _create_dataframe_from_s3obj(self, s3_path): # ---------------------------------------------------- def _query_order_s3_table( - self, s3_path, sort_field, descending, limit, format="CSV"): + self, s3_path, sort_field, order, limit, format="CSV"): """Query the S3 resource defined for this class. Args: s3_path: S3 folder and filename within the bucket sort_field: fieldname (column) to sort records on - descending: boolean flag indicating to sort ascending or descending + order: boolean flag indicating to sort ascending or descending limit: number of records to return, limit is 500 format: output format, options "CSV" or "JSON" @@ -112,7 +112,8 @@ def _query_order_s3_table( errors = {} df = self._create_dataframe_from_s3obj(s3_path) # Sort rows (Axis 0/index) by values in sort_field (column) - sorted_df = df.sort_values(by=sort_field, axis=0, ascending=(not descending)) + sorted_df = df.sort_values( + by=sort_field, axis=0, ascending=(order == "ascending")) rec_df = sorted_df.head(limit) for row in rec_df.itertuples(): @@ -157,14 +158,14 @@ def get_org_counts(self, pub_org_key): return (occ_count, species_count) # ---------------------------------------------------- - def rank_datasets(self, by_species, descending, limit, format="JSON"): + def rank_datasets(self, count_by, order, limit, format="JSON"): """Return the top or bottom datasets, with counts, ranked by number of species. Args: - by_species: boolean flag indicating whether to rank datasets by - species count (True) or occurrence count (False). - descending: boolean value, if true return top X datasets in descending - order, if false, return bottom X datasets in ascending order + count_by: string indicating rank datasets by counts of "species" or + "occurrence" . + order: string indicating whether to rank in "descending" or + "ascending" order. limit: number of datasets to return, no more than 300. format: output format, options "CSV" or "JSON" @@ -172,13 +173,13 @@ def rank_datasets(self, by_species, descending, limit, format="JSON"): records: list of limit records containing dataset_key, occ_count, species_count """ records = [] - if by_species: + if count_by == "species": sort_field = "species_count" else: sort_field = "occ_count" try: records, errors = self._query_order_s3_table( - self._dataset_counts_path, sort_field, descending, limit) + self._dataset_counts_path, sort_field, order, limit) except Exception as e: errors = {"error": get_traceback()} return records, errors From 8342e2eff03669266e85df5f924a24067d826723 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 14:20:50 -0500 Subject: [PATCH 27/81] debug examples, rm old code --- flask_app/analyst/routes.py | 2 -- sphinx/misc/debugging.rst | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py index d314c372..9bd338a7 100644 --- a/flask_app/analyst/routes.py +++ b/flask_app/analyst/routes.py @@ -74,8 +74,6 @@ def count_endpoint(): API response. """ ds_arg = request.args.get("dataset_key", default=None, type=str) - # org_arg = request.args.get("organization_id", default=None, type=str) - # if coll_arg is None and org_arg is None: if ds_arg is None: response = CountSvc.get_endpoint() else: diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst index c9d900d8..24659dd4 100644 --- a/sphinx/misc/debugging.rst +++ b/sphinx/misc/debugging.rst @@ -22,12 +22,14 @@ flask run http://127.0.0.1:5000 in browser, * Broker - i.e. http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False& - or http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif + * http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False& + * http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif + * http://127.0.0.1:5000/api/v1/badge/?provider=mopho * Analyst: http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515 http://127.0.0.1:5000/api/v1/rank/?by_species=true + * Flask will auto-update on file save. * Refresh browser after changes * The frontend endpoint cannot be tested this way, as it depends on frontend From b6cc4234d67e44f2cdf9d19176f3ff384838a307 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 14:22:42 -0500 Subject: [PATCH 28/81] rm format option from API --- flask_app/analyst/rank.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index f51ba086..6bb57825 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -18,7 +18,7 @@ class RankSvc(_AnalystService): # ............................................... @classmethod - def rank_counts(cls, count_by, order=None, limit=1, format="JSON"): + def rank_counts(cls, count_by, order=None, limit=1): """Return occurrence and species counts for dataset/organization identifiers. Args: @@ -28,7 +28,6 @@ def rank_counts(cls, count_by, order=None, limit=1, format="JSON"): "ascending" order. limit: integer URL parameter specifying the number of ordered records to return. - format: output format, options "CSV" or "JSON" full_output (flask_app.common.s2n_type.AnalystOutput): including records as a list of lists (CSV) or dictionaries (JSON) of records @@ -50,7 +49,7 @@ def rank_counts(cls, count_by, order=None, limit=1, format="JSON"): try: records, errors = cls._get_ordered_counts( good_params["count_by"], good_params["order"], - good_params["limit"], format) + good_params["limit"]) except Exception: errors = {"error": get_traceback()} @@ -66,7 +65,7 @@ def rank_counts(cls, count_by, order=None, limit=1, format="JSON"): # ............................................... @classmethod - def _get_ordered_counts(cls, count_by, order, limit, format): + def _get_ordered_counts(cls, count_by, order, limit): records = [] s3 = S3Query(PROJ_BUCKET) try: @@ -79,7 +78,6 @@ def _get_ordered_counts(cls, count_by, order, limit, format): # ............................................................................. if __name__ == "__main__": - format = "CSV" dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" svc = RankSvc() @@ -90,7 +88,7 @@ def _get_ordered_counts(cls, count_by, order, limit, format): order = "ascending" limit = 5 response = svc.rank_counts( - count_by, order=order, limit=limit, format=format) + count_by, order=order, limit=limit) AnalystOutput.print_output(response, do_print_rec=True) # print(response) From 0ebfb237fad98a3c531e43e912cbd19a7549d0af Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 15:44:49 -0500 Subject: [PATCH 29/81] add dataset_name to returned records --- flask_app/analyst/base.py | 15 +++++++++++++-- flask_app/analyst/count.py | 3 +++ flask_app/analyst/rank.py | 4 ++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 8408d702..b0b3a685 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -2,10 +2,9 @@ from werkzeug.exceptions import BadRequest from flask_app.common.base import _SpecifyNetworkService -from sppy.tools.s2n.utils import get_traceback from flask_app.common.s2n_type import AnalystOutput, APIService -# app = Flask(__name__) +from sppy.tools.s2n.utils import get_traceback # ............................................................................. @@ -98,6 +97,18 @@ def _standardize_params( return usr_params, errinfo + # ............................................... + @classmethod + def _add_dataset_names_to_records( + cls, records, dataset_key_field="datasetkey", + dataset_name_field="dataset_name"): + # if import is at top level, causes recursion error in awss3.count_datasets + from sppy.tools.provider.gbif import GbifAPI + gbif = GbifAPI(service="dataset") + for rec in records: + dataset_name, _ = gbif.get_dataset(rec[dataset_key_field]) + rec[dataset_name_field] = dataset_name + # ............................................................................. if __name__ == "__main__": diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 7d14e2b5..12d24a7e 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -51,6 +51,9 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): except Exception: errors = {"error": get_traceback()} else: + cls._add_dataset_names_to_records( + records, dataset_key_field="datasetkey", + dataset_name_field="dataset_name") if records: allrecs.append(records) # Combine errors from success or failure diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index 6bb57825..c6dc1021 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -74,6 +74,10 @@ def _get_ordered_counts(cls, count_by, order, limit): except Exception: errinfo = {"error": get_traceback()} + cls._add_dataset_names_to_records( + records, dataset_key_field="datasetkey", + dataset_name_field="dataset_name") + return records, errinfo # ............................................................................. From 849ff4cbf2eaad438cc7558e058483908aabaf4b Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 15:45:13 -0500 Subject: [PATCH 30/81] remove obsolete imports --- sppy/tools/provider/gbif.py | 2 +- sppy/tools/s2n/utils.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py index 890a527e..a2c521dd 100644 --- a/sppy/tools/provider/gbif.py +++ b/sppy/tools/provider/gbif.py @@ -12,7 +12,7 @@ from sppy.tools.util.logtools import logit from sppy.tools.provider.api import APIQuery -from sppy.tools.s2n.utils import get_traceback, add_errinfo +from sppy.tools.s2n.utils import add_errinfo # ............................................................................. diff --git a/sppy/tools/s2n/utils.py b/sppy/tools/s2n/utils.py index 3b6895ea..bc3f3241 100644 --- a/sppy/tools/s2n/utils.py +++ b/sppy/tools/s2n/utils.py @@ -3,9 +3,6 @@ import traceback from uuid import UUID -# from flask_app.broker.constants import ICON_API, ServiceProvider -# from flask_app.common.s2n_type import APIEndpoint - # ...................................................... def is_valid_uuid(uuid_to_test, version=4): From 13e33b8e921ce3fe35fb74101fd084779d3c3c0d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 15:47:22 -0500 Subject: [PATCH 31/81] add todo doc --- flask_app/analyst/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index b0b3a685..3c29035a 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -102,6 +102,7 @@ def _standardize_params( def _add_dataset_names_to_records( cls, records, dataset_key_field="datasetkey", dataset_name_field="dataset_name"): + # TODO: change this to a call to an S3 table with all dataset keys/names # if import is at top level, causes recursion error in awss3.count_datasets from sppy.tools.provider.gbif import GbifAPI gbif = GbifAPI(service="dataset") From 73e6a50e5ec26a59da9b5e9eb47f6cb2dbb5f3f9 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 18 Mar 2024 17:02:37 -0500 Subject: [PATCH 32/81] documentation --- .env.broker.conf | 2 +- sphinx/about/install_run_notes.rst | 340 ++++++++++++++--------------- sphinx/aws/aws_workflow.rst | 8 +- sphinx/misc/debugging.rst | 42 ++++ 4 files changed, 205 insertions(+), 187 deletions(-) diff --git a/.env.broker.conf b/.env.broker.conf index 95dab4b7..7cfc46b7 100644 --- a/.env.broker.conf +++ b/.env.broker.conf @@ -1,5 +1,5 @@ SECRET_KEY=dev WORKING_DIRECTORY=/scratch-path -FQDN=analyst.localhost +FQDN=broker.localhost PYTHONPATH=/home/specify/flask_app diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index e5d5bb00..5b64decc 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -6,19 +6,19 @@ Contains * Specify Network API services - * Tools/classes for broker, including + * Tools/classes for broker, including - * Flask application for individual API endpoints and frontend - * classes for Provider API connectors - * standardized API service output (s2n) + * Flask application for individual API endpoints and frontend + * classes for Provider API connectors + * standardized API service output (s2n) - * Tools/classes for analyst, including + * Tools/classes for analyst, including - * AWS scripts and - * Classes for use on EC2 or other AWS resources - * geotools for geospatial intersection/annotations - * aggregation, summary tools for writing tabular summaries - * + * AWS scripts and + * Classes for use on EC2 or other AWS resources + + * geotools for geospatial intersection/annotations + * aggregation, summary tools for writing tabular summaries Deployment =================================== @@ -32,20 +32,18 @@ To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate and the private key) using Let's Encrypt and link these files in `./sp_network/config/`. While in development, generate self-signed certificates then link them in -~/git/sp_network/config/ directory for this project: +~/git/sp_network/config/ directory for this project:: -```zsh -$ mkdir ~/certificates + $ mkdir ~/certificates -openssl req \ + openssl req \ -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \ -keyout ~/certificates/privkey.pem \ -out ~/certificates/fullchain.pem -$ cd ~/git/sp_network/config -$ ln -s ~/certificates/privkey.pem -$ ln -s ~/certificates/fullchain.pem -``` + $ cd ~/git/sp_network/config + $ ln -s ~/certificates/privkey.pem + $ ln -s ~/certificates/fullchain.pem To run either the production or the development containers with HTTPS support, generate `fullchain.pem` and `privkey.pem` (certificate and the private @@ -61,54 +59,55 @@ TLS/SSL using Certificate Authority (CA) * Stop apache service * request a certificate for the domain -```commandline -ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2 -ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v -Saving debug log to /var/log/letsencrypt/letsencrypt.log - -How would you like to authenticate with the ACME CA? -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -1: Spin up a temporary webserver (standalone) -2: Place files in webroot directory (webroot) -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 -Plugins selected: Authenticator standalone, Installer None -Please enter the domain name(s) you would like on your certificate (comma and/or -space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org -Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org -Performing the following challenges: -http-01 challenge for broker-dev.spcoco.org -Waiting for verification... -Cleaning up challenges - -Successfully received certificate. -Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem -Key is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem -This certificate expires on 2023-10-18. -These files will be updated when the certificate renews. -Certbot has set up a scheduled task to automatically renew this certificate in the background. - -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -If you like Certbot, please consider supporting our work by: - * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate - * Donating to EFF: https://eff.org/donate-le -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -ubuntu@ip-172-31-86-62:~$ -``` +:: + + ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2 + ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v + Saving debug log to /var/log/letsencrypt/letsencrypt.log + + How would you like to authenticate with the ACME CA? + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + 1: Spin up a temporary webserver (standalone) + 2: Place files in webroot directory (webroot) + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 + Plugins selected: Authenticator standalone, Installer None + Please enter the domain name(s) you would like on your certificate (comma and/or + space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org + Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org + Performing the following challenges: + http-01 challenge for broker-dev.spcoco.org + Waiting for verification... + Cleaning up challenges + + Successfully received certificate. + Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem + Key is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem + This certificate expires on 2023-10-18. + These files will be updated when the certificate renews. + Certbot has set up a scheduled task to automatically renew this certificate in the background. + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + If you like Certbot, please consider supporting our work by: + * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate + * Donating to EFF: https://eff.org/donate-le + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + ubuntu@ip-172-31-86-62:~$ + * as superuser, link the newly created fullchain.pem and privkey.pem files from the letsencrypt live to the project/config directory * change the owner so that they can be used in Docker containers -```commandline -$ sudo su - -# cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ -# chown ubuntu:ubuntu /home/ubuntu/certificates/* -# exit -$ cd ~/git/sp_network/config -$ ln -s ~/certificates/fullchain.pem -$ ln -s ~/certificates/privkey.pem -``` +:: + + $ sudo su - + # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ + # chown ubuntu:ubuntu /home/ubuntu/certificates/* + # exit + $ cd ~/git/sp_network/config + $ ln -s ~/certificates/fullchain.pem + $ ln -s ~/certificates/privkey.pem Renew Certbot SSL certificates ......................................... @@ -122,19 +121,19 @@ Amazon EC2 containers do not need apache running, certbot runs its own temp web Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3 -```commandline -$ sudo certbot certificates -$ sudo docker compose stop -$ sudo su - -# certbot renew -# cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ -# chown ubuntu:ubuntu /home/ubuntu/certificates/* -# exit -$ ls -lahtr ~/git/sp_network/config - -$ sudo docker system prune --all --volumes -$ sudo docker compose up -d -``` +:: + + $ sudo certbot certificates + $ sudo docker compose stop + $ sudo su - + # certbot renew + # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ + # chown ubuntu:ubuntu /home/ubuntu/certificates/* + # exit + $ ls -lahtr ~/git/sp_network/config + + $ sudo docker system prune --all --volumes + $ sudo docker compose up -d TODO: SSL through Amazon ......................................... @@ -151,12 +150,11 @@ Install Install dependencies --------------------------------------- -Certbot: +Certbot:: + + $ sudo apt update + $ sudo apt install certbot -```commandline -$ sudo apt update -$ sudo apt install certbot -``` Install Docker --------------------------------------- @@ -170,34 +168,33 @@ Install repo from Github * generate an SSH key for communicating with Github * Add SSH key to agent on local machine -```commandline -$ ssh-keygen -t rsa -b 4096 -C "aimee.stewart@ku.edu" -$ eval "$(ssh-agent -s)" -$ ssh-add ~/.ssh/id_rsa -$ cat .ssh/id_rsa.pub -``` +:: + + $ ssh-keygen -t rsa -b 4096 -C "aimee.stewart@ku.edu" + $ eval "$(ssh-agent -s)" + $ ssh-add ~/.ssh/id_rsa + $ cat .ssh/id_rsa.pub + * Add the SSH to Github by printing to console, copying, adding in Github profile * clone the repository -```commandline -$ cat .ssh/id_rsa.pub -$ # -$ cd ~/git -$ git clone git@github.com:specifysystems/sp_network.git -$ git checkout -``` +:: + $ cat .ssh/id_rsa.pub + $ # + $ cd ~/git + $ git clone git@github.com:specifysystems/sp_network.git + $ git checkout Install certificates into config directory ------------------------------------------------------- * Link the certificates in the repo config directory -```commandline -$ cd ~/git/sp_network -$ cd config -$ ln -s ~/certificates/fullchain1.pem -$ ln -s ~/certificates/privkey1.pem -``` +:: + $ cd ~/git/sp_network + $ cd config + $ ln -s ~/certificates/fullchain1.pem + $ ln -s ~/certificates/privkey1.pem Testing --------------------------------------- @@ -206,7 +203,9 @@ On a development server, check the following URL endpoints: * Index page: https://localhost * Broker: + * https://localhost/api/v1/ + * https://localhost/api/v1/badge/ * https://localhost/api/v1/name/ * https://localhost/api/v1/occ/ @@ -224,12 +223,11 @@ Environment variables set in the Docker containers from the .env.broker.conf and .env.broker.conf files are necessary to inform the host machine/container of its FQDN. **Temp solution:** Export these variables to the local environment in the python -virtual environment activation script (bin/activate) script. +virtual environment activation script (bin/activate) script:: + + export SECRET_KEY="dev" + export WORKING_DIRECTORY="scratch-path" -```zsh -export SECRET_KEY="dev" -export WORKING_DIRECTORY="scratch-path" -``` **Specify Network** homepage is now available at https://localhost/ and http://localhost. @@ -249,20 +247,17 @@ Troubleshooting For webserver errors, check logs of nginx container:: -```commandline -$ sudo docker logs --tail 1000 sp_network-nginx-1 -$ sudo docker logs --tail 1000 sp_network-broker-1 -``` + $ sudo docker logs --tail 1000 sp_network-nginx-1 + $ sudo docker logs --tail 1000 sp_network-broker-1 + Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+ -Then stop/rebuild/start: +Then stop/rebuild/start:: -```commandline -$ sudo docker compose stop -$ sudo docker system prune --all --volumes -$ sudo docker compose up -d -``` + $ sudo docker compose stop + $ sudo docker system prune --all --volumes + $ sudo docker compose up -d Docker manipulation ================================= @@ -272,6 +267,7 @@ Edit the docker environment files * Add the container domain name to the files .env.broker.conf and .env.analyst.conf * Change the FQDN value to the fully qualified domain name of the server. + * If this is a local testing deployment, it will be "localhost" * For a development or production server it will be the FQDN with correct subdomain for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and @@ -280,9 +276,9 @@ Edit the docker environment files Run the containers (production) ------------------------------------------- -```zsh -sudo docker compose -f docker-compose.yml up -d -``` +Start the containers with the Docker composition file:: + + sudo docker compose -f docker-compose.yml up -d Specify Network is now available at [https://localhost/](https://localhost:443) @@ -292,11 +288,9 @@ Run the containers (development) Note that the development compose file, docker-compose.development.yml, is referenced first on the command line. It has elements that override those defined in the -general compose file, docker-compose.yml. +general compose file, docker-compose.yml:: -```zsh -sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up -``` + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up Flask has hot-reload enabled. @@ -305,32 +299,25 @@ Rebuild/restart ------------------------------------------- To delete all containers, images, networks and volumes, stop any running -containers: +containers:: + + sudo docker compose stop -```zsh -sudo docker compose stop -``` -And run this command (which ignores running container): +And run this command (which ignores running container):: -```zsh -sudo docker system prune --all --volumes -``` + sudo docker system prune --all --volumes -Then rebuild/restart: +Then rebuild/restart:: -```zsh -sudo docker compose up -d -``` + sudo docker compose up -d Examine container ------------------------------------------- -To examine containers at a shell prompt: +To examine containers at a shell prompt:: -```zsh -sudo docker exec -it sp_network-nginx-1 /bin/sh -``` + sudo docker exec -it sp_network-nginx-1 /bin/sh Error port in use: "Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use" @@ -338,24 +325,22 @@ Error port in use: See what else is using the port. In my case apache was started on reboot. Bring down all docker containers, shut down httpd, bring up docker. -```zsh -lsof -i -P -n | grep 443 -sudo docker compose down -sudo systemctl stop httpd -sudo docker compose up -d -``` +:: + lsof -i -P -n | grep 443 + sudo docker compose down + sudo systemctl stop httpd + sudo docker compose up -d + Dev Environment ========================== -* Create a virtual environment and install python libs there +* Create a virtual environment and install python libs there:: -```commandline -$ cd ~/git/sp_network -$ python3 -m venv venv -$ . venv/bin/activate -$ pip install -r requirements.txt -``` + $ cd ~/git/sp_network + $ python3 -m venv venv + $ . venv/bin/activate + $ pip install -r requirements.txt Configure Debugger in local IDE @@ -370,19 +355,17 @@ Debug To run flask in debug mode, first set up Flask environment, then start the flask application (in this case, main in flask_app.broker.routes.py). Only one resource (aka broker or analyst) at a time can be tested in this way. -Reset the FLASK_APP variable to test an alternate resource. - -** the broker frontend can NOT be tested this way, as it depends on a docker volume +Reset the FLASK_APP variable to test an alternate resource:: -```zsh -export FLASK_ENV=development -export FLASK_APP=flask_app.broker.routes:app -# or -# export FLASK_APP=flask_app.analyst.routes:app -flask run -``` + export FLASK_ENV=development + export FLASK_APP=flask_app.broker.routes:app + # or + # export FLASK_APP=flask_app.analyst.routes:app + flask run * `broker` container is running `debugpy` on localhost, port `5000` +* ** the broker frontend can NOT be tested this way, as it depends on a docker volume + * Test with http, no https!! http://localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) @@ -394,40 +377,33 @@ Troubleshooting pip errors with SSL ------------------------------------------- - * add trusted-host option at command line +* add trusted-host option at command line:: + + pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org ~/git/lmpy -```commandline -pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org ~/git/lmpy -``` - * for processes that call pip, create a pip configuration file , then export as - PIP_CONFIG_FILE environment variable in .bashrc +* for processes that call pip, create a pip configuration file , then export as + PIP_CONFIG_FILE environment variable in .bashrc:: -```commandline -# ~/pip.conf -[install] -trusted-host = pypi.python.org - pypi.org - files.pythonhosted.org + # ~/pip.conf + [install] + trusted-host = pypi.python.org + pypi.org + files.pythonhosted.org -# ~/.bashrc -export PIP_CONFIG_FILE ~/pip.conf -``` + # ~/.bashrc + export PIP_CONFIG_FILE ~/pip.conf pre-commit errors with self-signed certificate --------------------------------------------------------- - * turn off verification (but this leaves you open to man-in-the-middle attacks) +* turn off verification (but this leaves you open to man-in-the-middle attacks):: -```commandline -git config --global http.sslVerify false -``` + git config --global http.sslVerify false - * turn on again with + * turn on again with:: -```commandline -git config --global http.sslVerify true + git config --global http.sslVerify true -``` pre-commit build errors -------------------------------------- diff --git a/sphinx/aws/aws_workflow.rst b/sphinx/aws/aws_workflow.rst index 6ed902f9..33100d8a 100644 --- a/sphinx/aws/aws_workflow.rst +++ b/sphinx/aws/aws_workflow.rst @@ -4,7 +4,7 @@ AWS Workflow Reference =========================================================== - * Stored procedures in rs_stored_procedures.sql +* Stored procedures in rs_stored_procedures.sql Steps @@ -14,9 +14,9 @@ Steps *********************************************************** * Redshift: Subset GBIF data from Amazon Registry of Open Data (AWS ODR) for processing - * First run rs_create_stored_procedures.sql to create procedures for the subset script. - * Next run rs_subset_gbif.sql to subset the data - * + + * First run rs_create_stored_procedures.sql to create procedures for the subset script. + * Next run rs_subset_gbif.sql to subset the data 1.5 TODO *********************************************************** diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst index 24659dd4..92c20cf6 100644 --- a/sphinx/misc/debugging.rst +++ b/sphinx/misc/debugging.rst @@ -16,6 +16,7 @@ Local debugging of flask app ```zsh export FLASK_ENV=development export FLASK_APP=flask_app.broker.routes +export FLASK_APP=flask_app.analyst.routes flask run ``` * With either Analyst or Broker, the development port will be 5000. Connect to @@ -35,3 +36,44 @@ flask run * The frontend endpoint cannot be tested this way, as it depends on frontend **webpack-output** and **static-files** to be mounted as docker volumes. + +Local debugging of Docker +============================================= + +More info in about/install_run_notes + + +Run Docker containers (development) +------------------------------------------- + +Note that the development compose file, docker-compose.development.yml, is referenced +first on the command line. It has elements that override those defined in the +general compose file, docker-compose.yml:: + + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Flask has hot-reload enabled. + +Rebuild/restart +------------------------------------------- + +To delete all containers, images, networks and volumes, stop any running +containers:: + + sudo docker compose stop + + +And run this command (which ignores running container):: + + sudo docker system prune --all --volumes + +Then rebuild/restart:: + + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Examine container +------------------------------------------- + +To examine containers at a shell prompt:: + + sudo docker exec -it sp_network-nginx-1 /bin/sh From 1b7087344eeb9ad33311eaeb3668c4d5f114284c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 19 Mar 2024 16:01:31 -0500 Subject: [PATCH 33/81] debug notes --- sphinx/misc/docker.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sphinx/misc/docker.rst diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst new file mode 100644 index 00000000..e69de29b From b10071aca41b8de510ec65ed1dad10e6be05311d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 19 Mar 2024 16:20:56 -0500 Subject: [PATCH 34/81] debug notes --- sphinx/about/install_run_notes.rst | 6 ++ sphinx/misc/docker.rst | 107 +++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index 5b64decc..be249905 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -196,6 +196,12 @@ Install certificates into config directory $ ln -s ~/certificates/fullchain1.pem $ ln -s ~/certificates/privkey1.pem +Direct Docker to correct FQDN +------------------------------------ + +Edit FQDN value in env.conf (or .env.analyst.conf and .env.broker.conf) to actual FQDN + + Testing --------------------------------------- On a development server, check the following URL endpoints: diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst index e69de29b..0f43b3f6 100644 --- a/sphinx/misc/docker.rst +++ b/sphinx/misc/docker.rst @@ -0,0 +1,107 @@ +Docker Troubleshooting +############################## + +Out of space error +************************ + +Problem +------------------ + +Running `certbot certificates` failed because the EC2 instance running Docker +containers for Specify Network development shows disk full:: + + root@ip-172-31-86-62:~# df -h + Filesystem Size Used Avail Use% Mounted on + /dev/root 7.6G 7.6G 0 100% / + tmpfs 483M 0 483M 0% /dev/shm + tmpfs 194M 21M 173M 11% /run + tmpfs 5.0M 0 5.0M 0% /run/lock + /dev/xvda15 105M 6.1M 99M 6% /boot/efi + overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/82d82cc5eb13260207b94443934c7318af651ea96a5fcd88c579f23224ba099d/merged + overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/cb0d78289131b3925e21d7eff2d03c79fe432eeba2d69a33c6134db40dc3caf3/merged + overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/3bd6d12b36e746f9c74227b6ac9d928a3179d8b604a9dea4fd88625eab84be1f/merged + tmpfs 97M 4.0K 97M 1% /run/user/1000 + + +Research +------------------ + +The disk is small, but the culprit is /var/lib/docker/overlay2 + +Some strategies at: +https://forums.docker.com/t/some-way-to-clean-up-identify-contents-of-var-lib-docker-overlay/30604/19 + +Actual disk usage is correctly reported here (unlike some of the use cases above), so +for now, clean it all out by stopping, pruning, removing images, killing the overlay2 +directory, recreating the overlay2 directory, changing permissions, then rebuilding +and restarting the docker image:: + + $ sudo docker compose stop + $ sudo docker system prune --all --volumes + $ sudo docker image ls + REPOSITORY TAG IMAGE ID CREATED SIZE + e6bf776fc762 2 months ago 1.43GB + 0ece9b23b9b3 2 months ago 108MB + 23e4dc1f7809 2 months ago 108MB + 529b5644c430 4 months ago 42.6MB + + $ sudo docker image rm + $ sudo du -skh /var/lib/docker/overlay2 + 1.2G /var/lib/docker/overlay2 + + $ sudo rm -rf /var/lib/docker/overlay2 + $ df -h + Filesystem Size Used Avail Use% Mounted on + /dev/root 7.6G 4.9G 2.8G 65% / + tmpfs 483M 0 483M 0% /dev/shm + tmpfs 194M 884K 193M 1% /run + tmpfs 5.0M 0 5.0M 0% /run/lock + /dev/xvda15 105M 6.1M 99M 6% /boot/efi + tmpfs 97M 4.0K 97M 1% /run/user/1000 + + $ sudo mkdir /var/lib/docker/overlay2 + $ sudo ls -lahtr /var/lib/docker/overlay2 + total 8.0K + drwx--x--- 12 root root 4.0K Mar 19 20:20 .. + drwxr-xr-x 2 root root 4.0K Mar 19 20:20 . + + $ sudo chmod 710 /var/lib/docker/overlay2 + total 8.0K + drwx--x--- 12 root root 4.0K Mar 19 20:20 .. + drwx--x--- 2 root root 4.0K Mar 19 20:20 . + + + + +Then uninstall docker (previously installed from docker repository noted in +about/install_run_notes), update apt repositories, re-install, reboot:: + + $ sudo apt list docker --installed + Listing... Done + docker/jammy 1.5-2 all + $ sudo apt-get update + $ sudo apt remove docker + ... + $ sudo apt install docker + ... + $ sudo shutdown -r now + +Apparently, ubuntu comes with a docker install, not removed by apt:: + + $ dpkg -l | grep -i docker + ii docker-buildx-plugin 0.10.4-1~ubuntu.22.04~jammy amd64 Docker Buildx cli plugin. + ii docker-ce 5:23.0.4-1~ubuntu.22.04~jammy amd64 Docker: the open-source application container engine + ii docker-ce-cli 5:23.0.4-1~ubuntu.22.04~jammy amd64 Docker CLI: the open-source application container engine + ii docker-ce-rootless-extras 5:23.0.4-1~ubuntu.22.04~jammy amd64 Rootless support for Docker. + ii docker-compose-plugin 2.17.2-1~ubuntu.22.04~jammy amd64 Docker Compose (V2) plugin for the Docker CLI. + ii wmdocker 1.5-2 amd64 System tray for KDE3/GNOME2 docklet applications + + $ sudo sudo apt-get purge -y docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras + $ sudo apt-get autoremove -y --purge docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras + $ sudo rm -rf /var/lib/docker + $ sudo groupdel docker + $ sudo rm -rf /var/run/docker.sock + +Then rebuild/restart docker:: + + $ sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up From c5f8ba7aef8aa3382d5566c6f8df1f4d0e5a5d3d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 14:10:28 -0500 Subject: [PATCH 35/81] extension for easy internal links bw docs --- sphinx/conf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sphinx/conf.py b/sphinx/conf.py index 96297158..1f14af2c 100644 --- a/sphinx/conf.py +++ b/sphinx/conf.py @@ -28,8 +28,9 @@ 'sphinx_rtd_theme', # 'autoapi.extension', 'myst_parser', # For MD support - ] - + # for internal links + 'sphinx.ext.autosectionlabel', +] templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] From a8c328c69ae3f4c01aef1c36e5e5b7362a3bb125 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 14:50:28 -0500 Subject: [PATCH 36/81] docs --- sphinx/about/install_run_notes.rst | 204 +------------------------ sphinx/misc/docker.rst | 229 ++++++++++++++++++++--------- sphinx/misc/ssl_certificates.rst | 176 +++++++++++++++------- 3 files changed, 287 insertions(+), 322 deletions(-) diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index be249905..bf891443 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -23,127 +23,6 @@ Contains Deployment =================================== -SSL ------------------------------------ - -Local self-signed certificates -......................................... -To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate -and the private key) using Let's Encrypt and link these files in `./sp_network/config/`. - -While in development, generate self-signed certificates then link them in -~/git/sp_network/config/ directory for this project:: - - $ mkdir ~/certificates - - openssl req \ - -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \ - -keyout ~/certificates/privkey.pem \ - -out ~/certificates/fullchain.pem - - $ cd ~/git/sp_network/config - $ ln -s ~/certificates/privkey.pem - $ ln -s ~/certificates/fullchain.pem - -To run either the production or the development containers with HTTPS -support, generate `fullchain.pem` and `privkey.pem` (certificate and the private -key) using Let's Encrypt, link these files in the `./config/` directory. -Full instructions in the docs/aws-steps.rst page, under `Set up TLS/SSL` - -Modify the `FQDN` environment variable in `.env.conf` as needed. - -TLS/SSL using Certificate Authority (CA) -.................................................. - -* Make sure that DNS has propogated for domain for SSL -* Stop apache service -* request a certificate for the domain - -:: - - ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2 - ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v - Saving debug log to /var/log/letsencrypt/letsencrypt.log - - How would you like to authenticate with the ACME CA? - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1: Spin up a temporary webserver (standalone) - 2: Place files in webroot directory (webroot) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 - Plugins selected: Authenticator standalone, Installer None - Please enter the domain name(s) you would like on your certificate (comma and/or - space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org - Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org - Performing the following challenges: - http-01 challenge for broker-dev.spcoco.org - Waiting for verification... - Cleaning up challenges - - Successfully received certificate. - Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem - Key is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem - This certificate expires on 2023-10-18. - These files will be updated when the certificate renews. - Certbot has set up a scheduled task to automatically renew this certificate in the background. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - If you like Certbot, please consider supporting our work by: - * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate - * Donating to EFF: https://eff.org/donate-le - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ubuntu@ip-172-31-86-62:~$ - - -* as superuser, link the newly created fullchain.pem and privkey.pem files from the - letsencrypt live to the project/config directory -* change the owner so that they can be used in Docker containers - -:: - - $ sudo su - - # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ - # chown ubuntu:ubuntu /home/ubuntu/certificates/* - # exit - $ cd ~/git/sp_network/config - $ ln -s ~/certificates/fullchain.pem - $ ln -s ~/certificates/privkey.pem - -Renew Certbot SSL certificates -......................................... - -SSL certificates are served from the instance (AWS EC2), and need port 80 to be renewed. -These are administered by Letsencrypt using Certbot and are only valid for 90 days at -a time. When it is time for a renewal (approx every 60 days), bring the docker -containers down. Renew the certificates, then bring the containers up again. - -Amazon EC2 containers do not need apache running, certbot runs its own temp web server. - -Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3 - -:: - - $ sudo certbot certificates - $ sudo docker compose stop - $ sudo su - - # certbot renew - # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ - # chown ubuntu:ubuntu /home/ubuntu/certificates/* - # exit - $ ls -lahtr ~/git/sp_network/config - - $ sudo docker system prune --all --volumes - $ sudo docker compose up -d - -TODO: SSL through Amazon -......................................... - -* Create Elastic IP address for EC2 instance -* Request a public certificate through Certificate Manager (ACM) - * Choose DNS validation - * Add tags sp_network, dev or prod, others - - Install ====================================== @@ -185,16 +64,10 @@ Install repo from Github $ git clone git@github.com:specifysystems/sp_network.git $ git checkout -Install certificates into config directory -------------------------------------------------------- - -* Link the certificates in the repo config directory +SSL +----------------------------------- +:ref:`Specify Network SSL certificates` -:: - $ cd ~/git/sp_network - $ cd config - $ ln -s ~/certificates/fullchain1.pem - $ ln -s ~/certificates/privkey1.pem Direct Docker to correct FQDN ------------------------------------ @@ -265,77 +138,10 @@ Then stop/rebuild/start:: $ sudo docker system prune --all --volumes $ sudo docker compose up -d -Docker manipulation +Docker ================================= -Edit the docker environment files -------------------------------------------- - -* Add the container domain name to the files .env.broker.conf and .env.analyst.conf -* Change the FQDN value to the fully qualified domain name of the server. - - * If this is a local testing deployment, it will be "localhost" - * For a development or production server it will be the FQDN with correct subdomain - for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and - FQDN=analyst.spcoco.org in .env.analyst.conf - -Run the containers (production) -------------------------------------------- - -Start the containers with the Docker composition file:: - - sudo docker compose -f docker-compose.yml up -d - -Specify Network is now available at [https://localhost/](https://localhost:443) - - -Run the containers (development) -------------------------------------------- - -Note that the development compose file, docker-compose.development.yml, is referenced -first on the command line. It has elements that override those defined in the -general compose file, docker-compose.yml:: - - sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up - -Flask has hot-reload enabled. - - -Rebuild/restart -------------------------------------------- - -To delete all containers, images, networks and volumes, stop any running -containers:: - - sudo docker compose stop - - -And run this command (which ignores running container):: - - sudo docker system prune --all --volumes - -Then rebuild/restart:: - - sudo docker compose up -d - -Examine container -------------------------------------------- - -To examine containers at a shell prompt:: - - sudo docker exec -it sp_network-nginx-1 /bin/sh - -Error port in use: -"Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use" - -See what else is using the port. In my case apache was started on reboot. Bring down -all docker containers, shut down httpd, bring up docker. - -:: - lsof -i -P -n | grep 443 - sudo docker compose down - sudo systemctl stop httpd - sudo docker compose up -d +More info at :ref:`Docker` Dev Environment diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst index 0f43b3f6..074bb1e4 100644 --- a/sphinx/misc/docker.rst +++ b/sphinx/misc/docker.rst @@ -1,10 +1,85 @@ -Docker Troubleshooting +Docker ############################## -Out of space error -************************ +Standard manipulation +================================= -Problem +Edit the docker environment files +------------------------------------------- + +* Add the container domain name to the files .env.broker.conf and .env.analyst.conf +* Change the FQDN value to the fully qualified domain name of the server. + + * If this is a local testing deployment, it will be "localhost" + * For a development or production server it will be the FQDN with correct subdomain + for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and + FQDN=analyst.spcoco.org in .env.analyst.conf + +Run the containers (production) +------------------------------------------- + +Start the containers with the Docker composition file:: + + sudo docker compose -f docker-compose.yml up -d + +Specify Network is now available at [https://localhost/](https://localhost:443) + + +Run the containers (development) +------------------------------------------- + +Note that the development compose file, docker-compose.development.yml, is referenced +first on the command line. It has elements that override those defined in the +general compose file, docker-compose.yml:: + + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Flask has hot-reload enabled. + + +Rebuild/restart +------------------------------------------- + +To delete all containers, images, networks and volumes, stop any running +containers:: + + sudo docker compose stop + + +And run this command (which ignores running container):: + + sudo docker system prune --all --volumes + +Then rebuild/restart:: + + sudo docker compose up -d + # or + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Examine container +------------------------------------------- + +To examine containers at a shell prompt:: + + sudo docker exec -it sp_network-nginx-1 /bin/sh + +Error port in use: +"Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use" + +See what else is using the port. In my case apache was started on reboot. Bring down +all docker containers, shut down httpd, bring up docker. + +:: + lsof -i -P -n | grep 443 + sudo docker compose down + sudo systemctl stop httpd + sudo docker compose up -d + + +Troubleshooting +================================= + +Out of Space Problem ------------------ Running `certbot certificates` failed because the EC2 instance running Docker @@ -22,86 +97,98 @@ containers for Specify Network development shows disk full:: overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/3bd6d12b36e746f9c74227b6ac9d928a3179d8b604a9dea4fd88625eab84be1f/merged tmpfs 97M 4.0K 97M 1% /run/user/1000 - -Research ------------------- - The disk is small, but the culprit is /var/lib/docker/overlay2 Some strategies at: https://forums.docker.com/t/some-way-to-clean-up-identify-contents-of-var-lib-docker-overlay/30604/19 -Actual disk usage is correctly reported here (unlike some of the use cases above), so -for now, clean it all out by stopping, pruning, removing images, killing the overlay2 -directory, recreating the overlay2 directory, changing permissions, then rebuilding -and restarting the docker image:: +Solution: +------------------- - $ sudo docker compose stop - $ sudo docker system prune --all --volumes - $ sudo docker image ls - REPOSITORY TAG IMAGE ID CREATED SIZE - e6bf776fc762 2 months ago 1.43GB - 0ece9b23b9b3 2 months ago 108MB - 23e4dc1f7809 2 months ago 108MB - 529b5644c430 4 months ago 42.6MB - - $ sudo docker image rm - $ sudo du -skh /var/lib/docker/overlay2 - 1.2G /var/lib/docker/overlay2 - - $ sudo rm -rf /var/lib/docker/overlay2 - $ df -h +* The instance was created with a volume of an 8gb default size. +* Stop the instance +* Modify the volume. +* Restart the EC2 instance - ok while the volume is in the optimizing state. +* If the instance does not recognize the extended volume immediately:: + + ubuntu@ip-172-31-91-57:~$ df -h Filesystem Size Used Avail Use% Mounted on - /dev/root 7.6G 4.9G 2.8G 65% / - tmpfs 483M 0 483M 0% /dev/shm - tmpfs 194M 884K 193M 1% /run + /dev/root 7.6G 7.6G 0 100% / + tmpfs 475M 0 475M 0% /dev/shm + tmpfs 190M 11M 180M 6% /run tmpfs 5.0M 0 5.0M 0% /run/lock /dev/xvda15 105M 6.1M 99M 6% /boot/efi - tmpfs 97M 4.0K 97M 1% /run/user/1000 + tmpfs 95M 4.0K 95M 1% /run/user/1000 + ubuntu@ip-172-31-91-57:~$ sudo lsblk + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS + loop0 7:0 0 24.9M 1 loop /snap/amazon-ssm-agent/7628 + loop1 7:1 0 25.2M 1 loop /snap/amazon-ssm-agent/7983 + loop2 7:2 0 55.7M 1 loop /snap/core18/2796 + loop3 7:3 0 55.7M 1 loop /snap/core18/2812 + loop4 7:4 0 63.9M 1 loop /snap/core20/2105 + loop5 7:5 0 63.9M 1 loop /snap/core20/2182 + loop6 7:6 0 87M 1 loop /snap/lxd/27037 + loop7 7:7 0 87M 1 loop /snap/lxd/27428 + loop8 7:8 0 40.4M 1 loop /snap/snapd/20671 + loop9 7:9 0 39.1M 1 loop /snap/snapd/21184 + xvda 202:0 0 30G 0 disk + ├─xvda1 202:1 0 7.9G 0 part / + ├─xvda14 202:14 0 4M 0 part + └─xvda15 202:15 0 106M 0 part /boot/efi + +* extend the filesystem: + https://docs.aws.amazon.com/ebs/latest/userguide/recognize-expanded-volume-linux.html +* In this case we want to extend xvda1, so:: + + $ sudo growpart /dev/xvda 1 + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + mkdir: cannot create directory ‘/tmp/growpart.1496’: No space left on device + FAILED: failed to make temp dir + +* We must free up space to allow extension:: - $ sudo mkdir /var/lib/docker/overlay2 - $ sudo ls -lahtr /var/lib/docker/overlay2 - total 8.0K - drwx--x--- 12 root root 4.0K Mar 19 20:20 .. - drwxr-xr-x 2 root root 4.0K Mar 19 20:20 . - - $ sudo chmod 710 /var/lib/docker/overlay2 - total 8.0K - drwx--x--- 12 root root 4.0K Mar 19 20:20 .. - drwx--x--- 2 root root 4.0K Mar 19 20:20 . - - - - -Then uninstall docker (previously installed from docker repository noted in -about/install_run_notes), update apt repositories, re-install, reboot:: - - $ sudo apt list docker --installed - Listing... Done - docker/jammy 1.5-2 all - $ sudo apt-get update - $ sudo apt remove docker - ... - $ sudo apt install docker + $ sudo docker system prune --all --volumes + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + WARNING! This will remove: + - all stopped containers + - all networks not used by at least one container + - all volumes not used by at least one container + - all images without at least one container associated to them + - all build cache + + Are you sure you want to continue? [y/N] y + Deleted Containers: + 24768ca767d37f248eff173f13556007468330298329200d533dfa9ca011e409 + 809709d6f8bfa8575009a0d07df16ee78852e9ab3735aa19561ac0dbc0313123 + 64591ed14ecae60721ea367af650683f738636167162f6ed577063582c210aa9 + + Deleted Networks: + sp_network_nginx + + Deleted Images: + untagged: nginx:alpine + untagged: nginx@sha256:a59278fd22a9d411121e190b8cec8aa57b306aa3332459197777583beb728f59 + deleted: sha256:529b5644c430c06553d2e8082c6713fe19a4169c9dc2369cbb960081f52924ff ... - $ sudo shutdown -r now + deleted: sha256:e74dab46dbca98b4be75dfbda3608cd857914b750ecd251c4f1bdbb4ef623c8c -Apparently, ubuntu comes with a docker install, not removed by apt:: + Total reclaimed space: 1.536GB - $ dpkg -l | grep -i docker - ii docker-buildx-plugin 0.10.4-1~ubuntu.22.04~jammy amd64 Docker Buildx cli plugin. - ii docker-ce 5:23.0.4-1~ubuntu.22.04~jammy amd64 Docker: the open-source application container engine - ii docker-ce-cli 5:23.0.4-1~ubuntu.22.04~jammy amd64 Docker CLI: the open-source application container engine - ii docker-ce-rootless-extras 5:23.0.4-1~ubuntu.22.04~jammy amd64 Rootless support for Docker. - ii docker-compose-plugin 2.17.2-1~ubuntu.22.04~jammy amd64 Docker Compose (V2) plugin for the Docker CLI. - ii wmdocker 1.5-2 amd64 System tray for KDE3/GNOME2 docklet applications +* Extend filesystem:: - $ sudo sudo apt-get purge -y docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras - $ sudo apt-get autoremove -y --purge docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras - $ sudo rm -rf /var/lib/docker - $ sudo groupdel docker - $ sudo rm -rf /var/run/docker.sock + $ sudo growpart /dev/xvda 1 + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + CHANGED: partition=1 start=227328 old: size=16549855 end=16777183 new: size=62687199 end=62914527 + $ df -h + Filesystem Size Used Avail Use% Mounted on + /dev/root 7.6G 5.7G 2.0G 75% / + tmpfs 475M 0 475M 0% /dev/shm + tmpfs 190M 18M 173M 10% /run + tmpfs 5.0M 0 5.0M 0% /run/lock + /dev/xvda15 105M 6.1M 99M 6% /boot/efi + tmpfs 95M 4.0K 95M 1% /run/user/1000 -Then rebuild/restart docker:: - $ sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up +* Stop apache2 if running +* Rebuild the docker containers diff --git a/sphinx/misc/ssl_certificates.rst b/sphinx/misc/ssl_certificates.rst index 20e9121d..1ee1878b 100644 --- a/sphinx/misc/ssl_certificates.rst +++ b/sphinx/misc/ssl_certificates.rst @@ -7,55 +7,127 @@ Letsencrypt using Certbot. They are only valid for 90 days at a time. TODO: move administration to AWS, and script renewal if necessary - -Renewal procedure -============================================= - -* Change to superuser, then check the validity of your certificates:: - - sudo su - - certbot certificates - -* When it is time for a renewal (approx every 60 days), move to the Specify Network - project directory where Docker was started, and stop the Docker containers:: - - cd /home/ubuntu/git/sp_network - docker compose stop - -* Renew the certificates:: - - certbot renew - -* Move to /etc/letsencrypt/archive/ and find the most recent - certificate names in the directory (certX.pem, chainX.pem, fullchainX.pem, - privkeyX.pem, where X is an integer):: - - cd /etc/letsencrypt/archive/spcoco.org/ - ls -lahtr - -* Copy the new certificates to /home/ubuntu/certificates, changing - the name to cert.pem, chain.pem, fullchain.pem, privkey.pem (no X integer). Then - change the owner from root, to the username (ubuntu):: - - cp cert4.pem /home/ubuntu/certificates/cert.pem - cp chain4.pem /home/ubuntu/certificates/chain.pem - cp fullchain4.pem /home/ubuntu/certificates/fullchain.pem - cp privkey4.pem /home/ubuntu/certificates/privkey.pem - -* Move to the directory with the certificates and change the - owner to ubuntu, then exit superuser:: - - cd /home/ubuntu/certificates - chown ubuntu:ubuntu * - exit - -* Move to the config directory and create symbolic links to the new fullchain.pem - and privkey.pem files:: - - cd /home/ubuntu/git/sp_network/config - ln -s /home/ubuntu/certificates/fullchain.pem - ln -s /home/ubuntu/certificates/privkey.pem - -* Then restart the containers:: - - sudo docker compose up -d +Local self-signed certificates +......................................... +To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate +and the private key) using Let's Encrypt and link these files in `./sp_network/config/`. + +While in development, generate self-signed certificates then link them in +~/git/sp_network/config/ directory for this project:: + + $ mkdir ~/certificates + + openssl req \ + -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \ + -keyout ~/certificates/privkey.pem \ + -out ~/certificates/fullchain.pem + + $ cd ~/git/sp_network/config + $ ln -s ~/certificates/privkey.pem + $ ln -s ~/certificates/fullchain.pem + +To run either the production or the development containers with HTTPS +support, generate `fullchain.pem` and `privkey.pem` (certificate and the private +key) using Let's Encrypt, link these files in the `./config/` directory. +Full instructions in the docs/aws-steps.rst page, under `Set up TLS/SSL` + +Modify the `FQDN` environment variable in `.env.conf` as needed. + +TLS/SSL using Certificate Authority (CA) +.................................................. + +* Make sure that DNS has propogated for domain for SSL +* Stop apache service +* request a certificate for the domain + +:: + + ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2 + ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v + Saving debug log to /var/log/letsencrypt/letsencrypt.log + + How would you like to authenticate with the ACME CA? + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + 1: Spin up a temporary webserver (standalone) + 2: Place files in webroot directory (webroot) + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 + Plugins selected: Authenticator standalone, Installer None + Please enter the domain name(s) you would like on your certificate (comma and/or + space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org + Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org + Performing the following challenges: + http-01 challenge for broker-dev.spcoco.org + Waiting for verification... + Cleaning up challenges + + Successfully received certificate. + Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem + Key is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem + This certificate expires on 2023-10-18. + These files will be updated when the certificate renews. + Certbot has set up a scheduled task to automatically renew this certificate in the background. + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + If you like Certbot, please consider supporting our work by: + * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate + * Donating to EFF: https://eff.org/donate-le + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + ubuntu@ip-172-31-86-62:~$ + + +Install certificates into config directory +------------------------------------------------------- + +* Create a ~/certificates directory to hold certificate files +* as superuser, copy the newly created fullchain.pem and privkey.pem files from the + letsencrypt live +* change the owner so that they can be used in Docker containers +* Link the certificates in the repo config directory + +:: + + $ cd + $ mkdir certificates + $ sudo su - + # cp -p /etc/letsencrypt/live//* /home/ubuntu/certificates/ + # chown ubuntu:ubuntu /home/ubuntu/certificates/* + # exit + $ cd ~/git/sp_network/config + $ ln -s ~/certificates/fullchain.pem + $ ln -s ~/certificates/privkey.pem + +Renew Certbot SSL certificates +......................................... + +SSL certificates are served from the instance (AWS EC2), and need port 80 to be renewed. +These are administered by Letsencrypt using Certbot and are only valid for 90 days at +a time. When it is time for a renewal (approx every 60 days), bring the docker +containers down. Prune the volumes so the new containers and volumes will be created +with the updated certificates. Renew the certificates, then bring the containers up. + +Amazon EC2 containers do not need apache running, certbot runs its own temp web server. + +Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3 + +:: + + $ sudo certbot certificates + $ sudo docker compose stop + $ sudo su - + # certbot renew + # cp -p /etc/letsencrypt/live/spcoco.org/* /home/ubuntu/certificates/ + # chown ubuntu:ubuntu /home/ubuntu/certificates/* + # exit + $ ls -lahtr ~/git/sp_network/config + + $ sudo docker system prune --all --volumes + $ sudo docker compose up -d + +TODO: SSL through Amazon +......................................... + +* Create Elastic IP address for EC2 instance +* Request a public certificate through Certificate Manager (ACM) + * Choose DNS validation + * Add tags sp_network, dev or prod, others From fbb28630724c8c27bcbd33e7db2bcdff19788168 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 15:41:16 -0500 Subject: [PATCH 37/81] remove unused dependencies --- requirements.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index f7a78ccb..4b7aa412 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,4 @@ boto3>=1.34.60 sqlalchemy pandas pandas-sql -pyarrow -s3fs -ggshield \ No newline at end of file +s3fs \ No newline at end of file From ca5ab2aed1e96a4dd8009e67023e57d6d6fd200a Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 15:41:58 -0500 Subject: [PATCH 38/81] call installed python executable --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a8c33950..9c17826d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,7 @@ USER specify COPY --chown=specify:specify ./requirements.txt . -RUN python -m venv venv \ +RUN python3 -m venv venv \ && venv/bin/pip install --no-cache-dir -r ./requirements.txt COPY --chown=specify:specify ./sppy ./sppy From 29c05caf81925352de48441b3f67c9298a74e60b Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 17:09:10 -0500 Subject: [PATCH 39/81] rm duplicate entries for location / --- config/nginx.conf | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/config/nginx.conf b/config/nginx.conf index cab6d732..aeea668d 100644 --- a/config/nginx.conf +++ b/config/nginx.conf @@ -7,7 +7,7 @@ server { server { listen 443 ssl; index index.html; - server_name broker-dev.spcoco.org; + # server_name broker.spcoco.org; ssl_certificate /etc/letsencrypt/fullchain.pem; ssl_certificate_key /etc/letsencrypt/privkey.pem; @@ -31,11 +31,11 @@ server { proxy_set_header Origin "${scheme}://${http_host}"; } - location / { - root /var/www/; - try_files $uri $uri/ = 404; - gzip_static on; - } +# http_host location / { +# root /var/www/; +# try_files $uri $uri/ = 404; +# gzip_static on; +# } location /static/js { root /volumes/webpack-output; @@ -48,11 +48,12 @@ server { rewrite ^/static/(.*)$ /$1 break; gzip_static on; } +} server { listen 443 ssl; index index.html; - server_name analyst-dev.spcoco.org; + # server_name analyst.spcoco.org; ssl_certificate /etc/letsencrypt/fullchain.pem; ssl_certificate_key /etc/letsencrypt/privkey.pem; @@ -76,11 +77,11 @@ server { proxy_set_header Origin "${scheme}://${http_host}"; } - location / { - root /var/www/; - try_files $uri $uri/ = 404; - gzip_static on; - } +# location / { +# root /var/www/; +# try_files $uri $uri/ = 404; +# gzip_static on; +# } location /static/js { root /volumes/webpack-output; From 6f70afe071c1c3e309b8b56945b799312f51889c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 17:09:48 -0500 Subject: [PATCH 40/81] add missing routes module to application --- docker-compose.development.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.development.yml b/docker-compose.development.yml index 026c7eb6..50f8c4af 100644 --- a/docker-compose.development.yml +++ b/docker-compose.development.yml @@ -21,7 +21,7 @@ services: ports: - "5002:5002" environment: - - FLASK_APP=flask_app.analyst:app + - FLASK_APP=flask_app.analyst.routes:app - FLASK_MANAGE=flask_app.analyst.manage - DEBUG_PORT=5002 volumes: From 0b4f61853533ca6fa0f617cb30c14e608488acb4 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 20 Mar 2024 17:37:00 -0500 Subject: [PATCH 41/81] update server_name for each instance --- config/nginx.conf | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/config/nginx.conf b/config/nginx.conf index aeea668d..ef94be1c 100644 --- a/config/nginx.conf +++ b/config/nginx.conf @@ -4,10 +4,11 @@ server { return 301 https://$host$request_uri; } +# Broker server { listen 443 ssl; index index.html; - # server_name broker.spcoco.org; + server_name broker.localhost; ssl_certificate /etc/letsencrypt/fullchain.pem; ssl_certificate_key /etc/letsencrypt/privkey.pem; @@ -31,12 +32,6 @@ server { proxy_set_header Origin "${scheme}://${http_host}"; } -# http_host location / { -# root /var/www/; -# try_files $uri $uri/ = 404; -# gzip_static on; -# } - location /static/js { root /volumes/webpack-output; rewrite ^/static/js/(.*)$ /$1 break; @@ -50,10 +45,11 @@ server { } } +# Analyst server { listen 443 ssl; index index.html; - # server_name analyst.spcoco.org; + server_name analyst.localhost; ssl_certificate /etc/letsencrypt/fullchain.pem; ssl_certificate_key /etc/letsencrypt/privkey.pem; @@ -77,12 +73,6 @@ server { proxy_set_header Origin "${scheme}://${http_host}"; } -# location / { -# root /var/www/; -# try_files $uri $uri/ = 404; -# gzip_static on; -# } - location /static/js { root /volumes/webpack-output; rewrite ^/static/js/(.*)$ /$1 break; From a637192b2d57ab980d59e18a0ef2b87fa77cf396 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 21 Mar 2024 16:05:31 -0500 Subject: [PATCH 42/81] doc --- sphinx/about/install_run_notes.rst | 5 +++-- sphinx/aws/aws-setup.rst | 9 ++++++++ sphinx/misc/ssl_certificates.rst | 36 +++++++++++++++++++++++------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index bf891443..484b1917 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -41,7 +41,7 @@ Install Docker Add docker repository, then use apt to install Docker: https://docs.docker.com/engine/install/ubuntu/ -Install repo from Github +Install/Update repo from Github --------------------------------------- * generate an SSH key for communicating with Github @@ -72,7 +72,8 @@ SSL Direct Docker to correct FQDN ------------------------------------ -Edit FQDN value in env.conf (or .env.analyst.conf and .env.broker.conf) to actual FQDN +Edit FQDN value in .env.analyst.conf and .env.broker.conf (referenced by the docker +compose file) and server_name in config/nginx.conf to actual FQDN. Testing diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst index 828da48d..b367bd92 100644 --- a/sphinx/aws/aws-setup.rst +++ b/sphinx/aws/aws-setup.rst @@ -1,6 +1,15 @@ Authentication #################### +EC2 instance creation +=========================================================== + +* Instance type t3.small (2gb RAM). +* Ubuntu Server 22.04 LTS, SSD Volume Type (free tier eligible), x86 architecture +* Security Group: launch-wizard-1 +* 30 Gb General Purpose SSD (gp2) +* For dev, Spot instance (in Advanced options) + For programmatic access to S3 =========================================================== Configure AWS credentials either through diff --git a/sphinx/misc/ssl_certificates.rst b/sphinx/misc/ssl_certificates.rst index 1ee1878b..cc83af66 100644 --- a/sphinx/misc/ssl_certificates.rst +++ b/sphinx/misc/ssl_certificates.rst @@ -42,8 +42,8 @@ TLS/SSL using Certificate Authority (CA) :: - ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2 - ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v + $ sudo systemctl stop apache2 + $ sudo certbot certonly -v Saving debug log to /var/log/letsencrypt/letsencrypt.log How would you like to authenticate with the ACME CA? @@ -53,18 +53,39 @@ TLS/SSL using Certificate Authority (CA) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 Plugins selected: Authenticator standalone, Installer None + Enter email address (used for urgent renewal and security notices) + (Enter 'c' to cancel): aimee.stewart@ku.edu + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Please read the Terms of Service at + https://letsencrypt.org/documents/LE-SA-v1.3-September-21-2022.pdf. You must + agree in order to register with the ACME server. Do you agree? + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + (Y)es/(N)o: Y + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Would you be willing, once your first certificate is successfully issued, to + share your email address with the Electronic Frontier Foundation, a founding + partner of the Let's Encrypt project and the non-profit organization that + develops Certbot? We'd like to send you email about our work encrypting the web, + EFF news, campaigns, and ways to support digital freedom. + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + (Y)es/(N)o: N + Account registered. Please enter the domain name(s) you would like on your certificate (comma and/or - space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org - Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org + space separated) (Enter 'c' to cancel): dev.spcoco.org, analyst-dev.spcoco.org, broker-dev.spcoco.org + Requesting a certificate for dev.spcoco.org and 2 more domains Performing the following challenges: + http-01 challenge for analyst-dev.spcoco.org http-01 challenge for broker-dev.spcoco.org + http-01 challenge for dev.spcoco.org Waiting for verification... Cleaning up challenges Successfully received certificate. - Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem - Key is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem - This certificate expires on 2023-10-18. + Certificate is saved at: /etc/letsencrypt/live/dev.spcoco.org/fullchain.pem + Key is saved at: /etc/letsencrypt/live/dev.spcoco.org/privkey.pem + This certificate expires on 2024-06-19. These files will be updated when the certificate renews. Certbot has set up a scheduled task to automatically renew this certificate in the background. @@ -73,7 +94,6 @@ TLS/SSL using Certificate Authority (CA) * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate * Donating to EFF: https://eff.org/donate-le - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ubuntu@ip-172-31-86-62:~$ Install certificates into config directory From c2422f98113251907859813caeafb62bd2c03dab Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 10:51:10 -0500 Subject: [PATCH 43/81] doc --- sphinx/about/install_run_notes.rst | 114 +++++++++++++++++------------ 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index 484b1917..7a7f9d0e 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -64,6 +64,18 @@ Install/Update repo from Github $ git clone git@github.com:specifysystems/sp_network.git $ git checkout +DNS +---------------------- + +If this is a development or production server with an actual domain, first point the +DNS record (through whatever service is managing the domain, GoDaddy in the case of +spcoco.org) to the static IP address for the server. + +For AWS, create (or modify) an Elastic IP address to point to the EC2 instance. + +If replacing an EC2 instance, disassociate the Elastic IP address from the old EC2 +instance, and associate it with the new instance. + SSL ----------------------------------- :ref:`Specify Network SSL certificates` @@ -76,25 +88,41 @@ Edit FQDN value in .env.analyst.conf and .env.broker.conf (referenced by the doc compose file) and server_name in config/nginx.conf to actual FQDN. -Testing ---------------------------------------- -On a development server, check the following URL endpoints: +Docker +================================= -* Index page: https://localhost +More info at :ref:`Docker` + + +Test +=========================== +On a development server, check the following URL endpoints: * Broker: - * https://localhost/api/v1/ + * https://localhost.broker + * https://localhost.broker/api/v1/ + + * https://localhost.broker/api/v1/badge/ + * https://localhost.broker/api/v1/name/ + * https://localhost.broker/api/v1/occ/ + * https://localhost.broker/api/v1/frontend/ + + * https://localhost.broker/api/v1/badge/gbif?icon_status=active + * https://localhost.broker/api/v1/occ/?occid=a7156437-55ec-4c6f-89de-938f9361753d + * https://localhost.broker/api/v1/name/Harengula%20jaguana + * https://localhost.broker/api/v1/frontend/?occid=a7156437-55ec-4c6f-89de-938f9361753d + +* Analyst: - * https://localhost/api/v1/badge/ - * https://localhost/api/v1/name/ - * https://localhost/api/v1/occ/ - * https://localhost/api/v1/frontend/ + * https://localhost.analyst + * https://localhost.analyst/api/v1/ - * https://localhost/api/v1/badge/gbif?icon_status=active - * https://localhost/api/v1/occ/?occid=a7156437-55ec-4c6f-89de-938f9361753d - * https://localhost/api/v1/name/Harengula%20jaguana - * https://localhost/api/v1/frontend/?occid=a7156437-55ec-4c6f-89de-938f9361753d + * https://localhost.analyst/api/v1/count/ + * https://localhost.analyst/api/v1/rank/ + + * http://localhost.analyst/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515 + * http://localhost.analyst/api/v1/rank/?by_species=true For local testing in a development environment, tests in the tests directory require the lmtest module available at https://github.com/lifemapper/lmtest. @@ -109,7 +137,7 @@ virtual environment activation script (bin/activate) script:: export WORKING_DIRECTORY="scratch-path" -**Specify Network** homepage is now available at https://localhost/ and http://localhost. +**Specify Network** homepage is now available at https://localhost/ **Broker** (aka back-end): @@ -122,29 +150,6 @@ needed. **Flask** is watching for back-end file changes and restarts the server when needed. -Troubleshooting -=========================================== - -For webserver errors, check logs of nginx container:: - - $ sudo docker logs --tail 1000 sp_network-nginx-1 - $ sudo docker logs --tail 1000 sp_network-broker-1 - - -Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container -Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+ -Then stop/rebuild/start:: - - $ sudo docker compose stop - $ sudo docker system prune --all --volumes - $ sudo docker compose up -d - -Docker -================================= - -More info at :ref:`Docker` - - Dev Environment ========================== @@ -156,13 +161,15 @@ Dev Environment $ pip install -r requirements.txt -Configure Debugger in local IDE +Configure Debugger ======================================== +Pycharm +------------------ [Instructions for PyCharm] (https://kartoza.com/en/blog/using-docker-compose-based-python-interpreter-in-pycharm/) -Debug +Flask ------------------------------------------- To run flask in debug mode, first set up Flask environment, then start the flask @@ -187,6 +194,28 @@ Reset the FLASK_APP variable to test an alternate resource:: Troubleshooting ====================================== + +For webserver errors +----------------------- + +Check logs of nginx container:: + + $ sudo docker logs --tail 1000 sp_network-nginx-1 + $ sudo docker logs --tail 1000 sp_network-broker-1 + + +Import error from werkzeug.urls +-------------------------------------- + +Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container +Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+ +Then stop/rebuild/start:: + + $ sudo docker compose stop + $ sudo docker system prune --all --volumes + $ sudo docker compose up -d + + pip errors with SSL ------------------------------------------- @@ -225,13 +254,6 @@ pre-commit build errors * Updated .pre-commit-config.yaml isort version to latest, https://github.com/PyCQA/isort, fixed build -AWS setup -=================================== - -* Add raw GBIF data to S3 - - - Dependencies: ============== From a22f266d7615a19cc3c6265c3642e2bd6fa1a1a9 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 10:51:25 -0500 Subject: [PATCH 44/81] remove unused dependency --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4b7aa412..ab01b10e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,4 @@ awscli boto3>=1.34.60 sqlalchemy pandas -pandas-sql -s3fs \ No newline at end of file +pandas-sql \ No newline at end of file From 07e673d6fcce8e3da91e254334837b043cc096cb Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 10:52:01 -0500 Subject: [PATCH 45/81] disable dataset name resolution with GBIF API --- flask_app/analyst/base.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 3c29035a..8859d8ad 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -102,13 +102,14 @@ def _standardize_params( def _add_dataset_names_to_records( cls, records, dataset_key_field="datasetkey", dataset_name_field="dataset_name"): - # TODO: change this to a call to an S3 table with all dataset keys/names - # if import is at top level, causes recursion error in awss3.count_datasets - from sppy.tools.provider.gbif import GbifAPI - gbif = GbifAPI(service="dataset") - for rec in records: - dataset_name, _ = gbif.get_dataset(rec[dataset_key_field]) - rec[dataset_name_field] = dataset_name + pass + # # TODO: change this to a call to an S3 table with all dataset keys/names + # # if import is at top level, causes recursion error in awss3.count_datasets + # from sppy.tools.provider.gbif import GbifAPI + # gbif = GbifAPI(service="dataset") + # for rec in records: + # dataset_name, _ = gbif.get_dataset(rec[dataset_key_field]) + # rec[dataset_name_field] = dataset_name # ............................................................................. From 5c4444bf43fade145fbd302d2b97627444d762ea Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 15:59:23 -0500 Subject: [PATCH 46/81] enclose error message in list --- flask_app/analyst/count.py | 4 ++-- flask_app/analyst/rank.py | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 12d24a7e..280360a0 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -40,7 +40,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): dataset_key=dataset_key, pub_org_key=pub_org_key) except BadRequest as e: - errinfo = {"error": e.description} + errinfo = {"error": [e.description]} else: # Query dataset counts @@ -49,7 +49,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): records, errors = cls._get_dataset_counts( good_params["dataset_key"]) except Exception: - errors = {"error": get_traceback()} + errors = {"error": [get_traceback()]} else: cls._add_dataset_names_to_records( records, dataset_key_field="datasetkey", diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index c6dc1021..61868953 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -42,7 +42,7 @@ def rank_counts(cls, count_by, order=None, limit=1): count_by=count_by, order=order, limit=limit) except BadRequest as e: - errinfo = {"error": e.description} + errinfo = {"error": [e.description]} else: # Query for ordered dataset counts @@ -51,7 +51,7 @@ def rank_counts(cls, count_by, order=None, limit=1): good_params["count_by"], good_params["order"], good_params["limit"]) except Exception: - errors = {"error": get_traceback()} + errors = {"error": [get_traceback()]} # Combine errors from success or failure errinfo = combine_errinfo(errinfo, errors) @@ -72,7 +72,7 @@ def _get_ordered_counts(cls, count_by, order, limit): records, errinfo = s3.rank_datasets(count_by, order, limit) except Exception: - errinfo = {"error": get_traceback()} + errinfo = {"error": [get_traceback()]} cls._add_dataset_names_to_records( records, dataset_key_field="datasetkey", @@ -89,10 +89,9 @@ def _get_ordered_counts(cls, count_by, order, limit): AnalystOutput.print_output(response, do_print_rec=True) # print(response) count_by = "species" - order = "ascending" + order = "descending" limit = 5 - response = svc.rank_counts( - count_by, order=order, limit=limit) + response = svc.rank_counts(count_by) AnalystOutput.print_output(response, do_print_rec=True) # print(response) From d552b675eefadbcbec096966b0c485f39de525f2 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 15:59:38 -0500 Subject: [PATCH 47/81] doc --- sphinx/about/install_run_notes.rst | 15 ++++++++++-- sphinx/aws/aws-setup.rst | 37 +++++++++++++++++------------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index 7a7f9d0e..cdff1d87 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -29,10 +29,10 @@ Install Install dependencies --------------------------------------- -Certbot:: +AWS Client, Certbot:: $ sudo apt update - $ sudo apt install certbot + $ sudo apt install awscli, certbot Install Docker @@ -93,6 +93,12 @@ Docker More info at :ref:`Docker` +AWS Config +================ + +Boto3 getting Error "botocore.exceptions.NoCredentialsError + +Create credentials file on host EC2 instance Test =========================== @@ -153,6 +159,11 @@ needed. Dev Environment ========================== +* Base system libraries:: + + sudo apt get update + sudo apt install awscli, certbot, apt install python3.10-venv + * Create a virtual environment and install python libs there:: $ cd ~/git/sp_network diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst index b367bd92..adf883b7 100644 --- a/sphinx/aws/aws-setup.rst +++ b/sphinx/aws/aws-setup.rst @@ -1,14 +1,34 @@ Authentication #################### +Create an IAM role for the EC2/Redshift/S3 interaction +*********************************************************** + +* Create a Role (Redshift-S3) for service Redshift to read/write to S3 + + * Add a policy allowing read and write access to the specnet S3 bucket + * Step 1: Trusted entity type = AWS service, Use Case = Redshift - Customizable. + + * TODO: change to Redshift - Scheduler when we automate the workflow + + * Step 2: Add permissions + + * AmazonRedshiftAllCommandsFullAccess (AWS managed) + * AmazonS3FullAccess (AWS managed) + EC2 instance creation =========================================================== -* Instance type t3.small (2gb RAM). +* Instance type t3.small + + * Build fails with t2.micro or t3.micro with 1gb RAM + * t3.small is 2gb RAM + * Ubuntu Server 22.04 LTS, SSD Volume Type (free tier eligible), x86 architecture * Security Group: launch-wizard-1 * 30 Gb General Purpose SSD (gp2) * For dev, Spot instance (in Advanced options) +* Modify IAM role - for role created above (i.e. specnet_ec2_role) For programmatic access to S3 =========================================================== @@ -47,21 +67,6 @@ Overview or preserved specimen. This brings the full dataset from about 2.6 billion down to 2.3 billion. -Create an IAM role for the Redshift/S3 interaction -*********************************************************** - -* Create a Role (Redshift-S3) for service Redshift to read/write to S3 - - * Add a policy allowing read and write access to the specnet S3 bucket - * Step 1: Trusted entity type = AWS service, Use Case = Redshift - Customizable. - - * TODO: change to Redshift - Scheduler when we automate the workflow - - * Step 2: Add permissions - - * AmazonRedshiftAllCommandsFullAccess (AWS managed) - * AmazonS3FullAccess (AWS managed) - Create a new workgroup (and namespace) *********************************************************** From 7c5ce78686be1e435c211ac1f53fd1f57dad5c2d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 16:00:34 -0500 Subject: [PATCH 48/81] enclose error message in list --- sppy/tools/provider/awss3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index da511b1a..d7bfde71 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -181,7 +181,7 @@ def rank_datasets(self, count_by, order, limit, format="JSON"): records, errors = self._query_order_s3_table( self._dataset_counts_path, sort_field, order, limit) except Exception as e: - errors = {"error": get_traceback()} + errors = {"error": [get_traceback()]} return records, errors # ............................................................................. From c930e622d6ee5269a4eba94593f5eeeed1220fea Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 16:01:15 -0500 Subject: [PATCH 49/81] add dataset name, citation to table; unfinished --- sppy/aws/aws_tools.py | 117 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index c01fba23..c775a4f1 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -7,13 +7,16 @@ from botocore.exceptions import ClientError import csv import datetime +from http import HTTPStatus import logging from logging.handlers import RotatingFileHandler import pandas as pd import os +import requests +import xml.etree.ElementTree as ET from sppy.aws.aws_constants import ( - INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, + ENCODING, INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, USER_DATA_TOKEN) @@ -674,4 +677,116 @@ def create_dataframe_from_s3obj( df = pd.read_parquet(s3_uri) return df +# ............................................... +def _get_nested_output_val(output, key_list): + while key_list: + key = key_list[0] + key_list = key_list[1:] + try: + output = output[key] + if not key_list: + val = output + if type(val) is bytes: + val = str(val).encode(ENCODING) + return str(output).encode(ENCODING) + except Exception: + return None + +# ............................................... +def _get_values_for_keys(output, keys): + values = [] + # Get values from JSON response + for key in keys: + if type(key) is list or type(key) is tuple: + val = _get_nested_output_val(output, key) + else: + try: + val = output[key] + except Exception: + val = None + if type(val) is bytes: + val = str(val).encode(ENCODING) + values.append(val) + return values + + +# ............................................... +def _get_api_response_vals(url, keys): + values = [] + output = {} + try: + response = requests.get(url) + except Exception as e: + errmsg = str(e) + else: + try: + status_code = response.status_code + reason = response.reason + except Exception: + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + reason = "Unknown API status_code/reason" + if status_code == HTTPStatus.OK: + # Parse response + try: + output = response.json() + except Exception: + output = response.content + if type(output) is bytes: + output = ET.fromstring(str(output)) + try: + output = ET.parse(output) + except Exception as e: + errmsg = f"Provider error: Invalid JSON response ({output})" + # Get values from JSON response + _get_values_for_keys(output, keys) + return values + +# ............................................... +def get_dataset(dataset_key): + """Return title from one dataset record with this key. + + Args: + dataset_key: GBIF identifier for this dataset + + Returns: + dataset_name: the name of the dataset. + citation: the preferred citation for the dataset. + + Raises: + Exception: on query failure. + """ + url = f"https://api.gbif.org/v1/dataset/{dataset_key}" + title, citation = _get_api_response_vals(url, ["title", ["citation", "text"]]) + return title, citation + +# ---------------------------------------------------- +def create_dataset_name_lookup( + bucket, s3_folders, s3_fname, ds_key_fieldname, datatype="parquet", region=REGION, encoding="utf-8"): + """Read CSV data from S3 into a pandas DataFrame. + Args: + bucket: name of the bucket containing the CSV data. + s3_path: the object name with enclosing S3 bucket folders. + ds_key_fieldname: fieldname of the column with GBIF datasetKey + region: AWS region to query. + datatype: tabular datatype, options are "csv", "parquet" + + Returns: + df: pandas DataFrame containing the CSV data. + """ + lookup_name = "dataset_name_citation" + input_path = f"{s3_folders}/{s3_fname}" + output_path = f"{s3_folders}/{lookup_name}" + ds_table = create_dataframe_from_s3obj( + bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) + ds_names = [] + ds_citations = [] + for rec in ds_table.itertuples(): + title, citation = get_dataset(rec.datasetkey) + ds_names.append(title) + ds_citations.append(citation) + # dataset_name and dataset_citation are the new fieldnames to be assigned + ds_table.assign(dataset_name=ds_names, dataset_citation=ds_citations) + tmp_filename = f"/tmp/{lookup_name}" + ds_table.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=ENCODING) + upload_to_s3(tmp_filename, bucket, output_path, region=region) \ No newline at end of file From 32aa000c32e7e1e9fa9a09d3200146786b326498 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 16:28:07 -0500 Subject: [PATCH 50/81] add pyarrow for parquet support --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ab01b10e..284c4368 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ awscli boto3>=1.34.60 sqlalchemy pandas -pandas-sql \ No newline at end of file +pandas-sql +pyarrow From efd163ca2105f155726679776dcbfdd507bd80a9 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 17:18:37 -0500 Subject: [PATCH 51/81] error doc --- sphinx/aws/aws-setup.rst | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst index adf883b7..0e1896d9 100644 --- a/sphinx/aws/aws-setup.rst +++ b/sphinx/aws/aws-setup.rst @@ -272,6 +272,8 @@ Enable S3 access from local machine and EC2 Error: SSL *************************************** +First time: + Error message :: SSL validation failed for https://ec2.us-east-1.amazonaws.com/ @@ -285,6 +287,38 @@ Test with:: Fix: Set up to work with Secret containing security key +Second time (in python code): +>>> response = requests.get(url) +Traceback (most recent call last): + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 703, in urlopen + httplib_response = self._make_request( + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 386, in _make_request + self._validate_conn(conn) + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 1042, in _validate_conn + conn.connect() + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connection.py", line 419, in connect + self.sock = ssl_wrap_socket( + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket + ssl_sock = _ssl_wrap_socket_impl( + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl + return ssl_context.wrap_socket(sock, server_hostname=server_hostname) + File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket + return self.sslsocket_class._create( + File "/usr/lib/python3.8/ssl.py", line 1069, in _create + self.do_handshake() + File "/usr/lib/python3.8/ssl.py", line 1338, in do_handshake + self._sslobj.do_handshake() +ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1131) + + +https://stackoverflow.com/questions/51925384/unable-to-get-local-issuer-certificate-when-using-requests + +pip install certifi + +import certifi +certifi.where() + + Workflow for Specify Network Analyst pre-computations =========================================================== From 67a30e14621d7fcfc825a504416dabcdc9e75709 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 22 Mar 2024 17:18:50 -0500 Subject: [PATCH 52/81] debugging --- sppy/aws/aws_tools.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index c775a4f1..c8b52ae6 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -789,4 +789,35 @@ def create_dataset_name_lookup( ds_table.assign(dataset_name=ds_names, dataset_citation=ds_citations) tmp_filename = f"/tmp/{lookup_name}" ds_table.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=ENCODING) - upload_to_s3(tmp_filename, bucket, output_path, region=region) \ No newline at end of file + upload_to_s3(tmp_filename, bucket, output_path, region=region) + + +# ............................................................................. +if __name__ == "__main__": + from sppy.aws.aws_tools import * + from sppy.aws.aws_constants import * + + bucket=PROJ_BUCKET + s3_folders="summary" + s3_fname="dataset_counts_2024_02_01_000.parquet" + lookup_name = "dataset_name_citation" + input_path = f"{s3_folders}/{s3_fname}" + output_path = f"{s3_folders}/{lookup_name}" + + ds_table: object = create_dataframe_from_s3obj( + bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) + + i = 0 + for rec in ds_table.itertuples(): + print(i) + print(rec) + i = i + 1 + if i == 5: + break + + dataset_key = rec.datasetkey + + url = f"https://api.gbif.org/v1/dataset/{dataset_key}" + response = requests.get(url) + + From 271cd137db1fca2e4371ada00ad8d45715134afb Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 25 Mar 2024 16:45:51 -0500 Subject: [PATCH 53/81] rm unused pandas-sql --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 284c4368..5b7074f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,4 @@ awscli boto3>=1.34.60 sqlalchemy pandas -pandas-sql pyarrow From 8161ef52aaa9236343122a244b42cb222fe4c899 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 25 Mar 2024 16:46:55 -0500 Subject: [PATCH 54/81] create lookup tables for GBIF datasets and publishingOrganizations; untested --- sppy/aws/aws_tools.py | 214 +++++++++++++++++++++++++++++++++++------- 1 file changed, 179 insertions(+), 35 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index c8b52ae6..8e4944a8 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -686,9 +686,9 @@ def _get_nested_output_val(output, key_list): output = output[key] if not key_list: val = output - if type(val) is bytes: - val = str(val).encode(ENCODING) - return str(output).encode(ENCODING) + # if type(val) is bytes: + # val = val.decode(ENCODING) + return val except Exception: return None @@ -704,8 +704,8 @@ def _get_values_for_keys(output, keys): val = output[key] except Exception: val = None - if type(val) is bytes: - val = str(val).encode(ENCODING) + # if type(val) is bytes: + # val = val.decode(ENCODING) values.append(val) return values @@ -713,7 +713,6 @@ def _get_values_for_keys(output, keys): # ............................................... def _get_api_response_vals(url, keys): values = [] - output = {} try: response = requests.get(url) except Exception as e: @@ -738,11 +737,11 @@ def _get_api_response_vals(url, keys): except Exception as e: errmsg = f"Provider error: Invalid JSON response ({output})" # Get values from JSON response - _get_values_for_keys(output, keys) + values = _get_values_for_keys(output, keys) return values # ............................................... -def get_dataset(dataset_key): +def get_dataset_name_citation(dataset_key): """Return title from one dataset record with this key. Args: @@ -756,55 +755,168 @@ def get_dataset(dataset_key): Exception: on query failure. """ url = f"https://api.gbif.org/v1/dataset/{dataset_key}" - title, citation = _get_api_response_vals(url, ["title", ["citation", "text"]]) - return title, citation + name, citation = _get_api_response_vals(url, ["title", ["citation", "text"]]) + return name, citation + + +# ............................................... +def _parse_records(ret_records, keys): + small_recs = [] + for rec in ret_records: + values = _get_values_for_keys(rec, keys) + small_recs.append(values) + return small_recs + +# ............................................... +def _get_records(url, keys): + small_recs = [] + is_end = True + try: + response = requests.get(url) + except Exception as e: + errmsg = str(e) + else: + try: + status_code = response.status_code + reason = response.reason + except Exception as e: + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + reason = str(e) + if status_code == HTTPStatus.OK: + # Parse response + try: + output = response.json() + except Exception: + output = response.content + if type(output) is bytes: + output = ET.fromstring(str(output)) + try: + output = ET.parse(output) + except Exception: + reason = f"Provider error: Invalid JSON response ({output})" + # Last query? + try: + is_end = output["endOfRecords"] + except KeyError: + print("Missing endOfRecords flag") + # Get values from JSON response + try: + ret_records = output["results"] + except KeyError: + reason = "No results returned" + else: + small_recs = _parse_records(ret_records, keys) + return small_recs, is_end + + +# ............................................... +def create_dataset_lookup(): + """Return title from one dataset record with this key. + + Returns: + dataframe of records containing GBIF dataset key, title, and citation + + Raises: + Exception: on query failure. + """ + all_recs = [] + is_end = False + keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] + offset = 0 + limit = 100 + while is_end is False: + url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}" + small_recs, is_end = _get_records(url, keys) + all_recs.append(small_recs) + lookup_df = pd.DataFrame( + all_recs, + columns=["datasetKey", "publishingOrganizationKey", "title", "citation"]) + return lookup_df # ---------------------------------------------------- -def create_dataset_name_lookup( - bucket, s3_folders, s3_fname, ds_key_fieldname, datatype="parquet", region=REGION, encoding="utf-8"): +def create_dataset_lookup( + bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"): """Read CSV data from S3 into a pandas DataFrame. Args: bucket: name of the bucket containing the CSV data. - s3_path: the object name with enclosing S3 bucket folders. - ds_key_fieldname: fieldname of the column with GBIF datasetKey + s3_folders: S3 bucket folders for output lookup table + lookup_fname: output table for looking up dataset name and citation region: AWS region to query. - datatype: tabular datatype, options are "csv", "parquet" + encoding: encoding of the input data Returns: df: pandas DataFrame containing the CSV data. """ - lookup_name = "dataset_name_citation" - input_path = f"{s3_folders}/{s3_fname}" - output_path = f"{s3_folders}/{lookup_name}" - ds_table = create_dataframe_from_s3obj( - bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) - ds_names = [] - ds_citations = [] - for rec in ds_table.itertuples(): - title, citation = get_dataset(rec.datasetkey) - ds_names.append(title) - ds_citations.append(citation) - # dataset_name and dataset_citation are the new fieldnames to be assigned - ds_table.assign(dataset_name=ds_names, dataset_citation=ds_citations) - tmp_filename = f"/tmp/{lookup_name}" - ds_table.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=ENCODING) + all_recs = [] + is_end = False + keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] + offset = 0 + limit = 100 + while is_end is False: + url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}" + small_recs, is_end = _get_records(url, keys) + all_recs.append(small_recs) + lookup_df = pd.DataFrame( + all_recs, + columns=["datasetKey", "publishingOrganizationKey", "title", "citation"]) + + output_path = f"{s3_folders}/{lookup_fname}" + tmp_filename = f"/tmp/{lookup_fname}" + # Output data written as CSV + lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) upload_to_s3(tmp_filename, bucket, output_path, region=region) - - + + +# ---------------------------------------------------- +def create_puborg_lookup( + bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"): + """Read CSV data from S3 into a pandas DataFrame. + + Args: + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + lookup_fname: output table for looking up dataset name and citation + region: AWS region to query. + encoding: encoding of the input data + + Returns: + df: pandas DataFrame containing the CSV data. + """ + all_recs = [] + is_end = False + keys = ["key", "title"] + offset = 0 + limit = 100 + while is_end is False: + url = f"https://api.gbif.org/v1/organization?offset={offset}&limit={limit}" + small_recs, is_end = _get_records(url, keys) + all_recs.append(small_recs) + lookup_df = pd.DataFrame(all_recs, columns=["publishingOrganizationKey", "title"]) + + output_path = f"{s3_folders}/{lookup_fname}" + tmp_filename = f"/tmp/{lookup_fname}" + # Output data written as CSV + lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) + upload_to_s3(tmp_filename, bucket, output_path, region=region) + # ............................................................................. if __name__ == "__main__": from sppy.aws.aws_tools import * from sppy.aws.aws_constants import * + import certifi + + cert = certifi.where() + bucket=PROJ_BUCKET s3_folders="summary" s3_fname="dataset_counts_2024_02_01_000.parquet" - lookup_name = "dataset_name_citation" + lookup_name = "dataset_name_2024_02_01_" input_path = f"{s3_folders}/{s3_fname}" output_path = f"{s3_folders}/{lookup_name}" - ds_table: object = create_dataframe_from_s3obj( + ds_table = create_dataframe_from_s3obj( bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) i = 0 @@ -818,6 +930,38 @@ def create_dataset_name_lookup( dataset_key = rec.datasetkey url = f"https://api.gbif.org/v1/dataset/{dataset_key}" - response = requests.get(url) + # response = requests.get(url) + r = requests.get(url, cert=cert) + +""" +from sppy.aws.aws_tools import * +from sppy.aws.aws_constants import * + +import certifi +cert = certifi.where() + +bucket=PROJ_BUCKET +s3_folders="summary" +s3_fname="dataset_counts_2024_02_01_000.parquet" +lookup_name = "dataset_name_2024_02_01_" +input_path = f"{s3_folders}/{s3_fname}" +output_path = f"{s3_folders}/{lookup_name}" + +ds_table = create_dataframe_from_s3obj( + bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) + +i = 0 +for rec in ds_table.itertuples(): + print(i) + print(rec) + i = i + 1 + if i == 5: + break + +dataset_key = rec.datasetkey +url = f"https://api.gbif.org/v1/dataset/{dataset_key}" +# response = requests.get(url) +r = requests.get(url, cert=cert) +""" From 61178203692737f093d1b150ebf046ae9be8ae4d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 26 Mar 2024 14:30:45 -0500 Subject: [PATCH 55/81] increment offset in paging loop; cleanup --- sppy/aws/aws_tools.py | 96 +++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 53 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 8e4944a8..3338e041 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -17,7 +17,7 @@ from sppy.aws.aws_constants import ( ENCODING, INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, - PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, USER_DATA_TOKEN) @@ -814,7 +814,8 @@ def create_dataset_lookup(): """Return title from one dataset record with this key. Returns: - dataframe of records containing GBIF dataset key, title, and citation + dataframe of records containing GBIF dataset key, GBIF publishingOrg key, + dataset title, and dataset citation Raises: Exception: on query failure. @@ -828,13 +829,14 @@ def create_dataset_lookup(): url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}" small_recs, is_end = _get_records(url, keys) all_recs.append(small_recs) + offset += limit lookup_df = pd.DataFrame( all_recs, columns=["datasetKey", "publishingOrganizationKey", "title", "citation"]) return lookup_df # ---------------------------------------------------- -def create_dataset_lookup( +def create_s3_dataset_lookup( bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"): """Read CSV data from S3 into a pandas DataFrame. @@ -845,21 +847,11 @@ def create_dataset_lookup( region: AWS region to query. encoding: encoding of the input data - Returns: - df: pandas DataFrame containing the CSV data. + Postcondition: + CSV table with dataset key, pubOrgKey, dataset name, dataset citation written + to the named S3 object in bucket and folders """ - all_recs = [] - is_end = False - keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] - offset = 0 - limit = 100 - while is_end is False: - url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}" - small_recs, is_end = _get_records(url, keys) - all_recs.append(small_recs) - lookup_df = pd.DataFrame( - all_recs, - columns=["datasetKey", "publishingOrganizationKey", "title", "citation"]) + lookup_df = create_dataset_lookup() output_path = f"{s3_folders}/{lookup_fname}" tmp_filename = f"/tmp/{lookup_fname}" @@ -869,19 +861,20 @@ def create_dataset_lookup( # ---------------------------------------------------- -def create_puborg_lookup( +def create_s3_puborg_lookup( bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"): """Read CSV data from S3 into a pandas DataFrame. Args: bucket: name of the bucket containing the CSV data. s3_folders: S3 bucket folders for output lookup table - lookup_fname: output table for looking up dataset name and citation + lookup_fname: output table for looking up organization name region: AWS region to query. encoding: encoding of the input data - Returns: - df: pandas DataFrame containing the CSV data. + Postcondition: + CSV table with pubOrgKey, pubOrg name written to the named S3 object in + bucket and folders """ all_recs = [] is_end = False @@ -892,6 +885,7 @@ def create_puborg_lookup( url = f"https://api.gbif.org/v1/organization?offset={offset}&limit={limit}" small_recs, is_end = _get_records(url, keys) all_recs.append(small_recs) + offset += limit lookup_df = pd.DataFrame(all_recs, columns=["publishingOrganizationKey", "title"]) output_path = f"{s3_folders}/{lookup_fname}" @@ -900,15 +894,9 @@ def create_puborg_lookup( lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) upload_to_s3(tmp_filename, bucket, output_path, region=region) + # ............................................................................. if __name__ == "__main__": - from sppy.aws.aws_tools import * - from sppy.aws.aws_constants import * - - import certifi - - cert = certifi.where() - bucket=PROJ_BUCKET s3_folders="summary" s3_fname="dataset_counts_2024_02_01_000.parquet" @@ -930,38 +918,40 @@ def create_puborg_lookup( dataset_key = rec.datasetkey url = f"https://api.gbif.org/v1/dataset/{dataset_key}" - # response = requests.get(url) - r = requests.get(url, cert=cert) + response = requests.get(url) + # import certifi + # cert = certifi.where() + # r = requests.get(url, cert=cert) """ from sppy.aws.aws_tools import * from sppy.aws.aws_constants import * -import certifi -cert = certifi.where() - bucket=PROJ_BUCKET s3_folders="summary" -s3_fname="dataset_counts_2024_02_01_000.parquet" -lookup_name = "dataset_name_2024_02_01_" -input_path = f"{s3_folders}/{s3_fname}" -output_path = f"{s3_folders}/{lookup_name}" - -ds_table = create_dataframe_from_s3obj( - bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) +lookup_fname = "dataset_name_2024_02_01_" + +create_s3_dataset_lookup( + bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8") + +# ds_table = create_dataframe_from_s3obj( +# bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) +# +# i = 0 +# for rec in ds_table.itertuples(): +# print(i) +# print(rec) +# i = i + 1 +# if i == 5: +# break +# +# dataset_key = rec.datasetkey +# +# url = f"https://api.gbif.org/v1/dataset/{dataset_key}" + +# import certifi +# cert = certifi.where() +# r = requests.get(url, cert=cert) -i = 0 -for rec in ds_table.itertuples(): - print(i) - print(rec) - i = i + 1 - if i == 5: - break - -dataset_key = rec.datasetkey - -url = f"https://api.gbif.org/v1/dataset/{dataset_key}" # response = requests.get(url) -r = requests.get(url, cert=cert) - """ From 4fbfec35708c59419e98861ea3cc6f93fa1b79e9 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 26 Mar 2024 15:41:03 -0500 Subject: [PATCH 56/81] generalize to create S3 lookup table from any API query --- sppy/aws/aws_tools.py | 134 ++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 70 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 3338e041..3d05fde6 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -809,66 +809,86 @@ def _get_records(url, keys): return small_recs, is_end -# ............................................... -def create_dataset_lookup(): - """Return title from one dataset record with this key. +# ---------------------------------------------------- +def create_s3_lookup( + bucket, s3_folders, base_url, response_keys, output_fname, output_columns, + region=REGION, encoding="utf-8"): + """Query an API, read the data and write a subset to a table in S3. - Returns: - dataframe of records containing GBIF dataset key, GBIF publishingOrg key, - dataset title, and dataset citation + Args: + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + base_url: API URL without any key value pairs for the data service + response_keys: list of keys within the API response to pull values from. A key + can be an ordered list of keys nested within several elements of the tree, + from outermost to innermost. + output_columns: list of column headings for output lookup table + output_fname: output table for looking up dataset name and citation + region: AWS region containing the destination bucket. + encoding: encoding of the input data - Raises: - Exception: on query failure. + Postcondition: + CSV table with output_columns and values for each written to the named S3 object + in bucket and folders """ all_recs = [] is_end = False - keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] offset = 0 limit = 100 + while is_end is False: - url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}" - small_recs, is_end = _get_records(url, keys) + url = f"{base_url}?offset={offset}&limit={limit}" + small_recs, is_end = _get_records(url, response_keys) all_recs.append(small_recs) offset += limit + if offset % 1000 == 0: + print(f"Offset = {offset}") + lookup_df = pd.DataFrame( all_recs, - columns=["datasetKey", "publishingOrganizationKey", "title", "citation"]) - return lookup_df + columns=output_columns) + print(f"Lookup table contains {lookup_df.shape[0]} rows") + + output_path = f"{s3_folders}/{output_fname}" + tmp_filename = f"/tmp/{output_fname}" + # Output data written as CSV + lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) + print(f"Wrote {tmp_filename}") + upload_to_s3(tmp_filename, bucket, output_path, region=region) + print(f"Uploaded to s3://{bucket}/{output_path}") + # ---------------------------------------------------- -def create_s3_dataset_lookup( - bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"): - """Read CSV data from S3 into a pandas DataFrame. +def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): + """Query the GBIF Dataset API, write a subset of the response to a table in S3. Args: bucket: name of the bucket containing the CSV data. s3_folders: S3 bucket folders for output lookup table - lookup_fname: output table for looking up dataset name and citation - region: AWS region to query. + region: AWS region containing the destination bucket. encoding: encoding of the input data Postcondition: CSV table with dataset key, pubOrgKey, dataset name, dataset citation written to the named S3 object in bucket and folders """ - lookup_df = create_dataset_lookup() - - output_path = f"{s3_folders}/{lookup_fname}" - tmp_filename = f"/tmp/{lookup_fname}" - # Output data written as CSV - lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) - upload_to_s3(tmp_filename, bucket, output_path, region=region) - + base_url = "https://api.gbif.org/v1/dataset" + response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] + data_date = get_current_datadate_str() + output_fname = f"dataset_name_{data_date}_" + output_fname = "dataset_name_2024_02_01_" + output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] + create_s3_lookup( + bucket, s3_folders, base_url, response_keys, output_fname, output_columns, + region=region, encoding=encoding) # ---------------------------------------------------- -def create_s3_puborg_lookup( - bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"): - """Read CSV data from S3 into a pandas DataFrame. +def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): + """Query the GBIF Organization API, write a subset of the response to a table in S3. Args: bucket: name of the bucket containing the CSV data. s3_folders: S3 bucket folders for output lookup table - lookup_fname: output table for looking up organization name region: AWS region to query. encoding: encoding of the input data @@ -876,52 +896,27 @@ def create_s3_puborg_lookup( CSV table with pubOrgKey, pubOrg name written to the named S3 object in bucket and folders """ - all_recs = [] - is_end = False - keys = ["key", "title"] - offset = 0 - limit = 100 - while is_end is False: - url = f"https://api.gbif.org/v1/organization?offset={offset}&limit={limit}" - small_recs, is_end = _get_records(url, keys) - all_recs.append(small_recs) - offset += limit - lookup_df = pd.DataFrame(all_recs, columns=["publishingOrganizationKey", "title"]) - - output_path = f"{s3_folders}/{lookup_fname}" - tmp_filename = f"/tmp/{lookup_fname}" - # Output data written as CSV - lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) - upload_to_s3(tmp_filename, bucket, output_path, region=region) + base_url = "https://api.gbif.org/v1/dataset" + response_keys = ["key", "title"] + data_date = get_current_datadate_str() + output_fname = f"organization_name_{data_date}_" + output_fname = "organization_name_2024_02_01_" + output_columns = ["publishingOrganizationKey", "title"] + create_s3_lookup( + bucket, s3_folders, base_url, response_keys, output_fname, output_columns, + region=region, encoding=encoding) # ............................................................................. if __name__ == "__main__": bucket=PROJ_BUCKET s3_folders="summary" - s3_fname="dataset_counts_2024_02_01_000.parquet" - lookup_name = "dataset_name_2024_02_01_" - input_path = f"{s3_folders}/{s3_fname}" - output_path = f"{s3_folders}/{lookup_name}" - - ds_table = create_dataframe_from_s3obj( - bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) - i = 0 - for rec in ds_table.itertuples(): - print(i) - print(rec) - i = i + 1 - if i == 5: - break + create_s3_dataset_lookup( + bucket, s3_folders, region=REGION, encoding="utf-8") + create_s3_organization_lookup( + bucket, s3_folders, region=REGION, encoding="utf-8") - dataset_key = rec.datasetkey - - url = f"https://api.gbif.org/v1/dataset/{dataset_key}" - response = requests.get(url) - # import certifi - # cert = certifi.where() - # r = requests.get(url, cert=cert) """ from sppy.aws.aws_tools import * @@ -929,10 +924,9 @@ def create_s3_puborg_lookup( bucket=PROJ_BUCKET s3_folders="summary" -lookup_fname = "dataset_name_2024_02_01_" create_s3_dataset_lookup( - bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8") + bucket, s3_folders, region=REGION, encoding="utf-8") # ds_table = create_dataframe_from_s3obj( # bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) From 87297e7e8ac4bfdb91241f5c633421009c829aa7 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Tue, 26 Mar 2024 15:59:12 -0500 Subject: [PATCH 57/81] add new tables for s3 query --- sppy/tools/provider/awss3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index d7bfde71..93444a65 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -31,7 +31,8 @@ def __init__( datestr = get_current_datadate_str() datestr = "2024_02_01" self._dataset_counts_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" - self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" + self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_lists_{datestr}_000.parquet" + self._dataset_names_path = f"{SUMMARY_FOLDER}/dataset_names_{datestr}_000.csv" # ---------------------------------------------------- def _query_table(self, s3_path, query_str, format="CSV"): From cc6b1bb21e5e1130118bbee0c507199680f8f8a1 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 13:41:23 -0500 Subject: [PATCH 58/81] rename S3Query class to SpNetAnalyses --- flask_app/analyst/base.py | 15 +-------------- flask_app/analyst/count.py | 4 ++-- flask_app/analyst/rank.py | 4 ++-- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 8859d8ad..263441d7 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -5,6 +5,7 @@ from flask_app.common.s2n_type import AnalystOutput, APIService from sppy.tools.s2n.utils import get_traceback +from sppy.tools.provider.awss3 import SpNetAnalyses # ............................................................................. @@ -97,20 +98,6 @@ def _standardize_params( return usr_params, errinfo - # ............................................... - @classmethod - def _add_dataset_names_to_records( - cls, records, dataset_key_field="datasetkey", - dataset_name_field="dataset_name"): - pass - # # TODO: change this to a call to an S3 table with all dataset keys/names - # # if import is at top level, causes recursion error in awss3.count_datasets - # from sppy.tools.provider.gbif import GbifAPI - # gbif = GbifAPI(service="dataset") - # for rec in records: - # dataset_name, _ = gbif.get_dataset(rec[dataset_key_field]) - # rec[dataset_name_field] = dataset_name - # ............................................................................. if __name__ == "__main__": diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 280360a0..4dabf0f3 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -6,7 +6,7 @@ from flask_app.analyst.base import _AnalystService from sppy.aws.aws_constants import PROJ_BUCKET -from sppy.tools.provider.awss3 import S3Query +from sppy.tools.provider.awss3 import SpNetAnalyses from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) @@ -86,7 +86,7 @@ def _get_dataset_counts(cls, dataset_key): """ records = [] errors = {} - s3 = S3Query(PROJ_BUCKET) + s3 = SpNetAnalyses(PROJ_BUCKET) try: records = s3.get_dataset_counts(dataset_key) except Exception: diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index 61868953..f0429031 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -6,7 +6,7 @@ from flask_app.analyst.base import _AnalystService from sppy.aws.aws_constants import PROJ_BUCKET -from sppy.tools.provider.awss3 import S3Query +from sppy.tools.provider.awss3 import SpNetAnalyses from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) @@ -67,7 +67,7 @@ def rank_counts(cls, count_by, order=None, limit=1): @classmethod def _get_ordered_counts(cls, count_by, order, limit): records = [] - s3 = S3Query(PROJ_BUCKET) + s3 = SpNetAnalyses(PROJ_BUCKET) try: records, errinfo = s3.rank_datasets(count_by, order, limit) From 8e14e1b21bbe2a889559750098d878b152afaf99 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 13:55:36 -0500 Subject: [PATCH 59/81] structure S3 table metadata into a dictionary in class --- flask_app/analyst/rank.py | 2 +- sppy/tools/provider/awss3.py | 107 +++++++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index f0429031..4aa55b26 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -69,7 +69,7 @@ def _get_ordered_counts(cls, count_by, order, limit): records = [] s3 = SpNetAnalyses(PROJ_BUCKET) try: - records, errinfo = s3.rank_datasets(count_by, order, limit) + records, errinfo = s3.rank_dataset_counts(count_by, order, limit) except Exception: errinfo = {"error": [get_traceback()]} diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py index 93444a65..444345d5 100644 --- a/sppy/tools/provider/awss3.py +++ b/sppy/tools/provider/awss3.py @@ -10,7 +10,7 @@ # ............................................................................. -class S3Query(): +class SpNetAnalyses(): """Class for retrieving SpecifyNetwork summary data from AWS S3.""" # ............................................... @@ -30,9 +30,30 @@ def __init__( self.exp_type = 'SQL' datestr = get_current_datadate_str() datestr = "2024_02_01" - self._dataset_counts_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet" - self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_lists_{datestr}_000.parquet" - self._dataset_names_path = f"{SUMMARY_FOLDER}/dataset_names_{datestr}_000.csv" + self._summary_path = "summary" + self._summary_tables = { + "dataset_counts": { + "fname": f"dataset_counts_{datestr}_000.parquet", + "fields": ["datasetkey", "occ_count", "species_count"], + "key": "datasetkey" + }, + "dataset_species_lists": { + "fname": f"dataset_lists_{datestr}_000.parquet", + "fields": ["datasetkey", "taxonkey", "species", "occ_count"], + "key": "datasetkey" + }, + "dataset_meta": { + "fname": f"dataset_names_{datestr}_000.csv", + "fields": [ + "datasetKey", "publishingOrganizationKey", "title", "citation"], + "key": "datasetKey" + }, + "organization_meta": { + "fname": f"organization_names_{datestr}_000.csv", + "fields": ["publishingOrganizationKey", "title"], + "key": "publishingOrganizationKey" + } + } # ---------------------------------------------------- def _query_table(self, s3_path, query_str, format="CSV"): @@ -135,31 +156,75 @@ def get_dataset_counts(self, dataset_key, format="JSON"): Returns: records: empty list or list of 1 record (list) """ + fields = self._summary_tables["dataset_counts"]["fields"] + key_idx = fields.index(self._summary_tables["dataset_counts"]["key"]) + + table_path = \ + f"{self._summary_path}/{self._summary_tables['dataset_counts']['fname']}" query_str = ( - "SELECT datasetkey, occ_count, species_count FROM s3object s " - f"WHERE s.datasetkey = '{dataset_key}'" + f"SELECT * FROM s3object s WHERE s.datasetkey = '{dataset_key}'" ) # Returns empty list or list of 1 record - records = self._query_table(self._dataset_counts_path, query_str, format=format) + records = self._query_table(table_path, query_str, format=format) + self.add_dataset_lookup_vals(records, key_idx=key_idx) return records # ---------------------------------------------------- - def get_org_counts(self, pub_org_key): - """Query S3 for occurrence and species counts for this organization. + def add_dataset_lookup_vals(self, records, key_idx=0, format="JSON"): + """Query the S3 resource for occurrence and species counts for this dataset. Args: - pub_org_key: unique GBIF identifier for organization of interest. + key: unique GBIF identifier for object of interest. + format: output format, options "CSV" or "JSON" Returns: - records: empty list or list of 1 record containing occ_count, species_count - - TODO: implement this? + records: empty list or list of 1 record (list) """ - (occ_count, species_count) = (0,0) - return (occ_count, species_count) + table_path = \ + f"{self._summary_path}/{self._summary_tables['dataset_meta']['fname']}" + fields = self._summary_tables["dataset_meta"]["fields"] + key_fld = fields[0] + new_flds = fields[1:] + qry_flds = " ".join(new_flds) + + for rec in records: + query_str = ( + f"SELECT {qry_flds} FROM s3object s WHERE s.{key_fld} = " + f"'{rec[key_idx]}'" + ) + # Returns empty list or list of 1 record + meta_recs = self._query_table(table_path, query_str, format=format) + try: + meta = meta_recs[0] + except IndexError: + if format == "CSV": + # Add placeholders for empty values + rec.extend(["" for f in new_flds]) + else: + for fld in new_flds: + if format == "JSON": + rec.update(meta) + else: + rec.extend(meta) + return records + + # # ---------------------------------------------------- + # def get_org_counts(self, pub_org_key): + # """Query S3 for occurrence and species counts for this organization. + # + # Args: + # pub_org_key: unique GBIF identifier for organization of interest. + # + # Returns: + # records: empty list or list of 1 record containing occ_count, species_count + # + # TODO: implement this? + # """ + # (occ_count, species_count) = (0,0) + # return (occ_count, species_count) # ---------------------------------------------------- - def rank_datasets(self, count_by, order, limit, format="JSON"): + def rank_dataset_counts(self, count_by, order, limit, format="JSON"): """Return the top or bottom datasets, with counts, ranked by number of species. Args: @@ -174,22 +239,28 @@ def rank_datasets(self, count_by, order, limit, format="JSON"): records: list of limit records containing dataset_key, occ_count, species_count """ records = [] + table_path = \ + f"{self._summary_path}/{self._summary_tables['dataset_counts']['fname']}" + fields = self._summary_tables["dataset_counts"]["fields"] + key_idx = fields.index(self._summary_tables["dataset_counts"]["key"]) if count_by == "species": sort_field = "species_count" else: sort_field = "occ_count" try: records, errors = self._query_order_s3_table( - self._dataset_counts_path, sort_field, order, limit) + table_path, sort_field, order, limit) except Exception as e: errors = {"error": [get_traceback()]} + + self.add_dataset_lookup_vals(records, key_idx=key_idx) return records, errors # ............................................................................. if __name__ == "__main__": format = "JSON" dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" - s3q = S3Query(PROJ_BUCKET) + s3q = SpNetAnalyses(PROJ_BUCKET) recs = s3q.get_dataset_counts(dataset_key, format=format) for r in recs: print(r) From 14bab92a404e77bb942d319824f1b3dfa5ca900f Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 14:39:26 -0500 Subject: [PATCH 60/81] separate create and write dataframe --- sppy/aws/aws_tools.py | 109 +++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 43 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 3d05fde6..34c43d85 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -677,20 +677,22 @@ def create_dataframe_from_s3obj( df = pd.read_parquet(s3_uri) return df -# ............................................... -def _get_nested_output_val(output, key_list): - while key_list: - key = key_list[0] - key_list = key_list[1:] - try: - output = output[key] - if not key_list: - val = output - # if type(val) is bytes: - # val = val.decode(ENCODING) - return val - except Exception: - return None + +# # ............................................... +# def _get_nested_output_val(output, key_list): +# while key_list: +# key = key_list[0] +# key_list = key_list[1:] +# try: +# output = output[key] +# if not key_list: +# val = output +# # if type(val) is bytes: +# # val = val.decode(ENCODING) +# return val +# except Exception: +# return None + # ............................................... def _get_values_for_keys(output, keys): @@ -698,14 +700,21 @@ def _get_values_for_keys(output, keys): # Get values from JSON response for key in keys: if type(key) is list or type(key) is tuple: - val = _get_nested_output_val(output, key) + key_list = key + while key_list: + key = key_list[0] + key_list = key_list[1:] + try: + output = output[key] + if not key_list: + val = output + except Exception: + val = None else: try: val = output[key] except Exception: val = None - # if type(val) is bytes: - # val = val.decode(ENCODING) values.append(val) return values @@ -810,32 +819,23 @@ def _get_records(url, keys): # ---------------------------------------------------- -def create_s3_lookup( - bucket, s3_folders, base_url, response_keys, output_fname, output_columns, - region=REGION, encoding="utf-8"): +def create_dataframe_from_api(base_url, response_keys, output_columns): """Query an API, read the data and write a subset to a table in S3. Args: - bucket: name of the bucket containing the CSV data. - s3_folders: S3 bucket folders for output lookup table base_url: API URL without any key value pairs for the data service response_keys: list of keys within the API response to pull values from. A key can be an ordered list of keys nested within several elements of the tree, from outermost to innermost. output_columns: list of column headings for output lookup table - output_fname: output table for looking up dataset name and citation - region: AWS region containing the destination bucket. - encoding: encoding of the input data - Postcondition: - CSV table with output_columns and values for each written to the named S3 object - in bucket and folders + Returns: + dataframe: Pandas dataframe with rows of data for the output_columns """ all_recs = [] is_end = False offset = 0 limit = 100 - while is_end is False: url = f"{base_url}?offset={offset}&limit={limit}" small_recs, is_end = _get_records(url, response_keys) @@ -843,16 +843,32 @@ def create_s3_lookup( offset += limit if offset % 1000 == 0: print(f"Offset = {offset}") + dataframe = pd.DataFrame(all_recs, columns=output_columns) + print(f"Lookup table contains {dataframe.shape[0]} rows") + return dataframe + + +# ---------------------------------------------------- +def write_dataframe_to_s3( + dataframe, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"): + """Query an API, read the data and write a subset to a table in S3. - lookup_df = pd.DataFrame( - all_recs, - columns=output_columns) - print(f"Lookup table contains {lookup_df.shape[0]} rows") + Args: + dataframe: Pandas dataframe with rows of data + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + output_fname: output table for looking up dataset name and citation + region: AWS region containing the destination bucket. + encoding: encoding of the input data + Postcondition: + CSV table with output_columns and values for each written to the named S3 object + in bucket and folders + """ output_path = f"{s3_folders}/{output_fname}" tmp_filename = f"/tmp/{output_fname}" # Output data written as CSV - lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) + dataframe.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) print(f"Wrote {tmp_filename}") upload_to_s3(tmp_filename, bucket, output_path, region=region) print(f"Uploaded to s3://{bucket}/{output_path}") @@ -878,9 +894,10 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8" output_fname = f"dataset_name_{data_date}_" output_fname = "dataset_name_2024_02_01_" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] - create_s3_lookup( - bucket, s3_folders, base_url, response_keys, output_fname, output_columns, - region=region, encoding=encoding) + lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns) + write_dataframe_to_s3( + lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding) + # ---------------------------------------------------- def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): @@ -902,25 +919,31 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u output_fname = f"organization_name_{data_date}_" output_fname = "organization_name_2024_02_01_" output_columns = ["publishingOrganizationKey", "title"] - create_s3_lookup( - bucket, s3_folders, base_url, response_keys, output_fname, output_columns, - region=region, encoding=encoding) + lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns) + write_dataframe_to_s3( + lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding) # ............................................................................. if __name__ == "__main__": bucket=PROJ_BUCKET + region=REGION + encoding=ENCODING s3_folders="summary" + base_url = "https://api.gbif.org/v1/dataset" + response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] + output_fname = "dataset_name_2024_02_01_" + output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] + create_s3_dataset_lookup( bucket, s3_folders, region=REGION, encoding="utf-8") - create_s3_organization_lookup( - bucket, s3_folders, region=REGION, encoding="utf-8") + # create_s3_organization_lookup( + # bucket, s3_folders, region=REGION, encoding="utf-8") """ from sppy.aws.aws_tools import * -from sppy.aws.aws_constants import * bucket=PROJ_BUCKET s3_folders="summary" From e71bf5abbdcd075051b07fd5629e715eee994739 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 15:01:38 -0500 Subject: [PATCH 61/81] add metadata implemented in SpNetAnalyses class --- flask_app/analyst/count.py | 17 +++-------------- flask_app/analyst/rank.py | 4 ---- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 4dabf0f3..a9c1715f 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -34,7 +34,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): if dataset_key is None and pub_org_key is None: return cls.get_endpoint() - allrecs = [] + records = [] try: good_params, errinfo = cls._standardize_params( dataset_key=dataset_key, pub_org_key=pub_org_key) @@ -50,25 +50,14 @@ def get_counts(cls, dataset_key=None, pub_org_key=None): good_params["dataset_key"]) except Exception: errors = {"error": [get_traceback()]} - else: - cls._add_dataset_names_to_records( - records, dataset_key_field="datasetkey", - dataset_name_field="dataset_name") - if records: - allrecs.append(records) - # Combine errors from success or failure - errinfo = combine_errinfo(errinfo, errors) - # Query organization counts - if good_params["pub_org_key"] is not None: - errors = { - "warning": "Count by Publishing Organization is not implemented"} + # Combine errors from success or failure errinfo = combine_errinfo(errinfo, errors) # Assemble full_out = AnalystOutput( cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], - records=allrecs, errors=errinfo) + records=records, errors=errinfo) return full_out.response diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index 4aa55b26..02f5ae94 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -74,10 +74,6 @@ def _get_ordered_counts(cls, count_by, order, limit): except Exception: errinfo = {"error": [get_traceback()]} - cls._add_dataset_names_to_records( - records, dataset_key_field="datasetkey", - dataset_name_field="dataset_name") - return records, errinfo # ............................................................................. From e664e437d61b1250313cb67589c0880c5e9bcff7 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 15:02:06 -0500 Subject: [PATCH 62/81] testing --- sppy/aws/aws_tools.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 34c43d85..80dc3070 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -946,29 +946,15 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u from sppy.aws.aws_tools import * bucket=PROJ_BUCKET +region=REGION +encoding=ENCODING s3_folders="summary" -create_s3_dataset_lookup( - bucket, s3_folders, region=REGION, encoding="utf-8") +base_url = "https://api.gbif.org/v1/dataset" +response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] +output_fname = "dataset_name_2024_02_01_" +output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] -# ds_table = create_dataframe_from_s3obj( -# bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING) -# -# i = 0 -# for rec in ds_table.itertuples(): -# print(i) -# print(rec) -# i = i + 1 -# if i == 5: -# break -# -# dataset_key = rec.datasetkey -# -# url = f"https://api.gbif.org/v1/dataset/{dataset_key}" - -# import certifi -# cert = certifi.where() -# r = requests.get(url, cert=cert) - -# response = requests.get(url) +create_s3_dataset_lookup( + bucket, s3_folders, region=REGION, encoding="utf-8") """ From 27436750c08f5364fea7eb8c45490798585ec8f8 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 15:17:23 -0500 Subject: [PATCH 63/81] rename awss3 module to reflect provider of SpNetwork data --- flask_app/analyst/base.py | 2 +- flask_app/analyst/count.py | 2 +- flask_app/analyst/rank.py | 2 +- sppy/tools/provider/{awss3.py => spnet.py} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename sppy/tools/provider/{awss3.py => spnet.py} (100%) diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index 263441d7..7a606862 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -5,7 +5,7 @@ from flask_app.common.s2n_type import AnalystOutput, APIService from sppy.tools.s2n.utils import get_traceback -from sppy.tools.provider.awss3 import SpNetAnalyses +from sppy.tools.provider.spnet import SpNetAnalyses # ............................................................................. diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index a9c1715f..ab09d232 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -6,7 +6,7 @@ from flask_app.analyst.base import _AnalystService from sppy.aws.aws_constants import PROJ_BUCKET -from sppy.tools.provider.awss3 import SpNetAnalyses +from sppy.tools.provider.spnet import SpNetAnalyses from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index 02f5ae94..83571fef 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -6,7 +6,7 @@ from flask_app.analyst.base import _AnalystService from sppy.aws.aws_constants import PROJ_BUCKET -from sppy.tools.provider.awss3 import SpNetAnalyses +from sppy.tools.provider.spnet import SpNetAnalyses from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/spnet.py similarity index 100% rename from sppy/tools/provider/awss3.py rename to sppy/tools/provider/spnet.py From 897efd769995cb4534cfb6076ce0cd6475ecdf6c Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 16:19:19 -0500 Subject: [PATCH 64/81] chg defaults --- flask_app/analyst/rank.py | 2 +- flask_app/common/s2n_type.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py index 83571fef..2eeafa92 100644 --- a/flask_app/analyst/rank.py +++ b/flask_app/analyst/rank.py @@ -18,7 +18,7 @@ class RankSvc(_AnalystService): # ............................................... @classmethod - def rank_counts(cls, count_by, order=None, limit=1): + def rank_counts(cls, count_by, order=None, limit=10): """Return occurrence and species counts for dataset/organization identifiers. Args: diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index b11efa56..f5e6721e 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -193,7 +193,7 @@ class APIService: "order": { "type": "", "options": ["ascending", "descending"], - "default": None + "default": "descending" }, "limit": {"type": 2, "default": 10, "min": 1, "max": 500}, }, From c308bca73982058075c0a0e129dae82e0f9497c7 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 16:20:38 -0500 Subject: [PATCH 65/81] write csv files locally to reduce memory requirements, before combining and writing to S3 --- sppy/aws/aws_tools.py | 102 +++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 15 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 80dc3070..3511552f 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -835,19 +835,95 @@ def create_dataframe_from_api(base_url, response_keys, output_columns): all_recs = [] is_end = False offset = 0 - limit = 100 + limit = 1000 while is_end is False: url = f"{base_url}?offset={offset}&limit={limit}" small_recs, is_end = _get_records(url, response_keys) all_recs.append(small_recs) offset += limit - if offset % 1000 == 0: + if offset % 5000 == 0: print(f"Offset = {offset}") dataframe = pd.DataFrame(all_recs, columns=output_columns) print(f"Lookup table contains {dataframe.shape[0]} rows") return dataframe +# ---------------------------------------------------- +def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fname): + """Query an API, read the data and write a subset to a table in S3. + + Args: + base_url: API URL without any key value pairs for the data service + response_keys: list of keys within the API response to pull values from. A key + can be an ordered list of keys nested within several elements of the tree, + from outermost to innermost. + output_columns: list of column headings for output lookup table + output_fname: base output filename for temporary CSV files + + Returns: + csv_files: Local CSV files with records. The first file in the list will have + a header, the rest will not. + """ + csv_files = [] + records = [] + is_end = False + offset = 0 + read_limit = 1000 + write_limit = 5000 + write_header = True + while is_end is False: + url = f"{base_url}?offset={offset}&limit={read_limit}" + small_recs, is_end = _get_records(url, response_keys) + records.append(small_recs) + offset += read_limit + # Write to tempfile every 5000 + if offset % write_limit == 0: + print(f"Offset = {offset}") + dataframe = pd.DataFrame(records, columns=output_columns) + tmp_filename = f"/tmp/{output_fname}_{offset}_" + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=write_header, + encoding=encoding) + # Only write header to first file, others will be appended + write_header = False + csv_files.append(tmp_filename) + print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") + # reset records in memory + records = [] + return csv_files + + +# ---------------------------------------------------- +def write_csvfiles_to_s3( + csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"): + """Query an API, read the data and write a subset to a table in S3. + + Args: + csvfiles: input CSV files for S3 table. The first file in the list will have + a header, the rest will not. + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + output_fname: output table for looking up dataset name and citation + region: AWS region containing the destination bucket. + encoding: encoding of the input data + + Postcondition: + CSV table with output_columns and values for each written to the named S3 object + in bucket and folders + """ + output_path = f"{s3_folders}/{output_fname}" + combined_fname = f"/tmp/{output_fname}" + with open(combined_fname, "a") as outf: + # Output data written as CSV + for fname in csv_fnames: + with open(fname, "r") as inf: + data = inf.read() + outf.write(data) + print(f"Wrote {combined_fname}") + upload_to_s3(combined_fname, bucket, output_path, region=region) + print(f"Uploaded to s3://{bucket}/{output_path}") + + # ---------------------------------------------------- def write_dataframe_to_s3( dataframe, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"): @@ -884,6 +960,11 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8" region: AWS region containing the destination bucket. encoding: encoding of the input data + Note: + There are >100k records for datasets and limited memory on this EC2 instance, + so we write them as temporary CSV files, then combine them, then create a + dataframe and upload. + Postcondition: CSV table with dataset key, pubOrgKey, dataset name, dataset citation written to the named S3 object in bucket and folders @@ -894,9 +975,10 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8" output_fname = f"dataset_name_{data_date}_" output_fname = "dataset_name_2024_02_01_" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] - lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns) - write_dataframe_to_s3( - lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding) + csv_fnames = create_csvfiles_from_api( + base_url, response_keys, output_columns, output_fname) + write_csvfiles_to_s3( + csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) # ---------------------------------------------------- @@ -931,11 +1013,6 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u encoding=ENCODING s3_folders="summary" - base_url = "https://api.gbif.org/v1/dataset" - response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] - output_fname = "dataset_name_2024_02_01_" - output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] - create_s3_dataset_lookup( bucket, s3_folders, region=REGION, encoding="utf-8") # create_s3_organization_lookup( @@ -950,11 +1027,6 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u encoding=ENCODING s3_folders="summary" -base_url = "https://api.gbif.org/v1/dataset" -response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] -output_fname = "dataset_name_2024_02_01_" -output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] - create_s3_dataset_lookup( bucket, s3_folders, region=REGION, encoding="utf-8") """ From f1d0dcfcb597ff0fac808c09045a95f03fb070d9 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 16:21:19 -0500 Subject: [PATCH 66/81] comment out metadata add until tables are populated --- sppy/tools/provider/spnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sppy/tools/provider/spnet.py b/sppy/tools/provider/spnet.py index 444345d5..4a213153 100644 --- a/sppy/tools/provider/spnet.py +++ b/sppy/tools/provider/spnet.py @@ -166,7 +166,7 @@ def get_dataset_counts(self, dataset_key, format="JSON"): ) # Returns empty list or list of 1 record records = self._query_table(table_path, query_str, format=format) - self.add_dataset_lookup_vals(records, key_idx=key_idx) + # self.add_dataset_lookup_vals(records, key_idx=key_idx) return records # ---------------------------------------------------- @@ -253,7 +253,7 @@ def rank_dataset_counts(self, count_by, order, limit, format="JSON"): except Exception as e: errors = {"error": [get_traceback()]} - self.add_dataset_lookup_vals(records, key_idx=key_idx) + # self.add_dataset_lookup_vals(records, key_idx=key_idx) return records, errors # ............................................................................. From d677746583d011f258646b16b4dbc9d0e8763e58 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 16:26:11 -0500 Subject: [PATCH 67/81] extend record list --- sppy/aws/aws_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 3511552f..5338cf15 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -839,7 +839,7 @@ def create_dataframe_from_api(base_url, response_keys, output_columns): while is_end is False: url = f"{base_url}?offset={offset}&limit={limit}" small_recs, is_end = _get_records(url, response_keys) - all_recs.append(small_recs) + all_recs.extend(small_recs) offset += limit if offset % 5000 == 0: print(f"Offset = {offset}") @@ -874,7 +874,7 @@ def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fna while is_end is False: url = f"{base_url}?offset={offset}&limit={read_limit}" small_recs, is_end = _get_records(url, response_keys) - records.append(small_recs) + records.extend(small_recs) offset += read_limit # Write to tempfile every 5000 if offset % write_limit == 0: From d6073dd8f4c9527207dcc8a91b592491c1166bd9 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 17:06:53 -0500 Subject: [PATCH 68/81] bugfix --- sppy/aws/aws_tools.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 5338cf15..2d3c7873 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -849,7 +849,8 @@ def create_dataframe_from_api(base_url, response_keys, output_columns): # ---------------------------------------------------- -def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fname): +def create_csvfiles_from_api( + base_url, response_keys, output_columns, output_fname, encoding="utf-8"): """Query an API, read the data and write a subset to a table in S3. Args: @@ -859,6 +860,7 @@ def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fna from outermost to innermost. output_columns: list of column headings for output lookup table output_fname: base output filename for temporary CSV files + encoding: encoding of the input data Returns: csv_files: Local CSV files with records. The first file in the list will have @@ -880,7 +882,7 @@ def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fna if offset % write_limit == 0: print(f"Offset = {offset}") dataframe = pd.DataFrame(records, columns=output_columns) - tmp_filename = f"/tmp/{output_fname}_{offset}_" + tmp_filename = f"/tmp/{output_fname}{offset}.csv" dataframe.to_csv( path_or_buf=tmp_filename, sep='\t', header=write_header, encoding=encoding) From e54833bc1b6455d99665e07dd5ab21e2469986d0 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Wed, 27 Mar 2024 17:07:07 -0500 Subject: [PATCH 69/81] rm unused pyarrow --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5b7074f5..6687eff1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,3 @@ awscli boto3>=1.34.60 sqlalchemy pandas -pyarrow From b3c53e984fd1217c70a26bcf9daecff0df667a24 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 28 Mar 2024 12:07:19 -0500 Subject: [PATCH 70/81] testing for proper csv to s3 formatting --- sppy/aws/aws_tools.py | 111 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 2d3c7873..5508cefa 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -776,10 +776,44 @@ def _parse_records(ret_records, keys): small_recs.append(values) return small_recs +# ............................................... +def _get_single_record(url, keys): + rec = None + try: + response = requests.get(url) + except Exception as e: + errmsg = str(e) + else: + try: + status_code = response.status_code + reason = response.reason + except Exception as e: + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + reason = str(e) + if status_code == HTTPStatus.OK: + # Parse response + try: + output = response.json() + except Exception: + output = response.content + if type(output) is bytes: + output = ET.fromstring(str(output)) + try: + output = ET.parse(output) + except Exception: + reason = f"Provider error: Invalid JSON response ({output})" + else: + # Output is only one record + small_recs = _parse_records([output], keys) + try: + rec = small_recs[0] + except Exception as e: + print(f"Error: no output record ({e})") + return rec + + # ............................................... def _get_records(url, keys): - small_recs = [] - is_end = True try: response = requests.get(url) except Exception as e: @@ -817,7 +851,6 @@ def _get_records(url, keys): small_recs = _parse_records(ret_records, keys) return small_recs, is_end - # ---------------------------------------------------- def create_dataframe_from_api(base_url, response_keys, output_columns): """Query an API, read the data and write a subset to a table in S3. @@ -885,6 +918,7 @@ def create_csvfiles_from_api( tmp_filename = f"/tmp/{output_fname}{offset}.csv" dataframe.to_csv( path_or_buf=tmp_filename, sep='\t', header=write_header, + columns=output_columns, doublequote=False, escapechar="\\", encoding=encoding) # Only write header to first file, others will be appended write_header = False @@ -1008,12 +1042,81 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding) +# ---------------------------------------------------- +def create_csvfile_from_api( + base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8"): + """Query an API, read the data and write a subset to a table in S3. + + Args: + base_url: API URL without any key value pairs for the data service + keys: unique identifiers to query the API for + response_keys: list of keys within the API response to pull values from. A key + can be an ordered list of keys nested within several elements of the tree, + from outermost to innermost. + output_columns: list of column headings for output lookup table + output_fname: base output filename for temporary CSV files + encoding: encoding of the input data + + Returns: + csv_files: Local CSV files with records. The first file in the list will have + a header, the rest will not. + """ + records = [] + for key in keys: + url = f"{base_url}/{key}" + rec = _get_single_record(url, response_keys) + records.append(rec) + dataframe = pd.DataFrame(records, columns=output_columns) + tmp_filename = f"/tmp/{output_fname}.csv" + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=True, + columns=output_columns, doublequote=False, escapechar="\\", + encoding=encoding) + print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") + return tmp_filename + +# ---------------------------------------------------- +def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): + """Query the GBIF Dataset API, write a subset of the response to a table in S3. + + Args: + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + region: AWS region containing the destination bucket. + encoding: encoding of the input data + + Note: + There are >100k records for datasets and limited memory on this EC2 instance, + so we write them as temporary CSV files, then combine them, then create a + dataframe and upload. + + Postcondition: + CSV table with dataset key, pubOrgKey, dataset name, dataset citation written + to the named S3 object in bucket and folders + """ + base_url = "https://api.gbif.org/v1/dataset" + response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] + data_date = get_current_datadate_str() + output_fname = f"dataset_name_{data_date}_" + output_fname = "dataset_name_test_2024_02_01_" + output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] + csv_fname = create_csvfile_from_api( + base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8") + write_csvfiles_to_s3( + [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding) + # ............................................................................. if __name__ == "__main__": bucket=PROJ_BUCKET region=REGION encoding=ENCODING s3_folders="summary" + keys = [ + "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b", + "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e", + "c8fded56-3ddb-4e26-8863-ba8d55862689", + "3c83d5da-822a-439c-897a-7569e82c4ebc" + ] create_s3_dataset_lookup( bucket, s3_folders, region=REGION, encoding="utf-8") @@ -1022,6 +1125,8 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u """ +# Note: Test with quoted data such as: +# http://api.gbif.org/v1/dataset/3c83d5da-822a-439c-897a-7569e82c4ebc from sppy.aws.aws_tools import * bucket=PROJ_BUCKET From 00d4cef09e99a6afee491cb9c919c204ace9ba47 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 28 Mar 2024 12:07:27 -0500 Subject: [PATCH 71/81] doc --- sphinx/about/install_run_notes.rst | 4 ++-- sphinx/misc/docker.rst | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index cdff1d87..e8106037 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -199,8 +199,8 @@ Reset the FLASK_APP variable to test an alternate resource:: * Test with http, no https!! - http://localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) - http://localhost:5000/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3 + http://broker.localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) + http://broker.localhost:5000/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3 Troubleshooting ====================================== diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst index 074bb1e4..fd91fc8f 100644 --- a/sphinx/misc/docker.rst +++ b/sphinx/misc/docker.rst @@ -75,6 +75,19 @@ all docker containers, shut down httpd, bring up docker. sudo systemctl stop httpd sudo docker compose up -d +Run Docker on OSX +================================= + +Need to bind server to 0.0.0.0 instead of 127.0.0.1 + +Test by getting internal IP, using ifconfig, then command to see if connects successfully:: + + nc -v x.x.x.x 443 + +Then can use same IP in browser, i.e. https://x.x.x.x/api/v1/name/ +This only exposes the broker, not the analyst services. + + Troubleshooting ================================= From 4af2758e9141a9400690e69ba8f5d74363908f74 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 28 Mar 2024 14:56:18 -0500 Subject: [PATCH 72/81] testing for csv quoting, escapechars --- sppy/aws/aws_tools.py | 99 +++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 5508cefa..ab65ebba 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -614,7 +614,7 @@ def get_logger(log_name, log_dir=None, log_level=logging.INFO): # create file handler handler = RotatingFileHandler( filename, mode="w", maxBytes=LOGFILE_MAX_BYTES, backupCount=10, - encoding="utf-8" + encoding=ENCODING ) formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT) handler.setLevel(log_level) @@ -643,14 +643,14 @@ def create_dataframe_from_gbifcsv_s3_bucket(bucket, csv_path, region=REGION): s3_client = boto3.client("s3", region_name=region) s3_obj = s3_client.get_object(Bucket=bucket, Key=csv_path) df = pd.read_csv( - s3_obj["Body"], delimiter="\t", encoding="utf-8", low_memory=False, + s3_obj["Body"], delimiter="\t", encoding=ENCODING, low_memory=False, quoting=csv.QUOTE_NONE) return df # ---------------------------------------------------- def create_dataframe_from_s3obj( - bucket, s3_path, datatype="parquet", region=REGION, encoding="utf-8"): + bucket, s3_path, datatype="parquet", region=REGION, encoding=ENCODING): """Read CSV data from S3 into a pandas DataFrame. Args: @@ -678,22 +678,6 @@ def create_dataframe_from_s3obj( return df -# # ............................................... -# def _get_nested_output_val(output, key_list): -# while key_list: -# key = key_list[0] -# key_list = key_list[1:] -# try: -# output = output[key] -# if not key_list: -# val = output -# # if type(val) is bytes: -# # val = val.decode(ENCODING) -# return val -# except Exception: -# return None - - # ............................................... def _get_values_for_keys(output, keys): values = [] @@ -801,14 +785,15 @@ def _get_single_record(url, keys): try: output = ET.parse(output) except Exception: + output = None reason = f"Provider error: Invalid JSON response ({output})" - else: - # Output is only one record - small_recs = _parse_records([output], keys) - try: - rec = small_recs[0] - except Exception as e: - print(f"Error: no output record ({e})") + if output: + # Output is only one record + small_recs = _parse_records([output], keys) + try: + rec = small_recs[0] + except Exception as e: + print(f"Error: no output record ({e})") return rec @@ -883,7 +868,7 @@ def create_dataframe_from_api(base_url, response_keys, output_columns): # ---------------------------------------------------- def create_csvfiles_from_api( - base_url, response_keys, output_columns, output_fname, encoding="utf-8"): + base_url, response_keys, output_columns, output_fname, encoding=ENCODING): """Query an API, read the data and write a subset to a table in S3. Args: @@ -915,7 +900,7 @@ def create_csvfiles_from_api( if offset % write_limit == 0: print(f"Offset = {offset}") dataframe = pd.DataFrame(records, columns=output_columns) - tmp_filename = f"/tmp/{output_fname}{offset}.csv" + tmp_filename = f"/tmp/{output_fname}_{offset}.csv" dataframe.to_csv( path_or_buf=tmp_filename, sep='\t', header=write_header, columns=output_columns, doublequote=False, escapechar="\\", @@ -931,7 +916,7 @@ def create_csvfiles_from_api( # ---------------------------------------------------- def write_csvfiles_to_s3( - csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"): + csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding=ENCODING): """Query an API, read the data and write a subset to a table in S3. Args: @@ -948,7 +933,7 @@ def write_csvfiles_to_s3( in bucket and folders """ output_path = f"{s3_folders}/{output_fname}" - combined_fname = f"/tmp/{output_fname}" + combined_fname = f"/tmp/{output_fname}.csv" with open(combined_fname, "a") as outf: # Output data written as CSV for fname in csv_fnames: @@ -962,7 +947,7 @@ def write_csvfiles_to_s3( # ---------------------------------------------------- def write_dataframe_to_s3( - dataframe, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"): + dataframe, bucket, s3_folders, output_fname, region=REGION, encoding=ENCODING): """Query an API, read the data and write a subset to a table in S3. Args: @@ -980,14 +965,16 @@ def write_dataframe_to_s3( output_path = f"{s3_folders}/{output_fname}" tmp_filename = f"/tmp/{output_fname}" # Output data written as CSV - dataframe.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding) + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=True, doublequote=False, + escapechar="\\", encoding=encoding) print(f"Wrote {tmp_filename}") upload_to_s3(tmp_filename, bucket, output_path, region=region) print(f"Uploaded to s3://{bucket}/{output_path}") # ---------------------------------------------------- -def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): +def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING): """Query the GBIF Dataset API, write a subset of the response to a table in S3. Args: @@ -1008,8 +995,8 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8" base_url = "https://api.gbif.org/v1/dataset" response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] data_date = get_current_datadate_str() - output_fname = f"dataset_name_{data_date}_" - output_fname = "dataset_name_2024_02_01_" + output_fname = f"dataset_name_{data_date}" + output_fname = "dataset_name_2024_02_01" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] csv_fnames = create_csvfiles_from_api( base_url, response_keys, output_columns, output_fname) @@ -1018,7 +1005,7 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8" # ---------------------------------------------------- -def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): +def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING): """Query the GBIF Organization API, write a subset of the response to a table in S3. Args: @@ -1034,8 +1021,8 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u base_url = "https://api.gbif.org/v1/dataset" response_keys = ["key", "title"] data_date = get_current_datadate_str() - output_fname = f"organization_name_{data_date}_" - output_fname = "organization_name_2024_02_01_" + output_fname = f"organization_name_{data_date}" + output_fname = "organization_name_2024_02_01" output_columns = ["publishingOrganizationKey", "title"] lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns) write_dataframe_to_s3( @@ -1044,7 +1031,7 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u # ---------------------------------------------------- def create_csvfile_from_api( - base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8"): + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING): """Query an API, read the data and write a subset to a table in S3. Args: @@ -1076,12 +1063,13 @@ def create_csvfile_from_api( return tmp_filename # ---------------------------------------------------- -def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"): +def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encoding=ENCODING): """Query the GBIF Dataset API, write a subset of the response to a table in S3. Args: bucket: name of the bucket containing the CSV data. s3_folders: S3 bucket folders for output lookup table + keys: unique identifiers to query the API for region: AWS region containing the destination bucket. encoding: encoding of the input data @@ -1101,7 +1089,7 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="u output_fname = "dataset_name_test_2024_02_01_" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] csv_fname = create_csvfile_from_api( - base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8") + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING) write_csvfiles_to_s3( [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding) @@ -1118,10 +1106,10 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="u "3c83d5da-822a-439c-897a-7569e82c4ebc" ] - create_s3_dataset_lookup( - bucket, s3_folders, region=REGION, encoding="utf-8") + + create_test_s3_dataset_lookup(bucket, s3_folders, keys) # create_s3_organization_lookup( - # bucket, s3_folders, region=REGION, encoding="utf-8") + # bucket, s3_folders, region=REGION, encoding=ENCODING) """ @@ -1133,7 +1121,26 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="u region=REGION encoding=ENCODING s3_folders="summary" +keys = [ + "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b", + "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e", + "c8fded56-3ddb-4e26-8863-ba8d55862689", + "3c83d5da-822a-439c-897a-7569e82c4ebc" + ] -create_s3_dataset_lookup( - bucket, s3_folders, region=REGION, encoding="utf-8") +base_url = "https://api.gbif.org/v1/dataset" +response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] +data_date = get_current_datadate_str() +output_fname = f"dataset_name_{data_date}_" +output_fname = "dataset_name_test_2024_02_01_" +output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] + + +csv_fname = create_csvfile_from_api( + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING) +write_csvfiles_to_s3( + [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding) + + +create_test_s3_dataset_lookup(bucket, s3_folders, keys) """ From ce5893b5e6906a9d2285e8fd460e7bf70b4fd708 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Thu, 28 Mar 2024 16:11:10 -0500 Subject: [PATCH 73/81] add certificate for local GBIF api query --- sppy/aws/aws_tools.py | 77 +++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index ab65ebba..90283fdf 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -6,6 +6,7 @@ import boto3 from botocore.exceptions import ClientError import csv +import certifi import datetime from http import HTTPStatus import logging @@ -704,10 +705,10 @@ def _get_values_for_keys(output, keys): # ............................................... -def _get_api_response_vals(url, keys): +def _get_api_response_vals(url, keys, certificate=None): values = [] try: - response = requests.get(url) + response = requests.get(url, verify=certificate) except Exception as e: errmsg = str(e) else: @@ -734,7 +735,7 @@ def _get_api_response_vals(url, keys): return values # ............................................... -def get_dataset_name_citation(dataset_key): +def get_dataset_name_citation(dataset_key, certificate=None): """Return title from one dataset record with this key. Args: @@ -748,7 +749,8 @@ def get_dataset_name_citation(dataset_key): Exception: on query failure. """ url = f"https://api.gbif.org/v1/dataset/{dataset_key}" - name, citation = _get_api_response_vals(url, ["title", ["citation", "text"]]) + name, citation = _get_api_response_vals( + url, ["title", ["citation", "text"]], certificate=certificate) return name, citation @@ -761,10 +763,13 @@ def _parse_records(ret_records, keys): return small_recs # ............................................... -def _get_single_record(url, keys): +def _get_single_record(url, keys, certificate=None): rec = None try: - response = requests.get(url) + if certificate: + response = requests.get(url, verify=certificate) + else: + response = requests.get(url) except Exception as e: errmsg = str(e) else: @@ -798,11 +803,16 @@ def _get_single_record(url, keys): # ............................................... -def _get_records(url, keys): +def _get_records(url, keys, certificate=None): + small_recs = [] + status_code = 0 try: - response = requests.get(url) + if certificate: + response = requests.get(url, verify=certificate) + else: + response = requests.get(url) except Exception as e: - errmsg = str(e) + reason = str(e) else: try: status_code = response.status_code @@ -834,6 +844,8 @@ def _get_records(url, keys): reason = "No results returned" else: small_recs = _parse_records(ret_records, keys) + if not small_recs: + print(f"No records returned, status {status_code}, reason {reason}") return small_recs, is_end # ---------------------------------------------------- @@ -854,9 +866,10 @@ def create_dataframe_from_api(base_url, response_keys, output_columns): is_end = False offset = 0 limit = 1000 + certificate = certifi.where() while is_end is False: url = f"{base_url}?offset={offset}&limit={limit}" - small_recs, is_end = _get_records(url, response_keys) + small_recs, is_end = _get_records(url, response_keys, certificate=certificate) all_recs.extend(small_recs) offset += limit if offset % 5000 == 0: @@ -888,17 +901,19 @@ def create_csvfiles_from_api( records = [] is_end = False offset = 0 - read_limit = 1000 + read_limit = 500 write_limit = 5000 write_header = True + certificate = certifi.where() while is_end is False: url = f"{base_url}?offset={offset}&limit={read_limit}" - small_recs, is_end = _get_records(url, response_keys) - records.extend(small_recs) + print(url) + small_recs, is_end = _get_records(url, response_keys, certificate=certificate) + if small_recs: + records.extend(small_recs) offset += read_limit # Write to tempfile every 5000 if offset % write_limit == 0: - print(f"Offset = {offset}") dataframe = pd.DataFrame(records, columns=output_columns) tmp_filename = f"/tmp/{output_fname}_{offset}.csv" dataframe.to_csv( @@ -1031,7 +1046,7 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=EN # ---------------------------------------------------- def create_csvfile_from_api( - base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING): + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING): """Query an API, read the data and write a subset to a table in S3. Args: @@ -1049,9 +1064,10 @@ def create_csvfile_from_api( a header, the rest will not. """ records = [] + certificate = certifi.where() for key in keys: url = f"{base_url}/{key}" - rec = _get_single_record(url, response_keys) + rec = _get_single_record(url, response_keys, certificate=certificate) records.append(rec) dataframe = pd.DataFrame(records, columns=output_columns) tmp_filename = f"/tmp/{output_fname}.csv" @@ -1088,8 +1104,10 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod output_fname = f"dataset_name_{data_date}_" output_fname = "dataset_name_test_2024_02_01_" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] + certificate = certifi.where() csv_fname = create_csvfile_from_api( - base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING) + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING, + certificate=certificate) write_csvfiles_to_s3( [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding) @@ -1106,8 +1124,8 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod "3c83d5da-822a-439c-897a-7569e82c4ebc" ] - - create_test_s3_dataset_lookup(bucket, s3_folders, keys) + create_s3_dataset_lookup(bucket, s3_folders) + # create_test_s3_dataset_lookup(bucket, s3_folders, keys) # create_s3_organization_lookup( # bucket, s3_folders, region=REGION, encoding=ENCODING) @@ -1121,26 +1139,7 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod region=REGION encoding=ENCODING s3_folders="summary" -keys = [ - "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b", - "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e", - "c8fded56-3ddb-4e26-8863-ba8d55862689", - "3c83d5da-822a-439c-897a-7569e82c4ebc" - ] -base_url = "https://api.gbif.org/v1/dataset" -response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] -data_date = get_current_datadate_str() -output_fname = f"dataset_name_{data_date}_" -output_fname = "dataset_name_test_2024_02_01_" -output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] - -csv_fname = create_csvfile_from_api( - base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING) -write_csvfiles_to_s3( - [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding) - - -create_test_s3_dataset_lookup(bucket, s3_folders, keys) +create_s3_dataset_lookup(bucket, s3_folders) """ From c1401bddc16bf7f9a4138a152b7a8d85f28e69c2 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Fri, 29 Mar 2024 17:09:00 -0500 Subject: [PATCH 74/81] modified to query only datasetkeys encountered bc pulling all at once causes JSON parsing errors of GBIF response --- sppy/aws/aws_tools.py | 214 ++++++++++++++++++++++++++++-------------- 1 file changed, 143 insertions(+), 71 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 90283fdf..176148a6 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -7,8 +7,9 @@ from botocore.exceptions import ClientError import csv import certifi -import datetime +import datetime as DT from http import HTTPStatus +import json import logging from logging.handlers import RotatingFileHandler import pandas as pd @@ -241,7 +242,7 @@ def create_token(type=None): """ if type is None: type = PROJ_NAME - token = f"{type}_{datetime.datetime.now().timestamp()}" + token = f"{type}_{DT.datetime.now().timestamp()}" return token @@ -252,7 +253,7 @@ def get_today_str(): Returns: date_str(str): string representing date in YYYY-MM-DD format. """ - n = datetime.datetime.now() + n = DT.datetime.now() date_str = f"{n.year}_{n.month:02d}_{n.day:02d}" return date_str @@ -264,7 +265,7 @@ def get_current_datadate_str(): Returns: date_str(str): string representing date in YYYY-MM-DD format. """ - n = datetime.datetime.now() + n = DT.datetime.now() date_str = f"{n.year}_{n.month:02d}_01" return date_str @@ -276,7 +277,7 @@ def get_previous_datadate_str(): Returns: date_str(str): string representing date in YYYY-MM-DD format. """ - n = datetime.datetime.now() + n = DT.datetime.now() yr = n.year mo = n.month - 1 if n.month == 0: @@ -754,6 +755,54 @@ def get_dataset_name_citation(dataset_key, certificate=None): return name, citation +# ---------------------------------------------------- +def _query_table(bucket, s3_path, query_str, region=REGION, format="CSV"): + """Query the S3 resource defined for this class. + + Args: + bucket: + s3_path: S3 folder and filename within the bucket + query_str: a SQL query for S3 select. + region: + format: output format, options "CSV" or "JSON" + + Returns: + list of records matching the query + """ + recs = [] + if format not in ("JSON", "CSV"): + format = "JSON" + if format == "JSON": + out_serialization = {"JSON": {}} + elif format == "CSV": + out_serialization = { + "CSV": { + "QuoteFields": "ASNEEDED", + "FieldDelimiter": ",", + "QuoteCharacter": '"'} + } + s3 = boto3.client("s3", region_name=region) + resp = s3.select_object_content( + Bucket=bucket, + Key=s3_path, + ExpressionType="SQL", + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization=out_serialization + ) + for event in resp["Payload"]: + if "Records" in event: + recs_str = event["Records"]["Payload"].decode(ENCODING) + rec_strings = recs_str.strip().split("\n") + for rs in rec_strings: + if format == "JSON": + rec = json.loads(rs) + else: + rec = rs.split(",") + recs.append(rec) + return recs + + # ............................................... def _parse_records(ret_records, keys): small_recs = [] @@ -805,7 +854,8 @@ def _get_single_record(url, keys, certificate=None): # ............................................... def _get_records(url, keys, certificate=None): small_recs = [] - status_code = 0 + status_code = None + is_end = count = None try: if certificate: response = requests.get(url, verify=certificate) @@ -837,6 +887,11 @@ def _get_records(url, keys, certificate=None): is_end = output["endOfRecords"] except KeyError: print("Missing endOfRecords flag") + # Expected count + try: + is_end = output["count"] + except KeyError: + print("Missing count") # Get values from JSON response try: ret_records = output["results"] @@ -846,7 +901,7 @@ def _get_records(url, keys, certificate=None): small_recs = _parse_records(ret_records, keys) if not small_recs: print(f"No records returned, status {status_code}, reason {reason}") - return small_recs, is_end + return small_recs, is_end, count # ---------------------------------------------------- def create_dataframe_from_api(base_url, response_keys, output_columns): @@ -869,7 +924,8 @@ def create_dataframe_from_api(base_url, response_keys, output_columns): certificate = certifi.where() while is_end is False: url = f"{base_url}?offset={offset}&limit={limit}" - small_recs, is_end = _get_records(url, response_keys, certificate=certificate) + small_recs, is_end, count = _get_records( + url, response_keys, certificate=certificate) all_recs.extend(small_recs) offset += limit if offset % 5000 == 0: @@ -900,15 +956,15 @@ def create_csvfiles_from_api( csv_files = [] records = [] is_end = False - offset = 0 + offset = 7000 read_limit = 500 write_limit = 5000 - write_header = True certificate = certifi.where() while is_end is False: url = f"{base_url}?offset={offset}&limit={read_limit}" print(url) - small_recs, is_end = _get_records(url, response_keys, certificate=certificate) + small_recs, is_end, count = _get_records( + url, response_keys, certificate=certificate) if small_recs: records.extend(small_recs) offset += read_limit @@ -916,12 +972,11 @@ def create_csvfiles_from_api( if offset % write_limit == 0: dataframe = pd.DataFrame(records, columns=output_columns) tmp_filename = f"/tmp/{output_fname}_{offset}.csv" + # Only write header to first file (offset == 0), others will be appended dataframe.to_csv( - path_or_buf=tmp_filename, sep='\t', header=write_header, + path_or_buf=tmp_filename, sep='\t', header=(offset == 0), columns=output_columns, doublequote=False, escapechar="\\", encoding=encoding) - # Only write header to first file, others will be appended - write_header = False csv_files.append(tmp_filename) print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") # reset records in memory @@ -988,35 +1043,35 @@ def write_dataframe_to_s3( print(f"Uploaded to s3://{bucket}/{output_path}") -# ---------------------------------------------------- -def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING): - """Query the GBIF Dataset API, write a subset of the response to a table in S3. - - Args: - bucket: name of the bucket containing the CSV data. - s3_folders: S3 bucket folders for output lookup table - region: AWS region containing the destination bucket. - encoding: encoding of the input data - - Note: - There are >100k records for datasets and limited memory on this EC2 instance, - so we write them as temporary CSV files, then combine them, then create a - dataframe and upload. - - Postcondition: - CSV table with dataset key, pubOrgKey, dataset name, dataset citation written - to the named S3 object in bucket and folders - """ - base_url = "https://api.gbif.org/v1/dataset" - response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] - data_date = get_current_datadate_str() - output_fname = f"dataset_name_{data_date}" - output_fname = "dataset_name_2024_02_01" - output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] - csv_fnames = create_csvfiles_from_api( - base_url, response_keys, output_columns, output_fname) - write_csvfiles_to_s3( - csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) +# # ---------------------------------------------------- +# def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING): +# """Query the GBIF Dataset API, write a subset of the response to a table in S3. +# +# Args: +# bucket: name of the bucket containing the CSV data. +# s3_folders: S3 bucket folders for output lookup table +# region: AWS region containing the destination bucket. +# encoding: encoding of the input data +# +# Note: +# There are >100k records for datasets and limited memory on this EC2 instance, +# so we write them as temporary CSV files, then combine them, then create a +# dataframe and upload. +# +# Postcondition: +# CSV table with dataset key, pubOrgKey, dataset name, dataset citation written +# to the named S3 object in bucket and folders +# """ +# base_url = "https://api.gbif.org/v1/dataset" +# response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] +# data_date = get_current_datadate_str() +# output_fname = f"dataset_name_{data_date}" +# output_fname = "dataset_name_2024_02_01" +# output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] +# csv_fnames = create_csvfiles_from_api( +# base_url, response_keys, output_columns, output_fname) +# write_csvfiles_to_s3( +# csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) # ---------------------------------------------------- @@ -1045,8 +1100,9 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=EN # ---------------------------------------------------- -def create_csvfile_from_api( - base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING): +def create_csvfiles_from_apiqueries( + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING, + certificate=None): """Query an API, read the data and write a subset to a table in S3. Args: @@ -1058,28 +1114,37 @@ def create_csvfile_from_api( output_columns: list of column headings for output lookup table output_fname: base output filename for temporary CSV files encoding: encoding of the input data + certificate: local SSL certificate required by some APIs Returns: csv_files: Local CSV files with records. The first file in the list will have a header, the rest will not. """ + tmp_filenames = [] records = [] - certificate = certifi.where() - for key in keys: - url = f"{base_url}/{key}" + write_chunk = 1000 + for i in range(len(keys)): + url = f"{base_url}/{keys[i]}" rec = _get_single_record(url, response_keys, certificate=certificate) records.append(rec) - dataframe = pd.DataFrame(records, columns=output_columns) - tmp_filename = f"/tmp/{output_fname}.csv" - dataframe.to_csv( - path_or_buf=tmp_filename, sep='\t', header=True, - columns=output_columns, doublequote=False, escapechar="\\", - encoding=encoding) - print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") - return tmp_filename + if i % 100 == 0: + print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}") + if i % write_chunk == 0 and i > 0: + print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}") + dataframe = pd.DataFrame(records, columns=output_columns) + tmp_filename = f"/tmp/{output_fname}_{i}.csv" + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=True, + columns=output_columns, doublequote=False, escapechar="\\", + encoding=encoding) + print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") + records = [] + tmp_filenames.append(tmp_filename) + return tmp_filenames # ---------------------------------------------------- -def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encoding=ENCODING): +def create_s3_dataset_lookup_by_keys( + bucket, s3_folders, region=REGION, encoding=ENCODING): """Query the GBIF Dataset API, write a subset of the response to a table in S3. Args: @@ -1098,18 +1163,24 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod CSV table with dataset key, pubOrgKey, dataset name, dataset citation written to the named S3 object in bucket and folders """ + input_fname = "dataset_counts_2024_02_01_000.parquet" + s3_path = f"{s3_folders}/{input_fname}" + query_str = "SELECT datasetkey from s3object s" + key_records = _query_table(bucket, s3_path, query_str, format="CSV") + keys = [r[0] for r in key_records] + base_url = "https://api.gbif.org/v1/dataset" response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] data_date = get_current_datadate_str() - output_fname = f"dataset_name_{data_date}_" - output_fname = "dataset_name_test_2024_02_01_" + output_fname = f"dataset_name_{data_date}" + output_fname = "dataset_name_test_2024_02_01" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] certificate = certifi.where() - csv_fname = create_csvfile_from_api( + csv_fnames = create_csvfiles_from_apiqueries( base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING, certificate=certificate) write_csvfiles_to_s3( - [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding) + csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) # ............................................................................. if __name__ == "__main__": @@ -1117,14 +1188,15 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod region=REGION encoding=ENCODING s3_folders="summary" - keys = [ - "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b", - "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e", - "c8fded56-3ddb-4e26-8863-ba8d55862689", - "3c83d5da-822a-439c-897a-7569e82c4ebc" - ] - - create_s3_dataset_lookup(bucket, s3_folders) + # keys = [ + # "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b", + # "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e", + # "c8fded56-3ddb-4e26-8863-ba8d55862689", + # "3c83d5da-822a-439c-897a-7569e82c4ebc" + # ] + create_s3_dataset_lookup_by_keys( + bucket, s3_folders, region=REGION, encoding=ENCODING) + # create_s3_dataset_lookup(bucket, s3_folders) # create_test_s3_dataset_lookup(bucket, s3_folders, keys) # create_s3_organization_lookup( # bucket, s3_folders, region=REGION, encoding=ENCODING) @@ -1140,6 +1212,6 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod encoding=ENCODING s3_folders="summary" - -create_s3_dataset_lookup(bucket, s3_folders) +create_s3_dataset_lookup_by_keys( + bucket, s3_folders, region=REGION, encoding=ENCODING) """ From 36e59dde6b04061b7bc261015d8639afee282723 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 10:09:55 -0500 Subject: [PATCH 75/81] make sure there are records before creating dataframe --- sppy/aws/aws_tools.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 176148a6..4c9b9506 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -1126,20 +1126,24 @@ def create_csvfiles_from_apiqueries( for i in range(len(keys)): url = f"{base_url}/{keys[i]}" rec = _get_single_record(url, response_keys, certificate=certificate) - records.append(rec) - if i % 100 == 0: - print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}") + if rec: + records.append(rec) if i % write_chunk == 0 and i > 0: - print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}") - dataframe = pd.DataFrame(records, columns=output_columns) - tmp_filename = f"/tmp/{output_fname}_{i}.csv" - dataframe.to_csv( - path_or_buf=tmp_filename, sep='\t', header=True, - columns=output_columns, doublequote=False, escapechar="\\", - encoding=encoding) - print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") - records = [] - tmp_filenames.append(tmp_filename) + print( + f"{DT.datetime.now().isoformat()} Create dataframe for {len(records)} " + f"records; key {i} of {len(keys)}") + if records: + dataframe = pd.DataFrame(records, columns=output_columns) + tmp_filename = f"/tmp/{output_fname}_{i}.csv" + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=(i < write_chunk+1), + columns=output_columns, doublequote=False, escapechar="\\", + encoding=encoding) + print( + f"Wrote {tmp_filename} with {len(records)} records and " + f"{dataframe.shape[0]} rows") + records = [] + tmp_filenames.append(tmp_filename) return tmp_filenames # ---------------------------------------------------- From b569dd04b7abcd16035fd6ae08c8f51a0df60883 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 11:40:38 -0500 Subject: [PATCH 76/81] check for existence of dataset metadata before trying to use --- sppy/aws/aws_tools.py | 8 +++---- sppy/tools/provider/spnet.py | 43 ++++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index 4c9b9506..ae85c23e 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -1065,8 +1065,8 @@ def write_dataframe_to_s3( # base_url = "https://api.gbif.org/v1/dataset" # response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] # data_date = get_current_datadate_str() -# output_fname = f"dataset_name_{data_date}" -# output_fname = "dataset_name_2024_02_01" +# output_fname = f"dataset_meta_{data_date}" +# output_fname = "dataset_meta_2024_02_01" # output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] # csv_fnames = create_csvfiles_from_api( # base_url, response_keys, output_columns, output_fname) @@ -1176,8 +1176,8 @@ def create_s3_dataset_lookup_by_keys( base_url = "https://api.gbif.org/v1/dataset" response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] data_date = get_current_datadate_str() - output_fname = f"dataset_name_{data_date}" - output_fname = "dataset_name_test_2024_02_01" + output_fname = f"dataset_meta_{data_date}" + output_fname = "dataset_meta_test_2024_02_01" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] certificate = certifi.where() csv_fnames = create_csvfiles_from_apiqueries( diff --git a/sppy/tools/provider/spnet.py b/sppy/tools/provider/spnet.py index 4a213153..fde4a41c 100644 --- a/sppy/tools/provider/spnet.py +++ b/sppy/tools/provider/spnet.py @@ -28,33 +28,57 @@ def __init__( self.region = region self.encoding = encoding self.exp_type = 'SQL' - datestr = get_current_datadate_str() - datestr = "2024_02_01" + self.datestr = get_current_datadate_str() + self.datestr = "2024_02_01" self._summary_path = "summary" self._summary_tables = { "dataset_counts": { - "fname": f"dataset_counts_{datestr}_000.parquet", + "fname": f"dataset_counts_{self.datestr}_000.parquet", "fields": ["datasetkey", "occ_count", "species_count"], "key": "datasetkey" }, "dataset_species_lists": { - "fname": f"dataset_lists_{datestr}_000.parquet", + "fname": f"dataset_lists_{self.datestr}_000.parquet", "fields": ["datasetkey", "taxonkey", "species", "occ_count"], "key": "datasetkey" }, "dataset_meta": { - "fname": f"dataset_names_{datestr}_000.csv", + "fname": f"dataset_meta_{self.datestr}.csv", "fields": [ "datasetKey", "publishingOrganizationKey", "title", "citation"], "key": "datasetKey" }, "organization_meta": { - "fname": f"organization_names_{datestr}_000.csv", + "fname": f"organization_meta_{self.datestr}.csv", "fields": ["publishingOrganizationKey", "title"], "key": "publishingOrganizationKey" } } + # ---------------------------------------------------- + def _list_summaries(self): + summary_objs = [] + s3 = boto3.client("s3", region_name=self.region) + summ_objs = s3.list_objects_v2(Bucket=self.bucket, Prefix=self._summary_path) + prefix = f"{self._summary_path}/" + try: + contents = summ_objs["Contents"] + except KeyError: + pass + else: + for item in contents: + fname = item["Key"].strip(prefix) + if len(fname) > 1: + summary_objs.append(fname) + return summary_objs + + # ---------------------------------------------------- + def _dataset_metadata_exists(self): + fnames = self._list_summaries() + if self._summary_tables["dataset_meta"]["fname"] in fnames: + return True + return False + # ---------------------------------------------------- def _query_table(self, s3_path, query_str, format="CSV"): """Query the S3 resource defined for this class. @@ -166,7 +190,8 @@ def get_dataset_counts(self, dataset_key, format="JSON"): ) # Returns empty list or list of 1 record records = self._query_table(table_path, query_str, format=format) - # self.add_dataset_lookup_vals(records, key_idx=key_idx) + if self._dataset_metadata_exists(): + self.add_dataset_lookup_vals(records, key_idx=key_idx) return records # ---------------------------------------------------- @@ -206,7 +231,6 @@ def add_dataset_lookup_vals(self, records, key_idx=0, format="JSON"): rec.update(meta) else: rec.extend(meta) - return records # # ---------------------------------------------------- # def get_org_counts(self, pub_org_key): @@ -253,7 +277,8 @@ def rank_dataset_counts(self, count_by, order, limit, format="JSON"): except Exception as e: errors = {"error": [get_traceback()]} - # self.add_dataset_lookup_vals(records, key_idx=key_idx) + if self._dataset_metadata_exists(): + self.add_dataset_lookup_vals(records, key_idx=key_idx) return records, errors # ............................................................................. From a1ff9943a758e542fa864e87474cf703f75c97f8 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 11:48:37 -0500 Subject: [PATCH 77/81] pandas.read_parquet requires pyarrow (or fastparquet) --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 6687eff1..a63db974 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ awscli boto3>=1.34.60 sqlalchemy pandas +pyarrow \ No newline at end of file From 5e814da2db1d16b6095325788bffa2aa02648846 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 12:39:25 -0500 Subject: [PATCH 78/81] upgrade pip to fix failing pyarrow install --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index 9c17826d..7c40e2b7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,10 @@ USER specify COPY --chown=specify:specify ./requirements.txt . +RUN pip3 install --upgrade pip + RUN python3 -m venv venv \ + && venv/bin/pip install --upgrade pip \ && venv/bin/pip install --no-cache-dir -r ./requirements.txt COPY --chown=specify:specify ./sppy ./sppy From 7fe5b1cfc847ffde6815df96a285095bd27f0cfc Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 14:42:08 -0500 Subject: [PATCH 79/81] replace failing pyarrow dependency build with fastparquet; upgrade python docker image --- Dockerfile | 4 +--- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7c40e2b7..8fe67c1d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # syntax=docker/dockerfile:1 # ........................................................ # Backend base image -FROM python:3.10.0rc2-alpine3.14 as base +FROM python:3.12.2-alpine3.19 as base LABEL maintainer="Specify Collections Consortium " @@ -20,8 +20,6 @@ USER specify COPY --chown=specify:specify ./requirements.txt . -RUN pip3 install --upgrade pip - RUN python3 -m venv venv \ && venv/bin/pip install --upgrade pip \ && venv/bin/pip install --no-cache-dir -r ./requirements.txt diff --git a/requirements.txt b/requirements.txt index a63db974..e9944230 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ gunicorn==20.1.0 rtree>=1.0.0 awscli boto3>=1.34.60 +fastparquet sqlalchemy pandas -pyarrow \ No newline at end of file From cf550a1e8cd71fd78afc804d8878a9131d85664d Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 14:42:37 -0500 Subject: [PATCH 80/81] add testing flag --- sppy/aws/aws_tools.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index ae85c23e..8380b173 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -1148,13 +1148,12 @@ def create_csvfiles_from_apiqueries( # ---------------------------------------------------- def create_s3_dataset_lookup_by_keys( - bucket, s3_folders, region=REGION, encoding=ENCODING): + bucket, s3_folders, region=REGION, encoding=ENCODING, is_test=False): """Query the GBIF Dataset API, write a subset of the response to a table in S3. Args: bucket: name of the bucket containing the CSV data. s3_folders: S3 bucket folders for output lookup table - keys: unique identifiers to query the API for region: AWS region containing the destination bucket. encoding: encoding of the input data @@ -1167,22 +1166,33 @@ def create_s3_dataset_lookup_by_keys( CSV table with dataset key, pubOrgKey, dataset name, dataset citation written to the named S3 object in bucket and folders """ - input_fname = "dataset_counts_2024_02_01_000.parquet" - s3_path = f"{s3_folders}/{input_fname}" - query_str = "SELECT datasetkey from s3object s" - key_records = _query_table(bucket, s3_path, query_str, format="CSV") - keys = [r[0] for r in key_records] + # Current filenames + data_date = get_current_datadate_str() + data_date = "2024_02_01" + input_fname = f"dataset_counts_{data_date}_000.parquet" + output_fname = f"dataset_meta_{data_date}" + # Data and query parameters base_url = "https://api.gbif.org/v1/dataset" response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] - data_date = get_current_datadate_str() - output_fname = f"dataset_meta_{data_date}" - output_fname = "dataset_meta_test_2024_02_01" output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] certificate = certifi.where() + + # Get keys for dataset resolution + s3_path = f"{s3_folders}/{input_fname}" + query_str = "SELECT datasetkey from s3object s" + key_records = _query_table(bucket, s3_path, query_str, format="CSV") + keys = [r[0] for r in key_records] + if is_test: + keys = keys[:2100] + output_fname = f"dataset_meta_test_{data_date}" + + # Write tempfiles locally csv_fnames = create_csvfiles_from_apiqueries( base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING, certificate=certificate) + + # Aggregate and write all records to S3 write_csvfiles_to_s3( csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) @@ -1210,6 +1220,7 @@ def create_s3_dataset_lookup_by_keys( # Note: Test with quoted data such as: # http://api.gbif.org/v1/dataset/3c83d5da-822a-439c-897a-7569e82c4ebc from sppy.aws.aws_tools import * +from sppy.aws.aws_tools import _query_table bucket=PROJ_BUCKET region=REGION @@ -1217,5 +1228,7 @@ def create_s3_dataset_lookup_by_keys( s3_folders="summary" create_s3_dataset_lookup_by_keys( - bucket, s3_folders, region=REGION, encoding=ENCODING) + bucket, s3_folders, region=REGION, encoding=ENCODING, is_test=False) + + """ From 759f21ed1cbeb2e21b8aa789c3d0a245d65f44d5 Mon Sep 17 00:00:00 2001 From: Aimee Stewart Date: Mon, 1 Apr 2024 14:43:55 -0500 Subject: [PATCH 81/81] pandas/fastparquet requires s3fs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index e9944230..8a96d76e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ gunicorn==20.1.0 rtree>=1.0.0 awscli boto3>=1.34.60 +s3fs fastparquet sqlalchemy pandas