diff --git a/.env.broker.conf b/.env.broker.conf index 95dab4b7..7cfc46b7 100644 --- a/.env.broker.conf +++ b/.env.broker.conf @@ -1,5 +1,5 @@ SECRET_KEY=dev WORKING_DIRECTORY=/scratch-path -FQDN=analyst.localhost +FQDN=broker.localhost PYTHONPATH=/home/specify/flask_app diff --git a/Dockerfile b/Dockerfile index a8c33950..8fe67c1d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # syntax=docker/dockerfile:1 # ........................................................ # Backend base image -FROM python:3.10.0rc2-alpine3.14 as base +FROM python:3.12.2-alpine3.19 as base LABEL maintainer="Specify Collections Consortium " @@ -20,7 +20,8 @@ USER specify COPY --chown=specify:specify ./requirements.txt . -RUN python -m venv venv \ +RUN python3 -m venv venv \ + && venv/bin/pip install --upgrade pip \ && venv/bin/pip install --no-cache-dir -r ./requirements.txt COPY --chown=specify:specify ./sppy ./sppy diff --git a/config/nginx.conf b/config/nginx.conf index cab6d732..ef94be1c 100644 --- a/config/nginx.conf +++ b/config/nginx.conf @@ -4,10 +4,11 @@ server { return 301 https://$host$request_uri; } +# Broker server { listen 443 ssl; index index.html; - server_name broker-dev.spcoco.org; + server_name broker.localhost; ssl_certificate /etc/letsencrypt/fullchain.pem; ssl_certificate_key /etc/letsencrypt/privkey.pem; @@ -31,12 +32,6 @@ server { proxy_set_header Origin "${scheme}://${http_host}"; } - location / { - root /var/www/; - try_files $uri $uri/ = 404; - gzip_static on; - } - location /static/js { root /volumes/webpack-output; rewrite ^/static/js/(.*)$ /$1 break; @@ -48,11 +43,13 @@ server { rewrite ^/static/(.*)$ /$1 break; gzip_static on; } +} +# Analyst server { listen 443 ssl; index index.html; - server_name analyst-dev.spcoco.org; + server_name analyst.localhost; ssl_certificate /etc/letsencrypt/fullchain.pem; ssl_certificate_key /etc/letsencrypt/privkey.pem; @@ -76,12 +73,6 @@ server { proxy_set_header Origin "${scheme}://${http_host}"; } - location / { - root /var/www/; - try_files $uri $uri/ = 404; - gzip_static on; - } - location /static/js { root /volumes/webpack-output; rewrite ^/static/js/(.*)$ /$1 break; diff --git a/docker-compose.development.yml b/docker-compose.development.yml index 026c7eb6..50f8c4af 100644 --- a/docker-compose.development.yml +++ b/docker-compose.development.yml @@ -21,7 +21,7 @@ services: ports: - "5002:5002" environment: - - FLASK_APP=flask_app.analyst:app + - FLASK_APP=flask_app.analyst.routes:app - FLASK_MANAGE=flask_app.analyst.manage - DEBUG_PORT=5002 volumes: diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py index fc31c364..7a606862 100644 --- a/flask_app/analyst/base.py +++ b/flask_app/analyst/base.py @@ -1,71 +1,19 @@ """Parent Class for the Specify Network API services.""" -from flask import Flask +from werkzeug.exceptions import BadRequest -import sppy.tools.s2n.utils as lmutil -from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService +from flask_app.common.base import _SpecifyNetworkService +from flask_app.common.s2n_type import AnalystOutput, APIService -app = Flask(__name__) +from sppy.tools.s2n.utils import get_traceback +from sppy.tools.provider.spnet import SpNetAnalyses # ............................................................................. -class _AnalystService: +class _AnalystService(_SpecifyNetworkService): """Base S-to-the-N service, handles parameter names and acceptable values.""" # overridden by subclasses SERVICE_TYPE = APIService.AnalystRoot - # ............................................... - @classmethod - def _get_valid_requested_params(cls, user_params_string, valid_params): - """Return valid and invalid options for parameters that accept >1 values. - - Args: - user_params_string: user-requested parameters as a string. - valid_params: valid parameter values - - Returns: - valid_requested_params: list of valid params from the provided query string - invalid_params: list of invalid params from the provided query string - - Note: - For the badge service, exactly one provider is required. For all other - services, multiple providers are accepted, and None indicates to query all - valid providers. - """ - valid_requested_params = invalid_params = [] - - if user_params_string: - tmplst = user_params_string.split(",") - user_params = {tp.lower().strip() for tp in tmplst} - - valid_requested_params = set() - invalid_params = set() - # valid_requested_providers, invalid_providers = - # cls.get_multivalue_options(user_provs, valid_providers) - for param in user_params: - if param in valid_params: - valid_requested_params.add(param) - else: - invalid_params.add(param) - - invalid_params = list(invalid_params) - if valid_requested_params: - valid_requested_params = list(valid_requested_params) - else: - valid_requested_params = [] - - return valid_requested_params, invalid_params - - # ............................................................................. - @classmethod - def endpoint(cls): - """Return the URL endpoint for this class. - - Returns: - URL endpoint for the service - """ - endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}" - return endpoint - # ............................................... @classmethod def get_endpoint(cls, **kwargs): @@ -75,7 +23,7 @@ def get_endpoint(cls, **kwargs): **kwargs: keyword arguments are accepted but ignored Returns: - flask_app.broker.s2n_type.S2nOutput object + flask_app.analyst.s2n_type.S2nOutput object Raises: Exception: on unknown error. @@ -106,155 +54,49 @@ def _show_online(cls): # ............................................... @classmethod - def _fix_type_new(cls, key, provided_val): - """Modify a parameter value to a valid type and value. - - Args: - key: parameter key - provided_val: user-provided parameter value - - Returns: - usr_val: a valid value for the parameter - valid_options: list of valid options (for error message) - - Note: - Corrections: - * cast to correct type - * validate with any options - * if value is invalid (type or value), return the default. - """ - valid_options = None - if provided_val is None: - return None - # all strings are lower case - try: - provided_val = provided_val.lower() - except Exception: - pass - - # First see if restricted to options - default_val = cls.SERVICE_TYPE["params"][key]["default"] - type_val = cls.SERVICE_TYPE["params"][key]["type"] - # If restricted options, check - try: - options = cls.SERVICE_TYPE["params"][key]["options"] - except KeyError: - options = None - else: - # Invalid option returns default value - if provided_val in options: - usr_val = provided_val - else: - valid_options = options - usr_val = default_val - - # If not restricted to options - if options is None: - # Cast values to correct type. Failed conversions return default value - if isinstance(type_val, str) and not options: - usr_val = str(provided_val) - - elif isinstance(type_val, float): - try: - usr_val = float(provided_val) - except ValueError: - usr_val = default_val - - # Boolean also tests as int, so try boolean first - elif isinstance(type_val, bool): - if provided_val in (0, "0", "n", "no", "f", "false"): - usr_val = False - elif provided_val in (1, "1", "y", "yes", "t", "true"): - usr_val = True - else: - valid_options = (True, False) - usr_val = default_val - - elif isinstance(type_val, int): - try: - usr_val = int(provided_val) - except ValueError: - usr_val = default_val - - else: - usr_val = provided_val - - return usr_val, valid_options - - # ............................................... - @classmethod - def _process_params(cls, user_kwargs=None): - """Modify all user provided keys to lowercase and values to correct types. - - Args: - user_kwargs: dictionary of keywords and values sent by the user for - the current service. - - Returns: - good_params: dictionary of valid parameters and values - errinfo: dictionary of errors for different error levels. - - Note: - A list of valid values for a keyword can include None as a default - if user-provided value is invalid - Todo: - Do we need not_in_valid_options for error message? - """ - good_params = {} - errinfo = {} - - # Correct all parameter keys/values present - for key in cls.SERVICE_TYPE["params"]: - val = user_kwargs[key] - # Done in calling function - if val is not None: - usr_val, valid_options = cls._fix_type_new(key, val) - if valid_options is not None and val not in valid_options: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Value {val} for parameter {key} is not in valid options " - f"{cls.SERVICE_TYPE['params'][key]['options']}") - good_params[key] = None - else: - good_params[key] = usr_val - - # Fill in defaults for missing parameters - for key in cls.SERVICE_TYPE["params"]: - param_meta = cls.SERVICE_TYPE["params"][key] - try: - _ = good_params[key] - except KeyError: - good_params[key] = param_meta["default"] - - return good_params, errinfo - - # ............................................... - @classmethod - def _standardize_params(cls, collection_id=None, organization_id=None): + def _standardize_params( + cls, dataset_key=None, pub_org_key=None, count_by=None, order=None, + limit=10): """Standardize query parameters to send to appropriate service. Args: - collection_id: collection identifier for comparisons - organization_id: organization identifier for comparisons + dataset_key: unique GBIF dataset identifier for comparisons + pub_org_key: unique publishing organization identifier for comparisons + count_by: counts of "occurrence" or "species" + order: sort records "descending" or "ascending" + limit: integer indicating how many ranked records to return, value must + be less than QUERY_LIMIT. + + Raises: + BadRequest: on invalid query parameters. + BadRequest: on unknown exception parsing parameters. Returns: a dictionary containing keys and properly formatted values for the user specified parameters. """ user_kwargs = { - "collection_id": collection_id, - "organization_id": organization_id + "dataset_key": dataset_key, + "pub_org_key": pub_org_key, + "count_by": count_by, + "order": order, + "limit": limit } - usr_params, errinfo = cls._process_params(user_kwargs) + try: + usr_params, errinfo = cls._process_params(user_kwargs) + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) - return usr_params, errinfo + # errinfo["error"] indicates bad parameters, throws exception + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass - # .......................... - @staticmethod - def OPTIONS(): - """Common options request for all services (needed for CORS).""" - return + return usr_params, errinfo # ............................................................................. diff --git a/flask_app/analyst/constants.py b/flask_app/analyst/constants.py index d0a99126..8c95a510 100644 --- a/flask_app/analyst/constants.py +++ b/flask_app/analyst/constants.py @@ -1 +1,2 @@ """Constants for the Specify Network Analyst API services.""" +QUERY_LIMIT = 500 \ No newline at end of file diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py index 65ad6291..ab09d232 100644 --- a/flask_app/analyst/count.py +++ b/flask_app/analyst/count.py @@ -1,11 +1,13 @@ """Class for the Specify Network Name API service.""" from http import HTTPStatus +from werkzeug.exceptions import BadRequest from flask_app.common.s2n_type import APIService, AnalystOutput -from flask_app.common.util import print_analyst_output from flask_app.analyst.base import _AnalystService -from sppy.tools.s2n.utils import get_traceback +from sppy.aws.aws_constants import PROJ_BUCKET +from sppy.tools.provider.spnet import SpNetAnalyses +from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) # ............................................................................. @@ -16,107 +18,83 @@ class CountSvc(_AnalystService): # ............................................... @classmethod - def get_counts(cls, collection_id, organization_id): + def get_counts(cls, dataset_key=None, pub_org_key=None): + """Return occurrence and species counts for dataset/organization identifiers. + + Args: + dataset_key: URL parameter for unique GBIF identifier of dataset. + pub_org_key: URL parameter for unique GBIF identifier of + publishingOrganization. + + Returns: + full_output (flask_app.common.s2n_type.AnalystOutput): including records + as a list of one list (CSV) or dictionary (JSON) of a record + containing dataset_key, occurrence count, and species count. + """ + if dataset_key is None and pub_org_key is None: + return cls.get_endpoint() + + records = [] try: - output = cls._get_records(collection_id, organization_id) - except Exception: - traceback = get_traceback() - output = AnalystOutput( - cls.SERVICE_TYPE["name"], - description=cls.SERVICE_TYPE["description"], - errors={"error": [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]}) - return output.response + good_params, errinfo = cls._standardize_params( + dataset_key=dataset_key, pub_org_key=pub_org_key) - # ............................................... - @classmethod - def _get_organization_counts(cls, organization_id): - return { - "Organization Raw Counts": - { - organization_id: 1, - "org_id_2": 2, - "org_id_3": 3 - }, - f"{organization_id} to other orgs": - { - "to total": "0.5", - "org_id_2": "1.2", - "org_id_3": "1.2" - } - } + except BadRequest as e: + errinfo = {"error": [e.description]} - # ............................................... - @classmethod - def _get_collection_counts(cls, collection_id): - return { - "Collection Raw Counts": - { - collection_id: 1, - "coll_id_2": 2, - "coll_id_3": 3 - }, - f"{collection_id} Ratios": - { - collection_id: "0.5", - "coll_id_2": "0.5", - "coll_id_3": "0.5", - "to total": "0.5" - } - } + else: + # Query dataset counts + if good_params["dataset_key"] is not None: + try: + records, errors = cls._get_dataset_counts( + good_params["dataset_key"]) + except Exception: + errors = {"error": [get_traceback()]} - # ............................................... - @classmethod - def _get_records(cls, collection_id, organization_id): - allrecs = [] - # for response metadata - if collection_id is not None: - coll_data = cls._get_collection_counts(collection_id) - allrecs.append(coll_data) - if organization_id is not None: - org_data = cls._get_organization_counts(organization_id) - allrecs.append(org_data) + # Combine errors from success or failure + errinfo = combine_errinfo(errinfo, errors) # Assemble full_out = AnalystOutput( cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], - records=allrecs, errors={}) + records=records, errors=errinfo) + + return full_out.response + +# ............................................... + @classmethod + def _get_dataset_counts(cls, dataset_key): + """Get counts for datasetKey. + + Args: + dataset_key: unique GBIF identifier for dataset of interest. + + Returns: + a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a + list of records corresponding to occurrence and counts for the dataset. + """ + records = [] + errors = {} + s3 = SpNetAnalyses(PROJ_BUCKET) + try: + records = s3.get_dataset_counts(dataset_key) + except Exception: + traceback = get_traceback() + errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback] + + return records, errors - return full_out # ............................................................................. if __name__ == "__main__": - # from flask_app.broker.constants import import TST_VALUES - # occids = TST_VALUES.GUIDS_WO_SPECIFY_ACCESS[0:3] - occids = ["84fe1494-c378-4657-be15-8c812b228bf4", - "04c05e26-4876-4114-9e1d-984f78e89c15", - "2facc7a2-dd88-44af-b95a-733cc27527d4"] - occids = ["01493b05-4310-4f28-9d81-ad20860311f3", - "01559f57-62ca-45ba-80b1-d2aafdc46f44", - "015f35b8-655a-4720-9b88-c1c09f6562cb", - "016613ba-4e65-44d5-94d1-e24605afc7e1", - "0170cead-c9cd-48ba-9819-6c5d2e59947e", - "01792c67-910f-4ad6-8912-9b1341cbd983", - "017ea8f2-fc5a-4660-92ec-c203daaaa631", - "018728bb-c376-4562-9ccb-8e3c3fd70df6", - "018a34a9-55da-4503-8aee-e728ba4be146", - "019b547a-79c7-47b3-a5ae-f11d30c2b0de"] - # This occ has 16 issues in IDB, 0 in GBIF - occids = ["2facc7a2-dd88-44af-b95a-733cc27527d4", - "2c1becd5-e641-4e83-b3f5-76a55206539a"] - occids = ["bffe655b-ea32-4838-8e80-a80e391d5b11"] - occids = ["db193603-1ed3-11e3-bfac-90b11c41863e"] + dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" svc = CountSvc() - out = svc.get_endpoint() - print_analyst_output(out, do_print_rec=True) - - coll_id = "a7156437-55ec-4c6f-89de-938f9361753d" - org_id = None - out = svc.get_counts(coll_id, org_id) - print_analyst_output(out, do_print_rec=True) - - # for occid in occids: - # out = svc.get_occurrence_records(occid=occid, provider=None, count_only=False) - # outputs = out["records"] - # print_broker_output(out, do_print_rec=True) + response = svc.get_endpoint() + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None) + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py new file mode 100644 index 00000000..2eeafa92 --- /dev/null +++ b/flask_app/analyst/rank.py @@ -0,0 +1,93 @@ +"""Class for the Specify Network Name API service.""" +from http import HTTPStatus +from werkzeug.exceptions import BadRequest + +from flask_app.common.s2n_type import APIService, AnalystOutput +from flask_app.analyst.base import _AnalystService + +from sppy.aws.aws_constants import PROJ_BUCKET +from sppy.tools.provider.spnet import SpNetAnalyses +from sppy.tools.s2n.utils import (combine_errinfo, get_traceback) + + +# ............................................................................. +class RankSvc(_AnalystService): + """Specify Network API service for retrieving taxonomic information.""" + SERVICE_TYPE = APIService.Rank + ORDERED_FIELDNAMES = [] + + # ............................................... + @classmethod + def rank_counts(cls, count_by, order=None, limit=10): + """Return occurrence and species counts for dataset/organization identifiers. + + Args: + count_by: URL parameter indicating rank datasets by counts of "species" or + "occurrence" . + order: URL parameter indicating whether to rank in "descending" or + "ascending" order. + limit: integer URL parameter specifying the number of ordered records to + return. + + full_output (flask_app.common.s2n_type.AnalystOutput): including records + as a list of lists (CSV) or dictionaries (JSON) of records + containing dataset_key, occurrence count, and species count. + """ + if count_by is None: + return cls.get_endpoint() + + records = [] + try: + good_params, errinfo = cls._standardize_params( + count_by=count_by, order=order, limit=limit) + + except BadRequest as e: + errinfo = {"error": [e.description]} + + else: + # Query for ordered dataset counts + try: + records, errors = cls._get_ordered_counts( + good_params["count_by"], good_params["order"], + good_params["limit"]) + except Exception: + errors = {"error": [get_traceback()]} + + # Combine errors from success or failure + errinfo = combine_errinfo(errinfo, errors) + + # Assemble + full_out = AnalystOutput( + cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"], + records=records, errors=errinfo) + + return full_out.response + + # ............................................... + @classmethod + def _get_ordered_counts(cls, count_by, order, limit): + records = [] + s3 = SpNetAnalyses(PROJ_BUCKET) + try: + records, errinfo = s3.rank_dataset_counts(count_by, order, limit) + + except Exception: + errinfo = {"error": [get_traceback()]} + + return records, errinfo + +# ............................................................................. +if __name__ == "__main__": + dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + + svc = RankSvc() + response = svc.get_endpoint() + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + count_by = "species" + order = "descending" + limit = 5 + response = svc.rank_counts(count_by) + AnalystOutput.print_output(response, do_print_rec=True) + # print(response) + diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py index f242e289..9bd338a7 100644 --- a/flask_app/analyst/routes.py +++ b/flask_app/analyst/routes.py @@ -1,8 +1,12 @@ """URL Routes for the Specify Network API services.""" from flask import Blueprint, Flask, render_template, request +import os from flask_app.analyst.count import CountSvc -from flask_app.common.constants import (STATIC_DIR, TEMPLATE_DIR) +from flask_app.analyst.rank import RankSvc +from flask_app.common.constants import ( + STATIC_DIR, TEMPLATE_DIR, SCHEMA_DIR, SCHEMA_ANALYST_FNAME) +from flask_app.common.s2n_type import APIEndpoint analyst_blueprint = Blueprint( "analyst", __name__, template_folder=TEMPLATE_DIR, static_folder=STATIC_DIR, @@ -18,37 +22,37 @@ def index(): return render_template("analyst.index.html") -# # ..................................................................................... -# @app.route("/api/v1/", methods=["GET"]) -# def analyst_status(): -# """Get services available from broker. -# -# Returns: -# dict: A dictionary of status information for the server. -# """ -# endpoints = APIEndpoint.get_analyst_endpoints() -# system_status = "In Development" -# return { -# "num_services": len(endpoints), -# "endpoints": endpoints, -# "status": system_status -# } -# +# ..................................................................................... +@app.route("/api/v1/", methods=["GET"]) +def analyst_status(): + """Get services available from broker. + + Returns: + dict: A dictionary of status information for the server. + """ + endpoints = APIEndpoint.get_analyst_endpoints() + system_status = "In Development" + return { + "num_services": len(endpoints), + "endpoints": endpoints, + "status": system_status + } + + +# .......................... +@app.route("/api/v1/schema") +def display_raw_schema(): + """Show the schema XML. + + Returns: + schema: the schema for the Specify Network. + """ + fname = os.path.join(SCHEMA_DIR, SCHEMA_ANALYST_FNAME) + with open(fname, "r") as f: + schema = f.read() + return schema + -# # .......................... -# @app.route("/api/v1/schema") -# def display_raw_schema(): -# """Show the schema XML. -# -# Returns: -# schema: the schema for the Specify Network. -# """ -# fname = os.path.join(SCHEMA_DIR, SCHEMA_FNAME) -# with open(fname, "r") as f: -# schema = f.read() -# return schema -# -# # # .......................... # @app.route("/api/v1/swaggerui") # def swagger_ui(): @@ -69,15 +73,37 @@ def count_endpoint(): response: A flask_app.analyst API response object containing the count API response. """ - coll_arg = request.args.get("collection_id", default=None, type=str) - org_arg = request.args.get("organization_id", default=None, type=str) - if coll_arg is None and org_arg is None: + ds_arg = request.args.get("dataset_key", default=None, type=str) + if ds_arg is None: response = CountSvc.get_endpoint() else: - response = CountSvc.get_counts(coll_arg, org_arg) + response = CountSvc.get_counts(ds_arg) return response +# ..................................................................................... +@app.route("/api/v1/rank/") +def rank_endpoint(): + """Get the available counts. + + Returns: + response: A flask_app.analyst API response object containing the count + API response. + """ + count_by_arg = request.args.get("count_by", default=None, type=str) + order_arg = request.args.get("order", default=None, type=str) + limit_arg = request.args.get("limit", default=10, type=int) + print( + f"*** count_by_arg={count_by_arg}, order_arg={order_arg}, " + f"limit_arg={limit_arg} ***") + # if coll_arg is None and org_arg is None: + if count_by_arg is None: + response = RankSvc.get_endpoint() + else: + response = RankSvc.rank_counts( + count_by_arg, order=order_arg, limit=limit_arg) + return response + # # ..................................................................................... # @app.route("/api/v1/collection/", methods=["GET"]) # def collection_get(): diff --git a/flask_app/broker/badge.py b/flask_app/broker/badge.py index 9be666d6..d95bcff5 100644 --- a/flask_app/broker/badge.py +++ b/flask_app/broker/badge.py @@ -6,7 +6,7 @@ from flask_app.broker.constants import (ICON_CONTENT, ICON_DIR) from flask_app.common.s2n_type import APIService, S2nKey, ServiceProvider -from sppy.tools.s2n.utils import get_traceback +from sppy.tools.s2n.utils import combine_errinfo, get_traceback from flask_app.broker.base import _BrokerService @@ -82,34 +82,27 @@ def get_icon( try: good_params, errinfo = cls._standardize_params( provider=provider, icon_status=icon_status) - # Bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except Exception: - pass - except Exception: - # Unknown error - error_description = get_traceback() - raise BadRequest(error_description) + except BadRequest as e: + raise - icon_basename = cls._get_icon_filename( - good_params["provider"][0], good_params["icon_status"]) - icon_fname = os.path.join(app_path, ICON_DIR, icon_basename) + else: + icon_basename = cls._get_icon_filename( + good_params["provider"][0], good_params["icon_status"]) + icon_fname = os.path.join(app_path, ICON_DIR, icon_basename) + + if icon_fname is not None: + if stream: + return send_file( + icon_fname, mimetype=ICON_CONTENT, as_attachment=False) + else: + return send_file( + icon_fname, mimetype=ICON_CONTENT, as_attachment=True, + attachment_filename=icon_fname) - if icon_fname is not None: - if stream: - return send_file( - icon_fname, mimetype=ICON_CONTENT, as_attachment=False) else: - return send_file( - icon_fname, mimetype=ICON_CONTENT, as_attachment=True, - attachment_filename=icon_fname) - - else: - raise NotImplementedError( - f"Badge {icon_status} not implemented for provider {provider}") + raise NotImplementedError( + f"Badge {icon_status} not implemented for provider {provider}") # ............................................................................. diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py index 5b5afb0c..6c47a8ad 100644 --- a/flask_app/broker/base.py +++ b/flask_app/broker/base.py @@ -2,26 +2,16 @@ from flask import Flask from werkzeug.exceptions import BadRequest, InternalServerError -import sppy.tools.s2n.utils as lmutil +from sppy.tools.s2n.utils import add_errinfo, combine_errinfo, get_traceback +from flask_app.common.base import _SpecifyNetworkService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider) from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.itis import ItisAPI -app = Flask(__name__) - # ............................................................................. -@app.errorhandler(BadRequest) -def handle_bad_request(e): - return f"Bad request: {e}" - -@app.errorhandler(InternalServerError) -def handle_bad_response(e): - return f"Internal Server Error: {e}" - -# ............................................................................. -class _BrokerService: +class _BrokerService(_SpecifyNetworkService): """Base S-to-the-N service, handles parameter names and acceptable values.""" # overridden by subclasses SERVICE_TYPE = APIService.BrokerRoot @@ -84,59 +74,6 @@ def get_providers(cls, filter_params=None): provnames = cls._order_providers(provnames) return provnames - # ............................................................................. - @classmethod - def _get_valid_requested_params(cls, user_params_string, valid_params): - """Return valid and invalid options for parameters that accept >1 values. - - Args: - user_params_string: user-requested parameters as a string. - valid_params: valid parameter values - - Returns: - valid_requested_params: list of valid params from the provided query string - invalid_params: list of invalid params from the provided query string - - Note: - For the badge service, exactly one provider is required. For all other - services, multiple providers are accepted, and None indicates to query all - valid providers. - """ - valid_requested_params = invalid_params = [] - - if user_params_string: - tmplst = user_params_string.split(",") - user_params = {tp.lower().strip() for tp in tmplst} - - valid_requested_params = set() - invalid_params = set() - # valid_requested_providers, invalid_providers = - # cls.get_multivalue_options(user_provs, valid_providers) - for param in user_params: - if param in valid_params: - valid_requested_params.add(param) - else: - invalid_params.add(param) - - invalid_params = list(invalid_params) - if valid_requested_params: - valid_requested_params = list(valid_requested_params) - else: - valid_requested_params = [] - - return valid_requested_params, invalid_params - - # ............................................................................. - @classmethod - def endpoint(cls): - """Return the URL endpoint for this class. - - Returns: - URL endpoint for the service - """ - endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}" - return endpoint - # ............................................... @classmethod def get_endpoint(cls, **kwargs): @@ -236,154 +173,6 @@ def match_name_with_itis(self, namestr): pass return namestr - # ............................................... - @classmethod - def _fix_type_new(cls, key, provided_val): - """Modify a parameter value to a valid type and value. - - Args: - key: parameter key - provided_val: user-provided parameter value - - Returns: - usr_val: a valid value for the parameter - valid_options: list of valid options (for error message) - - Note: - Corrections: - * cast to correct type - * validate with any options - * if value is invalid (type or value), return the default. - """ - valid_options = None - if provided_val is None: - return None - # all strings are lower case - try: - provided_val = provided_val.lower() - except Exception: - pass - - param_meta = cls.SERVICE_TYPE["params"][key] - # First see if restricted to options - default_val = param_meta["default"] - type_val = param_meta["type"] - # If restricted options, check - try: - options = param_meta["options"] - except KeyError: - options = None - else: - # Invalid option returns default value - if provided_val in options: - usr_val = provided_val - else: - valid_options = options - usr_val = default_val - - # If not restricted to options - if options is None: - # Cast values to correct type. Failed conversions return default value - if isinstance(type_val, str) and not options: - usr_val = str(provided_val) - - elif isinstance(type_val, float): - try: - usr_val = float(provided_val) - except ValueError: - usr_val = default_val - - # Boolean also tests as int, so try boolean first - elif isinstance(type_val, bool): - if provided_val in (0, "0", "n", "no", "f", "false"): - usr_val = False - elif provided_val in (1, "1", "y", "yes", "t", "true"): - usr_val = True - else: - valid_options = (True, False) - usr_val = default_val - - elif isinstance(type_val, int): - try: - usr_val = int(provided_val) - except ValueError: - usr_val = default_val - - else: - usr_val = provided_val - - return usr_val, valid_options - - # ............................................... - @classmethod - def _process_params(cls, user_kwargs=None): - """Modify all user provided keys to lowercase and values to correct types. - - Args: - user_kwargs: dictionary of keywords and values sent by the user for - the current service. - - Returns: - good_params: dictionary of valid parameters and values - errinfo: dictionary of errors for different error levels. - - Note: - A list of valid values for a keyword can include None as a default - if user-provided value is invalid - Todo: - Do we need not_in_valid_options for error message? - """ - good_params = {} - errinfo = {} - - # Correct all parameter keys/values present - for key, param_meta in cls.SERVICE_TYPE["params"].items(): - val = user_kwargs[key] - # Done in calling function - if key == "provider": - pass - - # Do not edit namestr, maintain capitalization - elif key == "namestr": - good_params["namestr"] = val - - # Require one valid icon_status - elif key == "icon_status": - valid_stat = param_meta["options"] - if val is None: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Parameter {key} containing one of {valid_stat} options is " - f"required") - elif val not in valid_stat: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Value {val} for parameter {key} not in valid options " - f"{valid_stat}") - else: - good_params[key] = val - - elif val is not None: - usr_val, valid_options = cls._fix_type_new(key, val) - if valid_options is not None and val not in valid_options: - errinfo = lmutil.add_errinfo( - errinfo, "error", - f"Value {val} for parameter {key} is not in valid options " - f"{param_meta['options']}") - good_params[key] = None - else: - good_params[key] = usr_val - - # Fill in defaults for missing parameters - for key in cls.SERVICE_TYPE["params"]: - param_meta = cls.SERVICE_TYPE["params"][key] - try: - _ = good_params[key] - except KeyError: - good_params[key] = param_meta["default"] - - return good_params, errinfo - # ............................................... @classmethod def _get_providers_from_string(cls, usr_req_providers, filter_params=None): @@ -404,14 +193,14 @@ def _get_providers_from_string(cls, usr_req_providers, filter_params=None): providers = valid_requested_providers[0] else: providers = None - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "error", f"Parameter provider containing exactly one of {valid_providers} " f"options is required") if invalid_providers: for ip in invalid_providers: - errinfo = lmutil.add_errinfo( + errinfo = add_errinfo( errinfo, "warning", f"Value {ip} for parameter provider not in valid options " f"{valid_providers}") @@ -449,6 +238,9 @@ def _standardize_params( a dictionary containing keys and properly formatted values for the user specified parameters. + Raises: + BadRequest on invalid query parameters + Note: filter_params is present to distinguish between providers for occ service by occurrence_id or by dataset_id. @@ -465,22 +257,27 @@ def _standardize_params( "gbif_dataset_key": gbif_dataset_key, "count_only": count_only, "url": url, - # "bbox": bbox, - # "exceptions": exceptions, - # "height": height, - # "layers": layers, - # "request": request, - # "format": frmat, - # "srs": srs, - # "width": width, "icon_status": icon_status} - providers, prov_errinfo = cls._get_providers_from_string( + providers, errinfo = cls._get_providers_from_string( provider, filter_params=filter_params) - usr_params, errinfo = cls._process_params(user_kwargs) + + try: + usr_params, param_errinfo = cls._process_params(user_kwargs) + except Exception: + error_description = get_traceback() + raise BadRequest(error_description) + # consolidate parameters and errors usr_params["provider"] = providers - errinfo = lmutil.combine_errinfo(errinfo, prov_errinfo) + errinfo = combine_errinfo(errinfo, param_errinfo) + + # errinfo["error"] indicates bad parameters, throws exception + try: + error_description = "; ".join(errinfo["error"]) + raise BadRequest(error_description) + except KeyError: + pass # Remove gbif_parse and itis_match flags gbif_parse = itis_match = False @@ -492,24 +289,14 @@ def _standardize_params( itis_match = usr_params.pop("itis_match") except Exception: pass + # Replace namestr with GBIF-parsed namestr if namestr and (gbif_parse or itis_match): usr_params["namestr"] = cls.parse_name_with_gbif(namestr) return usr_params, errinfo - # .......................... - @staticmethod - def OPTIONS(): - """Common options request for all services (needed for CORS).""" - return - # ............................................................................. if __name__ == "__main__": - kwarg_defaults = { - "count_only": False, - "width": 600, - "height": 300, - "type": [], - } + pass diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py index 4b1e2e63..cecde97c 100644 --- a/flask_app/broker/name.py +++ b/flask_app/broker/name.py @@ -1,10 +1,9 @@ """Class for the Specify Network Name API service.""" -from werkzeug.exceptions import (BadRequest, InternalServerError) +from werkzeug.exceptions import BadRequest from flask_app.broker.base import _BrokerService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, BrokerSchema, S2nKey, ServiceProvider) -from flask_app.common.util import print_broker_output from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.itis import ItisAPI @@ -156,35 +155,24 @@ def get_name_records( good_params, errinfo = cls._standardize_params( namestr=namestr, provider=provider, is_accepted=is_accepted, gbif_parse=gbif_parse, gbif_count=gbif_count, kingdom=kingdom) - # Bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) - try: - # Do Query! - output = cls._get_records( - good_params["namestr"], good_params["provider"], - good_params["is_accepted"], good_params["gbif_count"], - good_params["kingdom"]) + except BadRequest as e: + full_output = cls._get_badquery_output(e.description) - # Add message on invalid parameters to output + else: try: - for err in errinfo["warning"]: - output.append_error("warning", err) - except KeyError: - pass + # Do Query!, returns BrokerOutput + full_output = cls._get_records( + good_params["namestr"], good_params["provider"], + good_params["is_accepted"], good_params["gbif_count"], + good_params["kingdom"]) + except Exception: + full_output = cls._get_badquery_output(get_traceback()) - except Exception: - error_description = get_traceback() - raise InternalServerError(error_description) + # Combine with errors from parameters + full_output.combine_errors(errinfo) - return output.response + return full_output.response # ............................................................................. @@ -206,7 +194,7 @@ def get_name_records( svc = NameSvc() for namestr in test_names: - out = svc.get_name_records( + response = svc.get_name_records( namestr=namestr, provider=None, is_accepted=False, gbif_parse=True, gbif_count=True, kingdom=None) - print_broker_output(out, do_print_rec=True) + BrokerOutput.print_output(response, do_print_rec=True) diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py index 02c5fc20..3a6fb450 100644 --- a/flask_app/broker/occ.py +++ b/flask_app/broker/occ.py @@ -4,7 +4,6 @@ from flask_app.broker.base import _BrokerService from flask_app.common.s2n_type import ( APIEndpoint, APIService, BrokerOutput, BrokerSchema, S2nKey, ServiceProvider) -from flask_app.common.util import print_broker_output from sppy.tools.provider.gbif import GbifAPI from sppy.tools.provider.idigbio import IdigbioAPI @@ -144,54 +143,36 @@ def get_occurrence_records( a count and records kwargs: any additional keyword arguments are ignored - Raises: - BadRequest: on invalid query parameters. - BadRequest: on unknown exception parsing parameters. - InternalServerError: on unknown exception when executing request - Returns: - a flask_app.broker.s2n_type.BrokerOutput object with optional records as a - list of dictionaries of records corresponding to specimen occurrences in - the provider database. + full_output (flask_app.common.s2n_type.BrokerOutput): including records + as a list of dictionaries of records corresponding to specimen + occurrences in the provider database. """ if occid is None and gbif_dataset_key is None: return cls.get_endpoint() else: - # No filter_params defined for Name service yet try: good_params, errinfo = cls._standardize_params( occid=occid, provider=provider, gbif_dataset_key=gbif_dataset_key, count_only=count_only) - # Bad parameters - try: - error_description = "; ".join(errinfo["error"]) - raise BadRequest(error_description) - except KeyError: - pass - - except Exception: - error_description = get_traceback() - raise BadRequest(error_description) - # Do Query! - try: - output = cls._get_records( - good_params["occid"], good_params["provider"], - good_params["count_only"], - gbif_dataset_key=good_params["gbif_dataset_key"]) + except BadRequest as e: + full_output = cls._get_badquery_output(e.description) - # Add message on invalid parameters to output + else: try: - for err in errinfo["warning"]: - output.append_error("warning", err) - except KeyError: - pass + # Do Query!, returns BrokerOutput + full_output = cls._get_records( + good_params["occid"], good_params["provider"], + good_params["count_only"], + gbif_dataset_key=good_params["gbif_dataset_key"]) + except Exception: + full_output = cls._get_badquery_output(get_traceback()) - except Exception: - error_description = get_traceback() - raise InternalServerError(error_description) + # Combine with errors from parameters + full_output.combine_errors(errinfo) - return output.response + return full_output.response # ............................................................................. @@ -219,11 +200,11 @@ def get_occurrence_records( svc = OccurrenceSvc() out = svc.get_endpoint() - out = svc.get_occurrence_records(occid="a7156437-55ec-4c6f-89de-938f9361753d") + response = svc.get_occurrence_records(occid="a7156437-55ec-4c6f-89de-938f9361753d") - print_broker_output(out, do_print_rec=True) + BrokerOutput.print_output(response, do_print_rec=True) # for occid in occids: - # out = svc.get_occurrence_records(occid=occid, provider=None, count_only=False) - # outputs = out["records"] - # print_broker_output(out, do_print_rec=True) + # response = svc.get_occurrence_records(occid=occid, provider=None, count_only=False) + # recs = response["records"] + # BrokerOutput.print_output(response, do_print_rec=True) diff --git a/flask_app/broker/routes.py b/flask_app/broker/routes.py index cbaabfcf..347e93f4 100644 --- a/flask_app/broker/routes.py +++ b/flask_app/broker/routes.py @@ -1,10 +1,10 @@ """URL Routes for the Specify Network API services.""" -import os from flask import Blueprint, Flask, render_template, request +import os # from flask_app.application import create_app from flask_app.common.constants import ( - TEMPLATE_DIR, STATIC_DIR, SCHEMA_DIR, SCHEMA_FNAME + TEMPLATE_DIR, STATIC_DIR, SCHEMA_DIR, SCHEMA_BROKER_FNAME ) from flask_app.common.s2n_type import APIEndpoint @@ -54,7 +54,7 @@ def display_raw_schema(): Returns: schema: the schema for the Specify Network. """ - fname = os.path.join(SCHEMA_DIR, SCHEMA_FNAME) + fname = os.path.join(SCHEMA_DIR, SCHEMA_BROKER_FNAME) with open(fname, "r") as f: schema = f.read() return schema @@ -68,7 +68,7 @@ def swagger_ui(): Returns: a webpage UI of the Specify Network schema. """ - return render_template("swagger_ui.html") + return render_template("swagger_ui.broker.html") # ..................................................................................... diff --git a/flask_app/common/base.py b/flask_app/common/base.py new file mode 100644 index 00000000..0d599930 --- /dev/null +++ b/flask_app/common/base.py @@ -0,0 +1,269 @@ +"""Parent Class for the Specify Network API services.""" +from flask import Flask +from werkzeug.exceptions import BadRequest, InternalServerError + +from sppy.tools.s2n.utils import add_errinfo +from flask_app.common.s2n_type import APIEndpoint + +app = Flask(__name__) + + +# ............................................................................. +@app.errorhandler(BadRequest) +def handle_bad_request(e): + return f"Bad request: {e}" + +@app.errorhandler(InternalServerError) +def handle_bad_response(e): + return f"Internal Server Error: {e}" + +# ............................................................................. +class _SpecifyNetworkService: + """Base S-to-the-N service, handles parameter names and acceptable values.""" + # overridden by subclasses + SERVICE_TYPE = None + + + # ............................................................................. + @classmethod + def _get_valid_requested_params(cls, user_params_string, valid_params): + """Return valid and invalid options for parameters that accept >1 values. + + Args: + user_params_string: user-requested parameters as a string. + valid_params: valid parameter values + + Returns: + valid_requested_params: list of valid params from the provided query string + invalid_params: list of invalid params from the provided query string + + Note: + For the badge service, exactly one provider is required. For all other + services, multiple providers are accepted, and None indicates to query all + valid providers. + """ + valid_requested_params = invalid_params = [] + + if user_params_string: + tmplst = user_params_string.split(",") + user_params = {tp.lower().strip() for tp in tmplst} + + valid_requested_params = set() + invalid_params = set() + # valid_requested_providers, invalid_providers = + # cls.get_multivalue_options(user_provs, valid_providers) + for param in user_params: + if param in valid_params: + valid_requested_params.add(param) + else: + invalid_params.add(param) + + invalid_params = list(invalid_params) + if valid_requested_params: + valid_requested_params = list(valid_requested_params) + else: + valid_requested_params = [] + + return valid_requested_params, invalid_params + + # ............................................................................. + @classmethod + def endpoint(cls): + """Return the URL endpoint for this class. + + Returns: + URL endpoint for the service + """ + endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}" + return endpoint + + # ............................................... + @classmethod + def _fix_type_new(cls, key, provided_val): + """Modify a parameter value to a valid type and value. + + Args: + key: parameter key + provided_val: user-provided parameter value + + Returns: + usr_val: a valid value for the parameter + valid_options: list of valid options (for error message) + + Note: + Corrections: + * cast to correct type + * validate with any options + * if value is invalid (type or value), return the default. + """ + valid_options = None + if provided_val is None: + return None + # all strings are lower case + try: + provided_val = provided_val.lower() + except Exception: + pass + + param_meta = cls.SERVICE_TYPE["params"][key] + # First see if restricted to options + default_val = param_meta["default"] + type_val = param_meta["type"] + # If restricted options, check + try: + options = param_meta["options"] + except KeyError: + options = None + else: + # Invalid option returns default value + if provided_val in options: + usr_val = provided_val + else: + valid_options = options + usr_val = default_val + + # If not restricted to options + if options is None: + # Cast values to correct type. Failed conversions return default value + if isinstance(type_val, str) and not options: + usr_val = str(provided_val) + + # Boolean also tests as int, so try boolean first + elif isinstance(type_val, bool): + if provided_val in (0, "0", "n", "no", "f", "false"): + usr_val = False + elif provided_val in (1, "1", "y", "yes", "t", "true"): + usr_val = True + else: + valid_options = (True, False) + usr_val = default_val + else: + usr_val = cls._test_numbers(provided_val, param_meta) + + return usr_val, valid_options + + + # ............................................... + @classmethod + def _test_numbers(cls, provided_val, param_meta): + default_val = param_meta["default"] + type_val = param_meta["type"] + # If restricted numeric values, check + try: + min_val = param_meta["min"] + except KeyError: + min_val = None + # If restricted numeric values, check + try: + max_val = param_meta["max"] + except KeyError: + max_val = None + + if isinstance(type_val, float): + try: + usr_val = float(provided_val) + except ValueError: + usr_val = default_val + else: + if min_val and usr_val < min_val: + usr_val = min_val + if max_val and usr_val > max_val: + usr_val = max_val + + elif isinstance(type_val, int): + try: + usr_val = int(provided_val) + except ValueError: + usr_val = default_val + else: + if min_val and usr_val < min_val: + usr_val = min_val + if max_val and usr_val > max_val: + usr_val = max_val + + else: + usr_val = provided_val + + return usr_val + + # ............................................... + @classmethod + def _process_params(cls, user_kwargs=None): + """Modify all user provided keys to lowercase and values to correct types. + + Args: + user_kwargs: dictionary of keywords and values sent by the user for + the current service. + + Returns: + good_params: dictionary of valid parameters and values + errinfo: dictionary of errors for different error levels. + + Note: + A list of valid values for a keyword can include None as a default + if user-provided value is invalid + Todo: + Do we need not_in_valid_options for error message? + """ + good_params = {} + errinfo = {} + + # Correct all parameter keys/values present + for key, param_meta in cls.SERVICE_TYPE["params"].items(): + val = user_kwargs[key] + # Done in calling function + if key == "provider": + pass + + # Do not edit namestr, maintain capitalization + elif key == "namestr": + good_params["namestr"] = val + + # Require one valid icon_status + elif key == "icon_status": + valid_stat = param_meta["options"] + if val is None: + errinfo = add_errinfo( + errinfo, "error", + f"Parameter {key} containing one of {valid_stat} options is " + f"required") + elif val not in valid_stat: + errinfo = add_errinfo( + errinfo, "error", + f"Value {val} for parameter {key} not in valid options " + f"{valid_stat}") + else: + good_params[key] = val + + elif val is not None: + usr_val, valid_options = cls._fix_type_new(key, val) + if valid_options is not None and val not in valid_options: + errinfo = add_errinfo( + errinfo, "error", + f"Value {val} for parameter {key} is not in valid options " + f"{param_meta['options']}") + good_params[key] = None + else: + good_params[key] = usr_val + + # Fill in defaults for missing parameters + for key in cls.SERVICE_TYPE["params"]: + param_meta = cls.SERVICE_TYPE["params"][key] + try: + _ = good_params[key] + except KeyError: + good_params[key] = param_meta["default"] + + return good_params, errinfo + + + # .......................... + @staticmethod + def OPTIONS(): + """Common options request for all services (needed for CORS).""" + return + + +# ............................................................................. +if __name__ == "__main__": + pass diff --git a/flask_app/common/constants.py b/flask_app/common/constants.py index 3e50c197..1f782357 100644 --- a/flask_app/common/constants.py +++ b/flask_app/common/constants.py @@ -8,4 +8,5 @@ SCHEMA_DIR = f"{STATIC_DIR}/schema" TEMPLATE_DIR = "../templates" -SCHEMA_FNAME = "open_api.yaml" +SCHEMA_ANALYST_FNAME = "open_api.analyst.yaml" +SCHEMA_BROKER_FNAME = "open_api.broker.yaml" diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py index 2648ba3e..f5e6721e 100644 --- a/flask_app/common/s2n_type.py +++ b/flask_app/common/s2n_type.py @@ -81,11 +81,12 @@ class APIEndpoint: Occurrence = "occ" Frontend = "frontend" Count = "count" + Rank = "rank" @classmethod def Resources(cls): return { - cls.Analyst: [cls.Count], + cls.Analyst: [cls.Count, cls.Rank], cls.Broker: [ cls.Badge, @@ -163,18 +164,43 @@ class APIService: "name": APIEndpoint.Count, "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Count}", "params": { - "collection_id": { + "dataset_key": { "type": "", - "description": "Collection identifier", + "description": "GBIF Dataset Key", "default": None }, - "organization_id": { + "pub_org_key": { "type": "", - "description": "Organization identifier", + "description": "GBIF Publishing Organization Key", "default": None - } + }, }, - "description": "Return record count for the given collection or organization.", + "description": + "Return occurrence and species counts for the given dataset or " + "publishing organization.", + S2nKey.RECORD_FORMAT: "" + } + # Rankings + Rank = { + "name": APIEndpoint.Rank, + "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Rank}", + "params": { + "count_by": { + "type": "", + "options": ["occurrence", "species"], + "default": None + }, + "order": { + "type": "", + "options": ["ascending", "descending"], + "default": "descending" + }, + "limit": {"type": 2, "default": 10, "min": 1, "max": 500}, + }, + "description": + "Return an ordered list of datasets with occurrence and species counts " + "ranked by occurrence or species counts for the top X (descending) " + "or bottom X (ascending) datasets", S2nKey.RECORD_FORMAT: "" } # Taxonomic Resolution @@ -819,6 +845,19 @@ def append_error(self, error_type, error_desc): except KeyError: self._response[S2nKey.ERRORS][error_type] = [error_desc] + # ............................................... + def combine_errors(self, errinfo): + """Combine a dictionary of errors to the errors in a S2nOutput query response. + + Args: + errinfo: dictionary of errors, with error level, and list of descriptions. + """ + for err_type, err_desc in errinfo.items(): + try: + self._response[S2nKey.ERRORS][err_type].append(err_desc) + except KeyError: + self._response[S2nKey.ERRORS][err_type] = [err_desc] + # ............................................... @property def response(self): @@ -975,13 +1014,65 @@ def format_records(self, ordered_fieldnames): ordered_recs.append(ordrec) self._response[S2nKey.RECORDS] = ordered_recs + # ............................................................................. + @classmethod + def _print_sub_output(cls, oneelt, do_print_rec): + print("* One record of Specify Network Outputs *") + for name, attelt in oneelt.items(): + try: + if name == "records": + print(" records") + if do_print_rec is False: + print(f" {name}: {len(attelt)} returned records") + else: + for rec in attelt: + print(" record") + for k, v in rec.items(): + print(" {}: {}".format(k, v)) + else: + print(" {}: {}".format(name, attelt)) + except Exception: + pass + + # .................................... + @classmethod + def print_output(cls, response_dict, do_print_rec=False): + """Print a formatted string of the elements in an S2nOutput query response. + + Args: + response_dict: flask_app.broker.s2n_type.S2nOutput._response dictionary + do_print_rec: True to print each record in the response. + + TODO: move to a class method + """ + print("*** Broker output ***") + for name, attelt in response_dict.items(): + try: + if name == "records": + print("records: ") + for respdict in attelt: + cls._print_sub_output(respdict, do_print_rec) + else: + print(f"{name}: {attelt}") + except Exception: + pass + # outelts = set(response_dict.keys()) + # missing = S2nKey.broker_response_keys().difference(outelts) + # extras = outelts.difference(S2nKey.broker_response_keys()) + # if missing: + # print(f"Missing elements: {missing}") + # if extras: + # print(f"Extra elements: {extras}") + print("") + # ............................................................................. class AnalystOutput: """Response type for a Specify Network Analyst query.""" service: str description: str = "" - records: typing.List[dict] = [] + # records: typing.List[dict] = [] + records: typing.List = [] errors: dict = {} # ............................................... @@ -991,7 +1082,7 @@ def __init__(self, service, description=None, records=None, errors=None): Args: service: API Service this object is responding to. description: Description of the computation in this response. - records: Records in this response. + records: Records (lists or dictionaries) in this response. errors: Errors encountered when generating this response. """ if errors is None: @@ -1018,6 +1109,27 @@ def response(self): """ return self._response + # .................................... + @classmethod + def print_output(cls, response_dict, do_print_rec=False): + """Print a formatted string of the elements in an S2nOutput query response. + + Args: + response_dict: flask_app.broker.s2n_type.S2nOutput._response dictionary + do_print_rec: True to print each record in the response. + """ + print("*** Analyst output ***") + for name, attelt in response_dict.items(): + try: + if name == "records" and do_print_rec: + print("records: ") + for rec in attelt: + print(rec) + else: + print(f"{name}: {attelt}") + except Exception: + pass + # ............................................................................. class ServiceProvider: diff --git a/flask_app/common/util.py b/flask_app/common/util.py index 3b5abd13..df3bd5ff 100644 --- a/flask_app/common/util.py +++ b/flask_app/common/util.py @@ -15,77 +15,3 @@ def get_host_url(): if host_url.endswith("/"): host_url = host_url[:-1] return host_url - - -# ............................................................................. -def _print_sub_output(oneelt, do_print_rec): - print("* One record of Specify Network Outputs *") - for name, attelt in oneelt.items(): - try: - if name == "records": - print(" records") - if do_print_rec is False: - print(f" {name}: {len(attelt)} returned records") - else: - for rec in attelt: - print(" record") - for k, v in rec.items(): - print(" {}: {}".format(k, v)) - else: - print(" {}: {}".format(name, attelt)) - except Exception: - pass - - -# .................................... -def print_broker_output(response_dict, do_print_rec=False): - """Print a formatted string of the elements in an S2nOutput query response. - - Args: - response_dict: flask_app.broker.s2n_type.S2nOutput object - do_print_rec: True to print each record in the response. - - TODO: move to a class method - """ - print("*** Broker output ***") - for name, attelt in response_dict.items(): - try: - if name == "records": - print(f"{name}: ") - for respdict in attelt: - _print_sub_output(respdict, do_print_rec) - else: - print(f"{name}: {attelt}") - except Exception: - pass - # outelts = set(response_dict.keys()) - # missing = S2nKey.broker_response_keys().difference(outelts) - # extras = outelts.difference(S2nKey.broker_response_keys()) - # if missing: - # print(f"Missing elements: {missing}") - # if extras: - # print(f"Extra elements: {extras}") - print("") - - -# .................................... -def print_analyst_output(response_dict, do_print_rec=False): - """Print a formatted string of the elements in an S2nOutput query response. - - Args: - response_dict: flask_app.broker.s2n_type.S2nOutput object - do_print_rec: True to print each record in the response. - - TODO: move to a class method - """ - print("*** Analyst output ***") - for name, attelt in response_dict.items(): - try: - if name == "records": - print(f"{name}: ") - for respdict in attelt: - _print_sub_output(respdict, do_print_rec) - else: - print(f"{name}: {attelt}") - except Exception: - pass diff --git a/flask_app/templates/swagger_ui.html b/flask_app/templates/swagger_ui.broker.html similarity index 90% rename from flask_app/templates/swagger_ui.html rename to flask_app/templates/swagger_ui.broker.html index a6f4a5f3..1beb4832 100644 --- a/flask_app/templates/swagger_ui.html +++ b/flask_app/templates/swagger_ui.broker.html @@ -18,7 +18,7 @@ diff --git a/requirements.txt b/requirements.txt index 1af53c6e..8a96d76e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,8 @@ pykew>=0.1.3 gunicorn==20.1.0 rtree>=1.0.0 awscli -boto3 -pandas -pyarrow +boto3>=1.34.60 s3fs -ggshield \ No newline at end of file +fastparquet +sqlalchemy +pandas diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst index e5d5bb00..e8106037 100644 --- a/sphinx/about/install_run_notes.rst +++ b/sphinx/about/install_run_notes.rst @@ -6,157 +6,34 @@ Contains * Specify Network API services - * Tools/classes for broker, including + * Tools/classes for broker, including - * Flask application for individual API endpoints and frontend - * classes for Provider API connectors - * standardized API service output (s2n) + * Flask application for individual API endpoints and frontend + * classes for Provider API connectors + * standardized API service output (s2n) - * Tools/classes for analyst, including + * Tools/classes for analyst, including - * AWS scripts and - * Classes for use on EC2 or other AWS resources - * geotools for geospatial intersection/annotations - * aggregation, summary tools for writing tabular summaries - * + * AWS scripts and + * Classes for use on EC2 or other AWS resources + + * geotools for geospatial intersection/annotations + * aggregation, summary tools for writing tabular summaries Deployment =================================== -SSL ------------------------------------ - -Local self-signed certificates -......................................... -To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate -and the private key) using Let's Encrypt and link these files in `./sp_network/config/`. - -While in development, generate self-signed certificates then link them in -~/git/sp_network/config/ directory for this project: - -```zsh -$ mkdir ~/certificates - -openssl req \ - -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \ - -keyout ~/certificates/privkey.pem \ - -out ~/certificates/fullchain.pem - -$ cd ~/git/sp_network/config -$ ln -s ~/certificates/privkey.pem -$ ln -s ~/certificates/fullchain.pem -``` - -To run either the production or the development containers with HTTPS -support, generate `fullchain.pem` and `privkey.pem` (certificate and the private -key) using Let's Encrypt, link these files in the `./config/` directory. -Full instructions in the docs/aws-steps.rst page, under `Set up TLS/SSL` - -Modify the `FQDN` environment variable in `.env.conf` as needed. - -TLS/SSL using Certificate Authority (CA) -.................................................. - -* Make sure that DNS has propogated for domain for SSL -* Stop apache service -* request a certificate for the domain - -```commandline -ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2 -ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v -Saving debug log to /var/log/letsencrypt/letsencrypt.log - -How would you like to authenticate with the ACME CA? -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -1: Spin up a temporary webserver (standalone) -2: Place files in webroot directory (webroot) -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 -Plugins selected: Authenticator standalone, Installer None -Please enter the domain name(s) you would like on your certificate (comma and/or -space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org -Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org -Performing the following challenges: -http-01 challenge for broker-dev.spcoco.org -Waiting for verification... -Cleaning up challenges - -Successfully received certificate. -Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem -Key is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem -This certificate expires on 2023-10-18. -These files will be updated when the certificate renews. -Certbot has set up a scheduled task to automatically renew this certificate in the background. - -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -If you like Certbot, please consider supporting our work by: - * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate - * Donating to EFF: https://eff.org/donate-le -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -ubuntu@ip-172-31-86-62:~$ -``` - -* as superuser, link the newly created fullchain.pem and privkey.pem files from the - letsencrypt live to the project/config directory -* change the owner so that they can be used in Docker containers - -```commandline -$ sudo su - -# cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ -# chown ubuntu:ubuntu /home/ubuntu/certificates/* -# exit -$ cd ~/git/sp_network/config -$ ln -s ~/certificates/fullchain.pem -$ ln -s ~/certificates/privkey.pem -``` - -Renew Certbot SSL certificates -......................................... - -SSL certificates are served from the instance (AWS EC2), and need port 80 to be renewed. -These are administered by Letsencrypt using Certbot and are only valid for 90 days at -a time. When it is time for a renewal (approx every 60 days), bring the docker -containers down. Renew the certificates, then bring the containers up again. - -Amazon EC2 containers do not need apache running, certbot runs its own temp web server. - -Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3 - -```commandline -$ sudo certbot certificates -$ sudo docker compose stop -$ sudo su - -# certbot renew -# cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/ -# chown ubuntu:ubuntu /home/ubuntu/certificates/* -# exit -$ ls -lahtr ~/git/sp_network/config - -$ sudo docker system prune --all --volumes -$ sudo docker compose up -d -``` - -TODO: SSL through Amazon -......................................... - -* Create Elastic IP address for EC2 instance -* Request a public certificate through Certificate Manager (ACM) - * Choose DNS validation - * Add tags sp_network, dev or prod, others - - Install ====================================== Install dependencies --------------------------------------- -Certbot: +AWS Client, Certbot:: + + $ sudo apt update + $ sudo apt install awscli, certbot -```commandline -$ sudo apt update -$ sudo apt install certbot -``` Install Docker --------------------------------------- @@ -164,270 +41,222 @@ Install Docker Add docker repository, then use apt to install Docker: https://docs.docker.com/engine/install/ubuntu/ -Install repo from Github +Install/Update repo from Github --------------------------------------- * generate an SSH key for communicating with Github * Add SSH key to agent on local machine -```commandline -$ ssh-keygen -t rsa -b 4096 -C "aimee.stewart@ku.edu" -$ eval "$(ssh-agent -s)" -$ ssh-add ~/.ssh/id_rsa -$ cat .ssh/id_rsa.pub -``` -* Add the SSH to Github by printing to console, copying, adding in Github profile -* clone the repository - -```commandline -$ cat .ssh/id_rsa.pub -$ # -$ cd ~/git -$ git clone git@github.com:specifysystems/sp_network.git -$ git checkout -``` - -Install certificates into config directory -------------------------------------------------------- - -* Link the certificates in the repo config directory - -```commandline -$ cd ~/git/sp_network -$ cd config -$ ln -s ~/certificates/fullchain1.pem -$ ln -s ~/certificates/privkey1.pem -``` - -Testing ---------------------------------------- -On a development server, check the following URL endpoints: - -* Index page: https://localhost - -* Broker: - * https://localhost/api/v1/ - * https://localhost/api/v1/badge/ - * https://localhost/api/v1/name/ - * https://localhost/api/v1/occ/ - * https://localhost/api/v1/frontend/ - - * https://localhost/api/v1/badge/gbif?icon_status=active - * https://localhost/api/v1/occ/?occid=a7156437-55ec-4c6f-89de-938f9361753d - * https://localhost/api/v1/name/Harengula%20jaguana - * https://localhost/api/v1/frontend/?occid=a7156437-55ec-4c6f-89de-938f9361753d - -For local testing in a development environment, tests in the tests directory -require the lmtest module available at https://github.com/lifemapper/lmtest. +:: -Environment variables set in the Docker containers from the .env.broker.conf and -.env.broker.conf files are necessary to inform the host machine/container of its FQDN. + $ ssh-keygen -t rsa -b 4096 -C "aimee.stewart@ku.edu" + $ eval "$(ssh-agent -s)" + $ ssh-add ~/.ssh/id_rsa + $ cat .ssh/id_rsa.pub -**Temp solution:** Export these variables to the local environment in the python -virtual environment activation script (bin/activate) script. +* Add the SSH to Github by printing to console, copying, adding in Github profile +* clone the repository -```zsh -export SECRET_KEY="dev" -export WORKING_DIRECTORY="scratch-path" -``` +:: + $ cat .ssh/id_rsa.pub + $ # + $ cd ~/git + $ git clone git@github.com:specifysystems/sp_network.git + $ git checkout -**Specify Network** homepage is now available at https://localhost/ and http://localhost. +DNS +---------------------- -**Broker** (aka back-end): +If this is a development or production server with an actual domain, first point the +DNS record (through whatever service is managing the domain, GoDaddy in the case of +spcoco.org) to the static IP address for the server. - * https://localhost/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) - * https://localhost/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3 +For AWS, create (or modify) an Elastic IP address to point to the EC2 instance. -**Webpack** is watching for front-end file changes and rebuilds the bundle when -needed. +If replacing an EC2 instance, disassociate the Elastic IP address from the old EC2 +instance, and associate it with the new instance. -**Flask** is watching for back-end file changes and restarts the server when -needed. - -Troubleshooting -=========================================== +SSL +----------------------------------- +:ref:`Specify Network SSL certificates` -For webserver errors, check logs of nginx container:: -```commandline -$ sudo docker logs --tail 1000 sp_network-nginx-1 -$ sudo docker logs --tail 1000 sp_network-broker-1 -``` +Direct Docker to correct FQDN +------------------------------------ -Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container -Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+ -Then stop/rebuild/start: +Edit FQDN value in .env.analyst.conf and .env.broker.conf (referenced by the docker +compose file) and server_name in config/nginx.conf to actual FQDN. -```commandline -$ sudo docker compose stop -$ sudo docker system prune --all --volumes -$ sudo docker compose up -d -``` -Docker manipulation +Docker ================================= -Edit the docker environment files -------------------------------------------- +More info at :ref:`Docker` -* Add the container domain name to the files .env.broker.conf and .env.analyst.conf -* Change the FQDN value to the fully qualified domain name of the server. - * If this is a local testing deployment, it will be "localhost" - * For a development or production server it will be the FQDN with correct subdomain - for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and - FQDN=analyst.spcoco.org in .env.analyst.conf +AWS Config +================ -Run the containers (production) -------------------------------------------- +Boto3 getting Error "botocore.exceptions.NoCredentialsError -```zsh -sudo docker compose -f docker-compose.yml up -d -``` +Create credentials file on host EC2 instance -Specify Network is now available at [https://localhost/](https://localhost:443) +Test +=========================== +On a development server, check the following URL endpoints: +* Broker: -Run the containers (development) -------------------------------------------- + * https://localhost.broker + * https://localhost.broker/api/v1/ -Note that the development compose file, docker-compose.development.yml, is referenced -first on the command line. It has elements that override those defined in the -general compose file, docker-compose.yml. + * https://localhost.broker/api/v1/badge/ + * https://localhost.broker/api/v1/name/ + * https://localhost.broker/api/v1/occ/ + * https://localhost.broker/api/v1/frontend/ -```zsh -sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up -``` + * https://localhost.broker/api/v1/badge/gbif?icon_status=active + * https://localhost.broker/api/v1/occ/?occid=a7156437-55ec-4c6f-89de-938f9361753d + * https://localhost.broker/api/v1/name/Harengula%20jaguana + * https://localhost.broker/api/v1/frontend/?occid=a7156437-55ec-4c6f-89de-938f9361753d -Flask has hot-reload enabled. +* Analyst: + * https://localhost.analyst + * https://localhost.analyst/api/v1/ -Rebuild/restart -------------------------------------------- + * https://localhost.analyst/api/v1/count/ + * https://localhost.analyst/api/v1/rank/ -To delete all containers, images, networks and volumes, stop any running -containers: + * http://localhost.analyst/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515 + * http://localhost.analyst/api/v1/rank/?by_species=true -```zsh -sudo docker compose stop -``` +For local testing in a development environment, tests in the tests directory +require the lmtest module available at https://github.com/lifemapper/lmtest. -And run this command (which ignores running container): +Environment variables set in the Docker containers from the .env.broker.conf and +.env.broker.conf files are necessary to inform the host machine/container of its FQDN. -```zsh -sudo docker system prune --all --volumes -``` +**Temp solution:** Export these variables to the local environment in the python +virtual environment activation script (bin/activate) script:: -Then rebuild/restart: + export SECRET_KEY="dev" + export WORKING_DIRECTORY="scratch-path" -```zsh -sudo docker compose up -d -``` -Examine container -------------------------------------------- +**Specify Network** homepage is now available at https://localhost/ -To examine containers at a shell prompt: - -```zsh -sudo docker exec -it sp_network-nginx-1 /bin/sh -``` +**Broker** (aka back-end): -Error port in use: -"Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use" + * https://localhost/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) + * https://localhost/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3 -See what else is using the port. In my case apache was started on reboot. Bring down -all docker containers, shut down httpd, bring up docker. +**Webpack** is watching for front-end file changes and rebuilds the bundle when +needed. -```zsh -lsof -i -P -n | grep 443 -sudo docker compose down -sudo systemctl stop httpd -sudo docker compose up -d -``` +**Flask** is watching for back-end file changes and restarts the server when +needed. Dev Environment ========================== -* Create a virtual environment and install python libs there +* Base system libraries:: + + sudo apt get update + sudo apt install awscli, certbot, apt install python3.10-venv + +* Create a virtual environment and install python libs there:: -```commandline -$ cd ~/git/sp_network -$ python3 -m venv venv -$ . venv/bin/activate -$ pip install -r requirements.txt -``` + $ cd ~/git/sp_network + $ python3 -m venv venv + $ . venv/bin/activate + $ pip install -r requirements.txt -Configure Debugger in local IDE +Configure Debugger ======================================== +Pycharm +------------------ [Instructions for PyCharm] (https://kartoza.com/en/blog/using-docker-compose-based-python-interpreter-in-pycharm/) -Debug +Flask ------------------------------------------- To run flask in debug mode, first set up Flask environment, then start the flask application (in this case, main in flask_app.broker.routes.py). Only one resource (aka broker or analyst) at a time can be tested in this way. -Reset the FLASK_APP variable to test an alternate resource. +Reset the FLASK_APP variable to test an alternate resource:: -** the broker frontend can NOT be tested this way, as it depends on a docker volume - -```zsh -export FLASK_ENV=development -export FLASK_APP=flask_app.broker.routes:app -# or -# export FLASK_APP=flask_app.analyst.routes:app -flask run -``` + export FLASK_ENV=development + export FLASK_APP=flask_app.broker.routes:app + # or + # export FLASK_APP=flask_app.analyst.routes:app + flask run * `broker` container is running `debugpy` on localhost, port `5000` +* ** the broker frontend can NOT be tested this way, as it depends on a docker volume + * Test with http, no https!! - http://localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) - http://localhost:5000/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3 + http://broker.localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814) + http://broker.localhost:5000/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3 Troubleshooting ====================================== + +For webserver errors +----------------------- + +Check logs of nginx container:: + + $ sudo docker logs --tail 1000 sp_network-nginx-1 + $ sudo docker logs --tail 1000 sp_network-broker-1 + + +Import error from werkzeug.urls +-------------------------------------- + +Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container +Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+ +Then stop/rebuild/start:: + + $ sudo docker compose stop + $ sudo docker system prune --all --volumes + $ sudo docker compose up -d + + pip errors with SSL ------------------------------------------- - * add trusted-host option at command line +* add trusted-host option at command line:: + + pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org ~/git/lmpy -```commandline -pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org ~/git/lmpy -``` - * for processes that call pip, create a pip configuration file , then export as - PIP_CONFIG_FILE environment variable in .bashrc +* for processes that call pip, create a pip configuration file , then export as + PIP_CONFIG_FILE environment variable in .bashrc:: -```commandline -# ~/pip.conf -[install] -trusted-host = pypi.python.org - pypi.org - files.pythonhosted.org + # ~/pip.conf + [install] + trusted-host = pypi.python.org + pypi.org + files.pythonhosted.org -# ~/.bashrc -export PIP_CONFIG_FILE ~/pip.conf -``` + # ~/.bashrc + export PIP_CONFIG_FILE ~/pip.conf pre-commit errors with self-signed certificate --------------------------------------------------------- - * turn off verification (but this leaves you open to man-in-the-middle attacks) +* turn off verification (but this leaves you open to man-in-the-middle attacks):: -```commandline -git config --global http.sslVerify false -``` + git config --global http.sslVerify false - * turn on again with + * turn on again with:: -```commandline -git config --global http.sslVerify true + git config --global http.sslVerify true -``` pre-commit build errors -------------------------------------- @@ -436,13 +265,6 @@ pre-commit build errors * Updated .pre-commit-config.yaml isort version to latest, https://github.com/PyCQA/isort, fixed build -AWS setup -=================================== - -* Add raw GBIF data to S3 - - - Dependencies: ============== diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst index e657a105..0e1896d9 100644 --- a/sphinx/aws/aws-setup.rst +++ b/sphinx/aws/aws-setup.rst @@ -1,6 +1,35 @@ Authentication #################### +Create an IAM role for the EC2/Redshift/S3 interaction +*********************************************************** + +* Create a Role (Redshift-S3) for service Redshift to read/write to S3 + + * Add a policy allowing read and write access to the specnet S3 bucket + * Step 1: Trusted entity type = AWS service, Use Case = Redshift - Customizable. + + * TODO: change to Redshift - Scheduler when we automate the workflow + + * Step 2: Add permissions + + * AmazonRedshiftAllCommandsFullAccess (AWS managed) + * AmazonS3FullAccess (AWS managed) + +EC2 instance creation +=========================================================== + +* Instance type t3.small + + * Build fails with t2.micro or t3.micro with 1gb RAM + * t3.small is 2gb RAM + +* Ubuntu Server 22.04 LTS, SSD Volume Type (free tier eligible), x86 architecture +* Security Group: launch-wizard-1 +* 30 Gb General Purpose SSD (gp2) +* For dev, Spot instance (in Advanced options) +* Modify IAM role - for role created above (i.e. specnet_ec2_role) + For programmatic access to S3 =========================================================== Configure AWS credentials either through @@ -9,6 +38,13 @@ Configure AWS credentials either through * AWS CLI configuration (for command line tools), or * using an IAM role attached to your instance if running on AWS infrastructure. +The AWS cli depends on boto3, so both must be up to date. In my testing, awscli +1.27.118 (with requirement botocore==1.29.118) and boto3 1.28.1, failed on +S3 Select access. + +I upgraded awscli (sudo apt install awscli), then upgraded boto3 +(pip install --upgrade boto3) , which installed 1.34.60. Success + Redshift =========================================================== @@ -31,21 +67,6 @@ Overview or preserved specimen. This brings the full dataset from about 2.6 billion down to 2.3 billion. -Create an IAM role for the Redshift/S3 interaction -*********************************************************** - -* Create a Role (Redshift-S3) for service Redshift to read/write to S3 - - * Add a policy allowing read and write access to the specnet S3 bucket - * Step 1: Trusted entity type = AWS service, Use Case = Redshift - Customizable. - - * TODO: change to Redshift - Scheduler when we automate the workflow - - * Step 2: Add permissions - - * AmazonRedshiftAllCommandsFullAccess (AWS managed) - * AmazonS3FullAccess (AWS managed) - Create a new workgroup (and namespace) *********************************************************** @@ -251,6 +272,8 @@ Enable S3 access from local machine and EC2 Error: SSL *************************************** +First time: + Error message :: SSL validation failed for https://ec2.us-east-1.amazonaws.com/ @@ -264,6 +287,38 @@ Test with:: Fix: Set up to work with Secret containing security key +Second time (in python code): +>>> response = requests.get(url) +Traceback (most recent call last): + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 703, in urlopen + httplib_response = self._make_request( + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 386, in _make_request + self._validate_conn(conn) + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 1042, in _validate_conn + conn.connect() + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connection.py", line 419, in connect + self.sock = ssl_wrap_socket( + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket + ssl_sock = _ssl_wrap_socket_impl( + File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl + return ssl_context.wrap_socket(sock, server_hostname=server_hostname) + File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket + return self.sslsocket_class._create( + File "/usr/lib/python3.8/ssl.py", line 1069, in _create + self.do_handshake() + File "/usr/lib/python3.8/ssl.py", line 1338, in do_handshake + self._sslobj.do_handshake() +ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1131) + + +https://stackoverflow.com/questions/51925384/unable-to-get-local-issuer-certificate-when-using-requests + +pip install certifi + +import certifi +certifi.where() + + Workflow for Specify Network Analyst pre-computations =========================================================== diff --git a/sphinx/aws/aws_workflow.rst b/sphinx/aws/aws_workflow.rst index 6ed902f9..33100d8a 100644 --- a/sphinx/aws/aws_workflow.rst +++ b/sphinx/aws/aws_workflow.rst @@ -4,7 +4,7 @@ AWS Workflow Reference =========================================================== - * Stored procedures in rs_stored_procedures.sql +* Stored procedures in rs_stored_procedures.sql Steps @@ -14,9 +14,9 @@ Steps *********************************************************** * Redshift: Subset GBIF data from Amazon Registry of Open Data (AWS ODR) for processing - * First run rs_create_stored_procedures.sql to create procedures for the subset script. - * Next run rs_subset_gbif.sql to subset the data - * + + * First run rs_create_stored_procedures.sql to create procedures for the subset script. + * Next run rs_subset_gbif.sql to subset the data 1.5 TODO *********************************************************** diff --git a/sphinx/conf.py b/sphinx/conf.py index 96297158..1f14af2c 100644 --- a/sphinx/conf.py +++ b/sphinx/conf.py @@ -28,8 +28,9 @@ 'sphinx_rtd_theme', # 'autoapi.extension', 'myst_parser', # For MD support - ] - + # for internal links + 'sphinx.ext.autosectionlabel', +] templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] diff --git a/sphinx/flask/structure.rst b/sphinx/flask/structure.rst new file mode 100644 index 00000000..dbab00fa --- /dev/null +++ b/sphinx/flask/structure.rst @@ -0,0 +1,14 @@ +Structure +###################################### + +Specify Network consists of four Docker containers running on a single EC2 instance. + +The nginx and front-end containers support both the Analyst and Broker. Two flask +containers, one for Analyst, and one for Broker, expose the APIs of each to different +subdomains of the same domain. Code for each is in the flask_app.analyst and +flask_app.broker directories. In each, the routes.py file defines the different +endpoints. + + + + diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst index 29543b7e..92c20cf6 100644 --- a/sphinx/misc/debugging.rst +++ b/sphinx/misc/debugging.rst @@ -10,14 +10,70 @@ IDE debugging of functions Local debugging of flask app ============================================= +* Choose to run the Analyst or Broker with FLASK_APP environment variable * Run flask at command prompt ```zsh export FLASK_ENV=development export FLASK_APP=flask_app.broker.routes +export FLASK_APP=flask_app.analyst.routes flask run ``` +* With either Analyst or Broker, the development port will be 5000. Connect to + http://127.0.0.1:5000 in browser, + + * Broker + * http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False& + * http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif + * http://127.0.0.1:5000/api/v1/badge/?provider=mopho + + * Analyst: + http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515 + http://127.0.0.1:5000/api/v1/rank/?by_species=true -* Connect to localhost in browser. * Flask will auto-update on file save. * Refresh browser after changes +* The frontend endpoint cannot be tested this way, as it depends on frontend + **webpack-output** and **static-files** to be mounted as docker volumes. + + +Local debugging of Docker +============================================= + +More info in about/install_run_notes + + +Run Docker containers (development) +------------------------------------------- + +Note that the development compose file, docker-compose.development.yml, is referenced +first on the command line. It has elements that override those defined in the +general compose file, docker-compose.yml:: + + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Flask has hot-reload enabled. + +Rebuild/restart +------------------------------------------- + +To delete all containers, images, networks and volumes, stop any running +containers:: + + sudo docker compose stop + + +And run this command (which ignores running container):: + + sudo docker system prune --all --volumes + +Then rebuild/restart:: + + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Examine container +------------------------------------------- + +To examine containers at a shell prompt:: + + sudo docker exec -it sp_network-nginx-1 /bin/sh diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst new file mode 100644 index 00000000..fd91fc8f --- /dev/null +++ b/sphinx/misc/docker.rst @@ -0,0 +1,207 @@ +Docker +############################## + +Standard manipulation +================================= + +Edit the docker environment files +------------------------------------------- + +* Add the container domain name to the files .env.broker.conf and .env.analyst.conf +* Change the FQDN value to the fully qualified domain name of the server. + + * If this is a local testing deployment, it will be "localhost" + * For a development or production server it will be the FQDN with correct subdomain + for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and + FQDN=analyst.spcoco.org in .env.analyst.conf + +Run the containers (production) +------------------------------------------- + +Start the containers with the Docker composition file:: + + sudo docker compose -f docker-compose.yml up -d + +Specify Network is now available at [https://localhost/](https://localhost:443) + + +Run the containers (development) +------------------------------------------- + +Note that the development compose file, docker-compose.development.yml, is referenced +first on the command line. It has elements that override those defined in the +general compose file, docker-compose.yml:: + + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Flask has hot-reload enabled. + + +Rebuild/restart +------------------------------------------- + +To delete all containers, images, networks and volumes, stop any running +containers:: + + sudo docker compose stop + + +And run this command (which ignores running container):: + + sudo docker system prune --all --volumes + +Then rebuild/restart:: + + sudo docker compose up -d + # or + sudo docker compose -f docker-compose.development.yml -f docker-compose.yml up + +Examine container +------------------------------------------- + +To examine containers at a shell prompt:: + + sudo docker exec -it sp_network-nginx-1 /bin/sh + +Error port in use: +"Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use" + +See what else is using the port. In my case apache was started on reboot. Bring down +all docker containers, shut down httpd, bring up docker. + +:: + lsof -i -P -n | grep 443 + sudo docker compose down + sudo systemctl stop httpd + sudo docker compose up -d + +Run Docker on OSX +================================= + +Need to bind server to 0.0.0.0 instead of 127.0.0.1 + +Test by getting internal IP, using ifconfig, then command to see if connects successfully:: + + nc -v x.x.x.x 443 + +Then can use same IP in browser, i.e. https://x.x.x.x/api/v1/name/ +This only exposes the broker, not the analyst services. + + + +Troubleshooting +================================= + +Out of Space Problem +------------------ + +Running `certbot certificates` failed because the EC2 instance running Docker +containers for Specify Network development shows disk full:: + + root@ip-172-31-86-62:~# df -h + Filesystem Size Used Avail Use% Mounted on + /dev/root 7.6G 7.6G 0 100% / + tmpfs 483M 0 483M 0% /dev/shm + tmpfs 194M 21M 173M 11% /run + tmpfs 5.0M 0 5.0M 0% /run/lock + /dev/xvda15 105M 6.1M 99M 6% /boot/efi + overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/82d82cc5eb13260207b94443934c7318af651ea96a5fcd88c579f23224ba099d/merged + overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/cb0d78289131b3925e21d7eff2d03c79fe432eeba2d69a33c6134db40dc3caf3/merged + overlay 7.6G 7.6G 0 100% /var/lib/docker/overlay2/3bd6d12b36e746f9c74227b6ac9d928a3179d8b604a9dea4fd88625eab84be1f/merged + tmpfs 97M 4.0K 97M 1% /run/user/1000 + +The disk is small, but the culprit is /var/lib/docker/overlay2 + +Some strategies at: +https://forums.docker.com/t/some-way-to-clean-up-identify-contents-of-var-lib-docker-overlay/30604/19 + +Solution: +------------------- + +* The instance was created with a volume of an 8gb default size. +* Stop the instance +* Modify the volume. +* Restart the EC2 instance - ok while the volume is in the optimizing state. +* If the instance does not recognize the extended volume immediately:: + + ubuntu@ip-172-31-91-57:~$ df -h + Filesystem Size Used Avail Use% Mounted on + /dev/root 7.6G 7.6G 0 100% / + tmpfs 475M 0 475M 0% /dev/shm + tmpfs 190M 11M 180M 6% /run + tmpfs 5.0M 0 5.0M 0% /run/lock + /dev/xvda15 105M 6.1M 99M 6% /boot/efi + tmpfs 95M 4.0K 95M 1% /run/user/1000 + ubuntu@ip-172-31-91-57:~$ sudo lsblk + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS + loop0 7:0 0 24.9M 1 loop /snap/amazon-ssm-agent/7628 + loop1 7:1 0 25.2M 1 loop /snap/amazon-ssm-agent/7983 + loop2 7:2 0 55.7M 1 loop /snap/core18/2796 + loop3 7:3 0 55.7M 1 loop /snap/core18/2812 + loop4 7:4 0 63.9M 1 loop /snap/core20/2105 + loop5 7:5 0 63.9M 1 loop /snap/core20/2182 + loop6 7:6 0 87M 1 loop /snap/lxd/27037 + loop7 7:7 0 87M 1 loop /snap/lxd/27428 + loop8 7:8 0 40.4M 1 loop /snap/snapd/20671 + loop9 7:9 0 39.1M 1 loop /snap/snapd/21184 + xvda 202:0 0 30G 0 disk + ├─xvda1 202:1 0 7.9G 0 part / + ├─xvda14 202:14 0 4M 0 part + └─xvda15 202:15 0 106M 0 part /boot/efi + +* extend the filesystem: + https://docs.aws.amazon.com/ebs/latest/userguide/recognize-expanded-volume-linux.html +* In this case we want to extend xvda1, so:: + + $ sudo growpart /dev/xvda 1 + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + mkdir: cannot create directory ‘/tmp/growpart.1496’: No space left on device + FAILED: failed to make temp dir + +* We must free up space to allow extension:: + + $ sudo docker system prune --all --volumes + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + WARNING! This will remove: + - all stopped containers + - all networks not used by at least one container + - all volumes not used by at least one container + - all images without at least one container associated to them + - all build cache + + Are you sure you want to continue? [y/N] y + Deleted Containers: + 24768ca767d37f248eff173f13556007468330298329200d533dfa9ca011e409 + 809709d6f8bfa8575009a0d07df16ee78852e9ab3735aa19561ac0dbc0313123 + 64591ed14ecae60721ea367af650683f738636167162f6ed577063582c210aa9 + + Deleted Networks: + sp_network_nginx + + Deleted Images: + untagged: nginx:alpine + untagged: nginx@sha256:a59278fd22a9d411121e190b8cec8aa57b306aa3332459197777583beb728f59 + deleted: sha256:529b5644c430c06553d2e8082c6713fe19a4169c9dc2369cbb960081f52924ff + ... + deleted: sha256:e74dab46dbca98b4be75dfbda3608cd857914b750ecd251c4f1bdbb4ef623c8c + + Total reclaimed space: 1.536GB + +* Extend filesystem:: + + $ sudo growpart /dev/xvda 1 + sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution + CHANGED: partition=1 start=227328 old: size=16549855 end=16777183 new: size=62687199 end=62914527 + $ df -h + Filesystem Size Used Avail Use% Mounted on + /dev/root 7.6G 5.7G 2.0G 75% / + tmpfs 475M 0 475M 0% /dev/shm + tmpfs 190M 18M 173M 10% /run + tmpfs 5.0M 0 5.0M 0% /run/lock + /dev/xvda15 105M 6.1M 99M 6% /boot/efi + tmpfs 95M 4.0K 95M 1% /run/user/1000 + + +* Stop apache2 if running +* Rebuild the docker containers diff --git a/sphinx/misc/ssl_certificates.rst b/sphinx/misc/ssl_certificates.rst index 20e9121d..cc83af66 100644 --- a/sphinx/misc/ssl_certificates.rst +++ b/sphinx/misc/ssl_certificates.rst @@ -7,55 +7,147 @@ Letsencrypt using Certbot. They are only valid for 90 days at a time. TODO: move administration to AWS, and script renewal if necessary - -Renewal procedure -============================================= - -* Change to superuser, then check the validity of your certificates:: - - sudo su - - certbot certificates - -* When it is time for a renewal (approx every 60 days), move to the Specify Network - project directory where Docker was started, and stop the Docker containers:: - - cd /home/ubuntu/git/sp_network - docker compose stop - -* Renew the certificates:: - - certbot renew - -* Move to /etc/letsencrypt/archive/ and find the most recent - certificate names in the directory (certX.pem, chainX.pem, fullchainX.pem, - privkeyX.pem, where X is an integer):: - - cd /etc/letsencrypt/archive/spcoco.org/ - ls -lahtr - -* Copy the new certificates to /home/ubuntu/certificates, changing - the name to cert.pem, chain.pem, fullchain.pem, privkey.pem (no X integer). Then - change the owner from root, to the username (ubuntu):: - - cp cert4.pem /home/ubuntu/certificates/cert.pem - cp chain4.pem /home/ubuntu/certificates/chain.pem - cp fullchain4.pem /home/ubuntu/certificates/fullchain.pem - cp privkey4.pem /home/ubuntu/certificates/privkey.pem - -* Move to the directory with the certificates and change the - owner to ubuntu, then exit superuser:: - - cd /home/ubuntu/certificates - chown ubuntu:ubuntu * - exit - -* Move to the config directory and create symbolic links to the new fullchain.pem - and privkey.pem files:: - - cd /home/ubuntu/git/sp_network/config - ln -s /home/ubuntu/certificates/fullchain.pem - ln -s /home/ubuntu/certificates/privkey.pem - -* Then restart the containers:: - - sudo docker compose up -d +Local self-signed certificates +......................................... +To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate +and the private key) using Let's Encrypt and link these files in `./sp_network/config/`. + +While in development, generate self-signed certificates then link them in +~/git/sp_network/config/ directory for this project:: + + $ mkdir ~/certificates + + openssl req \ + -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \ + -keyout ~/certificates/privkey.pem \ + -out ~/certificates/fullchain.pem + + $ cd ~/git/sp_network/config + $ ln -s ~/certificates/privkey.pem + $ ln -s ~/certificates/fullchain.pem + +To run either the production or the development containers with HTTPS +support, generate `fullchain.pem` and `privkey.pem` (certificate and the private +key) using Let's Encrypt, link these files in the `./config/` directory. +Full instructions in the docs/aws-steps.rst page, under `Set up TLS/SSL` + +Modify the `FQDN` environment variable in `.env.conf` as needed. + +TLS/SSL using Certificate Authority (CA) +.................................................. + +* Make sure that DNS has propogated for domain for SSL +* Stop apache service +* request a certificate for the domain + +:: + + $ sudo systemctl stop apache2 + $ sudo certbot certonly -v + Saving debug log to /var/log/letsencrypt/letsencrypt.log + + How would you like to authenticate with the ACME CA? + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + 1: Spin up a temporary webserver (standalone) + 2: Place files in webroot directory (webroot) + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1 + Plugins selected: Authenticator standalone, Installer None + Enter email address (used for urgent renewal and security notices) + (Enter 'c' to cancel): aimee.stewart@ku.edu + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Please read the Terms of Service at + https://letsencrypt.org/documents/LE-SA-v1.3-September-21-2022.pdf. You must + agree in order to register with the ACME server. Do you agree? + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + (Y)es/(N)o: Y + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Would you be willing, once your first certificate is successfully issued, to + share your email address with the Electronic Frontier Foundation, a founding + partner of the Let's Encrypt project and the non-profit organization that + develops Certbot? We'd like to send you email about our work encrypting the web, + EFF news, campaigns, and ways to support digital freedom. + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + (Y)es/(N)o: N + Account registered. + Please enter the domain name(s) you would like on your certificate (comma and/or + space separated) (Enter 'c' to cancel): dev.spcoco.org, analyst-dev.spcoco.org, broker-dev.spcoco.org + Requesting a certificate for dev.spcoco.org and 2 more domains + Performing the following challenges: + http-01 challenge for analyst-dev.spcoco.org + http-01 challenge for broker-dev.spcoco.org + http-01 challenge for dev.spcoco.org + Waiting for verification... + Cleaning up challenges + + Successfully received certificate. + Certificate is saved at: /etc/letsencrypt/live/dev.spcoco.org/fullchain.pem + Key is saved at: /etc/letsencrypt/live/dev.spcoco.org/privkey.pem + This certificate expires on 2024-06-19. + These files will be updated when the certificate renews. + Certbot has set up a scheduled task to automatically renew this certificate in the background. + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + If you like Certbot, please consider supporting our work by: + * Donating to ISRG / Let's Encrypt: https://letsencrypt.org/donate + * Donating to EFF: https://eff.org/donate-le + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + +Install certificates into config directory +------------------------------------------------------- + +* Create a ~/certificates directory to hold certificate files +* as superuser, copy the newly created fullchain.pem and privkey.pem files from the + letsencrypt live +* change the owner so that they can be used in Docker containers +* Link the certificates in the repo config directory + +:: + + $ cd + $ mkdir certificates + $ sudo su - + # cp -p /etc/letsencrypt/live//* /home/ubuntu/certificates/ + # chown ubuntu:ubuntu /home/ubuntu/certificates/* + # exit + $ cd ~/git/sp_network/config + $ ln -s ~/certificates/fullchain.pem + $ ln -s ~/certificates/privkey.pem + +Renew Certbot SSL certificates +......................................... + +SSL certificates are served from the instance (AWS EC2), and need port 80 to be renewed. +These are administered by Letsencrypt using Certbot and are only valid for 90 days at +a time. When it is time for a renewal (approx every 60 days), bring the docker +containers down. Prune the volumes so the new containers and volumes will be created +with the updated certificates. Renew the certificates, then bring the containers up. + +Amazon EC2 containers do not need apache running, certbot runs its own temp web server. + +Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3 + +:: + + $ sudo certbot certificates + $ sudo docker compose stop + $ sudo su - + # certbot renew + # cp -p /etc/letsencrypt/live/spcoco.org/* /home/ubuntu/certificates/ + # chown ubuntu:ubuntu /home/ubuntu/certificates/* + # exit + $ ls -lahtr ~/git/sp_network/config + + $ sudo docker system prune --all --volumes + $ sudo docker compose up -d + +TODO: SSL through Amazon +......................................... + +* Create Elastic IP address for EC2 instance +* Request a public certificate through Certificate Manager (ACM) + * Choose DNS validation + * Add tags sp_network, dev or prod, others diff --git a/sppy/aws/aws_constants.py b/sppy/aws/aws_constants.py index 027aab91..9fd7fe79 100644 --- a/sppy/aws/aws_constants.py +++ b/sppy/aws/aws_constants.py @@ -2,6 +2,8 @@ PROJ_NAME = "specnet" REGION = "us-east-1" PROJ_BUCKET = f"{PROJ_NAME}-{REGION}" +SUMMARY_FOLDER = "summary" +ENCODING = "utf-8" INPUT_PATH = "summary" LOG_PATH = "log" diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py index b9ff1a89..8380b173 100644 --- a/sppy/aws/aws_tools.py +++ b/sppy/aws/aws_tools.py @@ -6,15 +6,21 @@ import boto3 from botocore.exceptions import ClientError import csv -import datetime +import certifi +import datetime as DT +from http import HTTPStatus +import json import logging from logging.handlers import RotatingFileHandler import pandas as pd import os +import requests +import xml.etree.ElementTree as ET from sppy.aws.aws_constants import ( - INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, PROJ_NAME, - REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, USER_DATA_TOKEN) + ENCODING, INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, + PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, + USER_DATA_TOKEN) # -------------------------------------------------------------------------------------- @@ -236,7 +242,7 @@ def create_token(type=None): """ if type is None: type = PROJ_NAME - token = f"{type}_{datetime.datetime.now().timestamp()}" + token = f"{type}_{DT.datetime.now().timestamp()}" return token @@ -247,7 +253,7 @@ def get_today_str(): Returns: date_str(str): string representing date in YYYY-MM-DD format. """ - n = datetime.datetime.now() + n = DT.datetime.now() date_str = f"{n.year}_{n.month:02d}_{n.day:02d}" return date_str @@ -259,7 +265,7 @@ def get_current_datadate_str(): Returns: date_str(str): string representing date in YYYY-MM-DD format. """ - n = datetime.datetime.now() + n = DT.datetime.now() date_str = f"{n.year}_{n.month:02d}_01" return date_str @@ -271,7 +277,7 @@ def get_previous_datadate_str(): Returns: date_str(str): string representing date in YYYY-MM-DD format. """ - n = datetime.datetime.now() + n = DT.datetime.now() yr = n.year mo = n.month - 1 if n.month == 0: @@ -610,7 +616,7 @@ def get_logger(log_name, log_dir=None, log_level=logging.INFO): # create file handler handler = RotatingFileHandler( filename, mode="w", maxBytes=LOGFILE_MAX_BYTES, backupCount=10, - encoding="utf-8" + encoding=ENCODING ) formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT) handler.setLevel(log_level) @@ -639,14 +645,14 @@ def create_dataframe_from_gbifcsv_s3_bucket(bucket, csv_path, region=REGION): s3_client = boto3.client("s3", region_name=region) s3_obj = s3_client.get_object(Bucket=bucket, Key=csv_path) df = pd.read_csv( - s3_obj["Body"], delimiter="\t", encoding="utf-8", low_memory=False, + s3_obj["Body"], delimiter="\t", encoding=ENCODING, low_memory=False, quoting=csv.QUOTE_NONE) return df # ---------------------------------------------------- def create_dataframe_from_s3obj( - bucket, s3_path, datatype="parquet", region=REGION, encoding="utf-8"): + bucket, s3_path, datatype="parquet", region=REGION, encoding=ENCODING): """Read CSV data from S3 into a pandas DataFrame. Args: @@ -672,3 +678,557 @@ def create_dataframe_from_s3obj( # s3_fs = s3fs.S3FileSystem df = pd.read_parquet(s3_uri) return df + + +# ............................................... +def _get_values_for_keys(output, keys): + values = [] + # Get values from JSON response + for key in keys: + if type(key) is list or type(key) is tuple: + key_list = key + while key_list: + key = key_list[0] + key_list = key_list[1:] + try: + output = output[key] + if not key_list: + val = output + except Exception: + val = None + else: + try: + val = output[key] + except Exception: + val = None + values.append(val) + return values + + +# ............................................... +def _get_api_response_vals(url, keys, certificate=None): + values = [] + try: + response = requests.get(url, verify=certificate) + except Exception as e: + errmsg = str(e) + else: + try: + status_code = response.status_code + reason = response.reason + except Exception: + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + reason = "Unknown API status_code/reason" + if status_code == HTTPStatus.OK: + # Parse response + try: + output = response.json() + except Exception: + output = response.content + if type(output) is bytes: + output = ET.fromstring(str(output)) + try: + output = ET.parse(output) + except Exception as e: + errmsg = f"Provider error: Invalid JSON response ({output})" + # Get values from JSON response + values = _get_values_for_keys(output, keys) + return values + +# ............................................... +def get_dataset_name_citation(dataset_key, certificate=None): + """Return title from one dataset record with this key. + + Args: + dataset_key: GBIF identifier for this dataset + + Returns: + dataset_name: the name of the dataset. + citation: the preferred citation for the dataset. + + Raises: + Exception: on query failure. + """ + url = f"https://api.gbif.org/v1/dataset/{dataset_key}" + name, citation = _get_api_response_vals( + url, ["title", ["citation", "text"]], certificate=certificate) + return name, citation + + +# ---------------------------------------------------- +def _query_table(bucket, s3_path, query_str, region=REGION, format="CSV"): + """Query the S3 resource defined for this class. + + Args: + bucket: + s3_path: S3 folder and filename within the bucket + query_str: a SQL query for S3 select. + region: + format: output format, options "CSV" or "JSON" + + Returns: + list of records matching the query + """ + recs = [] + if format not in ("JSON", "CSV"): + format = "JSON" + if format == "JSON": + out_serialization = {"JSON": {}} + elif format == "CSV": + out_serialization = { + "CSV": { + "QuoteFields": "ASNEEDED", + "FieldDelimiter": ",", + "QuoteCharacter": '"'} + } + s3 = boto3.client("s3", region_name=region) + resp = s3.select_object_content( + Bucket=bucket, + Key=s3_path, + ExpressionType="SQL", + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization=out_serialization + ) + for event in resp["Payload"]: + if "Records" in event: + recs_str = event["Records"]["Payload"].decode(ENCODING) + rec_strings = recs_str.strip().split("\n") + for rs in rec_strings: + if format == "JSON": + rec = json.loads(rs) + else: + rec = rs.split(",") + recs.append(rec) + return recs + + +# ............................................... +def _parse_records(ret_records, keys): + small_recs = [] + for rec in ret_records: + values = _get_values_for_keys(rec, keys) + small_recs.append(values) + return small_recs + +# ............................................... +def _get_single_record(url, keys, certificate=None): + rec = None + try: + if certificate: + response = requests.get(url, verify=certificate) + else: + response = requests.get(url) + except Exception as e: + errmsg = str(e) + else: + try: + status_code = response.status_code + reason = response.reason + except Exception as e: + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + reason = str(e) + if status_code == HTTPStatus.OK: + # Parse response + try: + output = response.json() + except Exception: + output = response.content + if type(output) is bytes: + output = ET.fromstring(str(output)) + try: + output = ET.parse(output) + except Exception: + output = None + reason = f"Provider error: Invalid JSON response ({output})" + if output: + # Output is only one record + small_recs = _parse_records([output], keys) + try: + rec = small_recs[0] + except Exception as e: + print(f"Error: no output record ({e})") + return rec + + +# ............................................... +def _get_records(url, keys, certificate=None): + small_recs = [] + status_code = None + is_end = count = None + try: + if certificate: + response = requests.get(url, verify=certificate) + else: + response = requests.get(url) + except Exception as e: + reason = str(e) + else: + try: + status_code = response.status_code + reason = response.reason + except Exception as e: + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + reason = str(e) + if status_code == HTTPStatus.OK: + # Parse response + try: + output = response.json() + except Exception: + output = response.content + if type(output) is bytes: + output = ET.fromstring(str(output)) + try: + output = ET.parse(output) + except Exception: + reason = f"Provider error: Invalid JSON response ({output})" + # Last query? + try: + is_end = output["endOfRecords"] + except KeyError: + print("Missing endOfRecords flag") + # Expected count + try: + is_end = output["count"] + except KeyError: + print("Missing count") + # Get values from JSON response + try: + ret_records = output["results"] + except KeyError: + reason = "No results returned" + else: + small_recs = _parse_records(ret_records, keys) + if not small_recs: + print(f"No records returned, status {status_code}, reason {reason}") + return small_recs, is_end, count + +# ---------------------------------------------------- +def create_dataframe_from_api(base_url, response_keys, output_columns): + """Query an API, read the data and write a subset to a table in S3. + + Args: + base_url: API URL without any key value pairs for the data service + response_keys: list of keys within the API response to pull values from. A key + can be an ordered list of keys nested within several elements of the tree, + from outermost to innermost. + output_columns: list of column headings for output lookup table + + Returns: + dataframe: Pandas dataframe with rows of data for the output_columns + """ + all_recs = [] + is_end = False + offset = 0 + limit = 1000 + certificate = certifi.where() + while is_end is False: + url = f"{base_url}?offset={offset}&limit={limit}" + small_recs, is_end, count = _get_records( + url, response_keys, certificate=certificate) + all_recs.extend(small_recs) + offset += limit + if offset % 5000 == 0: + print(f"Offset = {offset}") + dataframe = pd.DataFrame(all_recs, columns=output_columns) + print(f"Lookup table contains {dataframe.shape[0]} rows") + return dataframe + + +# ---------------------------------------------------- +def create_csvfiles_from_api( + base_url, response_keys, output_columns, output_fname, encoding=ENCODING): + """Query an API, read the data and write a subset to a table in S3. + + Args: + base_url: API URL without any key value pairs for the data service + response_keys: list of keys within the API response to pull values from. A key + can be an ordered list of keys nested within several elements of the tree, + from outermost to innermost. + output_columns: list of column headings for output lookup table + output_fname: base output filename for temporary CSV files + encoding: encoding of the input data + + Returns: + csv_files: Local CSV files with records. The first file in the list will have + a header, the rest will not. + """ + csv_files = [] + records = [] + is_end = False + offset = 7000 + read_limit = 500 + write_limit = 5000 + certificate = certifi.where() + while is_end is False: + url = f"{base_url}?offset={offset}&limit={read_limit}" + print(url) + small_recs, is_end, count = _get_records( + url, response_keys, certificate=certificate) + if small_recs: + records.extend(small_recs) + offset += read_limit + # Write to tempfile every 5000 + if offset % write_limit == 0: + dataframe = pd.DataFrame(records, columns=output_columns) + tmp_filename = f"/tmp/{output_fname}_{offset}.csv" + # Only write header to first file (offset == 0), others will be appended + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=(offset == 0), + columns=output_columns, doublequote=False, escapechar="\\", + encoding=encoding) + csv_files.append(tmp_filename) + print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows") + # reset records in memory + records = [] + return csv_files + + +# ---------------------------------------------------- +def write_csvfiles_to_s3( + csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding=ENCODING): + """Query an API, read the data and write a subset to a table in S3. + + Args: + csvfiles: input CSV files for S3 table. The first file in the list will have + a header, the rest will not. + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + output_fname: output table for looking up dataset name and citation + region: AWS region containing the destination bucket. + encoding: encoding of the input data + + Postcondition: + CSV table with output_columns and values for each written to the named S3 object + in bucket and folders + """ + output_path = f"{s3_folders}/{output_fname}" + combined_fname = f"/tmp/{output_fname}.csv" + with open(combined_fname, "a") as outf: + # Output data written as CSV + for fname in csv_fnames: + with open(fname, "r") as inf: + data = inf.read() + outf.write(data) + print(f"Wrote {combined_fname}") + upload_to_s3(combined_fname, bucket, output_path, region=region) + print(f"Uploaded to s3://{bucket}/{output_path}") + + +# ---------------------------------------------------- +def write_dataframe_to_s3( + dataframe, bucket, s3_folders, output_fname, region=REGION, encoding=ENCODING): + """Query an API, read the data and write a subset to a table in S3. + + Args: + dataframe: Pandas dataframe with rows of data + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + output_fname: output table for looking up dataset name and citation + region: AWS region containing the destination bucket. + encoding: encoding of the input data + + Postcondition: + CSV table with output_columns and values for each written to the named S3 object + in bucket and folders + """ + output_path = f"{s3_folders}/{output_fname}" + tmp_filename = f"/tmp/{output_fname}" + # Output data written as CSV + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=True, doublequote=False, + escapechar="\\", encoding=encoding) + print(f"Wrote {tmp_filename}") + upload_to_s3(tmp_filename, bucket, output_path, region=region) + print(f"Uploaded to s3://{bucket}/{output_path}") + + +# # ---------------------------------------------------- +# def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING): +# """Query the GBIF Dataset API, write a subset of the response to a table in S3. +# +# Args: +# bucket: name of the bucket containing the CSV data. +# s3_folders: S3 bucket folders for output lookup table +# region: AWS region containing the destination bucket. +# encoding: encoding of the input data +# +# Note: +# There are >100k records for datasets and limited memory on this EC2 instance, +# so we write them as temporary CSV files, then combine them, then create a +# dataframe and upload. +# +# Postcondition: +# CSV table with dataset key, pubOrgKey, dataset name, dataset citation written +# to the named S3 object in bucket and folders +# """ +# base_url = "https://api.gbif.org/v1/dataset" +# response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] +# data_date = get_current_datadate_str() +# output_fname = f"dataset_meta_{data_date}" +# output_fname = "dataset_meta_2024_02_01" +# output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] +# csv_fnames = create_csvfiles_from_api( +# base_url, response_keys, output_columns, output_fname) +# write_csvfiles_to_s3( +# csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) + + +# ---------------------------------------------------- +def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING): + """Query the GBIF Organization API, write a subset of the response to a table in S3. + + Args: + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + region: AWS region to query. + encoding: encoding of the input data + + Postcondition: + CSV table with pubOrgKey, pubOrg name written to the named S3 object in + bucket and folders + """ + base_url = "https://api.gbif.org/v1/dataset" + response_keys = ["key", "title"] + data_date = get_current_datadate_str() + output_fname = f"organization_name_{data_date}" + output_fname = "organization_name_2024_02_01" + output_columns = ["publishingOrganizationKey", "title"] + lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns) + write_dataframe_to_s3( + lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding) + + +# ---------------------------------------------------- +def create_csvfiles_from_apiqueries( + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING, + certificate=None): + """Query an API, read the data and write a subset to a table in S3. + + Args: + base_url: API URL without any key value pairs for the data service + keys: unique identifiers to query the API for + response_keys: list of keys within the API response to pull values from. A key + can be an ordered list of keys nested within several elements of the tree, + from outermost to innermost. + output_columns: list of column headings for output lookup table + output_fname: base output filename for temporary CSV files + encoding: encoding of the input data + certificate: local SSL certificate required by some APIs + + Returns: + csv_files: Local CSV files with records. The first file in the list will have + a header, the rest will not. + """ + tmp_filenames = [] + records = [] + write_chunk = 1000 + for i in range(len(keys)): + url = f"{base_url}/{keys[i]}" + rec = _get_single_record(url, response_keys, certificate=certificate) + if rec: + records.append(rec) + if i % write_chunk == 0 and i > 0: + print( + f"{DT.datetime.now().isoformat()} Create dataframe for {len(records)} " + f"records; key {i} of {len(keys)}") + if records: + dataframe = pd.DataFrame(records, columns=output_columns) + tmp_filename = f"/tmp/{output_fname}_{i}.csv" + dataframe.to_csv( + path_or_buf=tmp_filename, sep='\t', header=(i < write_chunk+1), + columns=output_columns, doublequote=False, escapechar="\\", + encoding=encoding) + print( + f"Wrote {tmp_filename} with {len(records)} records and " + f"{dataframe.shape[0]} rows") + records = [] + tmp_filenames.append(tmp_filename) + return tmp_filenames + +# ---------------------------------------------------- +def create_s3_dataset_lookup_by_keys( + bucket, s3_folders, region=REGION, encoding=ENCODING, is_test=False): + """Query the GBIF Dataset API, write a subset of the response to a table in S3. + + Args: + bucket: name of the bucket containing the CSV data. + s3_folders: S3 bucket folders for output lookup table + region: AWS region containing the destination bucket. + encoding: encoding of the input data + + Note: + There are >100k records for datasets and limited memory on this EC2 instance, + so we write them as temporary CSV files, then combine them, then create a + dataframe and upload. + + Postcondition: + CSV table with dataset key, pubOrgKey, dataset name, dataset citation written + to the named S3 object in bucket and folders + """ + # Current filenames + data_date = get_current_datadate_str() + data_date = "2024_02_01" + input_fname = f"dataset_counts_{data_date}_000.parquet" + output_fname = f"dataset_meta_{data_date}" + + # Data and query parameters + base_url = "https://api.gbif.org/v1/dataset" + response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]] + output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"] + certificate = certifi.where() + + # Get keys for dataset resolution + s3_path = f"{s3_folders}/{input_fname}" + query_str = "SELECT datasetkey from s3object s" + key_records = _query_table(bucket, s3_path, query_str, format="CSV") + keys = [r[0] for r in key_records] + if is_test: + keys = keys[:2100] + output_fname = f"dataset_meta_test_{data_date}" + + # Write tempfiles locally + csv_fnames = create_csvfiles_from_apiqueries( + base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING, + certificate=certificate) + + # Aggregate and write all records to S3 + write_csvfiles_to_s3( + csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding) + +# ............................................................................. +if __name__ == "__main__": + bucket=PROJ_BUCKET + region=REGION + encoding=ENCODING + s3_folders="summary" + # keys = [ + # "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b", + # "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e", + # "c8fded56-3ddb-4e26-8863-ba8d55862689", + # "3c83d5da-822a-439c-897a-7569e82c4ebc" + # ] + create_s3_dataset_lookup_by_keys( + bucket, s3_folders, region=REGION, encoding=ENCODING) + # create_s3_dataset_lookup(bucket, s3_folders) + # create_test_s3_dataset_lookup(bucket, s3_folders, keys) + # create_s3_organization_lookup( + # bucket, s3_folders, region=REGION, encoding=ENCODING) + + +""" +# Note: Test with quoted data such as: +# http://api.gbif.org/v1/dataset/3c83d5da-822a-439c-897a-7569e82c4ebc +from sppy.aws.aws_tools import * +from sppy.aws.aws_tools import _query_table + +bucket=PROJ_BUCKET +region=REGION +encoding=ENCODING +s3_folders="summary" + +create_s3_dataset_lookup_by_keys( + bucket, s3_folders, region=REGION, encoding=ENCODING, is_test=False) + + +""" diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py index 2a54e98a..a2c521dd 100644 --- a/sppy/tools/provider/gbif.py +++ b/sppy/tools/provider/gbif.py @@ -12,7 +12,7 @@ from sppy.tools.util.logtools import logit from sppy.tools.provider.api import APIQuery -from sppy.tools.s2n.utils import get_traceback, add_errinfo +from sppy.tools.s2n.utils import add_errinfo # ............................................................................. @@ -63,12 +63,26 @@ def _assemble_filter_string(self, filter_string=None): @classmethod def _get_output_val(cls, out_dict, name): try: - tmp = out_dict[name] - val = str(tmp).encode(ENCODING) + val = out_dict[name] except Exception: return None + if type(val) is bytes: + val = str(val).encode(ENCODING) return val + # ............................................... + @classmethod + def _get_nested_output_val(cls, output, key_list): + while key_list: + key = key_list[0] + key_list = key_list[1:] + try: + output = output[key] + if not key_list: + return str(output).encode(ENCODING) + except Exception: + return None + # # ............................................... # @classmethod # def get_taxonomy(cls, taxon_key, logger=None): @@ -646,13 +660,68 @@ def get_publishing_org(cls, pub_org_key, logger=None): raise return pub_org_name + # ............................................... + @classmethod + def get_dataset(cls, dataset_key, logger=None): + """Return title from one dataset record with this key. + + Args: + dataset_key: GBIF identifier for this dataset + logger: object for logging messages and errors. + + Returns: + dataset_name: the name of the dataset. + citation: the preferred citation for the dataset. + + Raises: + Exception: on query failure. + """ + ds_api = GbifAPI( + service=GBIF.DATASET_SERVICE, key=dataset_key, logger=logger) + try: + ds_api.query() + dataset_name = ds_api._get_output_val(ds_api.output, "title") + except Exception as e: + logit(logger, str(e), refname=cls.__name__) + raise + try: + citation = ds_api._get_nested_output_val( + ds_api.output, ["citation", "text"]) + except Exception as e: + citation = None + return dataset_name, citation + # ............................................... def query(self): """Query the API and set "output" attribute to a ElementTree object.""" APIQuery.query_by_get(self, output_type="json", verify=False) + + + # ............................................................................. if __name__ == "__main__": # test pass + +""" +from sppy.tools.provider.gbif import GbifAPI + +dataset_key = 'e9d1c589-5df6-4bd8-aead-c09e2d8630e4' +ds_api = GbifAPI(service='dataset', key=dataset_key) +try: + ds_api.query() + dataset_name = ds_api._get_output_val(ds_api.output, "title") +except Exception as e: + logit(logger, str(e), refname=cls.__name__) + raise +try: + citation = ds_api._get_nested_output_val( + ds_api.output, ["citation", "text"]) +except Exception as e: + logit(logger, str(e), refname=cls.__name__) + raise +return dataset_name, citation + +""" \ No newline at end of file diff --git a/sppy/tools/provider/spnet.py b/sppy/tools/provider/spnet.py new file mode 100644 index 00000000..fde4a41c --- /dev/null +++ b/sppy/tools/provider/spnet.py @@ -0,0 +1,292 @@ +"""Class to query tabular summary Specify Network data in S3""" +import boto3 +import json +import pandas as pd + +from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION, SUMMARY_FOLDER +from sppy.aws.aws_tools import get_current_datadate_str +from sppy.tools.s2n.utils import get_traceback + + + +# ............................................................................. +class SpNetAnalyses(): + """Class for retrieving SpecifyNetwork summary data from AWS S3.""" + + # ............................................... + @classmethod + def __init__( + self, bucket, region=REGION, encoding=ENCODING): + """Object to query tabular data in S3. + + Args: + bucket: S3 bucket containing data. + region: AWS region containing the data. + encoding: encoding of the data. + """ + self.bucket = bucket + self.region = region + self.encoding = encoding + self.exp_type = 'SQL' + self.datestr = get_current_datadate_str() + self.datestr = "2024_02_01" + self._summary_path = "summary" + self._summary_tables = { + "dataset_counts": { + "fname": f"dataset_counts_{self.datestr}_000.parquet", + "fields": ["datasetkey", "occ_count", "species_count"], + "key": "datasetkey" + }, + "dataset_species_lists": { + "fname": f"dataset_lists_{self.datestr}_000.parquet", + "fields": ["datasetkey", "taxonkey", "species", "occ_count"], + "key": "datasetkey" + }, + "dataset_meta": { + "fname": f"dataset_meta_{self.datestr}.csv", + "fields": [ + "datasetKey", "publishingOrganizationKey", "title", "citation"], + "key": "datasetKey" + }, + "organization_meta": { + "fname": f"organization_meta_{self.datestr}.csv", + "fields": ["publishingOrganizationKey", "title"], + "key": "publishingOrganizationKey" + } + } + + # ---------------------------------------------------- + def _list_summaries(self): + summary_objs = [] + s3 = boto3.client("s3", region_name=self.region) + summ_objs = s3.list_objects_v2(Bucket=self.bucket, Prefix=self._summary_path) + prefix = f"{self._summary_path}/" + try: + contents = summ_objs["Contents"] + except KeyError: + pass + else: + for item in contents: + fname = item["Key"].strip(prefix) + if len(fname) > 1: + summary_objs.append(fname) + return summary_objs + + # ---------------------------------------------------- + def _dataset_metadata_exists(self): + fnames = self._list_summaries() + if self._summary_tables["dataset_meta"]["fname"] in fnames: + return True + return False + + # ---------------------------------------------------- + def _query_table(self, s3_path, query_str, format="CSV"): + """Query the S3 resource defined for this class. + + Args: + s3_path: S3 folder and filename within the bucket + query_str: a SQL query for S3 select. + format: output format, options "CSV" or "JSON" + + Returns: + list of records matching the query + """ + recs = [] + if format not in ("JSON", "CSV"): + format = "JSON" + if format == "JSON": + out_serialization = {"JSON": {}} + elif format == "CSV": + out_serialization = { + "CSV": { + "QuoteFields": "ASNEEDED", + "FieldDelimiter": ",", + "QuoteCharacter": '"'} + } + s3 = boto3.client("s3", region_name=self.region) + resp = s3.select_object_content( + Bucket=self.bucket, + Key=s3_path, + ExpressionType="SQL", + Expression=query_str, + InputSerialization={"Parquet": {}}, + OutputSerialization=out_serialization + ) + for event in resp["Payload"]: + if "Records" in event: + recs_str = event["Records"]["Payload"].decode(ENCODING) + rec_strings = recs_str.strip().split("\n") + for rs in rec_strings: + if format == "JSON": + rec = json.loads(rs) + else: + rec = rs.split(",") + recs.append(rec) + return recs + + # ---------------------------------------------------- + def _create_dataframe_from_s3obj(self, s3_path): + """Read CSV data from S3 into a pandas DataFrame. + + Args: + s3_path: the object name with enclosing S3 bucket folders. + + Returns: + df: pandas DataFrame containing the CSV data. + """ + # import pyarrow.parquet as pq + s3_uri = f"s3://{self.bucket}/{s3_path}" + df = pd.read_parquet(s3_uri) + return df + + # ---------------------------------------------------- + def _query_order_s3_table( + self, s3_path, sort_field, order, limit, format="CSV"): + """Query the S3 resource defined for this class. + + Args: + s3_path: S3 folder and filename within the bucket + sort_field: fieldname (column) to sort records on + order: boolean flag indicating to sort ascending or descending + limit: number of records to return, limit is 500 + format: output format, options "CSV" or "JSON" + + Returns: + ordered list of records matching the query + """ + recs = [] + errors = {} + df = self._create_dataframe_from_s3obj(s3_path) + # Sort rows (Axis 0/index) by values in sort_field (column) + sorted_df = df.sort_values( + by=sort_field, axis=0, ascending=(order == "ascending")) + rec_df = sorted_df.head(limit) + + for row in rec_df.itertuples(): + rec = {"datasetkey": row.datasetkey, + "species_count": row.species_count, + "occ_count": row.occ_count} + recs.append(rec) + return recs, errors + + # ---------------------------------------------------- + def get_dataset_counts(self, dataset_key, format="JSON"): + """Query the S3 resource for occurrence and species counts for this dataset. + + Args: + dataset_key: unique GBIF identifier for dataset of interest. + format: output format, options "CSV" or "JSON" + + Returns: + records: empty list or list of 1 record (list) + """ + fields = self._summary_tables["dataset_counts"]["fields"] + key_idx = fields.index(self._summary_tables["dataset_counts"]["key"]) + + table_path = \ + f"{self._summary_path}/{self._summary_tables['dataset_counts']['fname']}" + query_str = ( + f"SELECT * FROM s3object s WHERE s.datasetkey = '{dataset_key}'" + ) + # Returns empty list or list of 1 record + records = self._query_table(table_path, query_str, format=format) + if self._dataset_metadata_exists(): + self.add_dataset_lookup_vals(records, key_idx=key_idx) + return records + + # ---------------------------------------------------- + def add_dataset_lookup_vals(self, records, key_idx=0, format="JSON"): + """Query the S3 resource for occurrence and species counts for this dataset. + + Args: + key: unique GBIF identifier for object of interest. + format: output format, options "CSV" or "JSON" + + Returns: + records: empty list or list of 1 record (list) + """ + table_path = \ + f"{self._summary_path}/{self._summary_tables['dataset_meta']['fname']}" + fields = self._summary_tables["dataset_meta"]["fields"] + key_fld = fields[0] + new_flds = fields[1:] + qry_flds = " ".join(new_flds) + + for rec in records: + query_str = ( + f"SELECT {qry_flds} FROM s3object s WHERE s.{key_fld} = " + f"'{rec[key_idx]}'" + ) + # Returns empty list or list of 1 record + meta_recs = self._query_table(table_path, query_str, format=format) + try: + meta = meta_recs[0] + except IndexError: + if format == "CSV": + # Add placeholders for empty values + rec.extend(["" for f in new_flds]) + else: + for fld in new_flds: + if format == "JSON": + rec.update(meta) + else: + rec.extend(meta) + + # # ---------------------------------------------------- + # def get_org_counts(self, pub_org_key): + # """Query S3 for occurrence and species counts for this organization. + # + # Args: + # pub_org_key: unique GBIF identifier for organization of interest. + # + # Returns: + # records: empty list or list of 1 record containing occ_count, species_count + # + # TODO: implement this? + # """ + # (occ_count, species_count) = (0,0) + # return (occ_count, species_count) + + # ---------------------------------------------------- + def rank_dataset_counts(self, count_by, order, limit, format="JSON"): + """Return the top or bottom datasets, with counts, ranked by number of species. + + Args: + count_by: string indicating rank datasets by counts of "species" or + "occurrence" . + order: string indicating whether to rank in "descending" or + "ascending" order. + limit: number of datasets to return, no more than 300. + format: output format, options "CSV" or "JSON" + + Returns: + records: list of limit records containing dataset_key, occ_count, species_count + """ + records = [] + table_path = \ + f"{self._summary_path}/{self._summary_tables['dataset_counts']['fname']}" + fields = self._summary_tables["dataset_counts"]["fields"] + key_idx = fields.index(self._summary_tables["dataset_counts"]["key"]) + if count_by == "species": + sort_field = "species_count" + else: + sort_field = "occ_count" + try: + records, errors = self._query_order_s3_table( + table_path, sort_field, order, limit) + except Exception as e: + errors = {"error": [get_traceback()]} + + if self._dataset_metadata_exists(): + self.add_dataset_lookup_vals(records, key_idx=key_idx) + return records, errors + +# ............................................................................. +if __name__ == "__main__": + format = "JSON" + dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + s3q = SpNetAnalyses(PROJ_BUCKET) + recs = s3q.get_dataset_counts(dataset_key, format=format) + for r in recs: + print(r) + diff --git a/sppy/tools/s2n/utils.py b/sppy/tools/s2n/utils.py index 3b6895ea..bc3f3241 100644 --- a/sppy/tools/s2n/utils.py +++ b/sppy/tools/s2n/utils.py @@ -3,9 +3,6 @@ import traceback from uuid import UUID -# from flask_app.broker.constants import ICON_API, ServiceProvider -# from flask_app.common.s2n_type import APIEndpoint - # ...................................................... def is_valid_uuid(uuid_to_test, version=4):