From 09127d28444af672394c63f662131e689d75397c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 11 Mar 2024 11:14:15 -0500
Subject: [PATCH 01/81] point to correct openapi definition

---
 flask_app/broker/routes.py                                      | 2 +-
 flask_app/templates/{swagger_ui.html => swagger_ui.broker.html} | 2 +-
 sphinx/flask/structure.rst                                      | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename flask_app/templates/{swagger_ui.html => swagger_ui.broker.html} (90%)
 create mode 100644 sphinx/flask/structure.rst
diff --git a/flask_app/broker/routes.py b/flask_app/broker/routes.py
index cbaabfcf..e2d8db11 100644
--- a/flask_app/broker/routes.py
+++ b/flask_app/broker/routes.py
@@ -68,7 +68,7 @@ def swagger_ui():
     Returns:
         a webpage UI of the Specify Network schema.
     """
-    return render_template("swagger_ui.html")
+    return render_template("swagger_ui.broker.html")
 
 
 # .....................................................................................
diff --git a/flask_app/templates/swagger_ui.html b/flask_app/templates/swagger_ui.broker.html
similarity index 90%
rename from flask_app/templates/swagger_ui.html
rename to flask_app/templates/swagger_ui.broker.html
index a6f4a5f3..1beb4832 100644
--- a/flask_app/templates/swagger_ui.html
+++ b/flask_app/templates/swagger_ui.broker.html
@@ -18,7 +18,7 @@
 
     <script>
       SwaggerUIBundle({
-        url: "/static/schema/open_api.yaml",
+        url: "/static/schema/open_api.broker.yaml",
         dom_id: "#swagger-ui",
       });
     </script>
diff --git a/sphinx/flask/structure.rst b/sphinx/flask/structure.rst
new file mode 100644
index 00000000..e69de29b

From 0458b26fc5a6749e25f75f75a8aae74c925d14a7 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 11 Mar 2024 11:14:54 -0500
Subject: [PATCH 02/81] doc for SpNetwork structure

---
 sphinx/flask/structure.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/sphinx/flask/structure.rst b/sphinx/flask/structure.rst
index e69de29b..dbab00fa 100644
--- a/sphinx/flask/structure.rst
+++ b/sphinx/flask/structure.rst
@@ -0,0 +1,14 @@
+Structure
+######################################
+
+Specify Network consists of four Docker containers running on a single EC2 instance.
+
+The nginx and front-end containers support both the Analyst and Broker.  Two flask
+containers, one for Analyst, and one for Broker, expose the APIs of each to different
+subdomains of the same domain.  Code for each is in the flask_app.analyst and
+flask_app.broker directories.  In each, the routes.py file defines the different
+endpoints.
+
+
+
+

From ef20344d5916beb85cb326541bd2c94451a0050c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 11 Mar 2024 17:29:07 -0500
Subject: [PATCH 03/81] separate schema files

---
 flask_app/broker/routes.py    | 6 +++---
 flask_app/common/constants.py | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/flask_app/broker/routes.py b/flask_app/broker/routes.py
index e2d8db11..347e93f4 100644
--- a/flask_app/broker/routes.py
+++ b/flask_app/broker/routes.py
@@ -1,10 +1,10 @@
 """URL Routes for the Specify Network API services."""
-import os
 from flask import Blueprint, Flask, render_template, request
+import os
 
 # from flask_app.application import create_app
 from flask_app.common.constants import (
-    TEMPLATE_DIR, STATIC_DIR, SCHEMA_DIR, SCHEMA_FNAME
+    TEMPLATE_DIR, STATIC_DIR, SCHEMA_DIR, SCHEMA_BROKER_FNAME
 )
 from flask_app.common.s2n_type import APIEndpoint
 
@@ -54,7 +54,7 @@ def display_raw_schema():
     Returns:
         schema: the schema for the Specify Network.
     """
-    fname = os.path.join(SCHEMA_DIR, SCHEMA_FNAME)
+    fname = os.path.join(SCHEMA_DIR, SCHEMA_BROKER_FNAME)
     with open(fname, "r") as f:
         schema = f.read()
     return schema
diff --git a/flask_app/common/constants.py b/flask_app/common/constants.py
index 3e50c197..1f782357 100644
--- a/flask_app/common/constants.py
+++ b/flask_app/common/constants.py
@@ -8,4 +8,5 @@
 SCHEMA_DIR = f"{STATIC_DIR}/schema"
 
 TEMPLATE_DIR = "../templates"
-SCHEMA_FNAME = "open_api.yaml"
+SCHEMA_ANALYST_FNAME = "open_api.analyst.yaml"
+SCHEMA_BROKER_FNAME = "open_api.broker.yaml"

From f3e54f4755f317824e9956b76dfda785e072ecb8 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 11 Mar 2024 17:30:22 -0500
Subject: [PATCH 04/81] add GBIF datasetkey resolver

---
 flask_app/analyst/count.py  | 74 ++++++++++++++++++++++++-------------
 flask_app/analyst/routes.py | 72 +++++++++++++++++++-----------------
 sppy/tools/provider/gbif.py | 68 ++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 60 deletions(-)

diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 65ad6291..375a2694 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -1,10 +1,12 @@
 """Class for the Specify Network Name API service."""
+import boto3
 from http import HTTPStatus
 
 from flask_app.common.s2n_type import APIService, AnalystOutput
 from flask_app.common.util import print_analyst_output
 from flask_app.analyst.base import _AnalystService
 
+from sppy.aws.aws_tools import query_s3_table
 from sppy.tools.s2n.utils import get_traceback
 
 
@@ -16,9 +18,21 @@ class CountSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def get_counts(cls, collection_id, organization_id):
+    def get_counts(cls, dataset_key):
+        """Get counts for datasetKey.
+
+        Args:
+            dataset_key: Unique identifier for GBIF datasets.
+
+        Returns:
+            a flask_app.broker.s2n_type.BrokerOutput object with optional records as a
+            list of dictionaries of records corresponding to specimen occurrences in
+            the provider database.
+
+        Todo: Consider adding publishing organization queries with pub_org_key
+        """
         try:
-            output = cls._get_records(collection_id, organization_id)
+            output = cls._get_records(dataset_key, )
         except Exception:
             traceback = get_traceback()
             output = AnalystOutput(
@@ -29,15 +43,15 @@ def get_counts(cls, collection_id, organization_id):
 
     # ...............................................
     @classmethod
-    def _get_organization_counts(cls, organization_id):
+    def _get_organization_counts(cls, pub_org_key):
         return {
             "Organization Raw Counts":
                 {
-                    organization_id: 1,
+                    pub_org_key: 1,
                     "org_id_2": 2,
                     "org_id_3": 3
                 },
-            f"{organization_id} to other orgs":
+            f"{pub_org_key} to other orgs":
                 {
                     "to total": "0.5",
                     "org_id_2": "1.2",
@@ -47,33 +61,41 @@ def _get_organization_counts(cls, organization_id):
 
     # ...............................................
     @classmethod
-    def _get_collection_counts(cls, collection_id):
-        return {
-            "Collection Raw Counts":
-                {
-                    collection_id: 1,
-                    "coll_id_2": 2,
-                    "coll_id_3": 3
-                },
-            f"{collection_id} Ratios":
-                {
-                    collection_id: "0.5",
-                    "coll_id_2": "0.5",
-                    "coll_id_3": "0.5",
-                    "to total": "0.5"
-                }
-        }
+    def _get_dataset_counts(cls, dataset_key):
+        s3 = boto3.client('s3')
+
+        resp = s3.select_object_content(
+            Bucket=PROJ_,
+            Key='sample_data.csv',
+            ExpressionType='SQL',
+            Expression="SELECT * FROM s3object s where s.\"Name\" = 'Jane'",
+            InputSerialization={'CSV': {"FileHeaderInfo": "Use"}, 'CompressionType': 'NONE'},
+            OutputSerialization={'CSV': {}},
+        )
+
+        for event in resp['Payload']:
+            if 'Records' in event:
+                records = event['Records']['Payload'].decode('utf-8')
+                print(records)
+            elif 'Stats' in event:
+                statsDetails = event['Stats']['Details']
+                print("Stats details bytesScanned: ")
+                print(statsDetails['BytesScanned'])
+                print("Stats details bytesProcessed: ")
+                print(statsDetails['BytesProcessed'])
+                print("Stats details bytesReturned: ")
+                print(statsDetails['BytesReturned'])
 
     # ...............................................
     @classmethod
-    def _get_records(cls, collection_id, organization_id):
+    def _get_records(cls, dataset_key, pub_org_key):
         allrecs = []
         # for response metadata
-        if collection_id is not None:
-            coll_data = cls._get_collection_counts(collection_id)
+        if dataset_key is not None:
+            coll_data = cls._get_collection_counts(dataset_key)
             allrecs.append(coll_data)
-        if organization_id is not None:
-            org_data = cls._get_organization_counts(organization_id)
+        if pub_org_key is not None:
+            org_data = cls._get_organization_counts(pub_org_key)
             allrecs.append(org_data)
 
         # Assemble
diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py
index f242e289..f53ad06c 100644
--- a/flask_app/analyst/routes.py
+++ b/flask_app/analyst/routes.py
@@ -1,8 +1,11 @@
 """URL Routes for the Specify Network API services."""
 from flask import Blueprint, Flask, render_template, request
+import os
 
 from flask_app.analyst.count import CountSvc
-from flask_app.common.constants import (STATIC_DIR, TEMPLATE_DIR)
+from flask_app.common.constants import (
+    STATIC_DIR, TEMPLATE_DIR, SCHEMA_DIR, SCHEMA_ANALYST_FNAME)
+from flask_app.common.s2n_type import APIEndpoint
 
 analyst_blueprint = Blueprint(
     "analyst", __name__, template_folder=TEMPLATE_DIR, static_folder=STATIC_DIR,
@@ -18,37 +21,37 @@
 def index():
     return render_template("analyst.index.html")
 
-# # .....................................................................................
-# @app.route("/api/v1/", methods=["GET"])
-# def analyst_status():
-#     """Get services available from broker.
-#
-#     Returns:
-#         dict: A dictionary of status information for the server.
-#     """
-#     endpoints = APIEndpoint.get_analyst_endpoints()
-#     system_status = "In Development"
-#     return {
-#         "num_services": len(endpoints),
-#         "endpoints": endpoints,
-#         "status": system_status
-#     }
-#
+# .....................................................................................
+@app.route("/api/v1/", methods=["GET"])
+def analyst_status():
+    """Get services available from broker.
+
+    Returns:
+        dict: A dictionary of status information for the server.
+    """
+    endpoints = APIEndpoint.get_analyst_endpoints()
+    system_status = "In Development"
+    return {
+        "num_services": len(endpoints),
+        "endpoints": endpoints,
+        "status": system_status
+    }
+
+
+# ..........................
+@app.route("/api/v1/schema")
+def display_raw_schema():
+    """Show the schema XML.
+
+    Returns:
+        schema: the schema for the Specify Network.
+    """
+    fname = os.path.join(SCHEMA_DIR, SCHEMA_ANALYST_FNAME)
+    with open(fname, "r") as f:
+        schema = f.read()
+    return schema
+
 
-# # ..........................
-# @app.route("/api/v1/schema")
-# def display_raw_schema():
-#     """Show the schema XML.
-#
-#     Returns:
-#         schema: the schema for the Specify Network.
-#     """
-#     fname = os.path.join(SCHEMA_DIR, SCHEMA_FNAME)
-#     with open(fname, "r") as f:
-#         schema = f.read()
-#     return schema
-#
-#
 # # ..........................
 # @app.route("/api/v1/swaggerui")
 # def swagger_ui():
@@ -70,11 +73,12 @@ def count_endpoint():
             API response.
     """
     coll_arg = request.args.get("collection_id", default=None, type=str)
-    org_arg = request.args.get("organization_id", default=None, type=str)
-    if coll_arg is None and org_arg is None:
+    # org_arg = request.args.get("organization_id", default=None, type=str)
+    # if coll_arg is None and org_arg is None:
+    if coll_arg is None:
         response = CountSvc.get_endpoint()
     else:
-        response = CountSvc.get_counts(coll_arg, org_arg)
+        response = CountSvc.get_counts(coll_arg)
     return response
 
 
diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py
index 2a54e98a..7854bc7f 100644
--- a/sppy/tools/provider/gbif.py
+++ b/sppy/tools/provider/gbif.py
@@ -69,6 +69,19 @@ def _get_output_val(cls, out_dict, name):
             return None
         return val
 
+    # ...............................................
+    @classmethod
+    def _get_nested_output_val(cls, output, key_list):
+        while key_list:
+            key = key_list[0]
+            key_list = key_list[1:]
+            try:
+                output = output[key]
+                if not key_list:
+                    return str(output).encode(ENCODING)
+            except Exception:
+                return None
+
     # # ...............................................
     # @classmethod
     # def get_taxonomy(cls, taxon_key, logger=None):
@@ -646,13 +659,68 @@ def get_publishing_org(cls, pub_org_key, logger=None):
             raise
         return pub_org_name
 
+    # ...............................................
+    @classmethod
+    def get_dataset(cls, dataset_key, logger=None):
+        """Return title from one dataset record with this key.
+
+        Args:
+            dataset_key: GBIF identifier for this dataset
+            logger: object for logging messages and errors.
+
+        Returns:
+            dataset_name: the name of the dataset.
+            citation: the preferred citation for the dataset.
+
+        Raises:
+            Exception: on query failure.
+        """
+        ds_api = GbifAPI(
+            service=GBIF.DATASET_SERVICE, key=dataset_key, logger=logger)
+        try:
+            ds_api.query()
+            dataset_name = ds_api._get_output_val(ds_api.output, "title")
+        except Exception as e:
+            logit(logger, str(e), refname=cls.__name__)
+            raise
+        try:
+            citation = ds_api._get_nested_output_val(
+                ds_api.output, ["citation", "text"])
+        except Exception as e:
+            citation = None
+        return dataset_name, citation
+
     # ...............................................
     def query(self):
         """Query the API and set "output" attribute to a ElementTree object."""
         APIQuery.query_by_get(self, output_type="json", verify=False)
 
 
+
+
+
 # .............................................................................
 if __name__ == "__main__":
     # test
     pass
+
+"""
+from sppy.tools.provider.gbif import GbifAPI
+
+dataset_key = 'e9d1c589-5df6-4bd8-aead-c09e2d8630e4'
+ds_api = GbifAPI(service='dataset', key=dataset_key)
+try:
+    ds_api.query()
+    dataset_name = ds_api._get_output_val(ds_api.output, "title")
+except Exception as e:
+    logit(logger, str(e), refname=cls.__name__)
+    raise
+try:
+    citation = ds_api._get_nested_output_val(
+        ds_api.output, ["citation", "text"])
+except Exception as e:
+    logit(logger, str(e), refname=cls.__name__)
+    raise
+return dataset_name, citation
+
+"""
\ No newline at end of file

From 91a66a05f0868f322e15e9a34c8936a4e829b464 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 11 Mar 2024 17:31:53 -0500
Subject: [PATCH 05/81] S3 Select access with upgraded dependency

---
 requirements.txt         |   2 +-
 sphinx/aws/aws-setup.rst |   7 +++
 sppy/aws/aws_tools.py    | 111 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1af53c6e..553b4dc4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ pykew>=0.1.3
 gunicorn==20.1.0
 rtree>=1.0.0
 awscli
-boto3
+boto3>=1.34.60
 pandas
 pyarrow
 s3fs
diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst
index e657a105..828da48d 100644
--- a/sphinx/aws/aws-setup.rst
+++ b/sphinx/aws/aws-setup.rst
@@ -9,6 +9,13 @@ Configure AWS credentials either through
 * AWS CLI configuration (for command line tools), or
 * using an IAM role attached to your instance if running on AWS infrastructure.
 
+The AWS cli depends on boto3, so both must be up to date.  In my testing, awscli
+1.27.118 (with requirement botocore==1.29.118) and boto3 1.28.1, failed on
+S3 Select access.
+
+I upgraded awscli (sudo apt install awscli), then upgraded boto3
+(pip install --upgrade boto3) , which installed 1.34.60.  Success
+
 
 Redshift
 ===========================================================
diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index b9ff1a89..a055ab6b 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -13,8 +13,9 @@
 import os
 
 from sppy.aws.aws_constants import (
-    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT, PROJ_NAME,
-    REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME, USER_DATA_TOKEN)
+    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
+    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
+    USER_DATA_TOKEN)
 
 
 # --------------------------------------------------------------------------------------
@@ -672,3 +673,109 @@ def create_dataframe_from_s3obj(
         # s3_fs = s3fs.S3FileSystem
         df = pd.read_parquet(s3_uri)
     return df
+
+
+# .............................................................................
+class S3Query():
+    """Specify Network API service for retrieving taxonomic information."""
+
+    # ...............................................
+    @classmethod
+    def __init__(
+            self, bucket, region=REGION, encoding="utf-8"):
+        """Object to query tabular data in S3.
+
+        Args:
+             bucket: S3 bucket containing data.
+             s3_path: S3 folder(s) containing data objects.
+             datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed.
+             region: AWS region containing the data.
+             encoding: encoding of the data.
+        """
+        self.s3 = boto3.client('s3')
+        self.bucket = bucket
+        self.region = region
+        self.encoding = encoding
+        self._current_datestr = get_current_datadate_str()
+        self.exp_type = 'SQL'
+
+    # ----------------------------------------------------
+    def query_s3_table(self, s3_path, query_str):
+        """Query the S3 resource defined for this class.
+
+        Args:
+            query_str: a SQL query for S3 select.
+
+        Returns:
+             list of records matching the query
+        """
+        recs = []
+        resp = self.s3.select_object_content(
+            Bucket=self.bucket,
+            Key= self.s3_path,
+            ExpressionType='SQL',
+            Expression=query_str,
+            InputSerialization={"Parquet": {}},
+            OutputSerialization={"JSON": {}}
+        )
+        for event in resp["Payload"]:
+            if "Records" in event:
+                records = event["Records"]["Payload"].decode(self.encoding)
+                recs.append(records)
+        return recs
+
+        # ----------------------------------------------------
+    def get_dataset_counts(self, dataset_key):
+        """Query the S3 resource for occurrence and species counts for this dataset.
+
+        Args:
+            dataset_key: unique GBIF identifier for dataset of interest.
+
+        Returns:
+             records
+        """
+        datestr = get_current_datadate_str()
+        datestr = "2024_02_01"
+        s3_path = f"summary/dataset_counts_{datestr}_000.parquet"
+        query_str = (f"SELECT occ_count, species_count "
+                     f"FROM s3object s "
+                     f"WHERE s.datasetkey = {dataset_key}")
+        records = self.query_s3_table(s3_path, query_str)
+        return records
+
+
+"""
+import boto3
+
+from sppy.aws.aws_constants import (
+    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
+    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
+    USER_DATA_TOKEN)
+
+ctable = "dataset_counts_2024_02_01_000.parquet"
+ltable = "dataset_lists_2024_02_01_000.parquet"
+s3_path = f"summary/{ctable}"
+dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
+
+s3 = boto3.client('s3')
+query_str = (f"SELECT occ_count, species_count "
+             f"FROM s3object s "
+             f"WHERE s.datasetkey = '{dataset_key}'")
+             
+SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515'
+
+resp = s3.select_object_content(
+            Bucket=PROJ_BUCKET,
+            Key=s3_path,
+            ExpressionType='SQL',
+            Expression=query_str,
+            InputSerialization={"Parquet": {}},
+            OutputSerialization={"CSV": {}}
+            )
+            
+for event in resp["Payload"]:
+    if "Records" in event:
+        records = event["Records"]["Payload"].decode('utf-8')
+        print(records)
+
+"""

From d2c682102d2ed3e921b52b193ed5d7e595cb806c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 11 Mar 2024 17:32:11 -0500
Subject: [PATCH 06/81] testing notes

---
 sphinx/misc/debugging.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst
index 29543b7e..8cfd0aa3 100644
--- a/sphinx/misc/debugging.rst
+++ b/sphinx/misc/debugging.rst
@@ -10,6 +10,7 @@ IDE debugging of functions
 Local debugging of flask app
 =============================================
 
+* Choose to run the Analyst or Broker with FLASK_APP environment variable
 * Run flask at command prompt
 
 ```zsh
@@ -17,7 +18,11 @@ export FLASK_ENV=development
 export FLASK_APP=flask_app.broker.routes
 flask run
 ```
+* With either Analyst or Broker, the development port will be 5000
+
+  * Connect to http://127.0.0.1:5000 in browser,
+    i.e. http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False&
 
-* Connect to localhost in browser.
 * Flask will auto-update on file save.
 * Refresh browser after changes
+

From 250225bab1f96b014862102b1c6197030cabdd6e Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 12 Mar 2024 14:24:00 -0500
Subject: [PATCH 07/81] move common Analyst/BrokerService methods to new
 superclass

---
 flask_app/analyst/base.py | 218 ++++------------------------------
 flask_app/broker/base.py  | 217 +---------------------------------
 flask_app/common/base.py  | 241 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 270 insertions(+), 406 deletions(-)
 create mode 100644 flask_app/common/base.py

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index fc31c364..c4568345 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -1,71 +1,20 @@
 """Parent Class for the Specify Network API services."""
 from flask import Flask
+from werkzeug.exceptions import (BadRequest, InternalServerError)
 
-import sppy.tools.s2n.utils as lmutil
+from flask_app.common.base import _SpecifyNetworkService
+from sppy.tools.s2n.utils import add_errinfo, get_traceback
 from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService
 
 app = Flask(__name__)
 
 
 # .............................................................................
-class _AnalystService:
+class _AnalystService(_SpecifyNetworkService):
     """Base S-to-the-N service, handles parameter names and acceptable values."""
     # overridden by subclasses
     SERVICE_TYPE = APIService.AnalystRoot
 
-    # ...............................................
-    @classmethod
-    def _get_valid_requested_params(cls, user_params_string, valid_params):
-        """Return valid and invalid options for parameters that accept >1 values.
-
-        Args:
-            user_params_string: user-requested parameters as a string.
-            valid_params: valid parameter values
-
-        Returns:
-            valid_requested_params: list of valid params from the provided query string
-            invalid_params: list of invalid params from the provided query string
-
-        Note:
-            For the badge service, exactly one provider is required.  For all other
-            services, multiple providers are accepted, and None indicates to query all
-            valid providers.
-        """
-        valid_requested_params = invalid_params = []
-
-        if user_params_string:
-            tmplst = user_params_string.split(",")
-            user_params = {tp.lower().strip() for tp in tmplst}
-
-            valid_requested_params = set()
-            invalid_params = set()
-            # valid_requested_providers, invalid_providers =
-            #   cls.get_multivalue_options(user_provs, valid_providers)
-            for param in user_params:
-                if param in valid_params:
-                    valid_requested_params.add(param)
-                else:
-                    invalid_params.add(param)
-
-            invalid_params = list(invalid_params)
-            if valid_requested_params:
-                valid_requested_params = list(valid_requested_params)
-            else:
-                valid_requested_params = []
-
-        return valid_requested_params, invalid_params
-
-    # .............................................................................
-    @classmethod
-    def endpoint(cls):
-        """Return the URL endpoint for this class.
-
-        Returns:
-            URL endpoint for the service
-        """
-        endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}"
-        return endpoint
-
     # ...............................................
     @classmethod
     def get_endpoint(cls, **kwargs):
@@ -75,7 +24,7 @@ def get_endpoint(cls, **kwargs):
             **kwargs: keyword arguments are accepted but ignored
 
         Returns:
-            flask_app.broker.s2n_type.S2nOutput object
+            flask_app.analyst.s2n_type.S2nOutput object
 
         Raises:
             Exception: on unknown error.
@@ -106,155 +55,40 @@ def _show_online(cls):
 
     # ...............................................
     @classmethod
-    def _fix_type_new(cls, key, provided_val):
-        """Modify a parameter value to a valid type and value.
-
-        Args:
-            key: parameter key
-            provided_val: user-provided parameter value
-
-        Returns:
-            usr_val: a valid value for the parameter
-            valid_options: list of valid options (for error message)
-
-        Note:
-            Corrections:
-            * cast to correct type
-            * validate with any options
-            * if value is invalid (type or value), return the default.
-        """
-        valid_options = None
-        if provided_val is None:
-            return None
-        # all strings are lower case
-        try:
-            provided_val = provided_val.lower()
-        except Exception:
-            pass
-
-        # First see if restricted to options
-        default_val = cls.SERVICE_TYPE["params"][key]["default"]
-        type_val = cls.SERVICE_TYPE["params"][key]["type"]
-        # If restricted options, check
-        try:
-            options = cls.SERVICE_TYPE["params"][key]["options"]
-        except KeyError:
-            options = None
-        else:
-            # Invalid option returns default value
-            if provided_val in options:
-                usr_val = provided_val
-            else:
-                valid_options = options
-                usr_val = default_val
-
-        # If not restricted to options
-        if options is None:
-            # Cast values to correct type. Failed conversions return default value
-            if isinstance(type_val, str) and not options:
-                usr_val = str(provided_val)
-
-            elif isinstance(type_val, float):
-                try:
-                    usr_val = float(provided_val)
-                except ValueError:
-                    usr_val = default_val
-
-            # Boolean also tests as int, so try boolean first
-            elif isinstance(type_val, bool):
-                if provided_val in (0, "0", "n", "no", "f", "false"):
-                    usr_val = False
-                elif provided_val in (1, "1", "y", "yes", "t", "true"):
-                    usr_val = True
-                else:
-                    valid_options = (True, False)
-                    usr_val = default_val
-
-            elif isinstance(type_val, int):
-                try:
-                    usr_val = int(provided_val)
-                except ValueError:
-                    usr_val = default_val
-
-            else:
-                usr_val = provided_val
-
-        return usr_val, valid_options
-
-    # ...............................................
-    @classmethod
-    def _process_params(cls, user_kwargs=None):
-        """Modify all user provided keys to lowercase and values to correct types.
-
-        Args:
-            user_kwargs: dictionary of keywords and values sent by the user for
-                the current service.
-
-        Returns:
-            good_params: dictionary of valid parameters and values
-            errinfo: dictionary of errors for different error levels.
-
-        Note:
-            A list of valid values for a keyword can include None as a default
-                if user-provided value is invalid
-        Todo:
-            Do we need not_in_valid_options for error message?
-        """
-        good_params = {}
-        errinfo = {}
-
-        # Correct all parameter keys/values present
-        for key in cls.SERVICE_TYPE["params"]:
-            val = user_kwargs[key]
-            # Done in calling function
-            if val is not None:
-                usr_val, valid_options = cls._fix_type_new(key, val)
-                if valid_options is not None and val not in valid_options:
-                    errinfo = lmutil.add_errinfo(
-                        errinfo, "error",
-                        f"Value {val} for parameter {key} is not in valid options "
-                        f"{cls.SERVICE_TYPE['params'][key]['options']}")
-                    good_params[key] = None
-                else:
-                    good_params[key] = usr_val
-
-        # Fill in defaults for missing parameters
-        for key in cls.SERVICE_TYPE["params"]:
-            param_meta = cls.SERVICE_TYPE["params"][key]
-            try:
-                _ = good_params[key]
-            except KeyError:
-                good_params[key] = param_meta["default"]
-
-        return good_params, errinfo
-
-    # ...............................................
-    @classmethod
-    def _standardize_params(cls, collection_id=None, organization_id=None):
+    def _standardize_params(
+            cls, dataset_key=None, pub_org_key=None, order="descending", limit=10):
         """Standardize query parameters to send to appropriate service.
 
         Args:
-            collection_id: collection identifier for comparisons
-            organization_id: organization identifier for comparisons
+            dataset_key: unique GBIF dataset identifier for comparisons
+            pub_org_key: unique publishing organization identifier for comparisons
 
         Returns:
             a dictionary containing keys and properly formatted values for the
                 user specified parameters.
         """
         user_kwargs = {
-            "collection_id": collection_id,
-            "organization_id": organization_id
+            "collection_id": dataset_key,
+            "organization_id": pub_org_key,
+            "order": order,
+            "limit": limit
         }
 
-        usr_params, errinfo = cls._process_params(user_kwargs)
+        try:
+            usr_params, errinfo = cls._process_params(user_kwargs)
 
-        return usr_params, errinfo
+            # errinfo indicates bad parameters
+            try:
+                error_description = "; ".join(errinfo["error"])
+                raise BadRequest(error_description)
+            except KeyError:
+                pass
 
-    # ..........................
-    @staticmethod
-    def OPTIONS():
-        """Common options request for all services (needed for CORS)."""
-        return
+        except Exception:
+            error_description = get_traceback()
+            raise BadRequest(error_description)
+
+        return usr_params, errinfo
 
 
 # .............................................................................
diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py
index 5b5afb0c..200ed43e 100644
--- a/flask_app/broker/base.py
+++ b/flask_app/broker/base.py
@@ -3,6 +3,7 @@
 from werkzeug.exceptions import BadRequest, InternalServerError
 
 import sppy.tools.s2n.utils as lmutil
+from flask_app.common.base import _SpecifyNetworkService
 from flask_app.common.s2n_type import (
     APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider)
 from sppy.tools.provider.gbif import GbifAPI
@@ -21,7 +22,7 @@ def handle_bad_response(e):
     return f"Internal Server Error: {e}"
 
 # .............................................................................
-class _BrokerService:
+class _BrokerService(_SpecifyNetworkService):
     """Base S-to-the-N service, handles parameter names and acceptable values."""
     # overridden by subclasses
     SERVICE_TYPE = APIService.BrokerRoot
@@ -84,59 +85,6 @@ def get_providers(cls, filter_params=None):
         provnames = cls._order_providers(provnames)
         return provnames
 
-    # .............................................................................
-    @classmethod
-    def _get_valid_requested_params(cls, user_params_string, valid_params):
-        """Return valid and invalid options for parameters that accept >1 values.
-
-        Args:
-            user_params_string: user-requested parameters as a string.
-            valid_params: valid parameter values
-
-        Returns:
-            valid_requested_params: list of valid params from the provided query string
-            invalid_params: list of invalid params from the provided query string
-
-        Note:
-            For the badge service, exactly one provider is required.  For all other
-            services, multiple providers are accepted, and None indicates to query all
-            valid providers.
-        """
-        valid_requested_params = invalid_params = []
-
-        if user_params_string:
-            tmplst = user_params_string.split(",")
-            user_params = {tp.lower().strip() for tp in tmplst}
-
-            valid_requested_params = set()
-            invalid_params = set()
-            # valid_requested_providers, invalid_providers =
-            #   cls.get_multivalue_options(user_provs, valid_providers)
-            for param in user_params:
-                if param in valid_params:
-                    valid_requested_params.add(param)
-                else:
-                    invalid_params.add(param)
-
-            invalid_params = list(invalid_params)
-            if valid_requested_params:
-                valid_requested_params = list(valid_requested_params)
-            else:
-                valid_requested_params = []
-
-        return valid_requested_params, invalid_params
-
-    # .............................................................................
-    @classmethod
-    def endpoint(cls):
-        """Return the URL endpoint for this class.
-
-        Returns:
-            URL endpoint for the service
-        """
-        endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}"
-        return endpoint
-
     # ...............................................
     @classmethod
     def get_endpoint(cls, **kwargs):
@@ -236,154 +184,6 @@ def match_name_with_itis(self, namestr):
                 pass
         return namestr
 
-    # ...............................................
-    @classmethod
-    def _fix_type_new(cls, key, provided_val):
-        """Modify a parameter value to a valid type and value.
-
-        Args:
-            key: parameter key
-            provided_val: user-provided parameter value
-
-        Returns:
-            usr_val: a valid value for the parameter
-            valid_options: list of valid options (for error message)
-
-        Note:
-            Corrections:
-            * cast to correct type
-            * validate with any options
-            * if value is invalid (type or value), return the default.
-        """
-        valid_options = None
-        if provided_val is None:
-            return None
-        # all strings are lower case
-        try:
-            provided_val = provided_val.lower()
-        except Exception:
-            pass
-
-        param_meta = cls.SERVICE_TYPE["params"][key]
-        # First see if restricted to options
-        default_val = param_meta["default"]
-        type_val = param_meta["type"]
-        # If restricted options, check
-        try:
-            options = param_meta["options"]
-        except KeyError:
-            options = None
-        else:
-            # Invalid option returns default value
-            if provided_val in options:
-                usr_val = provided_val
-            else:
-                valid_options = options
-                usr_val = default_val
-
-        # If not restricted to options
-        if options is None:
-            # Cast values to correct type. Failed conversions return default value
-            if isinstance(type_val, str) and not options:
-                usr_val = str(provided_val)
-
-            elif isinstance(type_val, float):
-                try:
-                    usr_val = float(provided_val)
-                except ValueError:
-                    usr_val = default_val
-
-            # Boolean also tests as int, so try boolean first
-            elif isinstance(type_val, bool):
-                if provided_val in (0, "0", "n", "no", "f", "false"):
-                    usr_val = False
-                elif provided_val in (1, "1", "y", "yes", "t", "true"):
-                    usr_val = True
-                else:
-                    valid_options = (True, False)
-                    usr_val = default_val
-
-            elif isinstance(type_val, int):
-                try:
-                    usr_val = int(provided_val)
-                except ValueError:
-                    usr_val = default_val
-
-            else:
-                usr_val = provided_val
-
-        return usr_val, valid_options
-
-    # ...............................................
-    @classmethod
-    def _process_params(cls, user_kwargs=None):
-        """Modify all user provided keys to lowercase and values to correct types.
-
-        Args:
-            user_kwargs: dictionary of keywords and values sent by the user for
-                the current service.
-
-        Returns:
-            good_params: dictionary of valid parameters and values
-            errinfo: dictionary of errors for different error levels.
-
-        Note:
-            A list of valid values for a keyword can include None as a default
-                if user-provided value is invalid
-        Todo:
-            Do we need not_in_valid_options for error message?
-        """
-        good_params = {}
-        errinfo = {}
-
-        # Correct all parameter keys/values present
-        for key, param_meta in cls.SERVICE_TYPE["params"].items():
-            val = user_kwargs[key]
-            # Done in calling function
-            if key == "provider":
-                pass
-
-            # Do not edit namestr, maintain capitalization
-            elif key == "namestr":
-                good_params["namestr"] = val
-
-            # Require one valid icon_status
-            elif key == "icon_status":
-                valid_stat = param_meta["options"]
-                if val is None:
-                    errinfo = lmutil.add_errinfo(
-                        errinfo, "error",
-                        f"Parameter {key} containing one of {valid_stat} options is "
-                        f"required")
-                elif val not in valid_stat:
-                    errinfo = lmutil.add_errinfo(
-                        errinfo, "error",
-                        f"Value {val} for parameter {key} not in valid options "
-                        f"{valid_stat}")
-                else:
-                    good_params[key] = val
-
-            elif val is not None:
-                usr_val, valid_options = cls._fix_type_new(key, val)
-                if valid_options is not None and val not in valid_options:
-                    errinfo = lmutil.add_errinfo(
-                        errinfo, "error",
-                        f"Value {val} for parameter {key} is not in valid options "
-                        f"{param_meta['options']}")
-                    good_params[key] = None
-                else:
-                    good_params[key] = usr_val
-
-        # Fill in defaults for missing parameters
-        for key in cls.SERVICE_TYPE["params"]:
-            param_meta = cls.SERVICE_TYPE["params"][key]
-            try:
-                _ = good_params[key]
-            except KeyError:
-                good_params[key] = param_meta["default"]
-
-        return good_params, errinfo
-
     # ...............................................
     @classmethod
     def _get_providers_from_string(cls, usr_req_providers, filter_params=None):
@@ -498,18 +298,7 @@ def _standardize_params(
 
         return usr_params, errinfo
 
-    # ..........................
-    @staticmethod
-    def OPTIONS():
-        """Common options request for all services (needed for CORS)."""
-        return
-
 
 # .............................................................................
 if __name__ == "__main__":
-    kwarg_defaults = {
-        "count_only": False,
-        "width": 600,
-        "height": 300,
-        "type": [],
-        }
+    pass
diff --git a/flask_app/common/base.py b/flask_app/common/base.py
new file mode 100644
index 00000000..bae23db9
--- /dev/null
+++ b/flask_app/common/base.py
@@ -0,0 +1,241 @@
+"""Parent Class for the Specify Network API services."""
+from flask import Flask
+from werkzeug.exceptions import BadRequest, InternalServerError
+
+import sppy.tools.s2n.utils as lmutil
+from flask_app.common.s2n_type import (
+    APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider)
+from sppy.tools.provider.gbif import GbifAPI
+from sppy.tools.provider.itis import ItisAPI
+
+app = Flask(__name__)
+
+
+# .............................................................................
+@app.errorhandler(BadRequest)
+def handle_bad_request(e):
+    return f"Bad request: {e}"
+
+@app.errorhandler(InternalServerError)
+def handle_bad_response(e):
+    return f"Internal Server Error: {e}"
+
+# .............................................................................
+class _SpecifyNetworkService:
+    """Base S-to-the-N service, handles parameter names and acceptable values."""
+    # overridden by subclasses
+    SERVICE_TYPE = None
+
+
+    # .............................................................................
+    @classmethod
+    def _get_valid_requested_params(cls, user_params_string, valid_params):
+        """Return valid and invalid options for parameters that accept >1 values.
+
+        Args:
+            user_params_string: user-requested parameters as a string.
+            valid_params: valid parameter values
+
+        Returns:
+            valid_requested_params: list of valid params from the provided query string
+            invalid_params: list of invalid params from the provided query string
+
+        Note:
+            For the badge service, exactly one provider is required.  For all other
+            services, multiple providers are accepted, and None indicates to query all
+            valid providers.
+        """
+        valid_requested_params = invalid_params = []
+
+        if user_params_string:
+            tmplst = user_params_string.split(",")
+            user_params = {tp.lower().strip() for tp in tmplst}
+
+            valid_requested_params = set()
+            invalid_params = set()
+            # valid_requested_providers, invalid_providers =
+            #   cls.get_multivalue_options(user_provs, valid_providers)
+            for param in user_params:
+                if param in valid_params:
+                    valid_requested_params.add(param)
+                else:
+                    invalid_params.add(param)
+
+            invalid_params = list(invalid_params)
+            if valid_requested_params:
+                valid_requested_params = list(valid_requested_params)
+            else:
+                valid_requested_params = []
+
+        return valid_requested_params, invalid_params
+
+    # .............................................................................
+    @classmethod
+    def endpoint(cls):
+        """Return the URL endpoint for this class.
+
+        Returns:
+            URL endpoint for the service
+        """
+        endpoint = f"{APIEndpoint.Root}/{cls.SERVICE_TYPE['endpoint']}"
+        return endpoint
+
+    # ...............................................
+    @classmethod
+    def _fix_type_new(cls, key, provided_val):
+        """Modify a parameter value to a valid type and value.
+
+        Args:
+            key: parameter key
+            provided_val: user-provided parameter value
+
+        Returns:
+            usr_val: a valid value for the parameter
+            valid_options: list of valid options (for error message)
+
+        Note:
+            Corrections:
+            * cast to correct type
+            * validate with any options
+            * if value is invalid (type or value), return the default.
+        """
+        valid_options = None
+        if provided_val is None:
+            return None
+        # all strings are lower case
+        try:
+            provided_val = provided_val.lower()
+        except Exception:
+            pass
+
+        param_meta = cls.SERVICE_TYPE["params"][key]
+        # First see if restricted to options
+        default_val = param_meta["default"]
+        type_val = param_meta["type"]
+        # If restricted options, check
+        try:
+            options = param_meta["options"]
+        except KeyError:
+            options = None
+        else:
+            # Invalid option returns default value
+            if provided_val in options:
+                usr_val = provided_val
+            else:
+                valid_options = options
+                usr_val = default_val
+
+        # If not restricted to options
+        if options is None:
+            # Cast values to correct type. Failed conversions return default value
+            if isinstance(type_val, str) and not options:
+                usr_val = str(provided_val)
+
+            elif isinstance(type_val, float):
+                try:
+                    usr_val = float(provided_val)
+                except ValueError:
+                    usr_val = default_val
+
+            # Boolean also tests as int, so try boolean first
+            elif isinstance(type_val, bool):
+                if provided_val in (0, "0", "n", "no", "f", "false"):
+                    usr_val = False
+                elif provided_val in (1, "1", "y", "yes", "t", "true"):
+                    usr_val = True
+                else:
+                    valid_options = (True, False)
+                    usr_val = default_val
+
+            elif isinstance(type_val, int):
+                try:
+                    usr_val = int(provided_val)
+                except ValueError:
+                    usr_val = default_val
+
+            else:
+                usr_val = provided_val
+
+        return usr_val, valid_options
+
+    # ...............................................
+    @classmethod
+    def _process_params(cls, user_kwargs=None):
+        """Modify all user provided keys to lowercase and values to correct types.
+
+        Args:
+            user_kwargs: dictionary of keywords and values sent by the user for
+                the current service.
+
+        Returns:
+            good_params: dictionary of valid parameters and values
+            errinfo: dictionary of errors for different error levels.
+
+        Note:
+            A list of valid values for a keyword can include None as a default
+                if user-provided value is invalid
+        Todo:
+            Do we need not_in_valid_options for error message?
+        """
+        good_params = {}
+        errinfo = {}
+
+        # Correct all parameter keys/values present
+        for key, param_meta in cls.SERVICE_TYPE["params"].items():
+            val = user_kwargs[key]
+            # Done in calling function
+            if key == "provider":
+                pass
+
+            # Do not edit namestr, maintain capitalization
+            elif key == "namestr":
+                good_params["namestr"] = val
+
+            # Require one valid icon_status
+            elif key == "icon_status":
+                valid_stat = param_meta["options"]
+                if val is None:
+                    errinfo = lmutil.add_errinfo(
+                        errinfo, "error",
+                        f"Parameter {key} containing one of {valid_stat} options is "
+                        f"required")
+                elif val not in valid_stat:
+                    errinfo = lmutil.add_errinfo(
+                        errinfo, "error",
+                        f"Value {val} for parameter {key} not in valid options "
+                        f"{valid_stat}")
+                else:
+                    good_params[key] = val
+
+            elif val is not None:
+                usr_val, valid_options = cls._fix_type_new(key, val)
+                if valid_options is not None and val not in valid_options:
+                    errinfo = lmutil.add_errinfo(
+                        errinfo, "error",
+                        f"Value {val} for parameter {key} is not in valid options "
+                        f"{param_meta['options']}")
+                    good_params[key] = None
+                else:
+                    good_params[key] = usr_val
+
+        # Fill in defaults for missing parameters
+        for key in cls.SERVICE_TYPE["params"]:
+            param_meta = cls.SERVICE_TYPE["params"][key]
+            try:
+                _ = good_params[key]
+            except KeyError:
+                good_params[key] = param_meta["default"]
+
+        return good_params, errinfo
+
+
+    # ..........................
+    @staticmethod
+    def OPTIONS():
+        """Common options request for all services (needed for CORS)."""
+        return
+
+
+# .............................................................................
+if __name__ == "__main__":
+    pass

From ea6bdbb4257492cfd1081e1130fbe5366b40b901 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 12 Mar 2024 14:25:46 -0500
Subject: [PATCH 08/81] doc, description

---
 flask_app/broker/occ.py      |  3 +--
 flask_app/common/s2n_type.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py
index 02c5fc20..007de72c 100644
--- a/flask_app/broker/occ.py
+++ b/flask_app/broker/occ.py
@@ -157,12 +157,11 @@ def get_occurrence_records(
         if occid is None and gbif_dataset_key is None:
             return cls.get_endpoint()
         else:
-            # No filter_params defined for Name service yet
             try:
                 good_params, errinfo = cls._standardize_params(
                     occid=occid, provider=provider, gbif_dataset_key=gbif_dataset_key,
                     count_only=count_only)
-                # Bad parameters
+                # errinfo indicates bad parameters
                 try:
                     error_description = "; ".join(errinfo["error"])
                     raise BadRequest(error_description)
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index 2648ba3e..b281c16c 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -163,18 +163,20 @@ class APIService:
         "name": APIEndpoint.Count,
         "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Count}",
         "params": {
-            "collection_id": {
+            "dataset_key": {
                 "type": "",
-                "description": "Collection identifier",
+                "description": "GBIF Dataset Key",
                 "default": None
             },
-            "organization_id": {
+            "pub_org_key": {
                 "type": "",
-                "description": "Organization identifier",
+                "description": "GBIF Publishing Organization Key",
                 "default": None
             }
         },
-        "description": "Return record count for the given collection or organization.",
+        "description":
+            "Return occurrence and species counts for the given dataset or "
+            "publishing organization.",
         S2nKey.RECORD_FORMAT: ""
     }
     # Taxonomic Resolution

From b6939d08f9840544ee6487f60028eeed19a9f4a2 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 12 Mar 2024 14:28:07 -0500
Subject: [PATCH 09/81] add SpNetwork S3 resources as a provider

---
 sppy/aws/aws_constants.py    |   1 +
 sppy/aws/aws_tools.py        | 106 +---------------------
 sppy/tools/provider/awss3.py | 169 +++++++++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+), 105 deletions(-)
 create mode 100644 sppy/tools/provider/awss3.py

diff --git a/sppy/aws/aws_constants.py b/sppy/aws/aws_constants.py
index 027aab91..266c3f1d 100644
--- a/sppy/aws/aws_constants.py
+++ b/sppy/aws/aws_constants.py
@@ -2,6 +2,7 @@
 PROJ_NAME = "specnet"
 REGION = "us-east-1"
 PROJ_BUCKET = f"{PROJ_NAME}-{REGION}"
+ENCODING = "utf-8"
 
 INPUT_PATH = "summary"
 LOG_PATH = "log"
diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index a055ab6b..c01fba23 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -14,7 +14,7 @@
 
 from sppy.aws.aws_constants import (
     INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
-    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
+    PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
     USER_DATA_TOKEN)
 
 
@@ -675,107 +675,3 @@ def create_dataframe_from_s3obj(
     return df
 
 
-# .............................................................................
-class S3Query():
-    """Specify Network API service for retrieving taxonomic information."""
-
-    # ...............................................
-    @classmethod
-    def __init__(
-            self, bucket, region=REGION, encoding="utf-8"):
-        """Object to query tabular data in S3.
-
-        Args:
-             bucket: S3 bucket containing data.
-             s3_path: S3 folder(s) containing data objects.
-             datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed.
-             region: AWS region containing the data.
-             encoding: encoding of the data.
-        """
-        self.s3 = boto3.client('s3')
-        self.bucket = bucket
-        self.region = region
-        self.encoding = encoding
-        self._current_datestr = get_current_datadate_str()
-        self.exp_type = 'SQL'
-
-    # ----------------------------------------------------
-    def query_s3_table(self, s3_path, query_str):
-        """Query the S3 resource defined for this class.
-
-        Args:
-            query_str: a SQL query for S3 select.
-
-        Returns:
-             list of records matching the query
-        """
-        recs = []
-        resp = self.s3.select_object_content(
-            Bucket=self.bucket,
-            Key= self.s3_path,
-            ExpressionType='SQL',
-            Expression=query_str,
-            InputSerialization={"Parquet": {}},
-            OutputSerialization={"JSON": {}}
-        )
-        for event in resp["Payload"]:
-            if "Records" in event:
-                records = event["Records"]["Payload"].decode(self.encoding)
-                recs.append(records)
-        return recs
-
-        # ----------------------------------------------------
-    def get_dataset_counts(self, dataset_key):
-        """Query the S3 resource for occurrence and species counts for this dataset.
-
-        Args:
-            dataset_key: unique GBIF identifier for dataset of interest.
-
-        Returns:
-             records
-        """
-        datestr = get_current_datadate_str()
-        datestr = "2024_02_01"
-        s3_path = f"summary/dataset_counts_{datestr}_000.parquet"
-        query_str = (f"SELECT occ_count, species_count "
-                     f"FROM s3object s "
-                     f"WHERE s.datasetkey = {dataset_key}")
-        records = self.query_s3_table(s3_path, query_str)
-        return records
-
-
-"""
-import boto3
-
-from sppy.aws.aws_constants import (
-    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
-    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
-    USER_DATA_TOKEN)
-
-ctable = "dataset_counts_2024_02_01_000.parquet"
-ltable = "dataset_lists_2024_02_01_000.parquet"
-s3_path = f"summary/{ctable}"
-dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
-
-s3 = boto3.client('s3')
-query_str = (f"SELECT occ_count, species_count "
-             f"FROM s3object s "
-             f"WHERE s.datasetkey = '{dataset_key}'")
-             
-SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515'
-
-resp = s3.select_object_content(
-            Bucket=PROJ_BUCKET,
-            Key=s3_path,
-            ExpressionType='SQL',
-            Expression=query_str,
-            InputSerialization={"Parquet": {}},
-            OutputSerialization={"CSV": {}}
-            )
-            
-for event in resp["Payload"]:
-    if "Records" in event:
-        records = event["Records"]["Payload"].decode('utf-8')
-        print(records)
-
-"""
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
new file mode 100644
index 00000000..18f1d35c
--- /dev/null
+++ b/sppy/tools/provider/awss3.py
@@ -0,0 +1,169 @@
+"""Class to query tabular summary Specify Network data in S3"""
+import base64
+import boto3
+from botocore.exceptions import ClientError
+import csv
+import datetime
+import logging
+from logging.handlers import RotatingFileHandler
+import pandas as pd
+import os
+
+from sppy.aws.aws_tools import get_current_datadate_str
+
+from sppy.aws.aws_constants import (
+    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
+    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
+    USER_DATA_TOKEN)
+
+
+
+# .............................................................................
+class S3Query():
+    """Specify Network API service for retrieving taxonomic information."""
+
+    # ...............................................
+    @classmethod
+    def __init__(
+            self, bucket, region=REGION, encoding="utf-8"):
+        """Object to query tabular data in S3.
+
+        Args:
+             bucket: S3 bucket containing data.
+             s3_path: S3 folder(s) containing data objects.
+             datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed.
+             region: AWS region containing the data.
+             encoding: encoding of the data.
+        """
+        self.s3 = boto3.client('s3')
+        self.bucket = bucket
+        self.region = region
+        self.encoding = encoding
+        self._current_datestr = get_current_datadate_str()
+        self.exp_type = 'SQL'
+
+    # ----------------------------------------------------
+    def query_s3_table(self, s3_path, query_str):
+        """Query the S3 resource defined for this class.
+
+        Args:
+            query_str: a SQL query for S3 select.
+
+        Returns:
+             list of records matching the query
+        """
+        recs = []
+        resp = self.s3.select_object_content(
+            Bucket=self.bucket,
+            Key=self.s3_path,
+            ExpressionType='SQL',
+            Expression=query_str,
+            InputSerialization={"Parquet": {}},
+            OutputSerialization={"JSON": {}}
+        )
+        for event in resp["Payload"]:
+            if "Records" in event:
+                records = event["Records"]["Payload"].decode(self.encoding)
+                recs.append(records)
+        return recs
+
+    # ----------------------------------------------------
+    def get_dataset_counts(self, dataset_key):
+        """Query the S3 resource for occurrence and species counts for this dataset.
+
+        Args:
+            dataset_key: unique GBIF identifier for dataset of interest.
+
+        Returns:
+             records: empty list or list of 1 record containing occ_count, species_count
+        """
+        (occ_count, species_count) = (0,0)
+        datestr = get_current_datadate_str()
+        datestr = "2024_02_01"
+        s3_path = f"summary/dataset_counts_{datestr}_000.parquet"
+        query_str = (f"SELECT occ_count, species_count "
+                     f"FROM s3object s "
+                     f"WHERE s.datasetkey = {dataset_key}")
+        # Returns empty list or list of 1 record with [(occ_count, species_count)]
+        records = self.query_s3_table(s3_path, query_str)
+        if records:
+            (occ_count, species_count) = records[0]
+        return (occ_count, species_count)
+
+    # ----------------------------------------------------
+    def get_org_counts(self, pub_org_key):
+        """Query S3 for occurrence and species counts for this organization.
+
+        Args:
+            pub_org_key: unique GBIF identifier for organization of interest.
+
+        Returns:
+             records: empty list or list of 1 record containing occ_count, species_count
+
+        TODO: implement this?
+        """
+        (occ_count, species_count) = (0,0)
+        return (occ_count, species_count)
+
+    # ----------------------------------------------------
+    def rank_datasets_by_species(self, order="descending", limit=10):
+        """Return the top or bottom datasets, with counts, ranked by number of species.
+
+        Args:
+            order: ascending (bottom up) or descending (top down).
+                descending = return top X datasets in descending order
+                ascending = return bottom X datasets in ascending order
+            limit: number of datasets to return, no more than 300.
+
+        Returns:
+             records: empty list or list of 1 record containing occ_count, species_count
+        """
+        (occ_count, species_count) = (0,0)
+        datestr = get_current_datadate_str()
+        datestr = "2024_02_01"
+        s3_path = f"summary/dataset_counts_{datestr}_000.parquet"
+        query_str = (f"SELECT occ_count, species_count "
+                     f"FROM s3object s "
+                     f"WHERE s.datasetkey = {dataset_key}")
+        # Returns empty list or list of 1 record with [(occ_count, species_count)]
+        records = self.query_s3_table(s3_path, query_str)
+        if records:
+            (occ_count, species_count) = records[0]
+        return (occ_count, species_count)
+
+
+"""
+import boto3
+
+from sppy.aws.aws_constants import (
+    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
+    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
+    USER_DATA_TOKEN)
+
+ctable = "dataset_counts_2024_02_01_000.parquet"
+ltable = "dataset_lists_2024_02_01_000.parquet"
+s3_path = f"summary/{ctable}"
+dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
+
+s3 = boto3.client('s3')
+query_str = (f"SELECT occ_count, species_count "
+             f"FROM s3object s "
+             f"WHERE s.datasetkey = '{dataset_key}'")
+
+SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515'
+
+resp = s3.select_object_content(
+            Bucket=PROJ_BUCKET,
+            Key=s3_path,
+            ExpressionType='SQL',
+            Expression=query_str,
+            InputSerialization={"Parquet": {}},
+            OutputSerialization={"CSV": {}}
+            )
+
+for event in resp["Payload"]:
+    if "Records" in event:
+        records = event["Records"]["Payload"].decode('utf-8')
+        print(records)
+
+"""

From 61b94c62a511e4efb3cf21229189653e695d08d4 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 12 Mar 2024 16:17:50 -0500
Subject: [PATCH 10/81] support min/max for numeric params

---
 flask_app/common/base.py     | 57 ++++++++++++++++++++++++++++--------
 flask_app/common/s2n_type.py |  4 ++-
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/flask_app/common/base.py b/flask_app/common/base.py
index bae23db9..5144d0d5 100644
--- a/flask_app/common/base.py
+++ b/flask_app/common/base.py
@@ -131,12 +131,6 @@ def _fix_type_new(cls, key, provided_val):
             if isinstance(type_val, str) and not options:
                 usr_val = str(provided_val)
 
-            elif isinstance(type_val, float):
-                try:
-                    usr_val = float(provided_val)
-                except ValueError:
-                    usr_val = default_val
-
             # Boolean also tests as int, so try boolean first
             elif isinstance(type_val, bool):
                 if provided_val in (0, "0", "n", "no", "f", "false"):
@@ -146,17 +140,54 @@ def _fix_type_new(cls, key, provided_val):
                 else:
                     valid_options = (True, False)
                     usr_val = default_val
+            else:
+                usr_val = cls._test_numbers(provided_val, param_meta)
+
+        return usr_val, valid_options
 
-            elif isinstance(type_val, int):
-                try:
-                    usr_val = int(provided_val)
-                except ValueError:
-                    usr_val = default_val
 
+    # ...............................................
+    @classmethod
+    def _test_numbers(cls, provided_val, param_meta):
+        default_val = param_meta["default"]
+        type_val = param_meta["type"]
+        # If restricted numeric values, check
+        try:
+            min_val = param_meta["min"]
+        except KeyError:
+            min_val = None
+        # If restricted numeric values, check
+        try:
+            max_val = param_meta["min"]
+        except KeyError:
+            max_val = None
+
+        if isinstance(type_val, float):
+            try:
+                usr_val = float(provided_val)
+            except ValueError:
+                usr_val = default_val
             else:
-                usr_val = provided_val
+                if min_val and usr_val < min_val:
+                    usr_val = min_val
+                if max_val and usr_val > max_val:
+                    usr_val = max_val
 
-        return usr_val, valid_options
+        elif isinstance(type_val, int):
+            try:
+                usr_val = int(provided_val)
+            except ValueError:
+                usr_val = default_val
+            else:
+                if min_val and usr_val < min_val:
+                    usr_val = min_val
+                if max_val and usr_val > max_val:
+                    usr_val = max_val
+
+        else:
+            usr_val = provided_val
+
+        return usr_val
 
     # ...............................................
     @classmethod
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index b281c16c..15831e2a 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -172,7 +172,9 @@ class APIService:
                 "type": "",
                 "description": "GBIF Publishing Organization Key",
                 "default": None
-            }
+            },
+            "descending": { "type": True, "default": True},
+            "limit": {"type": 2, "default": 10, "min": 1, "max": 500},
         },
         "description":
             "Return occurrence and species counts for the given dataset or "

From 6816e2bb139a7876cb6030a90c5c9dfe4db07c8a Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 12 Mar 2024 16:19:51 -0500
Subject: [PATCH 11/81] initial analyst apis

---
 flask_app/analyst/base.py      |   9 +-
 flask_app/analyst/constants.py |   1 +
 flask_app/analyst/count.py     | 215 ++++++++++++++++++++++-----------
 3 files changed, 152 insertions(+), 73 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index c4568345..6b3e4978 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -2,6 +2,7 @@
 from flask import Flask
 from werkzeug.exceptions import (BadRequest, InternalServerError)
 
+from flask_app.analyst.constants import QUERY_LIMIT
 from flask_app.common.base import _SpecifyNetworkService
 from sppy.tools.s2n.utils import add_errinfo, get_traceback
 from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService
@@ -56,12 +57,16 @@ def _show_online(cls):
     # ...............................................
     @classmethod
     def _standardize_params(
-            cls, dataset_key=None, pub_org_key=None, order="descending", limit=10):
+            cls, dataset_key=None, pub_org_key=None, descending=True, limit=10):
         """Standardize query parameters to send to appropriate service.
 
         Args:
             dataset_key: unique GBIF dataset identifier for comparisons
             pub_org_key: unique publishing organization identifier for comparisons
+            descending: boolean value indicating whether to sort records descending
+                (True) or ascending (False)
+            limit: integer indicating how many ranked records to return, value must
+                be less than QUERY_LIMIT.
 
         Returns:
             a dictionary containing keys and properly formatted values for the
@@ -70,7 +75,7 @@ def _standardize_params(
         user_kwargs = {
             "collection_id": dataset_key,
             "organization_id": pub_org_key,
-            "order": order,
+            "descending": descending,
             "limit": limit
         }
 
diff --git a/flask_app/analyst/constants.py b/flask_app/analyst/constants.py
index d0a99126..8c95a510 100644
--- a/flask_app/analyst/constants.py
+++ b/flask_app/analyst/constants.py
@@ -1 +1,2 @@
 """Constants for the Specify Network Analyst API services."""
+QUERY_LIMIT = 500
\ No newline at end of file
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 375a2694..bac143b8 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -1,12 +1,14 @@
 """Class for the Specify Network Name API service."""
 import boto3
 from http import HTTPStatus
+from werkzeug.exceptions import (BadRequest, InternalServerError)
 
 from flask_app.common.s2n_type import APIService, AnalystOutput
 from flask_app.common.util import print_analyst_output
 from flask_app.analyst.base import _AnalystService
 
-from sppy.aws.aws_tools import query_s3_table
+from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION
+from sppy.tools.provider.awss3 import S3Query
 from sppy.tools.s2n.utils import get_traceback
 
 
@@ -18,92 +20,163 @@ class CountSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def get_counts(cls, dataset_key):
+    def _get_params_errors(cls, *kwargs):
+        try:
+            good_params, errinfo = cls._standardize_params(cls, kwargs)
+            # errinfo indicates bad parameters
+            try:
+                error_description = "; ".join(errinfo["error"])
+                raise BadRequest(error_description)
+            except KeyError:
+                pass
+
+        except Exception:
+            error_description = get_traceback()
+            raise BadRequest(error_description)
+
+        return good_params, errinfo
+
+    # ...............................................
+    @classmethod
+    def get_counts(cls, dataset_key=None, pub_org_key=None):
+        if dataset_key is None and pub_org_key is None:
+            return cls.get_endpoint()
+        else:
+            try:
+                good_params, errinfo = cls._standardize_params(
+                    cls, dataset_key=dataset_key, pub_org_key=pub_org_key)
+                # errinfo indicates bad parameters
+                try:
+                    error_description = "; ".join(errinfo["error"])
+                    raise BadRequest(error_description)
+                except KeyError:
+                    pass
+
+            except Exception:
+                error_description = get_traceback()
+                raise BadRequest(error_description)
+
+            # Do Query!
+            try:
+                allrecs = []
+                errors = {}
+                # for response metadata
+                if dataset_key is not None:
+                    records, errors = cls._get_dataset_counts(dataset_key)
+                    allrecs.append(records)
+                if pub_org_key is not None:
+                    errors["warning"] = \
+                        "Count by Publishing Organization is not implemented"
+                    # records, errors = cls._get_organization_counts(pub_org_key)
+                    # allrecs.append(records)
+
+                # Assemble
+                full_out = AnalystOutput(
+                    cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
+                    records=allrecs, errors=errors)
+
+                # Add message on invalid parameters to output
+                try:
+                    for err in errinfo["warning"]:
+                        full_out.append_error("warning", err)
+                except KeyError:
+                    pass
+
+            except Exception:
+                error_description = get_traceback()
+                raise InternalServerError(error_description)
+
+        return full_out.response
+
+    # ...............................................
+    @classmethod
+    def get_ranked_counts(cls, descending=True, limit=10):
+            try:
+                good_params, errinfo = cls._standardize_params(
+                    cls, descending=descending, limit=limit)
+                # errinfo indicates bad parameters
+                try:
+                    error_description = "; ".join(errinfo["error"])
+                    raise BadRequest(error_description)
+                except KeyError:
+                    pass
+
+            except Exception:
+                error_description = get_traceback()
+                raise BadRequest(error_description)
+
+            # Do Query!
+            try:
+                s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
+                records = s3.rank_datasets_by_species(descending=True, limit=limit)
+
+    # ...............................................
+    @classmethod
+    def _get_dataset_counts(cls, dataset_key):
         """Get counts for datasetKey.
 
         Args:
             dataset_key: Unique identifier for GBIF datasets.
 
         Returns:
-            a flask_app.broker.s2n_type.BrokerOutput object with optional records as a
-            list of dictionaries of records corresponding to specimen occurrences in
-            the provider database.
-
-        Todo: Consider adding publishing organization queries with pub_org_key
+            a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a
+            list of records corresponding to occurrence and counts for the dataset.
         """
+        records = []
+        errors = {}
+        s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
         try:
-            output = cls._get_records(dataset_key, )
+            (occ_count, species_count) = s3.get_dataset_counts(dataset_key)
         except Exception:
             traceback = get_traceback()
-            output = AnalystOutput(
-                cls.SERVICE_TYPE["name"],
-                description=cls.SERVICE_TYPE["description"],
-                errors={"error": [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]})
-        return output.response
+            errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]
+        else:
+            records.append((occ_count, species_count))
+        return records, errors
 
     # ...............................................
     @classmethod
     def _get_organization_counts(cls, pub_org_key):
-        return {
-            "Organization Raw Counts":
-                {
-                    pub_org_key: 1,
-                    "org_id_2": 2,
-                    "org_id_3": 3
-                },
-            f"{pub_org_key} to other orgs":
-                {
-                    "to total": "0.5",
-                    "org_id_2": "1.2",
-                    "org_id_3": "1.2"
-                }
-        }
+        """Get counts for publishingOrganizationKey.
 
-    # ...............................................
-    @classmethod
-    def _get_dataset_counts(cls, dataset_key):
-        s3 = boto3.client('s3')
-
-        resp = s3.select_object_content(
-            Bucket=PROJ_,
-            Key='sample_data.csv',
-            ExpressionType='SQL',
-            Expression="SELECT * FROM s3object s where s.\"Name\" = 'Jane'",
-            InputSerialization={'CSV': {"FileHeaderInfo": "Use"}, 'CompressionType': 'NONE'},
-            OutputSerialization={'CSV': {}},
-        )
-
-        for event in resp['Payload']:
-            if 'Records' in event:
-                records = event['Records']['Payload'].decode('utf-8')
-                print(records)
-            elif 'Stats' in event:
-                statsDetails = event['Stats']['Details']
-                print("Stats details bytesScanned: ")
-                print(statsDetails['BytesScanned'])
-                print("Stats details bytesProcessed: ")
-                print(statsDetails['BytesProcessed'])
-                print("Stats details bytesReturned: ")
-                print(statsDetails['BytesReturned'])
+        Args:
+            pub_org_key: Unique identifier for GBIF publishing organizations.
 
-    # ...............................................
-    @classmethod
-    def _get_records(cls, dataset_key, pub_org_key):
-        allrecs = []
-        # for response metadata
-        if dataset_key is not None:
-            coll_data = cls._get_collection_counts(dataset_key)
-            allrecs.append(coll_data)
-        if pub_org_key is not None:
-            org_data = cls._get_organization_counts(pub_org_key)
-            allrecs.append(org_data)
-
-        # Assemble
-        full_out = AnalystOutput(
-            cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
-            records=allrecs, errors={})
-
-        return full_out
+        Returns:
+            a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a
+            list of records corresponding to occurrence and counts for the organization.
+        """
+        records = []
+        errors = {}
+        s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
+        try:
+            (occ_count, species_count) = s3.get_org_counts(pub_org_key)
+        except Exception:
+            traceback = get_traceback()
+            errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]
+        else:
+            records.append((occ_count, species_count))
+        return records, errors
+
+
+    # # ...............................................
+    # @classmethod
+    # def _get_records(cls, dataset_key, pub_org_key):
+    #     allrecs = []
+    #     # for response metadata
+    #     if dataset_key is not None:
+    #         records, errors = cls._get_dataset_counts(dataset_key)
+    #         allrecs.append(records)
+    #     if pub_org_key is not None:
+    #         records, errors = cls._get_organization_counts(pub_org_key)
+    #         allrecs.append(records)
+    #
+    #     # Assemble
+    #     full_out = AnalystOutput(
+    #         cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
+    #         records=allrecs, errors={})
+    #
+    #     return full_out
 
 
 # .............................................................................

From 2a6080bb47adaa66011bcb06ffd6f55a56846c02 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 12 Mar 2024 16:20:21 -0500
Subject: [PATCH 12/81] use pandassql for ranked records

---
 requirements.txt             |  2 +
 sppy/aws/aws_constants.py    |  1 +
 sppy/tools/provider/awss3.py | 83 ++++++++++++++++++++++--------------
 3 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 553b4dc4..f7a78ccb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,9 @@ gunicorn==20.1.0
 rtree>=1.0.0
 awscli
 boto3>=1.34.60
+sqlalchemy
 pandas
+pandas-sql
 pyarrow
 s3fs
 ggshield
\ No newline at end of file
diff --git a/sppy/aws/aws_constants.py b/sppy/aws/aws_constants.py
index 266c3f1d..9fd7fe79 100644
--- a/sppy/aws/aws_constants.py
+++ b/sppy/aws/aws_constants.py
@@ -2,6 +2,7 @@
 PROJ_NAME = "specnet"
 REGION = "us-east-1"
 PROJ_BUCKET = f"{PROJ_NAME}-{REGION}"
+SUMMARY_FOLDER = "summary"
 ENCODING = "utf-8"
 
 INPUT_PATH = "summary"
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 18f1d35c..8ceea88d 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -1,26 +1,17 @@
 """Class to query tabular summary Specify Network data in S3"""
-import base64
 import boto3
-from botocore.exceptions import ClientError
-import csv
-import datetime
-import logging
-from logging.handlers import RotatingFileHandler
 import pandas as pd
-import os
+from pandassql import sqldf
 
 from sppy.aws.aws_tools import get_current_datadate_str
 
-from sppy.aws.aws_constants import (
-    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
-    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
-    USER_DATA_TOKEN)
+from sppy.aws.aws_constants import (REGION, SUMMARY_FOLDER)
 
 
 
 # .............................................................................
 class S3Query():
-    """Specify Network API service for retrieving taxonomic information."""
+    """Specify Network API service for retrieving tabular parquet data from AWS S3."""
 
     # ...............................................
     @classmethod
@@ -43,7 +34,7 @@ def __init__(
         self.exp_type = 'SQL'
 
     # ----------------------------------------------------
-    def query_s3_table(self, s3_path, query_str):
+    def _query_s3_table(self, s3_path, query_str):
         """Query the S3 resource defined for this class.
 
         Args:
@@ -67,6 +58,42 @@ def query_s3_table(self, s3_path, query_str):
                 recs.append(records)
         return recs
 
+    # ----------------------------------------------------
+    def _create_dataframe_from_s3obj(self, s3_path):
+        """Read CSV data from S3 into a pandas DataFrame.
+
+        Args:
+            s3_path: the object name with enclosing S3 bucket folders.
+
+        Returns:
+            df: pandas DataFrame containing the CSV data.
+        """
+        # import pyarrow.parquet as pq
+        # import s3fs
+        s3_uri = f"s3://{self.bucket}/{s3_path}"
+        # s3_fs = s3fs.S3FileSystem
+        df = pd.read_parquet(s3_uri)
+        return df
+
+    # ----------------------------------------------------
+    def _query_order_s3_table(self, s3_path, sort_field, descending, limit):
+        """Query the S3 resource defined for this class.
+
+        Args:
+            query_str: a SQL query for S3 select.
+
+        Returns:
+             list of records matching the query
+        """
+        recs = []
+        df = self._create_dataframe_from_s3obj(s3_path)
+        df.sort_values(by=sort_field, ascending=(not descending))
+        for event in resp["Payload"]:
+            if "Records" in event:
+                records = event["Records"]["Payload"].decode(self.encoding)
+                recs.append(records)
+        return recs
+
     # ----------------------------------------------------
     def get_dataset_counts(self, dataset_key):
         """Query the S3 resource for occurrence and species counts for this dataset.
@@ -80,12 +107,12 @@ def get_dataset_counts(self, dataset_key):
         (occ_count, species_count) = (0,0)
         datestr = get_current_datadate_str()
         datestr = "2024_02_01"
-        s3_path = f"summary/dataset_counts_{datestr}_000.parquet"
+        s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
         query_str = (f"SELECT occ_count, species_count "
                      f"FROM s3object s "
                      f"WHERE s.datasetkey = {dataset_key}")
         # Returns empty list or list of 1 record with [(occ_count, species_count)]
-        records = self.query_s3_table(s3_path, query_str)
+        records = self._query_s3_table(s3_path, query_str)
         if records:
             (occ_count, species_count) = records[0]
         return (occ_count, species_count)
@@ -106,35 +133,29 @@ def get_org_counts(self, pub_org_key):
         return (occ_count, species_count)
 
     # ----------------------------------------------------
-    def rank_datasets_by_species(self, order="descending", limit=10):
+    def rank_datasets_by_species(self, descending=True, limit=10):
         """Return the top or bottom datasets, with counts, ranked by number of species.
 
         Args:
-            order: ascending (bottom up) or descending (top down).
-                descending = return top X datasets in descending order
-                ascending = return bottom X datasets in ascending order
+            descending: boolean value, if true return top X datasets in descending
+                order, if false, return bottom X datasets in ascending order
             limit: number of datasets to return, no more than 300.
 
         Returns:
-             records: empty list or list of 1 record containing occ_count, species_count
+             records: list of limit records containing dataset_key, occ_count, species_count
         """
-        (occ_count, species_count) = (0,0)
+        records = []
         datestr = get_current_datadate_str()
         datestr = "2024_02_01"
-        s3_path = f"summary/dataset_counts_{datestr}_000.parquet"
-        query_str = (f"SELECT occ_count, species_count "
-                     f"FROM s3object s "
-                     f"WHERE s.datasetkey = {dataset_key}")
-        # Returns empty list or list of 1 record with [(occ_count, species_count)]
-        records = self.query_s3_table(s3_path, query_str)
-        if records:
-            (occ_count, species_count) = records[0]
-        return (occ_count, species_count)
+        s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
+        records = self._query_order_s3_table(
+            s3_path, "species_count", descending, limit)
+        return records
 
 
 """
 import boto3
-
+SELECT s.datasetkey, s.occ_count, s.species_count FROM s3object s ORDER BY s.species_count DESC LIMIT 5
 from sppy.aws.aws_constants import (
     INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
     PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,

From 9f1a5c8b721651081fa8e3466fbc2476ef487195 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 13 Mar 2024 11:38:24 -0500
Subject: [PATCH 13/81] generalize standardize_params

---
 flask_app/analyst/base.py    |  19 +++--
 flask_app/analyst/count.py   | 144 +++++++++++------------------------
 flask_app/broker/base.py     |  29 +++++--
 sppy/tools/provider/awss3.py |  15 ++--
 4 files changed, 88 insertions(+), 119 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 6b3e4978..004d662f 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -68,6 +68,10 @@ def _standardize_params(
             limit: integer indicating how many ranked records to return, value must
                 be less than QUERY_LIMIT.
 
+        Raises:
+            BadRequest: on invalid query parameters.
+            BadRequest: on unknown exception parsing parameters.
+
         Returns:
             a dictionary containing keys and properly formatted values for the
                 user specified parameters.
@@ -81,18 +85,17 @@ def _standardize_params(
 
         try:
             usr_params, errinfo = cls._process_params(user_kwargs)
-
-            # errinfo indicates bad parameters
-            try:
-                error_description = "; ".join(errinfo["error"])
-                raise BadRequest(error_description)
-            except KeyError:
-                pass
-
         except Exception:
             error_description = get_traceback()
             raise BadRequest(error_description)
 
+        # errinfo["error"] indicates bad parameters, throws exception
+        try:
+            error_description = "; ".join(errinfo["error"])
+            raise BadRequest(error_description)
+        except KeyError:
+            pass
+
         return usr_params, errinfo
 
 
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index bac143b8..ad402665 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -9,7 +9,7 @@
 
 from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION
 from sppy.tools.provider.awss3 import S3Query
-from sppy.tools.s2n.utils import get_traceback
+from sppy.tools.s2n.utils import combine_errinfo, get_traceback
 
 
 # .............................................................................
@@ -18,99 +18,71 @@ class CountSvc(_AnalystService):
     SERVICE_TYPE = APIService.Count
     ORDERED_FIELDNAMES = []
 
-    # ...............................................
-    @classmethod
-    def _get_params_errors(cls, *kwargs):
-        try:
-            good_params, errinfo = cls._standardize_params(cls, kwargs)
-            # errinfo indicates bad parameters
-            try:
-                error_description = "; ".join(errinfo["error"])
-                raise BadRequest(error_description)
-            except KeyError:
-                pass
-
-        except Exception:
-            error_description = get_traceback()
-            raise BadRequest(error_description)
-
-        return good_params, errinfo
-
     # ...............................................
     @classmethod
     def get_counts(cls, dataset_key=None, pub_org_key=None):
         if dataset_key is None and pub_org_key is None:
             return cls.get_endpoint()
-        else:
-            try:
-                good_params, errinfo = cls._standardize_params(
-                    cls, dataset_key=dataset_key, pub_org_key=pub_org_key)
-                # errinfo indicates bad parameters
-                try:
-                    error_description = "; ".join(errinfo["error"])
-                    raise BadRequest(error_description)
-                except KeyError:
-                    pass
 
-            except Exception:
-                error_description = get_traceback()
-                raise BadRequest(error_description)
+        allrecs = []
+        try:
+            good_params, errinfo = cls._standardize_params(
+                cls, dataset_key=dataset_key, pub_org_key=pub_org_key)
 
-            # Do Query!
-            try:
-                allrecs = []
-                errors = {}
-                # for response metadata
-                if dataset_key is not None:
+        except BadRequest as e:
+            errinfo = combine_errinfo(errinfo, {"error": e.description})
+
+        else:
+
+            # Query dataset counts
+            if dataset_key is not None:
+                try:
                     records, errors = cls._get_dataset_counts(dataset_key)
+                except Exception:
+                    errors = {"error": get_traceback()}
+                else:
                     allrecs.append(records)
-                if pub_org_key is not None:
-                    errors["warning"] = \
-                        "Count by Publishing Organization is not implemented"
-                    # records, errors = cls._get_organization_counts(pub_org_key)
-                    # allrecs.append(records)
-
-                # Assemble
-                full_out = AnalystOutput(
-                    cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
-                    records=allrecs, errors=errors)
-
-                # Add message on invalid parameters to output
-                try:
-                    for err in errinfo["warning"]:
-                        full_out.append_error("warning", err)
-                except KeyError:
-                    pass
+                # Combine errors from success or failure
+                errinfo = combine_errinfo(errinfo, errors)
 
-            except Exception:
-                error_description = get_traceback()
-                raise InternalServerError(error_description)
+            # Query organization counts
+            if pub_org_key is not None:
+                errors = {"warning": "Count by Publishing Organization is not implemented"}
+                errinfo = combine_errinfo(errinfo, errors)
+
+        # Assemble
+        full_out = AnalystOutput(
+            cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
+            records=allrecs, errors=errinfo)
 
         return full_out.response
 
     # ...............................................
     @classmethod
     def get_ranked_counts(cls, descending=True, limit=10):
-            try:
-                good_params, errinfo = cls._standardize_params(
-                    cls, descending=descending, limit=limit)
-                # errinfo indicates bad parameters
-                try:
-                    error_description = "; ".join(errinfo["error"])
-                    raise BadRequest(error_description)
-                except KeyError:
-                    pass
+        allrecs = []
+        try:
+            good_params, errinfo = cls._standardize_params(
+                cls, descending=descending, limit=limit)
 
-            except Exception:
-                error_description = get_traceback()
-                raise BadRequest(error_description)
+        except BadRequest as e:
+            errinfo = combine_errinfo(errinfo, {"error": e.description})
 
+        else:
             # Do Query!
             try:
                 s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
-                records = s3.rank_datasets_by_species(descending=True, limit=limit)
-
-    # ...............................................
+                records, errors = s3.rank_datasets_by_species(
+                    descending=True, limit=limit)
+            except Exception:
+                errors = {"error": get_traceback()}
+            else:
+                allrecs.append(records)
+            # Combine errors from success or failure
+            errinfo = combine_errinfo(errinfo, errors)
+        return allrecs, errinfo
+
+# ...............................................
     @classmethod
     def _get_dataset_counts(cls, dataset_key):
         """Get counts for datasetKey.
@@ -181,27 +153,6 @@ def _get_organization_counts(cls, pub_org_key):
 
 # .............................................................................
 if __name__ == "__main__":
-    # from flask_app.broker.constants import import TST_VALUES
-    # occids = TST_VALUES.GUIDS_WO_SPECIFY_ACCESS[0:3]
-    occids = ["84fe1494-c378-4657-be15-8c812b228bf4",
-              "04c05e26-4876-4114-9e1d-984f78e89c15",
-              "2facc7a2-dd88-44af-b95a-733cc27527d4"]
-    occids = ["01493b05-4310-4f28-9d81-ad20860311f3",
-              "01559f57-62ca-45ba-80b1-d2aafdc46f44",
-              "015f35b8-655a-4720-9b88-c1c09f6562cb",
-              "016613ba-4e65-44d5-94d1-e24605afc7e1",
-              "0170cead-c9cd-48ba-9819-6c5d2e59947e",
-              "01792c67-910f-4ad6-8912-9b1341cbd983",
-              "017ea8f2-fc5a-4660-92ec-c203daaaa631",
-              "018728bb-c376-4562-9ccb-8e3c3fd70df6",
-              "018a34a9-55da-4503-8aee-e728ba4be146",
-              "019b547a-79c7-47b3-a5ae-f11d30c2b0de"]
-    # This occ has 16 issues in IDB, 0 in GBIF
-    occids = ["2facc7a2-dd88-44af-b95a-733cc27527d4",
-              "2c1becd5-e641-4e83-b3f5-76a55206539a"]
-    occids = ["bffe655b-ea32-4838-8e80-a80e391d5b11"]
-    occids = ["db193603-1ed3-11e3-bfac-90b11c41863e"]
-
     svc = CountSvc()
     out = svc.get_endpoint()
     print_analyst_output(out, do_print_rec=True)
@@ -210,8 +161,3 @@ def _get_organization_counts(cls, pub_org_key):
     org_id = None
     out = svc.get_counts(coll_id, org_id)
     print_analyst_output(out, do_print_rec=True)
-
-    # for occid in occids:
-    #     out = svc.get_occurrence_records(occid=occid, provider=None, count_only=False)
-    #     outputs = out["records"]
-    #     print_broker_output(out, do_print_rec=True)
diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py
index 200ed43e..c7b1c455 100644
--- a/flask_app/broker/base.py
+++ b/flask_app/broker/base.py
@@ -2,7 +2,7 @@
 from flask import Flask
 from werkzeug.exceptions import BadRequest, InternalServerError
 
-import sppy.tools.s2n.utils as lmutil
+from sppy.tools.s2n.utils import add_errinfo, combine_errinfo, get_traceback
 from flask_app.common.base import _SpecifyNetworkService
 from flask_app.common.s2n_type import (
     APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider)
@@ -204,14 +204,14 @@ def _get_providers_from_string(cls, usr_req_providers, filter_params=None):
                 providers = valid_requested_providers[0]
             else:
                 providers = None
-                errinfo = lmutil.add_errinfo(
+                errinfo = add_errinfo(
                     errinfo, "error",
                     f"Parameter provider containing exactly one of {valid_providers} "
                     f"options is required")
 
         if invalid_providers:
             for ip in invalid_providers:
-                errinfo = lmutil.add_errinfo(
+                errinfo = add_errinfo(
                     errinfo, "warning",
                     f"Value {ip} for parameter provider not in valid options "
                     f"{valid_providers}")
@@ -249,6 +249,9 @@ def _standardize_params(
             a dictionary containing keys and properly formatted values for the
                 user specified parameters.
 
+        Raises:
+            BadRequest on invalid query parameters
+
         Note:
             filter_params is present to distinguish between providers for occ service by
             occurrence_id or by dataset_id.
@@ -275,12 +278,25 @@ def _standardize_params(
             # "width": width,
             "icon_status": icon_status}
 
-        providers, prov_errinfo = cls._get_providers_from_string(
+        providers, errinfo = cls._get_providers_from_string(
             provider, filter_params=filter_params)
-        usr_params, errinfo = cls._process_params(user_kwargs)
+
+        try:
+            usr_params, param_errinfo = cls._process_params(user_kwargs)
+        except Exception:
+            error_description = get_traceback()
+            raise BadRequest(error_description)
+
         # consolidate parameters and errors
         usr_params["provider"] = providers
-        errinfo = lmutil.combine_errinfo(errinfo, prov_errinfo)
+        errinfo = combine_errinfo(errinfo, param_errinfo)
+
+        # errinfo["error"] indicates bad parameters, throws exception
+        try:
+            error_description = "; ".join(errinfo["error"])
+            raise BadRequest(error_description)
+        except KeyError:
+            pass
 
         # Remove gbif_parse and itis_match flags
         gbif_parse = itis_match = False
@@ -292,6 +308,7 @@ def _standardize_params(
             itis_match = usr_params.pop("itis_match")
         except Exception:
             pass
+
         # Replace namestr with GBIF-parsed namestr
         if namestr and (gbif_parse or itis_match):
             usr_params["namestr"] = cls.parse_name_with_gbif(namestr)
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 8ceea88d..5306e93a 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -1,11 +1,10 @@
 """Class to query tabular summary Specify Network data in S3"""
 import boto3
 import pandas as pd
-from pandassql import sqldf
 
 from sppy.aws.aws_tools import get_current_datadate_str
-
 from sppy.aws.aws_constants import (REGION, SUMMARY_FOLDER)
+from sppy.tools.s2n.utils import get_traceback
 
 
 
@@ -86,13 +85,14 @@ def _query_order_s3_table(self, s3_path, sort_field, descending, limit):
              list of records matching the query
         """
         recs = []
+        errors = {}
         df = self._create_dataframe_from_s3obj(s3_path)
         df.sort_values(by=sort_field, ascending=(not descending))
         for event in resp["Payload"]:
             if "Records" in event:
                 records = event["Records"]["Payload"].decode(self.encoding)
                 recs.append(records)
-        return recs
+        return recs, errors
 
     # ----------------------------------------------------
     def get_dataset_counts(self, dataset_key):
@@ -148,9 +148,12 @@ def rank_datasets_by_species(self, descending=True, limit=10):
         datestr = get_current_datadate_str()
         datestr = "2024_02_01"
         s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
-        records = self._query_order_s3_table(
-            s3_path, "species_count", descending, limit)
-        return records
+        try:
+            records, errors = self._query_order_s3_table(
+                s3_path, "species_count", descending, limit)
+        except Exception as e:
+            errors = {"error": get_traceback()}
+        return records, errors
 
 
 """

From 296c1228e02581dd038ebd776acc56a3cd093b3e Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 13 Mar 2024 17:55:49 -0500
Subject: [PATCH 14/81] debugging analyst response

---
 flask_app/analyst/base.py    |   4 +-
 flask_app/analyst/count.py   |  19 +++---
 flask_app/analyst/routes.py  |   6 +-
 flask_app/broker/badge.py    |  43 +++++-------
 flask_app/broker/name.py     |  39 ++++-------
 flask_app/broker/occ.py      |  42 +++++-------
 flask_app/common/s2n_type.py |  13 ++++
 sphinx/misc/debugging.rst    |  10 ++-
 sppy/tools/provider/awss3.py | 123 ++++++++++++++++++++++-------------
 9 files changed, 161 insertions(+), 138 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 004d662f..1ed8e959 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -77,8 +77,8 @@ def _standardize_params(
                 user specified parameters.
         """
         user_kwargs = {
-            "collection_id": dataset_key,
-            "organization_id": pub_org_key,
+            "dataset_key": dataset_key,
+            "pub_org_key": pub_org_key,
             "descending": descending,
             "limit": limit
         }
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index ad402665..cdd82375 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -27,13 +27,12 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
         allrecs = []
         try:
             good_params, errinfo = cls._standardize_params(
-                cls, dataset_key=dataset_key, pub_org_key=pub_org_key)
+                dataset_key=dataset_key, pub_org_key=pub_org_key)
 
         except BadRequest as e:
-            errinfo = combine_errinfo(errinfo, {"error": e.description})
+            errinfo = {"error": e.description}
 
         else:
-
             # Query dataset counts
             if dataset_key is not None:
                 try:
@@ -47,7 +46,8 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
 
             # Query organization counts
             if pub_org_key is not None:
-                errors = {"warning": "Count by Publishing Organization is not implemented"}
+                errors = {
+                    "warning": "Count by Publishing Organization is not implemented"}
                 errinfo = combine_errinfo(errinfo, errors)
 
         # Assemble
@@ -98,12 +98,11 @@ def _get_dataset_counts(cls, dataset_key):
         errors = {}
         s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
         try:
-            (occ_count, species_count) = s3.get_dataset_counts(dataset_key)
+            records = s3.get_dataset_counts(dataset_key)
         except Exception:
             traceback = get_traceback()
             errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]
-        else:
-            records.append((occ_count, species_count))
+
         return records, errors
 
     # ...............................................
@@ -157,7 +156,7 @@ def _get_organization_counts(cls, pub_org_key):
     out = svc.get_endpoint()
     print_analyst_output(out, do_print_rec=True)
 
-    coll_id = "a7156437-55ec-4c6f-89de-938f9361753d"
-    org_id = None
-    out = svc.get_counts(coll_id, org_id)
+    ds_key = "a7156437-55ec-4c6f-89de-938f9361753d"
+    org_key = None
+    out = svc.get_counts(dataset_key=ds_key, pub_org_key=org_key)
     print_analyst_output(out, do_print_rec=True)
diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py
index f53ad06c..7afc0d09 100644
--- a/flask_app/analyst/routes.py
+++ b/flask_app/analyst/routes.py
@@ -72,13 +72,13 @@ def count_endpoint():
         response: A flask_app.analyst API response object containing the count
             API response.
     """
-    coll_arg = request.args.get("collection_id", default=None, type=str)
+    ds_arg = request.args.get("dataset_key", default=None, type=str)
     # org_arg = request.args.get("organization_id", default=None, type=str)
     # if coll_arg is None and org_arg is None:
-    if coll_arg is None:
+    if ds_arg is None:
         response = CountSvc.get_endpoint()
     else:
-        response = CountSvc.get_counts(coll_arg)
+        response = CountSvc.get_counts(ds_arg)
     return response
 
 
diff --git a/flask_app/broker/badge.py b/flask_app/broker/badge.py
index 9be666d6..d95bcff5 100644
--- a/flask_app/broker/badge.py
+++ b/flask_app/broker/badge.py
@@ -6,7 +6,7 @@
 from flask_app.broker.constants import (ICON_CONTENT, ICON_DIR)
 from flask_app.common.s2n_type import APIService, S2nKey, ServiceProvider
 
-from sppy.tools.s2n.utils import get_traceback
+from sppy.tools.s2n.utils import combine_errinfo, get_traceback
 
 from flask_app.broker.base import _BrokerService
 
@@ -82,34 +82,27 @@ def get_icon(
         try:
             good_params, errinfo = cls._standardize_params(
                 provider=provider, icon_status=icon_status)
-            # Bad parameters
-            try:
-                error_description = "; ".join(errinfo["error"])
-                raise BadRequest(error_description)
-            except Exception:
-                pass
 
-        except Exception:
-            # Unknown error
-            error_description = get_traceback()
-            raise BadRequest(error_description)
+        except BadRequest as e:
+            raise
 
-        icon_basename = cls._get_icon_filename(
-            good_params["provider"][0], good_params["icon_status"])
-        icon_fname = os.path.join(app_path, ICON_DIR, icon_basename)
+        else:
+            icon_basename = cls._get_icon_filename(
+                good_params["provider"][0], good_params["icon_status"])
+            icon_fname = os.path.join(app_path, ICON_DIR, icon_basename)
+
+            if icon_fname is not None:
+                if stream:
+                    return send_file(
+                        icon_fname, mimetype=ICON_CONTENT, as_attachment=False)
+                else:
+                    return send_file(
+                        icon_fname, mimetype=ICON_CONTENT, as_attachment=True,
+                        attachment_filename=icon_fname)
 
-        if icon_fname is not None:
-            if stream:
-                return send_file(
-                    icon_fname, mimetype=ICON_CONTENT, as_attachment=False)
             else:
-                return send_file(
-                    icon_fname, mimetype=ICON_CONTENT, as_attachment=True,
-                    attachment_filename=icon_fname)
-
-        else:
-            raise NotImplementedError(
-                f"Badge {icon_status} not implemented for provider {provider}")
+                raise NotImplementedError(
+                    f"Badge {icon_status} not implemented for provider {provider}")
 
 
 # .............................................................................
diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py
index 4b1e2e63..d79a944a 100644
--- a/flask_app/broker/name.py
+++ b/flask_app/broker/name.py
@@ -9,7 +9,7 @@
 from sppy.tools.provider.gbif import GbifAPI
 from sppy.tools.provider.itis import ItisAPI
 from sppy.tools.provider.worms import WormsAPI
-from sppy.tools.s2n.utils import get_traceback
+from sppy.tools.s2n.utils import combine_errinfo, get_traceback
 
 
 # .............................................................................
@@ -156,35 +156,24 @@ def get_name_records(
                 good_params, errinfo = cls._standardize_params(
                     namestr=namestr, provider=provider, is_accepted=is_accepted,
                     gbif_parse=gbif_parse, gbif_count=gbif_count, kingdom=kingdom)
-                # Bad parameters
-                try:
-                    error_description = "; ".join(errinfo["error"])
-                    raise BadRequest(error_description)
-                except KeyError:
-                    pass
-            except Exception:
-                error_description = get_traceback()
-                raise BadRequest(error_description)
 
-            try:
-                # Do Query!
-                output = cls._get_records(
-                    good_params["namestr"], good_params["provider"],
-                    good_params["is_accepted"], good_params["gbif_count"],
-                    good_params["kingdom"])
+            except BadRequest as e:
+                full_output = cls._get_badquery_output(e.description)
 
-                # Add message on invalid parameters to output
+            else:
                 try:
-                    for err in errinfo["warning"]:
-                        output.append_error("warning", err)
-                except KeyError:
-                    pass
+                    # Do Query!, returns BrokerOutput
+                    full_output = cls._get_records(
+                        good_params["namestr"], good_params["provider"],
+                        good_params["is_accepted"], good_params["gbif_count"],
+                        good_params["kingdom"])
+                except Exception:
+                    full_output = cls._get_badquery_output(get_traceback())
 
-            except Exception:
-                error_description = get_traceback()
-                raise InternalServerError(error_description)
+                # Combine with errors from parameters
+                full_output.combine_errors(errinfo)
 
-        return output.response
+        return full_output.response
 
 
 # .............................................................................
diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py
index 007de72c..47d31378 100644
--- a/flask_app/broker/occ.py
+++ b/flask_app/broker/occ.py
@@ -161,36 +161,24 @@ def get_occurrence_records(
                 good_params, errinfo = cls._standardize_params(
                     occid=occid, provider=provider, gbif_dataset_key=gbif_dataset_key,
                     count_only=count_only)
-                # errinfo indicates bad parameters
-                try:
-                    error_description = "; ".join(errinfo["error"])
-                    raise BadRequest(error_description)
-                except KeyError:
-                    pass
-
-            except Exception:
-                error_description = get_traceback()
-                raise BadRequest(error_description)
 
-            # Do Query!
-            try:
-                output = cls._get_records(
-                    good_params["occid"], good_params["provider"],
-                    good_params["count_only"],
-                    gbif_dataset_key=good_params["gbif_dataset_key"])
+            except BadRequest as e:
+                full_output = cls._get_badquery_output(e.description)
 
-                # Add message on invalid parameters to output
+            else:
                 try:
-                    for err in errinfo["warning"]:
-                        output.append_error("warning", err)
-                except KeyError:
-                    pass
-
-            except Exception:
-                error_description = get_traceback()
-                raise InternalServerError(error_description)
-
-        return output.response
+                    # Do Query!, returns BrokerOutput
+                    full_output = cls._get_records(
+                        good_params["occid"], good_params["provider"],
+                        good_params["count_only"],
+                        gbif_dataset_key=good_params["gbif_dataset_key"])
+                except Exception:
+                    full_output = cls._get_badquery_output(get_traceback())
+
+                # Combine with errors from parameters
+                full_output.combine_errors(errinfo)
+
+        return full_output.response
 
 
 # .............................................................................
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index 15831e2a..ea4dc7e6 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -823,6 +823,19 @@ def append_error(self, error_type, error_desc):
         except KeyError:
             self._response[S2nKey.ERRORS][error_type] = [error_desc]
 
+    # ...............................................
+    def combine_errors(self, errinfo):
+        """Combine a dictionary of errors to the errors in a S2nOutput query response.
+
+        Args:
+            errinfo: dictionary of errors, with error level, and list of descriptions.
+        """
+        for err_type, err_desc in errinfo.items():
+            try:
+                self._response[S2nKey.ERRORS][err_type].append(err_desc)
+            except KeyError:
+                self._response[S2nKey.ERRORS][err_type] = [err_desc]
+
     # ...............................................
     @property
     def response(self):
diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst
index 8cfd0aa3..0c8b7844 100644
--- a/sphinx/misc/debugging.rst
+++ b/sphinx/misc/debugging.rst
@@ -18,11 +18,17 @@ export FLASK_ENV=development
 export FLASK_APP=flask_app.broker.routes
 flask run
 ```
-* With either Analyst or Broker, the development port will be 5000
+* With either Analyst or Broker, the development port will be 5000.  Connect to
+  http://127.0.0.1:5000 in browser,
 
-  * Connect to http://127.0.0.1:5000 in browser,
+  * Broker
     i.e. http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False&
+    or http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif
 
+  * Analyst:
+    http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515
 * Flask will auto-update on file save.
 * Refresh browser after changes
+* The frontend endpoint cannot be tested this way, as it depends on frontend
+  **webpack-output** and **static-files** to be mounted as docker volumes.
 
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 5306e93a..15d43880 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -1,5 +1,6 @@
 """Class to query tabular summary Specify Network data in S3"""
 import boto3
+import json
 import pandas as pd
 
 from sppy.aws.aws_tools import get_current_datadate_str
@@ -33,7 +34,7 @@ def __init__(
         self.exp_type = 'SQL'
 
     # ----------------------------------------------------
-    def _query_s3_table(self, s3_path, query_str):
+    def _query_s3_table(self, s3_path, query_str, format="JSON"):
         """Query the S3 resource defined for this class.
 
         Args:
@@ -43,18 +44,36 @@ def _query_s3_table(self, s3_path, query_str):
              list of records matching the query
         """
         recs = []
+        if format not in ("JSON", "CSV"):
+            format = "JSON"
+        if format == "JSON":
+            out_serialization = {"JSON": {}}
+        elif format == "CSV":
+            out_serialization = {
+                "CSV": {
+                    "QuoteFields": "ASNEEDED",
+                    "FieldDelimiter": ",",
+                    "QuoteCharacter": '"'}
+            }
         resp = self.s3.select_object_content(
             Bucket=self.bucket,
-            Key=self.s3_path,
+            Key=s3_path,
             ExpressionType='SQL',
             Expression=query_str,
             InputSerialization={"Parquet": {}},
-            OutputSerialization={"JSON": {}}
+            OutputSerialization=out_serialization
         )
         for event in resp["Payload"]:
             if "Records" in event:
-                records = event["Records"]["Payload"].decode(self.encoding)
-                recs.append(records)
+                recs_str = event["Records"]["Payload"].decode(ENCODING)
+                rec_strings = recs_str.split("\n")
+                for rs in rec_strings:
+                    if rs:
+                        if format == "JSON":
+                            rec = json.loads(rs)
+                        else:
+                            rec = rs.split(",")
+                        recs.append(rec)
         return recs
 
     # ----------------------------------------------------
@@ -108,14 +127,13 @@ def get_dataset_counts(self, dataset_key):
         datestr = get_current_datadate_str()
         datestr = "2024_02_01"
         s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
-        query_str = (f"SELECT occ_count, species_count "
+        query_str = (f"SELECT datasetkey, occ_count, species_count "
                      f"FROM s3object s "
-                     f"WHERE s.datasetkey = {dataset_key}")
+                     f"WHERE s.datasetkey = '{dataset_key}'")
+        print(query_str)
         # Returns empty list or list of 1 record with [(occ_count, species_count)]
-        records = self._query_s3_table(s3_path, query_str)
-        if records:
-            (occ_count, species_count) = records[0]
-        return (occ_count, species_count)
+        records = self._query_s3_table(s3_path, query_str, format="JSON")
+        return records
 
     # ----------------------------------------------------
     def get_org_counts(self, pub_org_key):
@@ -155,39 +173,56 @@ def rank_datasets_by_species(self, descending=True, limit=10):
             errors = {"error": get_traceback()}
         return records, errors
 
-
+# .............................................................................
+if __name__ == "__main__":
+    from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION
+
+    datestr = "2024_02_01"
+    dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
+    s3 = boto3.client('s3')
+
+    s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
+    query_str = (f"SELECT datasetkey, occ_count, species_count "
+                 f"FROM s3object s "
+                 f"WHERE s.datasetkey = '{dataset_key}'")
+    query_str = f"SELECT datasetkey, occ_count, species_count FROM s3object s LIMIT 5"
+
+    format = "CSV"
+    if format == "JSON":
+        out_serialization = {"JSON": {}}
+    elif format == "CSV":
+        out_serialization = {
+            "CSV": {
+                "QuoteFields": "ASNEEDED",
+                "FieldDelimiter": ",",
+                "QuoteCharacter": '"'}
+        }
+    resp = s3.select_object_content(
+        Bucket=PROJ_BUCKET,
+        Key=s3_path,
+        ExpressionType="SQL",
+        Expression=query_str,
+        InputSerialization={"Parquet": {}},
+        OutputSerialization=out_serialization
+    )
+
+    for event in resp["Payload"]:
+        print(event)
+        if "Records" in event:
+            recs_str = event["Records"]["Payload"].decode(ENCODING)
+            rec_strings = recs_str.split("\n")
+            for rs in rec_strings:
+                if rs:
+                    if format == "JSON":
+                        rec = json.loads(rs)
+                    else:
+                        rec = rs.split(",")
+                    print(rec)
+
+
+
+
+    # records = self._query_s3_table(s3_path, query_str)
 """
-import boto3
-SELECT s.datasetkey, s.occ_count, s.species_count FROM s3object s ORDER BY s.species_count DESC LIMIT 5
-from sppy.aws.aws_constants import (
-    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
-    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
-    USER_DATA_TOKEN)
-
-ctable = "dataset_counts_2024_02_01_000.parquet"
-ltable = "dataset_lists_2024_02_01_000.parquet"
-s3_path = f"summary/{ctable}"
-dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
-
-s3 = boto3.client('s3')
-query_str = (f"SELECT occ_count, species_count "
-             f"FROM s3object s "
-             f"WHERE s.datasetkey = '{dataset_key}'")
-
-SELECT * FROM s3object WHERE datasetkey = '0000e36f-d0e9-46b0-aa23-cc1980f00515'
-
-resp = s3.select_object_content(
-            Bucket=PROJ_BUCKET,
-            Key=s3_path,
-            ExpressionType='SQL',
-            Expression=query_str,
-            InputSerialization={"Parquet": {}},
-            OutputSerialization={"CSV": {}}
-            )
-
-for event in resp["Payload"]:
-    if "Records" in event:
-        records = event["Records"]["Payload"].decode('utf-8')
-        print(records)
 
 """

From 3dd75b74ef020e4bc5540136d214093850c79d36 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 14 Mar 2024 11:48:44 -0500
Subject: [PATCH 15/81] fixed circular imports

---
 flask_app/analyst/base.py    |  10 ++-
 flask_app/analyst/count.py   |  49 +++++++++------
 flask_app/broker/base.py     |  10 ---
 flask_app/broker/name.py     |   4 +-
 flask_app/common/base.py     |  13 ++--
 sppy/tools/provider/awss3.py | 118 +++++++++++------------------------
 6 files changed, 79 insertions(+), 125 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 1ed8e959..0c771cb6 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -1,13 +1,11 @@
 """Parent Class for the Specify Network API services."""
-from flask import Flask
-from werkzeug.exceptions import (BadRequest, InternalServerError)
+from werkzeug.exceptions import BadRequest
 
-from flask_app.analyst.constants import QUERY_LIMIT
 from flask_app.common.base import _SpecifyNetworkService
-from sppy.tools.s2n.utils import add_errinfo, get_traceback
-from flask_app.common.s2n_type import AnalystOutput, APIEndpoint, APIService
+from sppy.tools.s2n.utils import get_traceback
+from flask_app.common.s2n_type import AnalystOutput, APIService
 
-app = Flask(__name__)
+# app = Flask(__name__)
 
 
 # .............................................................................
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index cdd82375..006325f2 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -1,15 +1,14 @@
 """Class for the Specify Network Name API service."""
-import boto3
 from http import HTTPStatus
-from werkzeug.exceptions import (BadRequest, InternalServerError)
+from werkzeug.exceptions import BadRequest
 
 from flask_app.common.s2n_type import APIService, AnalystOutput
 from flask_app.common.util import print_analyst_output
 from flask_app.analyst.base import _AnalystService
 
-from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION
+from sppy.aws.aws_constants import PROJ_BUCKET
 from sppy.tools.provider.awss3 import S3Query
-from sppy.tools.s2n.utils import combine_errinfo, get_traceback
+from sppy.tools.s2n.utils import (combine_errinfo, get_traceback)
 
 
 # .............................................................................
@@ -20,7 +19,16 @@ class CountSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def get_counts(cls, dataset_key=None, pub_org_key=None):
+    def get_counts(cls, dataset_key=None, pub_org_key=None, format="CSV"):
+        """Return occurrence and species counts for dataset/organization identifiers.
+
+        Args:
+            dataset_key: URL parameter for unique GBIF identifier of dataset.
+            pub_org_key: URL parameter for unique GBIF identifier of
+                publishingOrganization.
+            format: output format, options "CSV" or "JSON"
+
+        """
         if dataset_key is None and pub_org_key is None:
             return cls.get_endpoint()
 
@@ -34,18 +42,20 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
 
         else:
             # Query dataset counts
-            if dataset_key is not None:
+            if good_params["dataset_key"] is not None:
                 try:
-                    records, errors = cls._get_dataset_counts(dataset_key)
+                    records, errors = cls._get_dataset_counts(
+                        good_params["dataset_key"], format)
                 except Exception:
                     errors = {"error": get_traceback()}
                 else:
-                    allrecs.append(records)
+                    if records:
+                        allrecs.append(records)
                 # Combine errors from success or failure
                 errinfo = combine_errinfo(errinfo, errors)
 
             # Query organization counts
-            if pub_org_key is not None:
+            if good_params["pub_org_key"] is not None:
                 errors = {
                     "warning": "Count by Publishing Organization is not implemented"}
                 errinfo = combine_errinfo(errinfo, errors)
@@ -71,7 +81,7 @@ def get_ranked_counts(cls, descending=True, limit=10):
         else:
             # Do Query!
             try:
-                s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
+                s3 = S3Query(PROJ_BUCKET)
                 records, errors = s3.rank_datasets_by_species(
                     descending=True, limit=limit)
             except Exception:
@@ -84,11 +94,12 @@ def get_ranked_counts(cls, descending=True, limit=10):
 
 # ...............................................
     @classmethod
-    def _get_dataset_counts(cls, dataset_key):
+    def _get_dataset_counts(cls, dataset_key, format):
         """Get counts for datasetKey.
 
         Args:
-            dataset_key: Unique identifier for GBIF datasets.
+            dataset_key: unique GBIF identifier for dataset of interest.
+            format: output format, options "CSV" or "JSON"
 
         Returns:
             a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a
@@ -96,9 +107,9 @@ def _get_dataset_counts(cls, dataset_key):
         """
         records = []
         errors = {}
-        s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
+        s3 = S3Query(PROJ_BUCKET)
         try:
-            records = s3.get_dataset_counts(dataset_key)
+            records = s3.get_dataset_counts(dataset_key, format=format)
         except Exception:
             traceback = get_traceback()
             errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]
@@ -119,7 +130,7 @@ def _get_organization_counts(cls, pub_org_key):
         """
         records = []
         errors = {}
-        s3 = S3Query(PROJ_BUCKET, region=REGION, encoding=ENCODING)
+        s3 = S3Query(PROJ_BUCKET)
         try:
             (occ_count, species_count) = s3.get_org_counts(pub_org_key)
         except Exception:
@@ -155,8 +166,8 @@ def _get_organization_counts(cls, pub_org_key):
     svc = CountSvc()
     out = svc.get_endpoint()
     print_analyst_output(out, do_print_rec=True)
-
-    ds_key = "a7156437-55ec-4c6f-89de-938f9361753d"
-    org_key = None
-    out = svc.get_counts(dataset_key=ds_key, pub_org_key=org_key)
+    format = "CSV"
+    dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
+    out = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format="CSV")
     print_analyst_output(out, do_print_rec=True)
+
diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py
index c7b1c455..bba52669 100644
--- a/flask_app/broker/base.py
+++ b/flask_app/broker/base.py
@@ -9,8 +9,6 @@
 from sppy.tools.provider.gbif import GbifAPI
 from sppy.tools.provider.itis import ItisAPI
 
-app = Flask(__name__)
-
 
 # .............................................................................
 @app.errorhandler(BadRequest)
@@ -268,14 +266,6 @@ def _standardize_params(
             "gbif_dataset_key": gbif_dataset_key,
             "count_only": count_only,
             "url": url,
-            # "bbox": bbox,
-            # "exceptions": exceptions,
-            # "height": height,
-            # "layers": layers,
-            # "request": request,
-            # "format": frmat,
-            # "srs": srs,
-            # "width": width,
             "icon_status": icon_status}
 
         providers, errinfo = cls._get_providers_from_string(
diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py
index d79a944a..bb2343ff 100644
--- a/flask_app/broker/name.py
+++ b/flask_app/broker/name.py
@@ -1,5 +1,5 @@
 """Class for the Specify Network Name API service."""
-from werkzeug.exceptions import (BadRequest, InternalServerError)
+from werkzeug.exceptions import BadRequest
 
 from flask_app.broker.base import _BrokerService
 from flask_app.common.s2n_type import (
@@ -9,7 +9,7 @@
 from sppy.tools.provider.gbif import GbifAPI
 from sppy.tools.provider.itis import ItisAPI
 from sppy.tools.provider.worms import WormsAPI
-from sppy.tools.s2n.utils import combine_errinfo, get_traceback
+from sppy.tools.s2n.utils import get_traceback
 
 
 # .............................................................................
diff --git a/flask_app/common/base.py b/flask_app/common/base.py
index 5144d0d5..017ea59d 100644
--- a/flask_app/common/base.py
+++ b/flask_app/common/base.py
@@ -2,11 +2,8 @@
 from flask import Flask
 from werkzeug.exceptions import BadRequest, InternalServerError
 
-import sppy.tools.s2n.utils as lmutil
-from flask_app.common.s2n_type import (
-    APIEndpoint, APIService, BrokerOutput, get_host_url, S2nKey, ServiceProvider)
-from sppy.tools.provider.gbif import GbifAPI
-from sppy.tools.provider.itis import ItisAPI
+from sppy.tools.s2n.utils import add_errinfo
+from flask_app.common.s2n_type import APIEndpoint
 
 app = Flask(__name__)
 
@@ -226,12 +223,12 @@ def _process_params(cls, user_kwargs=None):
             elif key == "icon_status":
                 valid_stat = param_meta["options"]
                 if val is None:
-                    errinfo = lmutil.add_errinfo(
+                    errinfo = add_errinfo(
                         errinfo, "error",
                         f"Parameter {key} containing one of {valid_stat} options is "
                         f"required")
                 elif val not in valid_stat:
-                    errinfo = lmutil.add_errinfo(
+                    errinfo = add_errinfo(
                         errinfo, "error",
                         f"Value {val} for parameter {key} not in valid options "
                         f"{valid_stat}")
@@ -241,7 +238,7 @@ def _process_params(cls, user_kwargs=None):
             elif val is not None:
                 usr_val, valid_options = cls._fix_type_new(key, val)
                 if valid_options is not None and val not in valid_options:
-                    errinfo = lmutil.add_errinfo(
+                    errinfo = add_errinfo(
                         errinfo, "error",
                         f"Value {val} for parameter {key} is not in valid options "
                         f"{param_meta['options']}")
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 15d43880..992d969e 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -3,42 +3,44 @@
 import json
 import pandas as pd
 
+from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION, SUMMARY_FOLDER
 from sppy.aws.aws_tools import get_current_datadate_str
-from sppy.aws.aws_constants import (REGION, SUMMARY_FOLDER)
 from sppy.tools.s2n.utils import get_traceback
 
 
 
 # .............................................................................
 class S3Query():
-    """Specify Network API service for retrieving tabular parquet data from AWS S3."""
+    """Class for retrieving SpecifyNetwork summary data from AWS S3."""
 
     # ...............................................
     @classmethod
     def __init__(
-            self, bucket, region=REGION, encoding="utf-8"):
+            self, bucket, region=REGION, encoding=ENCODING):
         """Object to query tabular data in S3.
 
         Args:
              bucket: S3 bucket containing data.
-             s3_path: S3 folder(s) containing data objects.
-             datatype: type of tabular data, 'CSV', 'JSON', and 'Parquet' are allowed.
              region: AWS region containing the data.
              encoding: encoding of the data.
         """
-        self.s3 = boto3.client('s3')
         self.bucket = bucket
         self.region = region
         self.encoding = encoding
-        self._current_datestr = get_current_datadate_str()
         self.exp_type = 'SQL'
+        datestr = get_current_datadate_str()
+        datestr = "2024_02_01"
+        self._dataset_counts_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
+        self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
 
     # ----------------------------------------------------
-    def _query_s3_table(self, s3_path, query_str, format="JSON"):
+    def _query_table(self, s3_path, query_str, format="CSV"):
         """Query the S3 resource defined for this class.
 
         Args:
+            s3_path: S3 folder and filename within the bucket
             query_str: a SQL query for S3 select.
+            format: output format, options "CSV" or "JSON"
 
         Returns:
              list of records matching the query
@@ -55,10 +57,11 @@ def _query_s3_table(self, s3_path, query_str, format="JSON"):
                     "FieldDelimiter": ",",
                     "QuoteCharacter": '"'}
             }
-        resp = self.s3.select_object_content(
+        s3 = boto3.client("s3", region_name=self.region)
+        resp = s3.select_object_content(
             Bucket=self.bucket,
             Key=s3_path,
-            ExpressionType='SQL',
+            ExpressionType="SQL",
             Expression=query_str,
             InputSerialization={"Parquet": {}},
             OutputSerialization=out_serialization
@@ -94,45 +97,44 @@ def _create_dataframe_from_s3obj(self, s3_path):
         return df
 
     # ----------------------------------------------------
-    def _query_order_s3_table(self, s3_path, sort_field, descending, limit):
+    def _query_order_s3_table(
+            self, s3_path, sort_field, descending, limit, format="CSV"):
         """Query the S3 resource defined for this class.
 
         Args:
-            query_str: a SQL query for S3 select.
+            s3_path: S3 folder and filename within the bucket
+            sort_field: fieldname to sort records on
+            descending: boolean flag indicating to sort ascending or descending
+            limit: number of records to return, limit is 500
+            format: output format, options "CSV" or "JSON"
 
         Returns:
-             list of records matching the query
+             ordered list of records matching the query
         """
-        recs = []
-        errors = {}
-        df = self._create_dataframe_from_s3obj(s3_path)
-        df.sort_values(by=sort_field, ascending=(not descending))
-        for event in resp["Payload"]:
-            if "Records" in event:
-                records = event["Records"]["Payload"].decode(self.encoding)
-                recs.append(records)
-        return recs, errors
+        pass
+        # recs = []
+        # errors = {}
+        # df = self._create_dataframe_from_s3obj(s3_path)
+        # df.sort_values(by=sort_field, ascending=(not descending))
+        # return recs, errors
 
     # ----------------------------------------------------
-    def get_dataset_counts(self, dataset_key):
+    def get_dataset_counts(self, dataset_key, format="CSV"):
         """Query the S3 resource for occurrence and species counts for this dataset.
 
         Args:
             dataset_key: unique GBIF identifier for dataset of interest.
+            format: output format, options "CSV" or "JSON"
 
         Returns:
-             records: empty list or list of 1 record containing occ_count, species_count
+             records: empty list or list of 1 record (list)
         """
-        (occ_count, species_count) = (0,0)
-        datestr = get_current_datadate_str()
-        datestr = "2024_02_01"
-        s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
-        query_str = (f"SELECT datasetkey, occ_count, species_count "
-                     f"FROM s3object s "
+        query_str = (f"SELECT datasetkey, occ_count, species_count FROM s3object s "
                      f"WHERE s.datasetkey = '{dataset_key}'")
+        query_str = "SELECT * FROM s3object s LIMIT 5"
         print(query_str)
         # Returns empty list or list of 1 record with [(occ_count, species_count)]
-        records = self._query_s3_table(s3_path, query_str, format="JSON")
+        records = self._query_table(self._dataset_counts_path, query_str, format=format)
         return records
 
     # ----------------------------------------------------
@@ -175,54 +177,10 @@ def rank_datasets_by_species(self, descending=True, limit=10):
 
 # .............................................................................
 if __name__ == "__main__":
-    from sppy.aws.aws_constants import ENCODING, PROJ_BUCKET, REGION
-
-    datestr = "2024_02_01"
+    format = "CSV"
     dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
-    s3 = boto3.client('s3')
+    s3q = S3Query(PROJ_BUCKET)
+    recs = s3q.get_dataset_counts(dataset_key, format=format)
+    for r in recs:
+        print(r)
 
-    s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
-    query_str = (f"SELECT datasetkey, occ_count, species_count "
-                 f"FROM s3object s "
-                 f"WHERE s.datasetkey = '{dataset_key}'")
-    query_str = f"SELECT datasetkey, occ_count, species_count FROM s3object s LIMIT 5"
-
-    format = "CSV"
-    if format == "JSON":
-        out_serialization = {"JSON": {}}
-    elif format == "CSV":
-        out_serialization = {
-            "CSV": {
-                "QuoteFields": "ASNEEDED",
-                "FieldDelimiter": ",",
-                "QuoteCharacter": '"'}
-        }
-    resp = s3.select_object_content(
-        Bucket=PROJ_BUCKET,
-        Key=s3_path,
-        ExpressionType="SQL",
-        Expression=query_str,
-        InputSerialization={"Parquet": {}},
-        OutputSerialization=out_serialization
-    )
-
-    for event in resp["Payload"]:
-        print(event)
-        if "Records" in event:
-            recs_str = event["Records"]["Payload"].decode(ENCODING)
-            rec_strings = recs_str.split("\n")
-            for rs in rec_strings:
-                if rs:
-                    if format == "JSON":
-                        rec = json.loads(rs)
-                    else:
-                        rec = rs.split(",")
-                    print(rec)
-
-
-
-
-    # records = self._query_s3_table(s3_path, query_str)
-"""
-
-"""

From b19eda80d0fdff627460a1ed7cd0601b5d77e594 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 14 Mar 2024 12:21:49 -0500
Subject: [PATCH 16/81] move debug printing to output objects

---
 flask_app/analyst/count.py   | 18 +++++----
 flask_app/broker/name.py     |  5 +--
 flask_app/broker/occ.py      | 11 +++---
 flask_app/common/s2n_type.py | 77 +++++++++++++++++++++++++++++++++++-
 flask_app/common/util.py     | 74 ----------------------------------
 sppy/tools/provider/awss3.py | 10 ++---
 6 files changed, 97 insertions(+), 98 deletions(-)

diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 006325f2..2eaf3165 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -3,7 +3,6 @@
 from werkzeug.exceptions import BadRequest
 
 from flask_app.common.s2n_type import APIService, AnalystOutput
-from flask_app.common.util import print_analyst_output
 from flask_app.analyst.base import _AnalystService
 
 from sppy.aws.aws_constants import PROJ_BUCKET
@@ -19,7 +18,7 @@ class CountSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def get_counts(cls, dataset_key=None, pub_org_key=None, format="CSV"):
+    def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"):
         """Return occurrence and species counts for dataset/organization identifiers.
 
         Args:
@@ -163,11 +162,14 @@ def _get_organization_counts(cls, pub_org_key):
 
 # .............................................................................
 if __name__ == "__main__":
-    svc = CountSvc()
-    out = svc.get_endpoint()
-    print_analyst_output(out, do_print_rec=True)
-    format = "CSV"
+    format = "JSON"
     dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
-    out = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format="CSV")
-    print_analyst_output(out, do_print_rec=True)
+
+    svc = CountSvc()
+    response = svc.get_endpoint()
+    AnalystOutput.print_output(response, do_print_rec=True)
+    # print(response)
+    response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format=format)
+    AnalystOutput.print_output(response, do_print_rec=True)
+    # print(response)
 
diff --git a/flask_app/broker/name.py b/flask_app/broker/name.py
index bb2343ff..cecde97c 100644
--- a/flask_app/broker/name.py
+++ b/flask_app/broker/name.py
@@ -4,7 +4,6 @@
 from flask_app.broker.base import _BrokerService
 from flask_app.common.s2n_type import (
     APIEndpoint, APIService, BrokerOutput, BrokerSchema, S2nKey, ServiceProvider)
-from flask_app.common.util import print_broker_output
 
 from sppy.tools.provider.gbif import GbifAPI
 from sppy.tools.provider.itis import ItisAPI
@@ -195,7 +194,7 @@ def get_name_records(
 
     svc = NameSvc()
     for namestr in test_names:
-        out = svc.get_name_records(
+        response = svc.get_name_records(
             namestr=namestr, provider=None, is_accepted=False,
             gbif_parse=True, gbif_count=True, kingdom=None)
-        print_broker_output(out, do_print_rec=True)
+        BrokerOutput.print_output(response, do_print_rec=True)
diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py
index 47d31378..4f277c37 100644
--- a/flask_app/broker/occ.py
+++ b/flask_app/broker/occ.py
@@ -4,7 +4,6 @@
 from flask_app.broker.base import _BrokerService
 from flask_app.common.s2n_type import (
     APIEndpoint, APIService, BrokerOutput, BrokerSchema, S2nKey, ServiceProvider)
-from flask_app.common.util import print_broker_output
 
 from sppy.tools.provider.gbif import GbifAPI
 from sppy.tools.provider.idigbio import IdigbioAPI
@@ -206,11 +205,11 @@ def get_occurrence_records(
 
     svc = OccurrenceSvc()
     out = svc.get_endpoint()
-    out = svc.get_occurrence_records(occid="a7156437-55ec-4c6f-89de-938f9361753d")
+    response = svc.get_occurrence_records(occid="a7156437-55ec-4c6f-89de-938f9361753d")
 
-    print_broker_output(out, do_print_rec=True)
+    BrokerOutput.print_output(response, do_print_rec=True)
 
     # for occid in occids:
-    #     out = svc.get_occurrence_records(occid=occid, provider=None, count_only=False)
-    #     outputs = out["records"]
-    #     print_broker_output(out, do_print_rec=True)
+    #     response = svc.get_occurrence_records(occid=occid, provider=None, count_only=False)
+    #     recs = response["records"]
+    #     BrokerOutput.print_output(response, do_print_rec=True)
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index ea4dc7e6..92bcedd8 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -992,13 +992,65 @@ def format_records(self, ordered_fieldnames):
                 ordered_recs.append(ordrec)
         self._response[S2nKey.RECORDS] = ordered_recs
 
+    # .............................................................................
+    @classmethod
+    def _print_sub_output(cls, oneelt, do_print_rec):
+        print("* One record of Specify Network Outputs *")
+        for name, attelt in oneelt.items():
+            try:
+                if name == "records":
+                    print("   records")
+                    if do_print_rec is False:
+                        print(f"      {name}: {len(attelt)} returned records")
+                    else:
+                        for rec in attelt:
+                            print("      record")
+                            for k, v in rec.items():
+                                print("         {}: {}".format(k, v))
+                else:
+                    print("   {}: {}".format(name, attelt))
+            except Exception:
+                pass
+
+    # ....................................
+    @classmethod
+    def print_output(cls, response_dict, do_print_rec=False):
+        """Print a formatted string of the elements in an S2nOutput query response.
+
+        Args:
+            response_dict: flask_app.broker.s2n_type.S2nOutput._response dictionary
+            do_print_rec: True to print each record in the response.
+
+        TODO: move to a class method
+        """
+        print("*** Broker output ***")
+        for name, attelt in response_dict.items():
+            try:
+                if name == "records":
+                    print("records: ")
+                    for respdict in attelt:
+                        cls._print_sub_output(respdict, do_print_rec)
+                else:
+                    print(f"{name}: {attelt}")
+            except Exception:
+                pass
+        # outelts = set(response_dict.keys())
+        # missing = S2nKey.broker_response_keys().difference(outelts)
+        # extras = outelts.difference(S2nKey.broker_response_keys())
+        # if missing:
+        #     print(f"Missing elements: {missing}")
+        # if extras:
+        #     print(f"Extra elements: {extras}")
+        print("")
+
 
 # .............................................................................
 class AnalystOutput:
     """Response type for a Specify Network Analyst query."""
     service: str
     description: str = ""
-    records: typing.List[dict] = []
+    # records: typing.List[dict] = []
+    records: typing.List = []
     errors: dict = {}
 
     # ...............................................
@@ -1008,7 +1060,7 @@ def __init__(self, service, description=None, records=None, errors=None):
         Args:
             service: API Service this object is responding to.
             description: Description of the computation in this response.
-            records: Records in this response.
+            records: Records (lists or dictionaries) in this response.
             errors: Errors encountered when generating this response.
         """
         if errors is None:
@@ -1035,6 +1087,27 @@ def response(self):
         """
         return self._response
 
+    # ....................................
+    @classmethod
+    def print_output(cls, response_dict, do_print_rec=False):
+        """Print a formatted string of the elements in an S2nOutput query response.
+
+        Args:
+            response_dict: flask_app.broker.s2n_type.S2nOutput._response dictionary
+            do_print_rec: True to print each record in the response.
+        """
+        print("*** Analyst output ***")
+        for name, attelt in response_dict.items():
+            try:
+                if name == "records" and do_print_rec:
+                    print("records: ")
+                    for rec in attelt:
+                        print(rec)
+                else:
+                    print(f"{name}: {attelt}")
+            except Exception:
+                pass
+
 
 # .............................................................................
 class ServiceProvider:
diff --git a/flask_app/common/util.py b/flask_app/common/util.py
index 3b5abd13..df3bd5ff 100644
--- a/flask_app/common/util.py
+++ b/flask_app/common/util.py
@@ -15,77 +15,3 @@ def get_host_url():
     if host_url.endswith("/"):
         host_url = host_url[:-1]
     return host_url
-
-
-# .............................................................................
-def _print_sub_output(oneelt, do_print_rec):
-    print("* One record of Specify Network Outputs *")
-    for name, attelt in oneelt.items():
-        try:
-            if name == "records":
-                print("   records")
-                if do_print_rec is False:
-                    print(f"      {name}: {len(attelt)} returned records")
-                else:
-                    for rec in attelt:
-                        print("      record")
-                        for k, v in rec.items():
-                            print("         {}: {}".format(k, v))
-            else:
-                print("   {}: {}".format(name, attelt))
-        except Exception:
-            pass
-
-
-# ....................................
-def print_broker_output(response_dict, do_print_rec=False):
-    """Print a formatted string of the elements in an S2nOutput query response.
-
-    Args:
-        response_dict: flask_app.broker.s2n_type.S2nOutput object
-        do_print_rec: True to print each record in the response.
-
-    TODO: move to a class method
-    """
-    print("*** Broker output ***")
-    for name, attelt in response_dict.items():
-        try:
-            if name == "records":
-                print(f"{name}: ")
-                for respdict in attelt:
-                    _print_sub_output(respdict, do_print_rec)
-            else:
-                print(f"{name}: {attelt}")
-        except Exception:
-            pass
-    # outelts = set(response_dict.keys())
-    # missing = S2nKey.broker_response_keys().difference(outelts)
-    # extras = outelts.difference(S2nKey.broker_response_keys())
-    # if missing:
-    #     print(f"Missing elements: {missing}")
-    # if extras:
-    #     print(f"Extra elements: {extras}")
-    print("")
-
-
-# ....................................
-def print_analyst_output(response_dict, do_print_rec=False):
-    """Print a formatted string of the elements in an S2nOutput query response.
-
-    Args:
-        response_dict: flask_app.broker.s2n_type.S2nOutput object
-        do_print_rec: True to print each record in the response.
-
-    TODO: move to a class method
-    """
-    print("*** Analyst output ***")
-    for name, attelt in response_dict.items():
-        try:
-            if name == "records":
-                print(f"{name}: ")
-                for respdict in attelt:
-                    _print_sub_output(respdict, do_print_rec)
-            else:
-                print(f"{name}: {attelt}")
-        except Exception:
-            pass
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 992d969e..1befc7fa 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -129,11 +129,11 @@ def get_dataset_counts(self, dataset_key, format="CSV"):
         Returns:
              records: empty list or list of 1 record (list)
         """
-        query_str = (f"SELECT datasetkey, occ_count, species_count FROM s3object s "
-                     f"WHERE s.datasetkey = '{dataset_key}'")
-        query_str = "SELECT * FROM s3object s LIMIT 5"
-        print(query_str)
-        # Returns empty list or list of 1 record with [(occ_count, species_count)]
+        query_str = (
+            "SELECT datasetkey, occ_count, species_count FROM s3object s "
+            f"WHERE s.datasetkey = '{dataset_key}'"
+        )
+        # Returns empty list or list of 1 record
         records = self._query_table(self._dataset_counts_path, query_str, format=format)
         return records
 

From 527cf7e646580cb6dc9e3bad5f1f8281692b0b77 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 14 Mar 2024 14:20:38 -0500
Subject: [PATCH 17/81] remove methods that occur in superclass

---
 flask_app/broker/base.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/flask_app/broker/base.py b/flask_app/broker/base.py
index bba52669..6c47a8ad 100644
--- a/flask_app/broker/base.py
+++ b/flask_app/broker/base.py
@@ -10,15 +10,6 @@
 from sppy.tools.provider.itis import ItisAPI
 
 
-# .............................................................................
-@app.errorhandler(BadRequest)
-def handle_bad_request(e):
-    return f"Bad request: {e}"
-
-@app.errorhandler(InternalServerError)
-def handle_bad_response(e):
-    return f"Internal Server Error: {e}"
-
 # .............................................................................
 class _BrokerService(_SpecifyNetworkService):
     """Base S-to-the-N service, handles parameter names and acceptable values."""

From e1372287e9150122f0a1e89280ecf9d224591e3d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 14 Mar 2024 15:52:28 -0500
Subject: [PATCH 18/81] ranked counts return

---
 flask_app/analyst/base.py    |  4 +-
 flask_app/analyst/count.py   |  4 ++
 flask_app/analyst/rank.py    | 97 ++++++++++++++++++++++++++++++++++++
 flask_app/broker/occ.py      | 11 ++--
 flask_app/common/base.py     |  2 +-
 flask_app/common/s2n_type.py | 18 ++++++-
 sppy/tools/provider/awss3.py | 37 +++++++++-----
 7 files changed, 150 insertions(+), 23 deletions(-)
 create mode 100644 flask_app/analyst/rank.py

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 0c771cb6..297eeb76 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -55,7 +55,8 @@ def _show_online(cls):
     # ...............................................
     @classmethod
     def _standardize_params(
-            cls, dataset_key=None, pub_org_key=None, descending=True, limit=10):
+            cls, dataset_key=None, pub_org_key=None, by_species=True, descending=True,
+            limit=10):
         """Standardize query parameters to send to appropriate service.
 
         Args:
@@ -77,6 +78,7 @@ def _standardize_params(
         user_kwargs = {
             "dataset_key": dataset_key,
             "pub_org_key": pub_org_key,
+            "by_species": by_species,
             "descending": descending,
             "limit": limit
         }
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 2eaf3165..926451d4 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -27,6 +27,10 @@ def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"):
                 publishingOrganization.
             format: output format, options "CSV" or "JSON"
 
+        Returns:
+            full_output (flask_app.common.s2n_type.AnalystOutput): including records
+                as a list of one list (CSV) or dictionary (JSON) of a record
+                containing dataset_key,  occurrence count, and species count.
         """
         if dataset_key is None and pub_org_key is None:
             return cls.get_endpoint()
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
new file mode 100644
index 00000000..57240b1d
--- /dev/null
+++ b/flask_app/analyst/rank.py
@@ -0,0 +1,97 @@
+"""Class for the Specify Network Name API service."""
+from http import HTTPStatus
+from werkzeug.exceptions import BadRequest
+
+from flask_app.common.s2n_type import APIService, AnalystOutput
+from flask_app.analyst.base import _AnalystService
+
+from sppy.aws.aws_constants import PROJ_BUCKET
+from sppy.tools.provider.awss3 import S3Query
+from sppy.tools.s2n.utils import (combine_errinfo, get_traceback)
+
+
+# .............................................................................
+class RankSvc(_AnalystService):
+    """Specify Network API service for retrieving taxonomic information."""
+    SERVICE_TYPE = APIService.Rank
+    ORDERED_FIELDNAMES = []
+
+    # ...............................................
+    @classmethod
+    def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"):
+        """Return occurrence and species counts for dataset/organization identifiers.
+
+        Args:
+            by_species: boolean URL parameter indicating whether to rank datasets by
+                species count (True) or occurrence count (False).
+            descending: boolean URL parameter indicating whether to rank top down (True)
+                or bottom up (False).
+            limit: integer URL parameter specifying the number of ordered records to
+                return.
+            format: output format, options "CSV" or "JSON"
+
+            full_output (flask_app.common.s2n_type.AnalystOutput): including records
+                as a list of lists (CSV) or dictionaries (JSON) of records
+                containing dataset_key,  occurrence count, and species count.
+        """
+        if by_species is None:
+            return cls.get_endpoint()
+
+        records = []
+        try:
+            good_params, errinfo = cls._standardize_params(
+                by_species=by_species, descending=descending, limit=limit)
+
+        except BadRequest as e:
+            errinfo = {"error": e.description}
+
+        else:
+            # Query for ordered dataset counts
+            try:
+                records, errors = cls._get_ordered_counts(
+                    good_params["by_species"], good_params["descending"],
+                    good_params["limit"], format)
+            except Exception:
+                errors = {"error": get_traceback()}
+
+            # Combine errors from success or failure
+            errinfo = combine_errinfo(errinfo, errors)
+
+        # Assemble
+        full_out = AnalystOutput(
+            cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
+            records=records, errors=errinfo)
+
+        return full_out.response
+
+    # ...............................................
+    @classmethod
+    def _get_ordered_counts(cls, by_species, descending, limit, format):
+        records = []
+        s3 = S3Query(PROJ_BUCKET)
+        try:
+            records, errinfo = s3.rank_datasets(
+                by_species, descending=descending, limit=limit)
+
+        except Exception:
+            errinfo = {"error": get_traceback()}
+
+        return records, errinfo
+
+# .............................................................................
+if __name__ == "__main__":
+    format = "CSV"
+    dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
+
+    svc = RankSvc()
+    response = svc.get_endpoint()
+    AnalystOutput.print_output(response, do_print_rec=True)
+    # print(response)
+    by_species = True
+    descending = True
+    limit = 5
+    response = svc.rank_counts(
+        by_species, descending=descending, limit=limit, format=format)
+    AnalystOutput.print_output(response, do_print_rec=True)
+    # print(response)
+
diff --git a/flask_app/broker/occ.py b/flask_app/broker/occ.py
index 4f277c37..3a6fb450 100644
--- a/flask_app/broker/occ.py
+++ b/flask_app/broker/occ.py
@@ -143,15 +143,10 @@ def get_occurrence_records(
                 a count and records
             kwargs: any additional keyword arguments are ignored
 
-        Raises:
-            BadRequest: on invalid query parameters.
-            BadRequest: on unknown exception parsing parameters.
-            InternalServerError: on unknown exception when executing request
-
         Returns:
-            a flask_app.broker.s2n_type.BrokerOutput object with optional records as a
-            list of dictionaries of records corresponding to specimen occurrences in
-            the provider database.
+            full_output (flask_app.common.s2n_type.BrokerOutput): including records
+                as a list of dictionaries of records corresponding to specimen
+                occurrences in the provider database.
         """
         if occid is None and gbif_dataset_key is None:
             return cls.get_endpoint()
diff --git a/flask_app/common/base.py b/flask_app/common/base.py
index 017ea59d..0d599930 100644
--- a/flask_app/common/base.py
+++ b/flask_app/common/base.py
@@ -155,7 +155,7 @@ def _test_numbers(cls, provided_val, param_meta):
             min_val = None
         # If restricted numeric values, check
         try:
-            max_val = param_meta["min"]
+            max_val = param_meta["max"]
         except KeyError:
             max_val = None
 
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index 92bcedd8..95307272 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -81,11 +81,12 @@ class APIEndpoint:
     Occurrence = "occ"
     Frontend = "frontend"
     Count = "count"
+    Rank = "rank"
 
     @classmethod
     def Resources(cls):
         return {
-            cls.Analyst: [cls.Count],
+            cls.Analyst: [cls.Count, cls.Rank],
             cls.Broker:
                 [
                     cls.Badge,
@@ -181,6 +182,21 @@ class APIService:
             "publishing organization.",
         S2nKey.RECORD_FORMAT: ""
     }
+    # Rankings
+    Rank = {
+        "name": APIEndpoint.Rank,
+        "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Rank}",
+        "params": {
+            "by_species":{ "type": True, "default": True},
+            "descending": { "type": True, "default": True},
+            "limit": {"type": 2, "default": 10, "min": 1, "max": 500},
+        },
+        "description":
+            "Return an ordered list of datasets with occurrence and species counts "
+            "ranked by occurrence or species counts for the top X (descending) "
+            "or bottom X (ascending) datasets",
+        S2nKey.RECORD_FORMAT: ""
+    }
     # Taxonomic Resolution
     Name = {
         "name": APIEndpoint.Name,
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 1befc7fa..3365ba8f 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -103,7 +103,7 @@ def _query_order_s3_table(
 
         Args:
             s3_path: S3 folder and filename within the bucket
-            sort_field: fieldname to sort records on
+            sort_field: fieldname (column) to sort records on
             descending: boolean flag indicating to sort ascending or descending
             limit: number of records to return, limit is 500
             format: output format, options "CSV" or "JSON"
@@ -111,12 +111,21 @@ def _query_order_s3_table(
         Returns:
              ordered list of records matching the query
         """
-        pass
-        # recs = []
-        # errors = {}
-        # df = self._create_dataframe_from_s3obj(s3_path)
-        # df.sort_values(by=sort_field, ascending=(not descending))
-        # return recs, errors
+        recs = []
+        errors = {}
+        df = self._create_dataframe_from_s3obj(s3_path)
+        # Sort rows (Axis 0/index) by values in sort_field (column)
+        sorted_df = df.sort_values(by=sort_field, axis=0, ascending=(not descending))
+        rec_df = sorted_df.head(limit)
+
+        for row in rec_df.itertuples():
+            rec = {"datasetkey": row.datasetkey,
+                   "species_count": row.species_count,
+                   "occ_count": row.occ_count}
+            recs.append(rec)
+            print(row)
+            print(rec)
+        return recs, errors
 
     # ----------------------------------------------------
     def get_dataset_counts(self, dataset_key, format="CSV"):
@@ -153,24 +162,28 @@ def get_org_counts(self, pub_org_key):
         return (occ_count, species_count)
 
     # ----------------------------------------------------
-    def rank_datasets_by_species(self, descending=True, limit=10):
+    def rank_datasets(self, by_species, descending, limit, format="CSV"):
         """Return the top or bottom datasets, with counts, ranked by number of species.
 
         Args:
+            by_species: boolean flag indicating whether to rank datasets by
+                species count (True) or occurrence count (False).
             descending: boolean value, if true return top X datasets in descending
                 order, if false, return bottom X datasets in ascending order
             limit: number of datasets to return, no more than 300.
+            format: output format, options "CSV" or "JSON"
 
         Returns:
              records: list of limit records containing dataset_key, occ_count, species_count
         """
         records = []
-        datestr = get_current_datadate_str()
-        datestr = "2024_02_01"
-        s3_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
+        if by_species:
+            sort_field = "species_count"
+        else:
+            sort_field = "occ_count"
         try:
             records, errors = self._query_order_s3_table(
-                s3_path, "species_count", descending, limit)
+                self._dataset_counts_path, sort_field, descending, limit)
         except Exception as e:
             errors = {"error": get_traceback()}
         return records, errors

From 6fe691bec46ed5081b3ebb24d22dc3ce6a4bc802 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Sun, 17 Mar 2024 16:30:33 -0500
Subject: [PATCH 19/81] exposed rank service

---
 flask_app/analyst/routes.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py
index 7afc0d09..770a82f6 100644
--- a/flask_app/analyst/routes.py
+++ b/flask_app/analyst/routes.py
@@ -3,6 +3,7 @@
 import os
 
 from flask_app.analyst.count import CountSvc
+from flask_app.analyst.rank import RankSvc
 from flask_app.common.constants import (
     STATIC_DIR, TEMPLATE_DIR, SCHEMA_DIR, SCHEMA_ANALYST_FNAME)
 from flask_app.common.s2n_type import APIEndpoint
@@ -82,6 +83,25 @@ def count_endpoint():
     return response
 
 
+# .....................................................................................
+@app.route("/api/v1/rank/")
+def rank_endpoint():
+    """Get the available counts.
+
+    Returns:
+        response: A flask_app.analyst API response object containing the count
+            API response.
+    """
+    by_species_arg = request.args.get("by_species", default=None, type=bool)
+    descending_arg = request.args.get("descending", default=True, type=bool)
+    limit_arg = request.args.get("limit", default=10, type=int)
+    # if coll_arg is None and org_arg is None:
+    if by_species_arg is None:
+        response = RankSvc.get_endpoint()
+    else:
+        response = RankSvc.rank_counts(by_species_arg, descending_arg, limit_arg)
+    return response
+
 # # .....................................................................................
 # @app.route("/api/v1/collection/<string:collection_id>", methods=["GET"])
 # def collection_get():

From f3440eb77fa37caf183fdac55a3dd0eb6abcbc4d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 11:34:59 -0500
Subject: [PATCH 20/81] JSON default format

---
 sppy/tools/provider/awss3.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 3365ba8f..3c57bcd6 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -128,7 +128,7 @@ def _query_order_s3_table(
         return recs, errors
 
     # ----------------------------------------------------
-    def get_dataset_counts(self, dataset_key, format="CSV"):
+    def get_dataset_counts(self, dataset_key, format="JSON"):
         """Query the S3 resource for occurrence and species counts for this dataset.
 
         Args:
@@ -162,7 +162,7 @@ def get_org_counts(self, pub_org_key):
         return (occ_count, species_count)
 
     # ----------------------------------------------------
-    def rank_datasets(self, by_species, descending, limit, format="CSV"):
+    def rank_datasets(self, by_species, descending, limit, format="JSON"):
         """Return the top or bottom datasets, with counts, ranked by number of species.
 
         Args:
@@ -190,7 +190,7 @@ def rank_datasets(self, by_species, descending, limit, format="CSV"):
 
 # .............................................................................
 if __name__ == "__main__":
-    format = "CSV"
+    format = "JSON"
     dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
     s3q = S3Query(PROJ_BUCKET)
     recs = s3q.get_dataset_counts(dataset_key, format=format)

From 998cbb4c3d05cac36623a38d4946a2e4e638a08e Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 11:36:25 -0500
Subject: [PATCH 21/81] fix args for s3.rank_datasets call; remove format
 option

---
 flask_app/analyst/count.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 926451d4..a51edc11 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -18,14 +18,13 @@ class CountSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"):
+    def get_counts(cls, dataset_key=None, pub_org_key=None):
         """Return occurrence and species counts for dataset/organization identifiers.
 
         Args:
             dataset_key: URL parameter for unique GBIF identifier of dataset.
             pub_org_key: URL parameter for unique GBIF identifier of
                 publishingOrganization.
-            format: output format, options "CSV" or "JSON"
 
         Returns:
             full_output (flask_app.common.s2n_type.AnalystOutput): including records
@@ -48,7 +47,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"):
             if good_params["dataset_key"] is not None:
                 try:
                     records, errors = cls._get_dataset_counts(
-                        good_params["dataset_key"], format)
+                        good_params["dataset_key"])
                 except Exception:
                     errors = {"error": get_traceback()}
                 else:
@@ -72,11 +71,11 @@ def get_counts(cls, dataset_key=None, pub_org_key=None, format="JSON"):
 
     # ...............................................
     @classmethod
-    def get_ranked_counts(cls, descending=True, limit=10):
+    def get_ranked_counts(cls, by_species=True, descending=True, limit=10):
         allrecs = []
         try:
             good_params, errinfo = cls._standardize_params(
-                cls, descending=descending, limit=limit)
+                cls, by_species=by_species, descending=descending, limit=limit)
 
         except BadRequest as e:
             errinfo = combine_errinfo(errinfo, {"error": e.description})
@@ -85,8 +84,7 @@ def get_ranked_counts(cls, descending=True, limit=10):
             # Do Query!
             try:
                 s3 = S3Query(PROJ_BUCKET)
-                records, errors = s3.rank_datasets_by_species(
-                    descending=True, limit=limit)
+                records, errors = s3.rank_datasets(by_species, descending, limit)
             except Exception:
                 errors = {"error": get_traceback()}
             else:
@@ -97,12 +95,11 @@ def get_ranked_counts(cls, descending=True, limit=10):
 
 # ...............................................
     @classmethod
-    def _get_dataset_counts(cls, dataset_key, format):
+    def _get_dataset_counts(cls, dataset_key):
         """Get counts for datasetKey.
 
         Args:
             dataset_key: unique GBIF identifier for dataset of interest.
-            format: output format, options "CSV" or "JSON"
 
         Returns:
             a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a
@@ -112,7 +109,7 @@ def _get_dataset_counts(cls, dataset_key, format):
         errors = {}
         s3 = S3Query(PROJ_BUCKET)
         try:
-            records = s3.get_dataset_counts(dataset_key, format=format)
+            records = s3.get_dataset_counts(dataset_key)
         except Exception:
             traceback = get_traceback()
             errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]
@@ -166,14 +163,13 @@ def _get_organization_counts(cls, pub_org_key):
 
 # .............................................................................
 if __name__ == "__main__":
-    format = "JSON"
     dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
 
     svc = CountSvc()
     response = svc.get_endpoint()
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
-    response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None, format=format)
+    response = svc.get_counts(dataset_key=dataset_key, pub_org_key=None)
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
 

From 2875b6b8c23f0b9ad4424ec402fb8f4fd493cb9c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 11:36:41 -0500
Subject: [PATCH 22/81] doc

---
 sphinx/misc/debugging.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst
index 0c8b7844..c9d900d8 100644
--- a/sphinx/misc/debugging.rst
+++ b/sphinx/misc/debugging.rst
@@ -27,6 +27,7 @@ flask run
 
   * Analyst:
     http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515
+    http://127.0.0.1:5000/api/v1/rank/?by_species=true
 * Flask will auto-update on file save.
 * Refresh browser after changes
 * The frontend endpoint cannot be tested this way, as it depends on frontend

From f313b32a59103c6963ec8359953523f00f66d011 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 13:17:55 -0500
Subject: [PATCH 23/81] only encode value to str if returned as bytes

---
 sppy/tools/provider/gbif.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py
index 7854bc7f..890a527e 100644
--- a/sppy/tools/provider/gbif.py
+++ b/sppy/tools/provider/gbif.py
@@ -63,10 +63,11 @@ def _assemble_filter_string(self, filter_string=None):
     @classmethod
     def _get_output_val(cls, out_dict, name):
         try:
-            tmp = out_dict[name]
-            val = str(tmp).encode(ENCODING)
+            val = out_dict[name]
         except Exception:
             return None
+        if type(val) is bytes:
+            val = str(val).encode(ENCODING)
         return val
 
     # ...............................................

From f94bb2b139b4cc8e540d764a9ae466e4b2991890 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 13:19:50 -0500
Subject: [PATCH 24/81] return recs as json or csv

---
 sppy/tools/provider/awss3.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 3c57bcd6..9b88401c 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -69,14 +69,13 @@ def _query_table(self, s3_path, query_str, format="CSV"):
         for event in resp["Payload"]:
             if "Records" in event:
                 recs_str = event["Records"]["Payload"].decode(ENCODING)
-                rec_strings = recs_str.split("\n")
+                rec_strings = recs_str.strip().split("\n")
                 for rs in rec_strings:
-                    if rs:
-                        if format == "JSON":
-                            rec = json.loads(rs)
-                        else:
-                            rec = rs.split(",")
-                        recs.append(rec)
+                    if format == "JSON":
+                        rec = json.loads(rs)
+                    else:
+                        rec = rs.split(",")
+                    recs.append(rec)
         return recs
 
     # ----------------------------------------------------
@@ -90,9 +89,7 @@ def _create_dataframe_from_s3obj(self, s3_path):
             df: pandas DataFrame containing the CSV data.
         """
         # import pyarrow.parquet as pq
-        # import s3fs
         s3_uri = f"s3://{self.bucket}/{s3_path}"
-        # s3_fs = s3fs.S3FileSystem
         df = pd.read_parquet(s3_uri)
         return df
 
@@ -123,8 +120,6 @@ def _query_order_s3_table(
                    "species_count": row.species_count,
                    "occ_count": row.occ_count}
             recs.append(rec)
-            print(row)
-            print(rec)
         return recs, errors
 
     # ----------------------------------------------------

From a50599a0ceab12bd5f11154af40e645fcf1fab30 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 14:13:59 -0500
Subject: [PATCH 25/81] remove obsolete

---
 flask_app/analyst/count.py | 67 --------------------------------------
 1 file changed, 67 deletions(-)

diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index a51edc11..7d14e2b5 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -69,30 +69,6 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
 
         return full_out.response
 
-    # ...............................................
-    @classmethod
-    def get_ranked_counts(cls, by_species=True, descending=True, limit=10):
-        allrecs = []
-        try:
-            good_params, errinfo = cls._standardize_params(
-                cls, by_species=by_species, descending=descending, limit=limit)
-
-        except BadRequest as e:
-            errinfo = combine_errinfo(errinfo, {"error": e.description})
-
-        else:
-            # Do Query!
-            try:
-                s3 = S3Query(PROJ_BUCKET)
-                records, errors = s3.rank_datasets(by_species, descending, limit)
-            except Exception:
-                errors = {"error": get_traceback()}
-            else:
-                allrecs.append(records)
-            # Combine errors from success or failure
-            errinfo = combine_errinfo(errinfo, errors)
-        return allrecs, errinfo
-
 # ...............................................
     @classmethod
     def _get_dataset_counts(cls, dataset_key):
@@ -116,49 +92,6 @@ def _get_dataset_counts(cls, dataset_key):
 
         return records, errors
 
-    # ...............................................
-    @classmethod
-    def _get_organization_counts(cls, pub_org_key):
-        """Get counts for publishingOrganizationKey.
-
-        Args:
-            pub_org_key: Unique identifier for GBIF publishing organizations.
-
-        Returns:
-            a flask_app.analyst.s2n_type.AnalystOutput object with optional records as a
-            list of records corresponding to occurrence and counts for the organization.
-        """
-        records = []
-        errors = {}
-        s3 = S3Query(PROJ_BUCKET)
-        try:
-            (occ_count, species_count) = s3.get_org_counts(pub_org_key)
-        except Exception:
-            traceback = get_traceback()
-            errors["error"] = [HTTPStatus.INTERNAL_SERVER_ERROR, traceback]
-        else:
-            records.append((occ_count, species_count))
-        return records, errors
-
-
-    # # ...............................................
-    # @classmethod
-    # def _get_records(cls, dataset_key, pub_org_key):
-    #     allrecs = []
-    #     # for response metadata
-    #     if dataset_key is not None:
-    #         records, errors = cls._get_dataset_counts(dataset_key)
-    #         allrecs.append(records)
-    #     if pub_org_key is not None:
-    #         records, errors = cls._get_organization_counts(pub_org_key)
-    #         allrecs.append(records)
-    #
-    #     # Assemble
-    #     full_out = AnalystOutput(
-    #         cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
-    #         records=allrecs, errors={})
-    #
-    #     return full_out
 
 
 # .............................................................................

From b6447d38307d94dc45f2c1fad588a6b9c52c8ca4 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 14:16:07 -0500
Subject: [PATCH 26/81] change rank URL keys and types

---
 flask_app/analyst/base.py    | 10 +++++-----
 flask_app/analyst/rank.py    | 27 +++++++++++++--------------
 flask_app/analyst/routes.py  | 12 ++++++++----
 flask_app/common/s2n_type.py | 14 ++++++++++----
 sppy/tools/provider/awss3.py | 21 +++++++++++----------
 5 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 297eeb76..8408d702 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -55,15 +55,15 @@ def _show_online(cls):
     # ...............................................
     @classmethod
     def _standardize_params(
-            cls, dataset_key=None, pub_org_key=None, by_species=True, descending=True,
+            cls, dataset_key=None, pub_org_key=None, count_by=None, order=None,
             limit=10):
         """Standardize query parameters to send to appropriate service.
 
         Args:
             dataset_key: unique GBIF dataset identifier for comparisons
             pub_org_key: unique publishing organization identifier for comparisons
-            descending: boolean value indicating whether to sort records descending
-                (True) or ascending (False)
+            count_by: counts of "occurrence" or "species"
+            order: sort records "descending" or "ascending"
             limit: integer indicating how many ranked records to return, value must
                 be less than QUERY_LIMIT.
 
@@ -78,8 +78,8 @@ def _standardize_params(
         user_kwargs = {
             "dataset_key": dataset_key,
             "pub_org_key": pub_org_key,
-            "by_species": by_species,
-            "descending": descending,
+            "count_by": count_by,
+            "order": order,
             "limit": limit
         }
 
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index 57240b1d..f51ba086 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -18,14 +18,14 @@ class RankSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"):
+    def rank_counts(cls, count_by, order=None, limit=1, format="JSON"):
         """Return occurrence and species counts for dataset/organization identifiers.
 
         Args:
-            by_species: boolean URL parameter indicating whether to rank datasets by
-                species count (True) or occurrence count (False).
-            descending: boolean URL parameter indicating whether to rank top down (True)
-                or bottom up (False).
+            count_by: URL parameter indicating rank datasets by counts of "species" or
+                "occurrence" .
+            order: URL parameter indicating whether to rank in "descending" or
+                "ascending" order.
             limit: integer URL parameter specifying the number of ordered records to
                 return.
             format: output format, options "CSV" or "JSON"
@@ -34,13 +34,13 @@ def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"):
                 as a list of lists (CSV) or dictionaries (JSON) of records
                 containing dataset_key,  occurrence count, and species count.
         """
-        if by_species is None:
+        if count_by is None:
             return cls.get_endpoint()
 
         records = []
         try:
             good_params, errinfo = cls._standardize_params(
-                by_species=by_species, descending=descending, limit=limit)
+                count_by=count_by, order=order, limit=limit)
 
         except BadRequest as e:
             errinfo = {"error": e.description}
@@ -49,7 +49,7 @@ def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"):
             # Query for ordered dataset counts
             try:
                 records, errors = cls._get_ordered_counts(
-                    good_params["by_species"], good_params["descending"],
+                    good_params["count_by"], good_params["order"],
                     good_params["limit"], format)
             except Exception:
                 errors = {"error": get_traceback()}
@@ -66,12 +66,11 @@ def rank_counts(cls, by_species, descending=True, limit=1, format="JSON"):
 
     # ...............................................
     @classmethod
-    def _get_ordered_counts(cls, by_species, descending, limit, format):
+    def _get_ordered_counts(cls, count_by, order, limit, format):
         records = []
         s3 = S3Query(PROJ_BUCKET)
         try:
-            records, errinfo = s3.rank_datasets(
-                by_species, descending=descending, limit=limit)
+            records, errinfo = s3.rank_datasets(count_by, order, limit)
 
         except Exception:
             errinfo = {"error": get_traceback()}
@@ -87,11 +86,11 @@ def _get_ordered_counts(cls, by_species, descending, limit, format):
     response = svc.get_endpoint()
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
-    by_species = True
-    descending = True
+    count_by = "species"
+    order = "ascending"
     limit = 5
     response = svc.rank_counts(
-        by_species, descending=descending, limit=limit, format=format)
+        count_by, order=order, limit=limit, format=format)
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
 
diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py
index 770a82f6..d314c372 100644
--- a/flask_app/analyst/routes.py
+++ b/flask_app/analyst/routes.py
@@ -92,14 +92,18 @@ def rank_endpoint():
         response: A flask_app.analyst API response object containing the count
             API response.
     """
-    by_species_arg = request.args.get("by_species", default=None, type=bool)
-    descending_arg = request.args.get("descending", default=True, type=bool)
+    count_by_arg = request.args.get("count_by", default=None, type=str)
+    order_arg = request.args.get("order", default=None, type=str)
     limit_arg = request.args.get("limit", default=10, type=int)
+    print(
+        f"*** count_by_arg={count_by_arg}, order_arg={order_arg}, "
+        f"limit_arg={limit_arg} ***")
     # if coll_arg is None and org_arg is None:
-    if by_species_arg is None:
+    if count_by_arg is None:
         response = RankSvc.get_endpoint()
     else:
-        response = RankSvc.rank_counts(by_species_arg, descending_arg, limit_arg)
+        response = RankSvc.rank_counts(
+            count_by_arg, order=order_arg, limit=limit_arg)
     return response
 
 # # .....................................................................................
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index 95307272..b11efa56 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -174,8 +174,6 @@ class APIService:
                 "description": "GBIF Publishing Organization Key",
                 "default": None
             },
-            "descending": { "type": True, "default": True},
-            "limit": {"type": 2, "default": 10, "min": 1, "max": 500},
         },
         "description":
             "Return occurrence and species counts for the given dataset or "
@@ -187,8 +185,16 @@ class APIService:
         "name": APIEndpoint.Rank,
         "endpoint": f"{APIEndpoint.Root}/{APIEndpoint.Rank}",
         "params": {
-            "by_species":{ "type": True, "default": True},
-            "descending": { "type": True, "default": True},
+            "count_by": {
+                "type": "",
+                "options": ["occurrence", "species"],
+                "default": None
+            },
+            "order": {
+                "type": "",
+                "options": ["ascending", "descending"],
+                "default": None
+            },
             "limit": {"type": 2, "default": 10, "min": 1, "max": 500},
         },
         "description":
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 9b88401c..da511b1a 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -95,13 +95,13 @@ def _create_dataframe_from_s3obj(self, s3_path):
 
     # ----------------------------------------------------
     def _query_order_s3_table(
-            self, s3_path, sort_field, descending, limit, format="CSV"):
+            self, s3_path, sort_field, order, limit, format="CSV"):
         """Query the S3 resource defined for this class.
 
         Args:
             s3_path: S3 folder and filename within the bucket
             sort_field: fieldname (column) to sort records on
-            descending: boolean flag indicating to sort ascending or descending
+            order: boolean flag indicating to sort ascending or descending
             limit: number of records to return, limit is 500
             format: output format, options "CSV" or "JSON"
 
@@ -112,7 +112,8 @@ def _query_order_s3_table(
         errors = {}
         df = self._create_dataframe_from_s3obj(s3_path)
         # Sort rows (Axis 0/index) by values in sort_field (column)
-        sorted_df = df.sort_values(by=sort_field, axis=0, ascending=(not descending))
+        sorted_df = df.sort_values(
+            by=sort_field, axis=0, ascending=(order == "ascending"))
         rec_df = sorted_df.head(limit)
 
         for row in rec_df.itertuples():
@@ -157,14 +158,14 @@ def get_org_counts(self, pub_org_key):
         return (occ_count, species_count)
 
     # ----------------------------------------------------
-    def rank_datasets(self, by_species, descending, limit, format="JSON"):
+    def rank_datasets(self, count_by, order, limit, format="JSON"):
         """Return the top or bottom datasets, with counts, ranked by number of species.
 
         Args:
-            by_species: boolean flag indicating whether to rank datasets by
-                species count (True) or occurrence count (False).
-            descending: boolean value, if true return top X datasets in descending
-                order, if false, return bottom X datasets in ascending order
+            count_by: string indicating rank datasets by counts of "species" or
+                "occurrence" .
+            order: string indicating whether to rank in "descending" or
+                "ascending" order.
             limit: number of datasets to return, no more than 300.
             format: output format, options "CSV" or "JSON"
 
@@ -172,13 +173,13 @@ def rank_datasets(self, by_species, descending, limit, format="JSON"):
              records: list of limit records containing dataset_key, occ_count, species_count
         """
         records = []
-        if by_species:
+        if count_by == "species":
             sort_field = "species_count"
         else:
             sort_field = "occ_count"
         try:
             records, errors = self._query_order_s3_table(
-                self._dataset_counts_path, sort_field, descending, limit)
+                self._dataset_counts_path, sort_field, order, limit)
         except Exception as e:
             errors = {"error": get_traceback()}
         return records, errors

From 8342e2eff03669266e85df5f924a24067d826723 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 14:20:50 -0500
Subject: [PATCH 27/81] debug examples, rm old code

---
 flask_app/analyst/routes.py | 2 --
 sphinx/misc/debugging.rst   | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/flask_app/analyst/routes.py b/flask_app/analyst/routes.py
index d314c372..9bd338a7 100644
--- a/flask_app/analyst/routes.py
+++ b/flask_app/analyst/routes.py
@@ -74,8 +74,6 @@ def count_endpoint():
             API response.
     """
     ds_arg = request.args.get("dataset_key", default=None, type=str)
-    # org_arg = request.args.get("organization_id", default=None, type=str)
-    # if coll_arg is None and org_arg is None:
     if ds_arg is None:
         response = CountSvc.get_endpoint()
     else:
diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst
index c9d900d8..24659dd4 100644
--- a/sphinx/misc/debugging.rst
+++ b/sphinx/misc/debugging.rst
@@ -22,12 +22,14 @@ flask run
   http://127.0.0.1:5000 in browser,
 
   * Broker
-    i.e. http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False&
-    or http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif
+    * http://127.0.0.1:5000/api/v1/name/Acer%20opalus%20Miller?is_accepted=True&gbif_count=False&
+    * http://127.0.0.1:5000/api/v1/occ/?occid=db8cc0df-1ed3-11e3-bfac-90b11c41863e&provider=gbif
+    * http://127.0.0.1:5000/api/v1/badge/?provider=mopho
 
   * Analyst:
     http://127.0.0.1:5000/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515
     http://127.0.0.1:5000/api/v1/rank/?by_species=true
+
 * Flask will auto-update on file save.
 * Refresh browser after changes
 * The frontend endpoint cannot be tested this way, as it depends on frontend

From b6cc4234d67e44f2cdf9d19176f3ff384838a307 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 14:22:42 -0500
Subject: [PATCH 28/81] rm format option from API

---
 flask_app/analyst/rank.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index f51ba086..6bb57825 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -18,7 +18,7 @@ class RankSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def rank_counts(cls, count_by, order=None, limit=1, format="JSON"):
+    def rank_counts(cls, count_by, order=None, limit=1):
         """Return occurrence and species counts for dataset/organization identifiers.
 
         Args:
@@ -28,7 +28,6 @@ def rank_counts(cls, count_by, order=None, limit=1, format="JSON"):
                 "ascending" order.
             limit: integer URL parameter specifying the number of ordered records to
                 return.
-            format: output format, options "CSV" or "JSON"
 
             full_output (flask_app.common.s2n_type.AnalystOutput): including records
                 as a list of lists (CSV) or dictionaries (JSON) of records
@@ -50,7 +49,7 @@ def rank_counts(cls, count_by, order=None, limit=1, format="JSON"):
             try:
                 records, errors = cls._get_ordered_counts(
                     good_params["count_by"], good_params["order"],
-                    good_params["limit"], format)
+                    good_params["limit"])
             except Exception:
                 errors = {"error": get_traceback()}
 
@@ -66,7 +65,7 @@ def rank_counts(cls, count_by, order=None, limit=1, format="JSON"):
 
     # ...............................................
     @classmethod
-    def _get_ordered_counts(cls, count_by, order, limit, format):
+    def _get_ordered_counts(cls, count_by, order, limit):
         records = []
         s3 = S3Query(PROJ_BUCKET)
         try:
@@ -79,7 +78,6 @@ def _get_ordered_counts(cls, count_by, order, limit, format):
 
 # .............................................................................
 if __name__ == "__main__":
-    format = "CSV"
     dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
 
     svc = RankSvc()
@@ -90,7 +88,7 @@ def _get_ordered_counts(cls, count_by, order, limit, format):
     order = "ascending"
     limit = 5
     response = svc.rank_counts(
-        count_by, order=order, limit=limit, format=format)
+        count_by, order=order, limit=limit)
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
 

From 0ebfb237fad98a3c531e43e912cbd19a7549d0af Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 15:44:49 -0500
Subject: [PATCH 29/81] add dataset_name to returned records

---
 flask_app/analyst/base.py  | 15 +++++++++++++--
 flask_app/analyst/count.py |  3 +++
 flask_app/analyst/rank.py  |  4 ++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 8408d702..b0b3a685 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -2,10 +2,9 @@
 from werkzeug.exceptions import BadRequest
 
 from flask_app.common.base import _SpecifyNetworkService
-from sppy.tools.s2n.utils import get_traceback
 from flask_app.common.s2n_type import AnalystOutput, APIService
 
-# app = Flask(__name__)
+from sppy.tools.s2n.utils import get_traceback
 
 
 # .............................................................................
@@ -98,6 +97,18 @@ def _standardize_params(
 
         return usr_params, errinfo
 
+    # ...............................................
+    @classmethod
+    def _add_dataset_names_to_records(
+            cls, records, dataset_key_field="datasetkey",
+            dataset_name_field="dataset_name"):
+        # if import is at top level, causes recursion error in awss3.count_datasets
+        from sppy.tools.provider.gbif import GbifAPI
+        gbif = GbifAPI(service="dataset")
+        for rec in records:
+            dataset_name, _ = gbif.get_dataset(rec[dataset_key_field])
+            rec[dataset_name_field] = dataset_name
+
 
 # .............................................................................
 if __name__ == "__main__":
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 7d14e2b5..12d24a7e 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -51,6 +51,9 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
                 except Exception:
                     errors = {"error": get_traceback()}
                 else:
+                    cls._add_dataset_names_to_records(
+                        records, dataset_key_field="datasetkey",
+                        dataset_name_field="dataset_name")
                     if records:
                         allrecs.append(records)
                 # Combine errors from success or failure
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index 6bb57825..c6dc1021 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -74,6 +74,10 @@ def _get_ordered_counts(cls, count_by, order, limit):
         except Exception:
             errinfo = {"error": get_traceback()}
 
+        cls._add_dataset_names_to_records(
+            records, dataset_key_field="datasetkey",
+            dataset_name_field="dataset_name")
+
         return records, errinfo
 
 # .............................................................................

From 849ff4cbf2eaad438cc7558e058483908aabaf4b Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 15:45:13 -0500
Subject: [PATCH 30/81] remove obsolete imports

---
 sppy/tools/provider/gbif.py | 2 +-
 sppy/tools/s2n/utils.py     | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/sppy/tools/provider/gbif.py b/sppy/tools/provider/gbif.py
index 890a527e..a2c521dd 100644
--- a/sppy/tools/provider/gbif.py
+++ b/sppy/tools/provider/gbif.py
@@ -12,7 +12,7 @@
 
 from sppy.tools.util.logtools import logit
 from sppy.tools.provider.api import APIQuery
-from sppy.tools.s2n.utils import get_traceback, add_errinfo
+from sppy.tools.s2n.utils import add_errinfo
 
 
 # .............................................................................
diff --git a/sppy/tools/s2n/utils.py b/sppy/tools/s2n/utils.py
index 3b6895ea..bc3f3241 100644
--- a/sppy/tools/s2n/utils.py
+++ b/sppy/tools/s2n/utils.py
@@ -3,9 +3,6 @@
 import traceback
 from uuid import UUID
 
-# from flask_app.broker.constants import ICON_API, ServiceProvider
-# from flask_app.common.s2n_type import APIEndpoint
-
 
 # ......................................................
 def is_valid_uuid(uuid_to_test, version=4):

From 13e33b8e921ce3fe35fb74101fd084779d3c3c0d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 15:47:22 -0500
Subject: [PATCH 31/81] add todo doc

---
 flask_app/analyst/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index b0b3a685..3c29035a 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -102,6 +102,7 @@ def _standardize_params(
     def _add_dataset_names_to_records(
             cls, records, dataset_key_field="datasetkey",
             dataset_name_field="dataset_name"):
+        # TODO: change this to a call to an S3 table with all dataset keys/names
         # if import is at top level, causes recursion error in awss3.count_datasets
         from sppy.tools.provider.gbif import GbifAPI
         gbif = GbifAPI(service="dataset")

From 73e6a50e5ec26a59da9b5e9eb47f6cb2dbb5f3f9 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 18 Mar 2024 17:02:37 -0500
Subject: [PATCH 32/81] documentation

---
 .env.broker.conf                   |   2 +-
 sphinx/about/install_run_notes.rst | 340 ++++++++++++++---------------
 sphinx/aws/aws_workflow.rst        |   8 +-
 sphinx/misc/debugging.rst          |  42 ++++
 4 files changed, 205 insertions(+), 187 deletions(-)

diff --git a/.env.broker.conf b/.env.broker.conf
index 95dab4b7..7cfc46b7 100644
--- a/.env.broker.conf
+++ b/.env.broker.conf
@@ -1,5 +1,5 @@
 SECRET_KEY=dev
 WORKING_DIRECTORY=/scratch-path
 
-FQDN=analyst.localhost
+FQDN=broker.localhost
 PYTHONPATH=/home/specify/flask_app
diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index e5d5bb00..5b64decc 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -6,19 +6,19 @@ Contains
 
 * Specify Network API services
 
-    * Tools/classes for broker, including
+  * Tools/classes for broker, including
 
-        * Flask application for individual API endpoints and frontend
-        * classes for Provider API connectors
-        * standardized API service output (s2n)
+    * Flask application for individual API endpoints and frontend
+    * classes for Provider API connectors
+    * standardized API service output (s2n)
 
-    * Tools/classes for analyst, including
+  * Tools/classes for analyst, including
 
-        * AWS scripts and
-        * Classes for use on EC2 or other AWS resources
-            * geotools for geospatial intersection/annotations
-            * aggregation, summary tools for writing tabular summaries
-            *
+    * AWS scripts and
+    * Classes for use on EC2 or other AWS resources
+
+      * geotools for geospatial intersection/annotations
+      * aggregation, summary tools for writing tabular summaries
 
 Deployment
 ===================================
@@ -32,20 +32,18 @@ To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate
 and the private key) using Let's Encrypt and link these files in `./sp_network/config/`.
 
 While in development, generate self-signed certificates then link them in
-~/git/sp_network/config/ directory for this project:
+~/git/sp_network/config/ directory for this project::
 
-```zsh
-$ mkdir ~/certificates
+  $ mkdir ~/certificates
 
-openssl req \
+  openssl req \
   -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \
   -keyout ~/certificates/privkey.pem \
   -out ~/certificates/fullchain.pem
 
-$ cd ~/git/sp_network/config
-$ ln -s ~/certificates/privkey.pem
-$ ln -s ~/certificates/fullchain.pem
-```
+  $ cd ~/git/sp_network/config
+  $ ln -s ~/certificates/privkey.pem
+  $ ln -s ~/certificates/fullchain.pem
 
 To run either the production or the development containers with HTTPS
 support, generate `fullchain.pem` and `privkey.pem` (certificate and the private
@@ -61,54 +59,55 @@ TLS/SSL using Certificate Authority (CA)
 * Stop apache service
 * request a certificate for the domain
 
-```commandline
-ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2
-ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v
-Saving debug log to /var/log/letsencrypt/letsencrypt.log
-
-How would you like to authenticate with the ACME CA?
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-1: Spin up a temporary webserver (standalone)
-2: Place files in webroot directory (webroot)
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1
-Plugins selected: Authenticator standalone, Installer None
-Please enter the domain name(s) you would like on your certificate (comma and/or
-space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org
-Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org
-Performing the following challenges:
-http-01 challenge for broker-dev.spcoco.org
-Waiting for verification...
-Cleaning up challenges
-
-Successfully received certificate.
-Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem
-Key is saved at:         /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem
-This certificate expires on 2023-10-18.
-These files will be updated when the certificate renews.
-Certbot has set up a scheduled task to automatically renew this certificate in the background.
-
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-If you like Certbot, please consider supporting our work by:
- * Donating to ISRG / Let's Encrypt:   https://letsencrypt.org/donate
- * Donating to EFF:                    https://eff.org/donate-le
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-ubuntu@ip-172-31-86-62:~$
-```
+::
+
+    ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2
+    ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v
+    Saving debug log to /var/log/letsencrypt/letsencrypt.log
+
+    How would you like to authenticate with the ACME CA?
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    1: Spin up a temporary webserver (standalone)
+    2: Place files in webroot directory (webroot)
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1
+    Plugins selected: Authenticator standalone, Installer None
+    Please enter the domain name(s) you would like on your certificate (comma and/or
+    space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org
+    Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org
+    Performing the following challenges:
+    http-01 challenge for broker-dev.spcoco.org
+    Waiting for verification...
+    Cleaning up challenges
+
+    Successfully received certificate.
+    Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem
+    Key is saved at:         /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem
+    This certificate expires on 2023-10-18.
+    These files will be updated when the certificate renews.
+    Certbot has set up a scheduled task to automatically renew this certificate in the background.
+
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    If you like Certbot, please consider supporting our work by:
+     * Donating to ISRG / Let's Encrypt:   https://letsencrypt.org/donate
+     * Donating to EFF:                    https://eff.org/donate-le
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    ubuntu@ip-172-31-86-62:~$
+
 
 * as superuser, link the newly created fullchain.pem and privkey.pem files from the
   letsencrypt live to the project/config directory
 * change the owner so that they can be used in Docker containers
 
-```commandline
-$ sudo su -
-# cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/
-# chown ubuntu:ubuntu /home/ubuntu/certificates/*
-# exit
-$ cd ~/git/sp_network/config
-$ ln -s ~/certificates/fullchain.pem
-$ ln -s ~/certificates/privkey.pem
-```
+::
+
+    $ sudo su -
+    # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/
+    # chown ubuntu:ubuntu /home/ubuntu/certificates/*
+    # exit
+    $ cd ~/git/sp_network/config
+    $ ln -s ~/certificates/fullchain.pem
+    $ ln -s ~/certificates/privkey.pem
 
 Renew Certbot SSL certificates
 .........................................
@@ -122,19 +121,19 @@ Amazon EC2 containers do not need apache running, certbot runs its own temp web
 
 Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3
 
-```commandline
-$ sudo certbot certificates
-$ sudo docker compose stop
-$ sudo su -
-# certbot renew
-# cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/
-# chown ubuntu:ubuntu /home/ubuntu/certificates/*
-# exit
-$ ls -lahtr ~/git/sp_network/config
-<check symlinks - should still be valid>
-$ sudo docker system prune --all --volumes
-$ sudo docker compose up -d
-```
+::
+
+    $ sudo certbot certificates
+    $ sudo docker compose stop
+    $ sudo su -
+    # certbot renew
+    # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/
+    # chown ubuntu:ubuntu /home/ubuntu/certificates/*
+    # exit
+    $ ls -lahtr ~/git/sp_network/config
+    <check symlinks - should still be valid>
+    $ sudo docker system prune --all --volumes
+    $ sudo docker compose up -d
 
 TODO: SSL through Amazon
 .........................................
@@ -151,12 +150,11 @@ Install
 Install dependencies
 ---------------------------------------
 
-Certbot:
+Certbot::
+
+    $ sudo apt update
+    $ sudo apt install certbot
 
-```commandline
-$ sudo apt update
-$ sudo apt install certbot
-```
 
 Install Docker
 ---------------------------------------
@@ -170,34 +168,33 @@ Install repo from Github
 * generate an SSH key for communicating with Github
 * Add SSH key to agent on local machine
 
-```commandline
-$ ssh-keygen -t rsa -b 4096 -C "aimee.stewart@ku.edu"
-$ eval "$(ssh-agent -s)"
-$ ssh-add ~/.ssh/id_rsa
-$ cat .ssh/id_rsa.pub
-```
+::
+
+    $ ssh-keygen -t rsa -b 4096 -C "aimee.stewart@ku.edu"
+    $ eval "$(ssh-agent -s)"
+    $ ssh-add ~/.ssh/id_rsa
+    $ cat .ssh/id_rsa.pub
+
 * Add the SSH to Github by printing to console, copying, adding in Github profile
 * clone the repository
 
-```commandline
-$ cat .ssh/id_rsa.pub
-$ # <copy to profile in github website>
-$ cd ~/git
-$ git clone git@github.com:specifysystems/sp_network.git
-$ git checkout <branch>
-```
+::
+    $ cat .ssh/id_rsa.pub
+    $ # <copy to profile in github website>
+    $ cd ~/git
+    $ git clone git@github.com:specifysystems/sp_network.git
+    $ git checkout <branch>
 
 Install certificates into config directory
 -------------------------------------------------------
 
 * Link the certificates in the repo config directory
 
-```commandline
-$ cd ~/git/sp_network
-$ cd config
-$ ln -s ~/certificates/fullchain1.pem
-$ ln -s ~/certificates/privkey1.pem
-```
+::
+    $ cd ~/git/sp_network
+    $ cd config
+    $ ln -s ~/certificates/fullchain1.pem
+    $ ln -s ~/certificates/privkey1.pem
 
 Testing
 ---------------------------------------
@@ -206,7 +203,9 @@ On a development server, check the following URL endpoints:
 * Index page: https://localhost
 
 * Broker:
+
   * https://localhost/api/v1/
+
     * https://localhost/api/v1/badge/
     * https://localhost/api/v1/name/
     * https://localhost/api/v1/occ/
@@ -224,12 +223,11 @@ Environment variables set in the Docker containers from the .env.broker.conf and
 .env.broker.conf files are necessary to inform the host machine/container of its FQDN.
 
 **Temp solution:** Export these variables to the local environment in the python
-virtual environment activation script (bin/activate) script.
+virtual environment activation script (bin/activate) script::
+
+    export SECRET_KEY="dev"
+    export WORKING_DIRECTORY="scratch-path"
 
-```zsh
-export SECRET_KEY="dev"
-export WORKING_DIRECTORY="scratch-path"
-```
 
 **Specify Network** homepage is now available at https://localhost/ and http://localhost.
 
@@ -249,20 +247,17 @@ Troubleshooting
 
 For webserver errors, check logs of nginx container::
 
-```commandline
-$ sudo docker logs --tail 1000 sp_network-nginx-1
-$ sudo docker logs --tail 1000 sp_network-broker-1
-```
+    $ sudo docker logs --tail 1000 sp_network-nginx-1
+    $ sudo docker logs --tail 1000 sp_network-broker-1
+
 
 Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container
 Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+
-Then stop/rebuild/start:
+Then stop/rebuild/start::
 
-```commandline
-$ sudo docker compose stop
-$ sudo docker system prune --all --volumes
-$ sudo docker compose up -d
-```
+    $ sudo docker compose stop
+    $ sudo docker system prune --all --volumes
+    $ sudo docker compose up -d
 
 Docker manipulation
 =================================
@@ -272,6 +267,7 @@ Edit the docker environment files
 
 * Add the container domain name to the files .env.broker.conf and .env.analyst.conf
 * Change the FQDN value to the fully qualified domain name of the server.
+
   * If this is a local testing deployment, it will be "localhost"
   * For a development or production server it will be the FQDN with correct subdomain
     for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and
@@ -280,9 +276,9 @@ Edit the docker environment files
 Run the containers (production)
 -------------------------------------------
 
-```zsh
-sudo docker compose -f docker-compose.yml up -d
-```
+Start the containers with the Docker composition file::
+
+    sudo docker compose -f docker-compose.yml up -d
 
 Specify Network is now available at [https://localhost/](https://localhost:443)
 
@@ -292,11 +288,9 @@ Run the containers (development)
 
 Note that the development compose file, docker-compose.development.yml, is referenced
 first on the command line.  It has elements that override those defined in the
-general compose file, docker-compose.yml.
+general compose file, docker-compose.yml::
 
-```zsh
-sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
-```
+    sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
 
 Flask has hot-reload enabled.
 
@@ -305,32 +299,25 @@ Rebuild/restart
 -------------------------------------------
 
 To delete all containers, images, networks and volumes, stop any running
-containers:
+containers::
+
+    sudo docker compose stop
 
-```zsh
-sudo docker compose stop
-```
 
-And run this command (which ignores running container):
+And run this command (which ignores running container)::
 
-```zsh
-sudo docker system prune --all --volumes
-```
+    sudo docker system prune --all --volumes
 
-Then rebuild/restart:
+Then rebuild/restart::
 
-```zsh
-sudo docker compose up -d
-```
+    sudo docker compose up -d
 
 Examine container
 -------------------------------------------
 
-To examine containers at a shell prompt:
+To examine containers at a shell prompt::
 
-```zsh
-sudo docker exec -it sp_network-nginx-1 /bin/sh
-```
+    sudo docker exec -it sp_network-nginx-1 /bin/sh
 
 Error port in use:
 "Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use"
@@ -338,24 +325,22 @@ Error port in use:
 See what else is using the port.  In my case apache was started on reboot.  Bring down
 all docker containers, shut down httpd, bring up docker.
 
-```zsh
-lsof -i -P -n | grep 443
-sudo docker compose down
-sudo systemctl stop httpd
-sudo docker compose  up -d
-```
+::
+    lsof -i -P -n | grep 443
+    sudo docker compose down
+    sudo systemctl stop httpd
+    sudo docker compose  up -d
+
 
 Dev Environment
 ==========================
 
-* Create a virtual environment and install python libs there
+* Create a virtual environment and install python libs there::
 
-```commandline
-$ cd ~/git/sp_network
-$ python3 -m venv venv
-$ . venv/bin/activate
-$ pip install -r requirements.txt
-```
+    $ cd ~/git/sp_network
+    $ python3 -m venv venv
+    $ . venv/bin/activate
+    $ pip install -r requirements.txt
 
 
 Configure Debugger in local IDE
@@ -370,19 +355,17 @@ Debug
 To run flask in debug mode, first set up Flask environment, then start the flask
 application (in this case, main in flask_app.broker.routes.py).  Only one resource
 (aka broker or analyst) at a time can be tested in this way.
-Reset the FLASK_APP variable to test an alternate resource.
-
-** the broker frontend can NOT be tested this way, as it depends on a docker volume
+Reset the FLASK_APP variable to test an alternate resource::
 
-```zsh
-export FLASK_ENV=development
-export FLASK_APP=flask_app.broker.routes:app
-# or
-# export FLASK_APP=flask_app.analyst.routes:app
-flask run
-```
+    export FLASK_ENV=development
+    export FLASK_APP=flask_app.broker.routes:app
+    # or
+    # export FLASK_APP=flask_app.analyst.routes:app
+    flask run
 
 * `broker` container is running `debugpy` on localhost, port `5000`
+* ** the broker frontend can NOT be tested this way, as it depends on a docker volume
+
 * Test with http, no https!!
 
   http://localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814)
@@ -394,40 +377,33 @@ Troubleshooting
 pip errors with SSL
 -------------------------------------------
 
-  * add trusted-host option at command line
+* add trusted-host option at command line::
+
+    pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org ~/git/lmpy
 
-```commandline
-pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org ~/git/lmpy
-```
-  * for processes that call pip, create a pip configuration file , then export as
-    PIP_CONFIG_FILE environment variable in .bashrc
+* for processes that call pip, create a pip configuration file , then export as
+    PIP_CONFIG_FILE environment variable in .bashrc::
 
-```commandline
-# ~/pip.conf
-[install]
-trusted-host = pypi.python.org
-               pypi.org
-               files.pythonhosted.org
+    # ~/pip.conf
+    [install]
+    trusted-host = pypi.python.org
+                   pypi.org
+                   files.pythonhosted.org
 
-# ~/.bashrc
-export PIP_CONFIG_FILE ~/pip.conf
-```
+    # ~/.bashrc
+    export PIP_CONFIG_FILE ~/pip.conf
 
 pre-commit errors with self-signed certificate
 ---------------------------------------------------------
 
-  * turn off verification (but this leaves you open to man-in-the-middle attacks)
+* turn off verification (but this leaves you open to man-in-the-middle attacks)::
 
-```commandline
-git config --global http.sslVerify false
-```
+    git config --global http.sslVerify false
 
-  * turn on again with
+  * turn on again with::
 
-```commandline
-git config --global http.sslVerify true
+    git config --global http.sslVerify true
 
-```
 
 pre-commit build errors
 --------------------------------------
diff --git a/sphinx/aws/aws_workflow.rst b/sphinx/aws/aws_workflow.rst
index 6ed902f9..33100d8a 100644
--- a/sphinx/aws/aws_workflow.rst
+++ b/sphinx/aws/aws_workflow.rst
@@ -4,7 +4,7 @@ AWS Workflow
 Reference
 ===========================================================
 
-  * Stored procedures in rs_stored_procedures.sql
+* Stored procedures in rs_stored_procedures.sql
 
 
 Steps
@@ -14,9 +14,9 @@ Steps
 ***********************************************************
 
 * Redshift: Subset GBIF data from Amazon Registry of Open Data (AWS ODR) for processing
-    * First run rs_create_stored_procedures.sql to create procedures for the subset script.
-    * Next run rs_subset_gbif.sql to subset the data
-    *
+
+  * First run rs_create_stored_procedures.sql to create procedures for the subset script.
+  * Next run rs_subset_gbif.sql to subset the data
 
 1.5 TODO
 ***********************************************************
diff --git a/sphinx/misc/debugging.rst b/sphinx/misc/debugging.rst
index 24659dd4..92c20cf6 100644
--- a/sphinx/misc/debugging.rst
+++ b/sphinx/misc/debugging.rst
@@ -16,6 +16,7 @@ Local debugging of flask app
 ```zsh
 export FLASK_ENV=development
 export FLASK_APP=flask_app.broker.routes
+export FLASK_APP=flask_app.analyst.routes
 flask run
 ```
 * With either Analyst or Broker, the development port will be 5000.  Connect to
@@ -35,3 +36,44 @@ flask run
 * The frontend endpoint cannot be tested this way, as it depends on frontend
   **webpack-output** and **static-files** to be mounted as docker volumes.
 
+
+Local debugging of Docker
+=============================================
+
+More info in about/install_run_notes
+
+
+Run Docker containers (development)
+-------------------------------------------
+
+Note that the development compose file, docker-compose.development.yml, is referenced
+first on the command line.  It has elements that override those defined in the
+general compose file, docker-compose.yml::
+
+    sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
+
+Flask has hot-reload enabled.
+
+Rebuild/restart
+-------------------------------------------
+
+To delete all containers, images, networks and volumes, stop any running
+containers::
+
+    sudo docker compose stop
+
+
+And run this command (which ignores running container)::
+
+    sudo docker system prune --all --volumes
+
+Then rebuild/restart::
+
+    sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
+
+Examine container
+-------------------------------------------
+
+To examine containers at a shell prompt::
+
+    sudo docker exec -it sp_network-nginx-1 /bin/sh

From 1b7087344eeb9ad33311eaeb3668c4d5f114284c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 19 Mar 2024 16:01:31 -0500
Subject: [PATCH 33/81] debug notes

---
 sphinx/misc/docker.rst | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sphinx/misc/docker.rst

diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst
new file mode 100644
index 00000000..e69de29b

From b10071aca41b8de510ec65ed1dad10e6be05311d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 19 Mar 2024 16:20:56 -0500
Subject: [PATCH 34/81] debug notes

---
 sphinx/about/install_run_notes.rst |   6 ++
 sphinx/misc/docker.rst             | 107 +++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index 5b64decc..be249905 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -196,6 +196,12 @@ Install certificates into config directory
     $ ln -s ~/certificates/fullchain1.pem
     $ ln -s ~/certificates/privkey1.pem
 
+Direct Docker to correct FQDN
+------------------------------------
+
+Edit FQDN value in env.conf (or .env.analyst.conf and .env.broker.conf) to actual FQDN
+
+
 Testing
 ---------------------------------------
 On a development server, check the following URL endpoints:
diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst
index e69de29b..0f43b3f6 100644
--- a/sphinx/misc/docker.rst
+++ b/sphinx/misc/docker.rst
@@ -0,0 +1,107 @@
+Docker Troubleshooting
+##############################
+
+Out of space error
+************************
+
+Problem
+------------------
+
+Running `certbot certificates` failed because the EC2 instance running Docker
+containers for Specify Network development shows disk full::
+
+    root@ip-172-31-86-62:~# df -h
+    Filesystem      Size  Used Avail Use% Mounted on
+    /dev/root       7.6G  7.6G     0 100% /
+    tmpfs           483M     0  483M   0% /dev/shm
+    tmpfs           194M   21M  173M  11% /run
+    tmpfs           5.0M     0  5.0M   0% /run/lock
+    /dev/xvda15     105M  6.1M   99M   6% /boot/efi
+    overlay         7.6G  7.6G     0 100% /var/lib/docker/overlay2/82d82cc5eb13260207b94443934c7318af651ea96a5fcd88c579f23224ba099d/merged
+    overlay         7.6G  7.6G     0 100% /var/lib/docker/overlay2/cb0d78289131b3925e21d7eff2d03c79fe432eeba2d69a33c6134db40dc3caf3/merged
+    overlay         7.6G  7.6G     0 100% /var/lib/docker/overlay2/3bd6d12b36e746f9c74227b6ac9d928a3179d8b604a9dea4fd88625eab84be1f/merged
+    tmpfs            97M  4.0K   97M   1% /run/user/1000
+
+
+Research
+------------------
+
+The disk is small, but the culprit is /var/lib/docker/overlay2
+
+Some strategies at:
+https://forums.docker.com/t/some-way-to-clean-up-identify-contents-of-var-lib-docker-overlay/30604/19
+
+Actual disk usage is correctly reported here (unlike some of the use cases above), so
+for now, clean it all out by stopping, pruning, removing images, killing the overlay2
+directory, recreating the overlay2 directory, changing permissions, then rebuilding
+and restarting the docker image::
+
+    $ sudo docker compose stop
+    $ sudo docker system prune --all --volumes
+    $ sudo docker image ls
+    REPOSITORY   TAG       IMAGE ID       CREATED        SIZE
+    <none>       <none>    e6bf776fc762   2 months ago   1.43GB
+    <none>       <none>    0ece9b23b9b3   2 months ago   108MB
+    <none>       <none>    23e4dc1f7809   2 months ago   108MB
+    <none>       <none>    529b5644c430   4 months ago   42.6MB
+
+    $ sudo docker image rm <each image id>
+    $ sudo du -skh /var/lib/docker/overlay2
+    1.2G	/var/lib/docker/overlay2
+
+    $ sudo rm -rf  /var/lib/docker/overlay2
+    $ df -h
+    Filesystem      Size  Used Avail Use% Mounted on
+    /dev/root       7.6G  4.9G  2.8G  65% /
+    tmpfs           483M     0  483M   0% /dev/shm
+    tmpfs           194M  884K  193M   1% /run
+    tmpfs           5.0M     0  5.0M   0% /run/lock
+    /dev/xvda15     105M  6.1M   99M   6% /boot/efi
+    tmpfs            97M  4.0K   97M   1% /run/user/1000
+
+    $ sudo mkdir  /var/lib/docker/overlay2
+    $ sudo ls -lahtr /var/lib/docker/overlay2
+    total 8.0K
+    drwx--x--- 12 root root 4.0K Mar 19 20:20 ..
+    drwxr-xr-x  2 root root 4.0K Mar 19 20:20 .
+
+    $ sudo chmod 710 /var/lib/docker/overlay2
+    total 8.0K
+    drwx--x--- 12 root root 4.0K Mar 19 20:20 ..
+    drwx--x---  2 root root 4.0K Mar 19 20:20 .
+
+
+
+
+Then uninstall docker (previously installed from docker repository noted in
+about/install_run_notes), update apt repositories, re-install, reboot::
+
+    $ sudo apt list docker --installed
+    Listing... Done
+    docker/jammy 1.5-2 all
+    $ sudo apt-get update
+    $ sudo apt remove docker
+    ...
+    $ sudo apt install docker
+    ...
+    $ sudo shutdown -r now
+
+Apparently, ubuntu comes with a docker install, not removed by apt::
+
+    $ dpkg -l | grep -i docker
+    ii  docker-buildx-plugin               0.10.4-1~ubuntu.22.04~jammy             amd64        Docker Buildx cli plugin.
+    ii  docker-ce                          5:23.0.4-1~ubuntu.22.04~jammy           amd64        Docker: the open-source application container engine
+    ii  docker-ce-cli                      5:23.0.4-1~ubuntu.22.04~jammy           amd64        Docker CLI: the open-source application container engine
+    ii  docker-ce-rootless-extras          5:23.0.4-1~ubuntu.22.04~jammy           amd64        Rootless support for Docker.
+    ii  docker-compose-plugin              2.17.2-1~ubuntu.22.04~jammy             amd64        Docker Compose (V2) plugin for the Docker CLI.
+    ii  wmdocker                           1.5-2                                   amd64        System tray for KDE3/GNOME2 docklet applications
+
+    $ sudo sudo apt-get purge -y docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras
+    $ sudo apt-get autoremove -y --purge docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras
+    $ sudo rm -rf /var/lib/docker
+    $ sudo groupdel docker
+    $ sudo rm -rf /var/run/docker.sock
+
+Then rebuild/restart docker::
+
+    $ sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up

From c5f8ba7aef8aa3382d5566c6f8df1f4d0e5a5d3d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 14:10:28 -0500
Subject: [PATCH 35/81] extension for easy internal links bw docs

---
 sphinx/conf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sphinx/conf.py b/sphinx/conf.py
index 96297158..1f14af2c 100644
--- a/sphinx/conf.py
+++ b/sphinx/conf.py
@@ -28,8 +28,9 @@
     'sphinx_rtd_theme',
     # 'autoapi.extension',
     'myst_parser',  # For MD support
-    ]
-
+    # for internal links
+    'sphinx.ext.autosectionlabel',
+]
 
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

From a8c328c69ae3f4c01aef1c36e5e5b7362a3bb125 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 14:50:28 -0500
Subject: [PATCH 36/81] docs

---
 sphinx/about/install_run_notes.rst | 204 +------------------------
 sphinx/misc/docker.rst             | 229 ++++++++++++++++++++---------
 sphinx/misc/ssl_certificates.rst   | 176 +++++++++++++++-------
 3 files changed, 287 insertions(+), 322 deletions(-)

diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index be249905..bf891443 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -23,127 +23,6 @@ Contains
 Deployment
 ===================================
 
-SSL
------------------------------------
-
-Local self-signed certificates
-.........................................
-To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate
-and the private key) using Let's Encrypt and link these files in `./sp_network/config/`.
-
-While in development, generate self-signed certificates then link them in
-~/git/sp_network/config/ directory for this project::
-
-  $ mkdir ~/certificates
-
-  openssl req \
-  -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \
-  -keyout ~/certificates/privkey.pem \
-  -out ~/certificates/fullchain.pem
-
-  $ cd ~/git/sp_network/config
-  $ ln -s ~/certificates/privkey.pem
-  $ ln -s ~/certificates/fullchain.pem
-
-To run either the production or the development containers with HTTPS
-support, generate `fullchain.pem` and `privkey.pem` (certificate and the private
-key) using Let's Encrypt, link these files in the `./config/` directory.
-Full instructions in the docs/aws-steps.rst page, under `Set up TLS/SSL`
-
-Modify the `FQDN` environment variable in `.env.conf` as needed.
-
-TLS/SSL using Certificate Authority (CA)
-..................................................
-
-* Make sure that DNS has propogated for domain for SSL
-* Stop apache service
-* request a certificate for the domain
-
-::
-
-    ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2
-    ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v
-    Saving debug log to /var/log/letsencrypt/letsencrypt.log
-
-    How would you like to authenticate with the ACME CA?
-    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    1: Spin up a temporary webserver (standalone)
-    2: Place files in webroot directory (webroot)
-    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1
-    Plugins selected: Authenticator standalone, Installer None
-    Please enter the domain name(s) you would like on your certificate (comma and/or
-    space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org
-    Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org
-    Performing the following challenges:
-    http-01 challenge for broker-dev.spcoco.org
-    Waiting for verification...
-    Cleaning up challenges
-
-    Successfully received certificate.
-    Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem
-    Key is saved at:         /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem
-    This certificate expires on 2023-10-18.
-    These files will be updated when the certificate renews.
-    Certbot has set up a scheduled task to automatically renew this certificate in the background.
-
-    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    If you like Certbot, please consider supporting our work by:
-     * Donating to ISRG / Let's Encrypt:   https://letsencrypt.org/donate
-     * Donating to EFF:                    https://eff.org/donate-le
-    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    ubuntu@ip-172-31-86-62:~$
-
-
-* as superuser, link the newly created fullchain.pem and privkey.pem files from the
-  letsencrypt live to the project/config directory
-* change the owner so that they can be used in Docker containers
-
-::
-
-    $ sudo su -
-    # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/
-    # chown ubuntu:ubuntu /home/ubuntu/certificates/*
-    # exit
-    $ cd ~/git/sp_network/config
-    $ ln -s ~/certificates/fullchain.pem
-    $ ln -s ~/certificates/privkey.pem
-
-Renew Certbot SSL certificates
-.........................................
-
-SSL certificates are served from the instance (AWS EC2), and need port 80 to be renewed.
-These are administered by Letsencrypt using Certbot and are only valid for 90 days at
-a time. When it is time for a renewal (approx every 60 days), bring the docker
-containers down. Renew the certificates, then bring the containers up again.
-
-Amazon EC2 containers do not need apache running, certbot runs its own temp web server.
-
-Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3
-
-::
-
-    $ sudo certbot certificates
-    $ sudo docker compose stop
-    $ sudo su -
-    # certbot renew
-    # cp -p /etc/letsencrypt/live/dev.spcoco.org/* /home/ubuntu/certificates/
-    # chown ubuntu:ubuntu /home/ubuntu/certificates/*
-    # exit
-    $ ls -lahtr ~/git/sp_network/config
-    <check symlinks - should still be valid>
-    $ sudo docker system prune --all --volumes
-    $ sudo docker compose up -d
-
-TODO: SSL through Amazon
-.........................................
-
-* Create Elastic IP address for EC2 instance
-* Request a public certificate through Certificate Manager (ACM)
-  * Choose DNS validation
-  * Add tags sp_network, dev or prod, others
-
-
 Install
 ======================================
 
@@ -185,16 +64,10 @@ Install repo from Github
     $ git clone git@github.com:specifysystems/sp_network.git
     $ git checkout <branch>
 
-Install certificates into config directory
--------------------------------------------------------
-
-* Link the certificates in the repo config directory
+SSL
+-----------------------------------
+:ref:`Specify Network SSL certificates`
 
-::
-    $ cd ~/git/sp_network
-    $ cd config
-    $ ln -s ~/certificates/fullchain1.pem
-    $ ln -s ~/certificates/privkey1.pem
 
 Direct Docker to correct FQDN
 ------------------------------------
@@ -265,77 +138,10 @@ Then stop/rebuild/start::
     $ sudo docker system prune --all --volumes
     $ sudo docker compose up -d
 
-Docker manipulation
+Docker
 =================================
 
-Edit the docker environment files
--------------------------------------------
-
-* Add the container domain name to the files .env.broker.conf and .env.analyst.conf
-* Change the FQDN value to the fully qualified domain name of the server.
-
-  * If this is a local testing deployment, it will be "localhost"
-  * For a development or production server it will be the FQDN with correct subdomain
-    for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and
-    FQDN=analyst.spcoco.org in .env.analyst.conf
-
-Run the containers (production)
--------------------------------------------
-
-Start the containers with the Docker composition file::
-
-    sudo docker compose -f docker-compose.yml up -d
-
-Specify Network is now available at [https://localhost/](https://localhost:443)
-
-
-Run the containers (development)
--------------------------------------------
-
-Note that the development compose file, docker-compose.development.yml, is referenced
-first on the command line.  It has elements that override those defined in the
-general compose file, docker-compose.yml::
-
-    sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
-
-Flask has hot-reload enabled.
-
-
-Rebuild/restart
--------------------------------------------
-
-To delete all containers, images, networks and volumes, stop any running
-containers::
-
-    sudo docker compose stop
-
-
-And run this command (which ignores running container)::
-
-    sudo docker system prune --all --volumes
-
-Then rebuild/restart::
-
-    sudo docker compose up -d
-
-Examine container
--------------------------------------------
-
-To examine containers at a shell prompt::
-
-    sudo docker exec -it sp_network-nginx-1 /bin/sh
-
-Error port in use:
-"Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use"
-
-See what else is using the port.  In my case apache was started on reboot.  Bring down
-all docker containers, shut down httpd, bring up docker.
-
-::
-    lsof -i -P -n | grep 443
-    sudo docker compose down
-    sudo systemctl stop httpd
-    sudo docker compose  up -d
+More info at :ref:`Docker`
 
 
 Dev Environment
diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst
index 0f43b3f6..074bb1e4 100644
--- a/sphinx/misc/docker.rst
+++ b/sphinx/misc/docker.rst
@@ -1,10 +1,85 @@
-Docker Troubleshooting
+Docker
 ##############################
 
-Out of space error
-************************
+Standard manipulation
+=================================
 
-Problem
+Edit the docker environment files
+-------------------------------------------
+
+* Add the container domain name to the files .env.broker.conf and .env.analyst.conf
+* Change the FQDN value to the fully qualified domain name of the server.
+
+  * If this is a local testing deployment, it will be "localhost"
+  * For a development or production server it will be the FQDN with correct subdomain
+    for each container, i.e FQDN=broker.spcoco.org in .env.broker.conf and
+    FQDN=analyst.spcoco.org in .env.analyst.conf
+
+Run the containers (production)
+-------------------------------------------
+
+Start the containers with the Docker composition file::
+
+    sudo docker compose -f docker-compose.yml up -d
+
+Specify Network is now available at [https://localhost/](https://localhost:443)
+
+
+Run the containers (development)
+-------------------------------------------
+
+Note that the development compose file, docker-compose.development.yml, is referenced
+first on the command line.  It has elements that override those defined in the
+general compose file, docker-compose.yml::
+
+    sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
+
+Flask has hot-reload enabled.
+
+
+Rebuild/restart
+-------------------------------------------
+
+To delete all containers, images, networks and volumes, stop any running
+containers::
+
+    sudo docker compose stop
+
+
+And run this command (which ignores running container)::
+
+    sudo docker system prune --all --volumes
+
+Then rebuild/restart::
+
+    sudo docker compose up -d
+    # or
+    sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
+
+Examine container
+-------------------------------------------
+
+To examine containers at a shell prompt::
+
+    sudo docker exec -it sp_network-nginx-1 /bin/sh
+
+Error port in use:
+"Error starting userland proxy: listen tcp4 0.0.0.0:443: bind: address already in use"
+
+See what else is using the port.  In my case apache was started on reboot.  Bring down
+all docker containers, shut down httpd, bring up docker.
+
+::
+    lsof -i -P -n | grep 443
+    sudo docker compose down
+    sudo systemctl stop httpd
+    sudo docker compose  up -d
+
+
+Troubleshooting
+=================================
+
+Out of Space Problem
 ------------------
 
 Running `certbot certificates` failed because the EC2 instance running Docker
@@ -22,86 +97,98 @@ containers for Specify Network development shows disk full::
     overlay         7.6G  7.6G     0 100% /var/lib/docker/overlay2/3bd6d12b36e746f9c74227b6ac9d928a3179d8b604a9dea4fd88625eab84be1f/merged
     tmpfs            97M  4.0K   97M   1% /run/user/1000
 
-
-Research
-------------------
-
 The disk is small, but the culprit is /var/lib/docker/overlay2
 
 Some strategies at:
 https://forums.docker.com/t/some-way-to-clean-up-identify-contents-of-var-lib-docker-overlay/30604/19
 
-Actual disk usage is correctly reported here (unlike some of the use cases above), so
-for now, clean it all out by stopping, pruning, removing images, killing the overlay2
-directory, recreating the overlay2 directory, changing permissions, then rebuilding
-and restarting the docker image::
+Solution:
+-------------------
 
-    $ sudo docker compose stop
-    $ sudo docker system prune --all --volumes
-    $ sudo docker image ls
-    REPOSITORY   TAG       IMAGE ID       CREATED        SIZE
-    <none>       <none>    e6bf776fc762   2 months ago   1.43GB
-    <none>       <none>    0ece9b23b9b3   2 months ago   108MB
-    <none>       <none>    23e4dc1f7809   2 months ago   108MB
-    <none>       <none>    529b5644c430   4 months ago   42.6MB
-
-    $ sudo docker image rm <each image id>
-    $ sudo du -skh /var/lib/docker/overlay2
-    1.2G	/var/lib/docker/overlay2
-
-    $ sudo rm -rf  /var/lib/docker/overlay2
-    $ df -h
+* The instance was created with a volume of an 8gb default size.
+* Stop the instance
+* Modify the volume.
+* Restart the EC2 instance - ok while the volume is in the optimizing state.
+* If the instance does not recognize the extended volume immediately::
+
+    ubuntu@ip-172-31-91-57:~$ df -h
     Filesystem      Size  Used Avail Use% Mounted on
-    /dev/root       7.6G  4.9G  2.8G  65% /
-    tmpfs           483M     0  483M   0% /dev/shm
-    tmpfs           194M  884K  193M   1% /run
+    /dev/root       7.6G  7.6G     0 100% /
+    tmpfs           475M     0  475M   0% /dev/shm
+    tmpfs           190M   11M  180M   6% /run
     tmpfs           5.0M     0  5.0M   0% /run/lock
     /dev/xvda15     105M  6.1M   99M   6% /boot/efi
-    tmpfs            97M  4.0K   97M   1% /run/user/1000
+    tmpfs            95M  4.0K   95M   1% /run/user/1000
+    ubuntu@ip-172-31-91-57:~$ sudo lsblk
+    sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution
+    NAME     MAJ:MIN RM  SIZE RO TYPE MOUNTPOINTS
+    loop0      7:0    0 24.9M  1 loop /snap/amazon-ssm-agent/7628
+    loop1      7:1    0 25.2M  1 loop /snap/amazon-ssm-agent/7983
+    loop2      7:2    0 55.7M  1 loop /snap/core18/2796
+    loop3      7:3    0 55.7M  1 loop /snap/core18/2812
+    loop4      7:4    0 63.9M  1 loop /snap/core20/2105
+    loop5      7:5    0 63.9M  1 loop /snap/core20/2182
+    loop6      7:6    0   87M  1 loop /snap/lxd/27037
+    loop7      7:7    0   87M  1 loop /snap/lxd/27428
+    loop8      7:8    0 40.4M  1 loop /snap/snapd/20671
+    loop9      7:9    0 39.1M  1 loop /snap/snapd/21184
+    xvda     202:0    0   30G  0 disk
+    ├─xvda1  202:1    0  7.9G  0 part /
+    ├─xvda14 202:14   0    4M  0 part
+    └─xvda15 202:15   0  106M  0 part /boot/efi
+
+* extend the filesystem:
+  https://docs.aws.amazon.com/ebs/latest/userguide/recognize-expanded-volume-linux.html
+* In this case we want to extend xvda1, so::
+
+    $ sudo growpart /dev/xvda 1
+    sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution
+    mkdir: cannot create directory ‘/tmp/growpart.1496’: No space left on device
+    FAILED: failed to make temp dir
+
+* We must free up space to allow extension::
 
-    $ sudo mkdir  /var/lib/docker/overlay2
-    $ sudo ls -lahtr /var/lib/docker/overlay2
-    total 8.0K
-    drwx--x--- 12 root root 4.0K Mar 19 20:20 ..
-    drwxr-xr-x  2 root root 4.0K Mar 19 20:20 .
-
-    $ sudo chmod 710 /var/lib/docker/overlay2
-    total 8.0K
-    drwx--x--- 12 root root 4.0K Mar 19 20:20 ..
-    drwx--x---  2 root root 4.0K Mar 19 20:20 .
-
-
-
-
-Then uninstall docker (previously installed from docker repository noted in
-about/install_run_notes), update apt repositories, re-install, reboot::
-
-    $ sudo apt list docker --installed
-    Listing... Done
-    docker/jammy 1.5-2 all
-    $ sudo apt-get update
-    $ sudo apt remove docker
-    ...
-    $ sudo apt install docker
+    $ sudo docker system prune --all --volumes
+    sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution
+    WARNING! This will remove:
+      - all stopped containers
+      - all networks not used by at least one container
+      - all volumes not used by at least one container
+      - all images without at least one container associated to them
+      - all build cache
+
+    Are you sure you want to continue? [y/N] y
+    Deleted Containers:
+    24768ca767d37f248eff173f13556007468330298329200d533dfa9ca011e409
+    809709d6f8bfa8575009a0d07df16ee78852e9ab3735aa19561ac0dbc0313123
+    64591ed14ecae60721ea367af650683f738636167162f6ed577063582c210aa9
+
+    Deleted Networks:
+    sp_network_nginx
+
+    Deleted Images:
+    untagged: nginx:alpine
+    untagged: nginx@sha256:a59278fd22a9d411121e190b8cec8aa57b306aa3332459197777583beb728f59
+    deleted: sha256:529b5644c430c06553d2e8082c6713fe19a4169c9dc2369cbb960081f52924ff
     ...
-    $ sudo shutdown -r now
+    deleted: sha256:e74dab46dbca98b4be75dfbda3608cd857914b750ecd251c4f1bdbb4ef623c8c
 
-Apparently, ubuntu comes with a docker install, not removed by apt::
+    Total reclaimed space: 1.536GB
 
-    $ dpkg -l | grep -i docker
-    ii  docker-buildx-plugin               0.10.4-1~ubuntu.22.04~jammy             amd64        Docker Buildx cli plugin.
-    ii  docker-ce                          5:23.0.4-1~ubuntu.22.04~jammy           amd64        Docker: the open-source application container engine
-    ii  docker-ce-cli                      5:23.0.4-1~ubuntu.22.04~jammy           amd64        Docker CLI: the open-source application container engine
-    ii  docker-ce-rootless-extras          5:23.0.4-1~ubuntu.22.04~jammy           amd64        Rootless support for Docker.
-    ii  docker-compose-plugin              2.17.2-1~ubuntu.22.04~jammy             amd64        Docker Compose (V2) plugin for the Docker CLI.
-    ii  wmdocker                           1.5-2                                   amd64        System tray for KDE3/GNOME2 docklet applications
+* Extend filesystem::
 
-    $ sudo sudo apt-get purge -y docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras
-    $ sudo apt-get autoremove -y --purge docker-buildx-plugin docker-ce docker-ce-cli docker-compose-plugin docker-ce-rootless-extras
-    $ sudo rm -rf /var/lib/docker
-    $ sudo groupdel docker
-    $ sudo rm -rf /var/run/docker.sock
+    $ sudo growpart /dev/xvda 1
+    sudo: unable to resolve host ip-172-31-91-57: Temporary failure in name resolution
+    CHANGED: partition=1 start=227328 old: size=16549855 end=16777183 new: size=62687199 end=62914527
+    $ df -h
+    Filesystem      Size  Used Avail Use% Mounted on
+    /dev/root       7.6G  5.7G  2.0G  75% /
+    tmpfs           475M     0  475M   0% /dev/shm
+    tmpfs           190M   18M  173M  10% /run
+    tmpfs           5.0M     0  5.0M   0% /run/lock
+    /dev/xvda15     105M  6.1M   99M   6% /boot/efi
+    tmpfs            95M  4.0K   95M   1% /run/user/1000
 
-Then rebuild/restart docker::
 
-    $ sudo docker compose -f docker-compose.development.yml -f docker-compose.yml  up
+* Stop apache2 if running
+* Rebuild the docker containers
diff --git a/sphinx/misc/ssl_certificates.rst b/sphinx/misc/ssl_certificates.rst
index 20e9121d..1ee1878b 100644
--- a/sphinx/misc/ssl_certificates.rst
+++ b/sphinx/misc/ssl_certificates.rst
@@ -7,55 +7,127 @@ Letsencrypt using Certbot.  They are only valid for 90 days at a time.
 
 TODO: move administration to AWS, and script renewal if necessary
 
-
-Renewal procedure
-=============================================
-
-* Change to superuser, then check the validity of your certificates::
-
-    sudo su -
-    certbot certificates
-
-* When it is time for a renewal (approx every 60 days), move to the Specify Network
-  project directory where Docker was started, and stop the Docker containers::
-
-    cd /home/ubuntu/git/sp_network
-    docker compose stop
-
-* Renew the certificates::
-
-    certbot renew
-
-* Move to /etc/letsencrypt/archive/<FQDN> and find the most recent
-  certificate names in the directory (certX.pem, chainX.pem, fullchainX.pem,
-  privkeyX.pem, where X is an integer)::
-
-    cd /etc/letsencrypt/archive/spcoco.org/
-    ls -lahtr
-
-* Copy the new certificates to /home/ubuntu/certificates, changing
-  the name to cert.pem, chain.pem, fullchain.pem, privkey.pem (no X integer).  Then
-  change the owner from root, to the username (ubuntu)::
-
-    cp cert4.pem /home/ubuntu/certificates/cert.pem
-    cp chain4.pem /home/ubuntu/certificates/chain.pem
-    cp fullchain4.pem /home/ubuntu/certificates/fullchain.pem
-    cp privkey4.pem /home/ubuntu/certificates/privkey.pem
-
-* Move to the directory with the certificates and change the
-  owner to ubuntu, then exit superuser::
-
-    cd /home/ubuntu/certificates
-    chown ubuntu:ubuntu *
-    exit
-
-* Move to the config directory and create symbolic links to the new fullchain.pem
-  and privkey.pem files::
-
-    cd /home/ubuntu/git/sp_network/config
-    ln -s /home/ubuntu/certificates/fullchain.pem
-    ln -s /home/ubuntu/certificates/privkey.pem
-
-* Then restart the containers::
-
-    sudo docker compose up -d
+Local self-signed certificates
+.........................................
+To run the containers, generate `fullchain.pem` and `privkey.pem` (certificate
+and the private key) using Let's Encrypt and link these files in `./sp_network/config/`.
+
+While in development, generate self-signed certificates then link them in
+~/git/sp_network/config/ directory for this project::
+
+  $ mkdir ~/certificates
+
+  openssl req \
+  -x509 -sha256 -nodes -newkey rsa:2048 -days 365 \
+  -keyout ~/certificates/privkey.pem \
+  -out ~/certificates/fullchain.pem
+
+  $ cd ~/git/sp_network/config
+  $ ln -s ~/certificates/privkey.pem
+  $ ln -s ~/certificates/fullchain.pem
+
+To run either the production or the development containers with HTTPS
+support, generate `fullchain.pem` and `privkey.pem` (certificate and the private
+key) using Let's Encrypt, link these files in the `./config/` directory.
+Full instructions in the docs/aws-steps.rst page, under `Set up TLS/SSL`
+
+Modify the `FQDN` environment variable in `.env.conf` as needed.
+
+TLS/SSL using Certificate Authority (CA)
+..................................................
+
+* Make sure that DNS has propogated for domain for SSL
+* Stop apache service
+* request a certificate for the domain
+
+::
+
+    ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2
+    ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v
+    Saving debug log to /var/log/letsencrypt/letsencrypt.log
+
+    How would you like to authenticate with the ACME CA?
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    1: Spin up a temporary webserver (standalone)
+    2: Place files in webroot directory (webroot)
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1
+    Plugins selected: Authenticator standalone, Installer None
+    Please enter the domain name(s) you would like on your certificate (comma and/or
+    space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org
+    Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org
+    Performing the following challenges:
+    http-01 challenge for broker-dev.spcoco.org
+    Waiting for verification...
+    Cleaning up challenges
+
+    Successfully received certificate.
+    Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem
+    Key is saved at:         /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem
+    This certificate expires on 2023-10-18.
+    These files will be updated when the certificate renews.
+    Certbot has set up a scheduled task to automatically renew this certificate in the background.
+
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    If you like Certbot, please consider supporting our work by:
+     * Donating to ISRG / Let's Encrypt:   https://letsencrypt.org/donate
+     * Donating to EFF:                    https://eff.org/donate-le
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    ubuntu@ip-172-31-86-62:~$
+
+
+Install certificates into config directory
+-------------------------------------------------------
+
+* Create a ~/certificates directory to hold certificate files
+* as superuser, copy the newly created fullchain.pem and privkey.pem files from the
+  letsencrypt live
+* change the owner so that they can be used in Docker containers
+* Link the certificates in the repo config directory
+
+::
+
+    $ cd
+    $ mkdir certificates
+    $ sudo su -
+    # cp -p /etc/letsencrypt/live/<fqdn>/* /home/ubuntu/certificates/
+    # chown ubuntu:ubuntu /home/ubuntu/certificates/*
+    # exit
+    $ cd ~/git/sp_network/config
+    $ ln -s ~/certificates/fullchain.pem
+    $ ln -s ~/certificates/privkey.pem
+
+Renew Certbot SSL certificates
+.........................................
+
+SSL certificates are served from the instance (AWS EC2), and need port 80 to be renewed.
+These are administered by Letsencrypt using Certbot and are only valid for 90 days at
+a time. When it is time for a renewal (approx every 60 days), bring the docker
+containers down.  Prune the volumes so the new containers and volumes will be created
+with the updated certificates.  Renew the certificates, then bring the containers up.
+
+Amazon EC2 containers do not need apache running, certbot runs its own temp web server.
+
+Test with https://broker.spcoco.org/api/v1/frontend/?occid=01493b05-4310-4f28-9d81-ad20860311f3
+
+::
+
+    $ sudo certbot certificates
+    $ sudo docker compose stop
+    $ sudo su -
+    # certbot renew
+    # cp -p /etc/letsencrypt/live/spcoco.org/* /home/ubuntu/certificates/
+    # chown ubuntu:ubuntu /home/ubuntu/certificates/*
+    # exit
+    $ ls -lahtr ~/git/sp_network/config
+    <check symlinks - should still be valid>
+    $ sudo docker system prune --all --volumes
+    $ sudo docker compose up -d
+
+TODO: SSL through Amazon
+.........................................
+
+* Create Elastic IP address for EC2 instance
+* Request a public certificate through Certificate Manager (ACM)
+  * Choose DNS validation
+  * Add tags sp_network, dev or prod, others

From fbb28630724c8c27bcbd33e7db2bcdff19788168 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 15:41:16 -0500
Subject: [PATCH 37/81] remove unused dependencies

---
 requirements.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f7a78ccb..4b7aa412 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,4 @@ boto3>=1.34.60
 sqlalchemy
 pandas
 pandas-sql
-pyarrow
-s3fs
-ggshield
\ No newline at end of file
+s3fs
\ No newline at end of file

From ca5ab2aed1e96a4dd8009e67023e57d6d6fd200a Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 15:41:58 -0500
Subject: [PATCH 38/81] call installed python executable

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index a8c33950..9c17826d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,7 +20,7 @@ USER specify
 
 COPY --chown=specify:specify ./requirements.txt .
 
-RUN python -m venv venv \
+RUN python3 -m venv venv \
  && venv/bin/pip install --no-cache-dir -r ./requirements.txt
 
 COPY --chown=specify:specify ./sppy ./sppy

From 29c05caf81925352de48441b3f67c9298a74e60b Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 17:09:10 -0500
Subject: [PATCH 39/81] rm duplicate entries for location /

---
 config/nginx.conf | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/config/nginx.conf b/config/nginx.conf
index cab6d732..aeea668d 100644
--- a/config/nginx.conf
+++ b/config/nginx.conf
@@ -7,7 +7,7 @@ server {
 server {
   listen 443 ssl;
   index index.html;
-  server_name  broker-dev.spcoco.org;
+  # server_name  broker.spcoco.org;
 
   ssl_certificate /etc/letsencrypt/fullchain.pem;
   ssl_certificate_key /etc/letsencrypt/privkey.pem;
@@ -31,11 +31,11 @@ server {
     proxy_set_header Origin "${scheme}://${http_host}";
   }
 
-  location / {
-    root /var/www/;
-    try_files $uri $uri/ = 404;
-    gzip_static on;
-  }
+# http_host  location / {
+#     root /var/www/;
+#     try_files $uri $uri/ = 404;
+#     gzip_static on;
+#   }
 
   location /static/js {
     root /volumes/webpack-output;
@@ -48,11 +48,12 @@ server {
     rewrite ^/static/(.*)$ /$1 break;
     gzip_static on;
   }
+}
 
 server {
   listen 443 ssl;
   index index.html;
-  server_name  analyst-dev.spcoco.org;
+  # server_name  analyst.spcoco.org;
 
   ssl_certificate /etc/letsencrypt/fullchain.pem;
   ssl_certificate_key /etc/letsencrypt/privkey.pem;
@@ -76,11 +77,11 @@ server {
     proxy_set_header Origin "${scheme}://${http_host}";
   }
 
-  location / {
-    root /var/www/;
-    try_files $uri $uri/ = 404;
-    gzip_static on;
-  }
+#   location / {
+#     root /var/www/;
+#     try_files $uri $uri/ = 404;
+#     gzip_static on;
+#   }
 
   location /static/js {
     root /volumes/webpack-output;

From 6f70afe071c1c3e309b8b56945b799312f51889c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 17:09:48 -0500
Subject: [PATCH 40/81] add missing routes module to application

---
 docker-compose.development.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.development.yml b/docker-compose.development.yml
index 026c7eb6..50f8c4af 100644
--- a/docker-compose.development.yml
+++ b/docker-compose.development.yml
@@ -21,7 +21,7 @@ services:
     ports:
       - "5002:5002"
     environment:
-      - FLASK_APP=flask_app.analyst:app
+      - FLASK_APP=flask_app.analyst.routes:app
       - FLASK_MANAGE=flask_app.analyst.manage
       - DEBUG_PORT=5002
     volumes:

From 0b4f61853533ca6fa0f617cb30c14e608488acb4 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 20 Mar 2024 17:37:00 -0500
Subject: [PATCH 41/81] update server_name for each instance

---
 config/nginx.conf | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/config/nginx.conf b/config/nginx.conf
index aeea668d..ef94be1c 100644
--- a/config/nginx.conf
+++ b/config/nginx.conf
@@ -4,10 +4,11 @@ server {
   return 301 https://$host$request_uri;
 }
 
+# Broker
 server {
   listen 443 ssl;
   index index.html;
-  # server_name  broker.spcoco.org;
+  server_name  broker.localhost;
 
   ssl_certificate /etc/letsencrypt/fullchain.pem;
   ssl_certificate_key /etc/letsencrypt/privkey.pem;
@@ -31,12 +32,6 @@ server {
     proxy_set_header Origin "${scheme}://${http_host}";
   }
 
-# http_host  location / {
-#     root /var/www/;
-#     try_files $uri $uri/ = 404;
-#     gzip_static on;
-#   }
-
   location /static/js {
     root /volumes/webpack-output;
     rewrite ^/static/js/(.*)$ /$1 break;
@@ -50,10 +45,11 @@ server {
   }
 }
 
+# Analyst
 server {
   listen 443 ssl;
   index index.html;
-  # server_name  analyst.spcoco.org;
+  server_name  analyst.localhost;
 
   ssl_certificate /etc/letsencrypt/fullchain.pem;
   ssl_certificate_key /etc/letsencrypt/privkey.pem;
@@ -77,12 +73,6 @@ server {
     proxy_set_header Origin "${scheme}://${http_host}";
   }
 
-#   location / {
-#     root /var/www/;
-#     try_files $uri $uri/ = 404;
-#     gzip_static on;
-#   }
-
   location /static/js {
     root /volumes/webpack-output;
     rewrite ^/static/js/(.*)$ /$1 break;

From a637192b2d57ab980d59e18a0ef2b87fa77cf396 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 21 Mar 2024 16:05:31 -0500
Subject: [PATCH 42/81] doc

---
 sphinx/about/install_run_notes.rst |  5 +++--
 sphinx/aws/aws-setup.rst           |  9 ++++++++
 sphinx/misc/ssl_certificates.rst   | 36 +++++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index bf891443..484b1917 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -41,7 +41,7 @@ Install Docker
 Add docker repository, then use apt to install Docker:
 https://docs.docker.com/engine/install/ubuntu/
 
-Install repo from Github
+Install/Update repo from Github
 ---------------------------------------
 
 * generate an SSH key for communicating with Github
@@ -72,7 +72,8 @@ SSL
 Direct Docker to correct FQDN
 ------------------------------------
 
-Edit FQDN value in env.conf (or .env.analyst.conf and .env.broker.conf) to actual FQDN
+Edit FQDN value in .env.analyst.conf and .env.broker.conf (referenced by the docker
+compose file) and server_name in config/nginx.conf to actual FQDN.
 
 
 Testing
diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst
index 828da48d..b367bd92 100644
--- a/sphinx/aws/aws-setup.rst
+++ b/sphinx/aws/aws-setup.rst
@@ -1,6 +1,15 @@
 Authentication
 ####################
 
+EC2 instance creation
+===========================================================
+
+* Instance type t3.small (2gb RAM).
+* Ubuntu Server 22.04 LTS, SSD Volume Type (free tier eligible), x86 architecture
+* Security Group: launch-wizard-1
+* 30 Gb General Purpose SSD (gp2)
+* For dev, Spot instance (in Advanced options)
+
 For programmatic access to S3
 ===========================================================
 Configure AWS credentials either through
diff --git a/sphinx/misc/ssl_certificates.rst b/sphinx/misc/ssl_certificates.rst
index 1ee1878b..cc83af66 100644
--- a/sphinx/misc/ssl_certificates.rst
+++ b/sphinx/misc/ssl_certificates.rst
@@ -42,8 +42,8 @@ TLS/SSL using Certificate Authority (CA)
 
 ::
 
-    ubuntu@ip-172-31-86-62:~$ sudo systemctl stop apache2
-    ubuntu@ip-172-31-86-62:~$ sudo certbot certonly -v
+    $ sudo systemctl stop apache2
+    $ sudo certbot certonly -v
     Saving debug log to /var/log/letsencrypt/letsencrypt.log
 
     How would you like to authenticate with the ACME CA?
@@ -53,18 +53,39 @@ TLS/SSL using Certificate Authority (CA)
     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     Select the appropriate number [1-2] then [enter] (press 'c' to cancel): 1
     Plugins selected: Authenticator standalone, Installer None
+    Enter email address (used for urgent renewal and security notices)
+     (Enter 'c' to cancel): aimee.stewart@ku.edu
+
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    Please read the Terms of Service at
+    https://letsencrypt.org/documents/LE-SA-v1.3-September-21-2022.pdf. You must
+    agree in order to register with the ACME server. Do you agree?
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    (Y)es/(N)o: Y
+
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    Would you be willing, once your first certificate is successfully issued, to
+    share your email address with the Electronic Frontier Foundation, a founding
+    partner of the Let's Encrypt project and the non-profit organization that
+    develops Certbot? We'd like to send you email about our work encrypting the web,
+    EFF news, campaigns, and ways to support digital freedom.
+    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    (Y)es/(N)o: N
+    Account registered.
     Please enter the domain name(s) you would like on your certificate (comma and/or
-    space separated) (Enter 'c' to cancel): broker-dev.spcoco.org analyst-dev.spcoco.org
-    Requesting a certificate for broker-dev.spcoco.org and analyst-dev.spcoco.org
+    space separated) (Enter 'c' to cancel): dev.spcoco.org, analyst-dev.spcoco.org, broker-dev.spcoco.org
+    Requesting a certificate for dev.spcoco.org and 2 more domains
     Performing the following challenges:
+    http-01 challenge for analyst-dev.spcoco.org
     http-01 challenge for broker-dev.spcoco.org
+    http-01 challenge for dev.spcoco.org
     Waiting for verification...
     Cleaning up challenges
 
     Successfully received certificate.
-    Certificate is saved at: /etc/letsencrypt/live/broker-dev.spcoco.org/fullchain.pem
-    Key is saved at:         /etc/letsencrypt/live/broker-dev.spcoco.org/privkey.pem
-    This certificate expires on 2023-10-18.
+    Certificate is saved at: /etc/letsencrypt/live/dev.spcoco.org/fullchain.pem
+    Key is saved at:         /etc/letsencrypt/live/dev.spcoco.org/privkey.pem
+    This certificate expires on 2024-06-19.
     These files will be updated when the certificate renews.
     Certbot has set up a scheduled task to automatically renew this certificate in the background.
 
@@ -73,7 +94,6 @@ TLS/SSL using Certificate Authority (CA)
      * Donating to ISRG / Let's Encrypt:   https://letsencrypt.org/donate
      * Donating to EFF:                    https://eff.org/donate-le
     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    ubuntu@ip-172-31-86-62:~$
 
 
 Install certificates into config directory

From c2422f98113251907859813caeafb62bd2c03dab Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 10:51:10 -0500
Subject: [PATCH 43/81] doc

---
 sphinx/about/install_run_notes.rst | 114 +++++++++++++++++------------
 1 file changed, 68 insertions(+), 46 deletions(-)

diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index 484b1917..7a7f9d0e 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -64,6 +64,18 @@ Install/Update repo from Github
     $ git clone git@github.com:specifysystems/sp_network.git
     $ git checkout <branch>
 
+DNS
+----------------------
+
+If this is a development or production server with an actual domain, first point the
+DNS record (through whatever service is managing the domain, GoDaddy in the case of
+spcoco.org) to the static IP address for the server.
+
+For AWS, create (or modify) an Elastic IP address to point to the EC2 instance.
+
+If replacing an EC2 instance, disassociate the Elastic IP address from the old EC2
+instance, and associate it with the new instance.
+
 SSL
 -----------------------------------
 :ref:`Specify Network SSL certificates`
@@ -76,25 +88,41 @@ Edit FQDN value in .env.analyst.conf and .env.broker.conf (referenced by the doc
 compose file) and server_name in config/nginx.conf to actual FQDN.
 
 
-Testing
----------------------------------------
-On a development server, check the following URL endpoints:
+Docker
+=================================
 
-* Index page: https://localhost
+More info at :ref:`Docker`
+
+
+Test
+===========================
+On a development server, check the following URL endpoints:
 
 * Broker:
 
-  * https://localhost/api/v1/
+  * https://localhost.broker
+  * https://localhost.broker/api/v1/
+
+  * https://localhost.broker/api/v1/badge/
+  * https://localhost.broker/api/v1/name/
+  * https://localhost.broker/api/v1/occ/
+  * https://localhost.broker/api/v1/frontend/
+
+  * https://localhost.broker/api/v1/badge/gbif?icon_status=active
+  * https://localhost.broker/api/v1/occ/?occid=a7156437-55ec-4c6f-89de-938f9361753d
+  * https://localhost.broker/api/v1/name/Harengula%20jaguana
+  * https://localhost.broker/api/v1/frontend/?occid=a7156437-55ec-4c6f-89de-938f9361753d
+
+* Analyst:
 
-    * https://localhost/api/v1/badge/
-    * https://localhost/api/v1/name/
-    * https://localhost/api/v1/occ/
-    * https://localhost/api/v1/frontend/
+  * https://localhost.analyst
+  * https://localhost.analyst/api/v1/
 
-  * https://localhost/api/v1/badge/gbif?icon_status=active
-  * https://localhost/api/v1/occ/?occid=a7156437-55ec-4c6f-89de-938f9361753d
-  * https://localhost/api/v1/name/Harengula%20jaguana
-  * https://localhost/api/v1/frontend/?occid=a7156437-55ec-4c6f-89de-938f9361753d
+  * https://localhost.analyst/api/v1/count/
+  * https://localhost.analyst/api/v1/rank/
+
+  * http://localhost.analyst/api/v1/count/?dataset_key=0000e36f-d0e9-46b0-aa23-cc1980f00515
+  * http://localhost.analyst/api/v1/rank/?by_species=true
 
 For local testing in a development environment, tests in the tests directory
 require the lmtest module available at https://github.com/lifemapper/lmtest.
@@ -109,7 +137,7 @@ virtual environment activation script (bin/activate) script::
     export WORKING_DIRECTORY="scratch-path"
 
 
-**Specify Network** homepage is now available at https://localhost/ and http://localhost.
+**Specify Network** homepage is now available at https://localhost/
 
 **Broker** (aka back-end):
 
@@ -122,29 +150,6 @@ needed.
 **Flask** is watching for back-end file changes and restarts the server when
 needed.
 
-Troubleshooting
-===========================================
-
-For webserver errors, check logs of nginx container::
-
-    $ sudo docker logs --tail 1000 sp_network-nginx-1
-    $ sudo docker logs --tail 1000 sp_network-broker-1
-
-
-Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container
-Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+
-Then stop/rebuild/start::
-
-    $ sudo docker compose stop
-    $ sudo docker system prune --all --volumes
-    $ sudo docker compose up -d
-
-Docker
-=================================
-
-More info at :ref:`Docker`
-
-
 Dev Environment
 ==========================
 
@@ -156,13 +161,15 @@ Dev Environment
     $ pip install -r requirements.txt
 
 
-Configure Debugger in local IDE
+Configure Debugger
 ========================================
 
+Pycharm
+------------------
 [Instructions for PyCharm]
 (https://kartoza.com/en/blog/using-docker-compose-based-python-interpreter-in-pycharm/)
 
-Debug
+Flask
 -------------------------------------------
 
 To run flask in debug mode, first set up Flask environment, then start the flask
@@ -187,6 +194,28 @@ Reset the FLASK_APP variable to test an alternate resource::
 Troubleshooting
 ======================================
 
+
+For webserver errors
+-----------------------
+
+Check logs of nginx container::
+
+    $ sudo docker logs --tail 1000 sp_network-nginx-1
+    $ sudo docker logs --tail 1000 sp_network-broker-1
+
+
+Import error from werkzeug.urls
+--------------------------------------
+
+Error: "... cannot import name 'url_quote' from 'werkzeug.urls'" in broker container
+Fix: Add Werkzeug==2.2.2 to requirements.txt to ensure it does not use 3.0+
+Then stop/rebuild/start::
+
+    $ sudo docker compose stop
+    $ sudo docker system prune --all --volumes
+    $ sudo docker compose up -d
+
+
 pip errors with SSL
 -------------------------------------------
 
@@ -225,13 +254,6 @@ pre-commit build errors
   * Updated .pre-commit-config.yaml isort version to latest,
      https://github.com/PyCQA/isort, fixed build
 
-AWS setup
-===================================
-
-* Add raw GBIF data to S3
-
-
-
 
 Dependencies:
 ==============

From a22f266d7615a19cc3c6265c3642e2bd6fa1a1a9 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 10:51:25 -0500
Subject: [PATCH 44/81] remove unused dependency

---
 requirements.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4b7aa412..ab01b10e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,4 @@ awscli
 boto3>=1.34.60
 sqlalchemy
 pandas
-pandas-sql
-s3fs
\ No newline at end of file
+pandas-sql
\ No newline at end of file

From 07e673d6fcce8e3da91e254334837b043cc096cb Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 10:52:01 -0500
Subject: [PATCH 45/81] disable dataset name resolution with GBIF API

---
 flask_app/analyst/base.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 3c29035a..8859d8ad 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -102,13 +102,14 @@ def _standardize_params(
     def _add_dataset_names_to_records(
             cls, records, dataset_key_field="datasetkey",
             dataset_name_field="dataset_name"):
-        # TODO: change this to a call to an S3 table with all dataset keys/names
-        # if import is at top level, causes recursion error in awss3.count_datasets
-        from sppy.tools.provider.gbif import GbifAPI
-        gbif = GbifAPI(service="dataset")
-        for rec in records:
-            dataset_name, _ = gbif.get_dataset(rec[dataset_key_field])
-            rec[dataset_name_field] = dataset_name
+        pass
+        # # TODO: change this to a call to an S3 table with all dataset keys/names
+        # # if import is at top level, causes recursion error in awss3.count_datasets
+        # from sppy.tools.provider.gbif import GbifAPI
+        # gbif = GbifAPI(service="dataset")
+        # for rec in records:
+        #     dataset_name, _ = gbif.get_dataset(rec[dataset_key_field])
+        #     rec[dataset_name_field] = dataset_name
 
 
 # .............................................................................

From 5c4444bf43fade145fbd302d2b97627444d762ea Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 15:59:23 -0500
Subject: [PATCH 46/81] enclose error message in list

---
 flask_app/analyst/count.py |  4 ++--
 flask_app/analyst/rank.py  | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 12d24a7e..280360a0 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -40,7 +40,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
                 dataset_key=dataset_key, pub_org_key=pub_org_key)
 
         except BadRequest as e:
-            errinfo = {"error": e.description}
+            errinfo = {"error": [e.description]}
 
         else:
             # Query dataset counts
@@ -49,7 +49,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
                     records, errors = cls._get_dataset_counts(
                         good_params["dataset_key"])
                 except Exception:
-                    errors = {"error": get_traceback()}
+                    errors = {"error": [get_traceback()]}
                 else:
                     cls._add_dataset_names_to_records(
                         records, dataset_key_field="datasetkey",
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index c6dc1021..61868953 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -42,7 +42,7 @@ def rank_counts(cls, count_by, order=None, limit=1):
                 count_by=count_by, order=order, limit=limit)
 
         except BadRequest as e:
-            errinfo = {"error": e.description}
+            errinfo = {"error": [e.description]}
 
         else:
             # Query for ordered dataset counts
@@ -51,7 +51,7 @@ def rank_counts(cls, count_by, order=None, limit=1):
                     good_params["count_by"], good_params["order"],
                     good_params["limit"])
             except Exception:
-                errors = {"error": get_traceback()}
+                errors = {"error": [get_traceback()]}
 
             # Combine errors from success or failure
             errinfo = combine_errinfo(errinfo, errors)
@@ -72,7 +72,7 @@ def _get_ordered_counts(cls, count_by, order, limit):
             records, errinfo = s3.rank_datasets(count_by, order, limit)
 
         except Exception:
-            errinfo = {"error": get_traceback()}
+            errinfo = {"error": [get_traceback()]}
 
         cls._add_dataset_names_to_records(
             records, dataset_key_field="datasetkey",
@@ -89,10 +89,9 @@ def _get_ordered_counts(cls, count_by, order, limit):
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
     count_by = "species"
-    order = "ascending"
+    order = "descending"
     limit = 5
-    response = svc.rank_counts(
-        count_by, order=order, limit=limit)
+    response = svc.rank_counts(count_by)
     AnalystOutput.print_output(response, do_print_rec=True)
     # print(response)
 

From d552b675eefadbcbec096966b0c485f39de525f2 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 15:59:38 -0500
Subject: [PATCH 47/81] doc

---
 sphinx/about/install_run_notes.rst | 15 ++++++++++--
 sphinx/aws/aws-setup.rst           | 37 +++++++++++++++++-------------
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index 7a7f9d0e..cdff1d87 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -29,10 +29,10 @@ Install
 Install dependencies
 ---------------------------------------
 
-Certbot::
+AWS Client, Certbot::
 
     $ sudo apt update
-    $ sudo apt install certbot
+    $ sudo apt install awscli, certbot
 
 
 Install Docker
@@ -93,6 +93,12 @@ Docker
 
 More info at :ref:`Docker`
 
+AWS Config
+================
+
+Boto3 getting Error "botocore.exceptions.NoCredentialsError
+
+Create credentials file on host EC2 instance
 
 Test
 ===========================
@@ -153,6 +159,11 @@ needed.
 Dev Environment
 ==========================
 
+* Base system libraries::
+
+    sudo apt get update
+    sudo apt install awscli, certbot, apt install python3.10-venv
+
 * Create a virtual environment and install python libs there::
 
     $ cd ~/git/sp_network
diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst
index b367bd92..adf883b7 100644
--- a/sphinx/aws/aws-setup.rst
+++ b/sphinx/aws/aws-setup.rst
@@ -1,14 +1,34 @@
 Authentication
 ####################
 
+Create an IAM role for the EC2/Redshift/S3 interaction
+***********************************************************
+
+* Create a Role (Redshift-S3) for service Redshift to read/write to S3
+
+  * Add a policy allowing read and write access to the specnet S3 bucket
+  * Step 1: Trusted entity type = AWS service, Use Case = Redshift - Customizable.
+
+    * TODO: change to Redshift - Scheduler when we automate the workflow
+
+  * Step 2: Add permissions
+
+    * AmazonRedshiftAllCommandsFullAccess (AWS managed)
+    * AmazonS3FullAccess (AWS managed)
+
 EC2 instance creation
 ===========================================================
 
-* Instance type t3.small (2gb RAM).
+* Instance type t3.small
+
+  * Build fails with t2.micro or t3.micro with 1gb RAM
+  * t3.small is 2gb RAM
+
 * Ubuntu Server 22.04 LTS, SSD Volume Type (free tier eligible), x86 architecture
 * Security Group: launch-wizard-1
 * 30 Gb General Purpose SSD (gp2)
 * For dev, Spot instance (in Advanced options)
+* Modify IAM role - for role created above (i.e. specnet_ec2_role)
 
 For programmatic access to S3
 ===========================================================
@@ -47,21 +67,6 @@ Overview
   or preserved specimen.  This brings the full dataset from about 2.6 billion down to
   2.3 billion.
 
-Create an IAM role for the Redshift/S3 interaction
-***********************************************************
-
-* Create a Role (Redshift-S3) for service Redshift to read/write to S3
-
-  * Add a policy allowing read and write access to the specnet S3 bucket
-  * Step 1: Trusted entity type = AWS service, Use Case = Redshift - Customizable.
-
-    * TODO: change to Redshift - Scheduler when we automate the workflow
-
-  * Step 2: Add permissions
-
-    * AmazonRedshiftAllCommandsFullAccess (AWS managed)
-    * AmazonS3FullAccess (AWS managed)
-
 
 Create a new workgroup (and namespace)
 ***********************************************************

From 7c5ce78686be1e435c211ac1f53fd1f57dad5c2d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 16:00:34 -0500
Subject: [PATCH 48/81] enclose error message in list

---
 sppy/tools/provider/awss3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index da511b1a..d7bfde71 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -181,7 +181,7 @@ def rank_datasets(self, count_by, order, limit, format="JSON"):
             records, errors = self._query_order_s3_table(
                 self._dataset_counts_path, sort_field, order, limit)
         except Exception as e:
-            errors = {"error": get_traceback()}
+            errors = {"error": [get_traceback()]}
         return records, errors
 
 # .............................................................................

From c930e622d6ee5269a4eba94593f5eeeed1220fea Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 16:01:15 -0500
Subject: [PATCH 49/81] add dataset name, citation to table; unfinished

---
 sppy/aws/aws_tools.py | 117 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index c01fba23..c775a4f1 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -7,13 +7,16 @@
 from botocore.exceptions import ClientError
 import csv
 import datetime
+from http import HTTPStatus
 import logging
 from logging.handlers import RotatingFileHandler
 import pandas as pd
 import os
+import requests
+import xml.etree.ElementTree as ET
 
 from sppy.aws.aws_constants import (
-    INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
+    ENCODING, INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
     PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
     USER_DATA_TOKEN)
 
@@ -674,4 +677,116 @@ def create_dataframe_from_s3obj(
         df = pd.read_parquet(s3_uri)
     return df
 
+# ...............................................
+def _get_nested_output_val(output, key_list):
+    while key_list:
+        key = key_list[0]
+        key_list = key_list[1:]
+        try:
+            output = output[key]
+            if not key_list:
+                val = output
+                if type(val) is bytes:
+                    val = str(val).encode(ENCODING)
+                return str(output).encode(ENCODING)
+        except Exception:
+            return None
+
+# ...............................................
+def _get_values_for_keys(output, keys):
+    values = []
+    # Get values from JSON response
+    for key in keys:
+        if type(key) is list or type(key) is tuple:
+            val = _get_nested_output_val(output, key)
+        else:
+            try:
+                val = output[key]
+            except Exception:
+                val = None
+            if type(val) is bytes:
+                val = str(val).encode(ENCODING)
+        values.append(val)
+    return values
+
+
+# ...............................................
+def _get_api_response_vals(url, keys):
+    values = []
+    output = {}
+    try:
+        response = requests.get(url)
+    except Exception as e:
+        errmsg = str(e)
+    else:
+        try:
+            status_code = response.status_code
+            reason = response.reason
+        except Exception:
+            status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+            reason = "Unknown API status_code/reason"
+        if status_code == HTTPStatus.OK:
+            # Parse response
+            try:
+                output = response.json()
+            except Exception:
+                output = response.content
+                if type(output) is bytes:
+                    output = ET.fromstring(str(output))
+                try:
+                    output = ET.parse(output)
+                except Exception as e:
+                    errmsg = f"Provider error: Invalid JSON response ({output})"
+            # Get values from JSON response
+            _get_values_for_keys(output, keys)
+    return values
+
+# ...............................................
+def get_dataset(dataset_key):
+    """Return title from one dataset record with this key.
+
+    Args:
+        dataset_key: GBIF identifier for this dataset
+
+    Returns:
+        dataset_name: the name of the dataset.
+        citation: the preferred citation for the dataset.
+
+    Raises:
+        Exception: on query failure.
+    """
+    url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
+    title, citation = _get_api_response_vals(url, ["title", ["citation", "text"]])
+    return title, citation
+
+# ----------------------------------------------------
+def create_dataset_name_lookup(
+        bucket, s3_folders, s3_fname, ds_key_fieldname, datatype="parquet", region=REGION, encoding="utf-8"):
+    """Read CSV data from S3 into a pandas DataFrame.
 
+    Args:
+        bucket: name of the bucket containing the CSV data.
+        s3_path: the object name with enclosing S3 bucket folders.
+        ds_key_fieldname: fieldname of the column with GBIF datasetKey
+        region: AWS region to query.
+        datatype: tabular datatype, options are "csv", "parquet"
+
+    Returns:
+        df: pandas DataFrame containing the CSV data.
+    """
+    lookup_name = "dataset_name_citation"
+    input_path = f"{s3_folders}/{s3_fname}"
+    output_path = f"{s3_folders}/{lookup_name}"
+    ds_table = create_dataframe_from_s3obj(
+        bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
+    ds_names = []
+    ds_citations = []
+    for rec in ds_table.itertuples():
+        title, citation = get_dataset(rec.datasetkey)
+        ds_names.append(title)
+        ds_citations.append(citation)
+    # dataset_name and dataset_citation are the new fieldnames to be assigned
+    ds_table.assign(dataset_name=ds_names, dataset_citation=ds_citations)
+    tmp_filename = f"/tmp/{lookup_name}"
+    ds_table.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=ENCODING)
+    upload_to_s3(tmp_filename, bucket, output_path, region=region)
\ No newline at end of file

From 32aa000c32e7e1e9fa9a09d3200146786b326498 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 16:28:07 -0500
Subject: [PATCH 50/81] add pyarrow for parquet support

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ab01b10e..284c4368 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ awscli
 boto3>=1.34.60
 sqlalchemy
 pandas
-pandas-sql
\ No newline at end of file
+pandas-sql
+pyarrow

From efd163ca2105f155726679776dcbfdd507bd80a9 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 17:18:37 -0500
Subject: [PATCH 51/81] error doc

---
 sphinx/aws/aws-setup.rst | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/sphinx/aws/aws-setup.rst b/sphinx/aws/aws-setup.rst
index adf883b7..0e1896d9 100644
--- a/sphinx/aws/aws-setup.rst
+++ b/sphinx/aws/aws-setup.rst
@@ -272,6 +272,8 @@ Enable S3 access from local machine and EC2
 Error: SSL
 ***************************************
 
+First time:
+
 Error message ::
 
     SSL validation failed for https://ec2.us-east-1.amazonaws.com/
@@ -285,6 +287,38 @@ Test with::
 
 Fix: Set up to work with Secret containing security key
 
+Second time (in python code):
+>>> response = requests.get(url)
+Traceback (most recent call last):
+  File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 703, in urlopen
+    httplib_response = self._make_request(
+  File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 386, in _make_request
+    self._validate_conn(conn)
+  File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 1042, in _validate_conn
+    conn.connect()
+  File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/connection.py", line 419, in connect
+    self.sock = ssl_wrap_socket(
+  File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
+    ssl_sock = _ssl_wrap_socket_impl(
+  File "/home/astewart/git/sp_network/venv/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
+    return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
+  File "/usr/lib/python3.8/ssl.py", line 500, in wrap_socket
+    return self.sslsocket_class._create(
+  File "/usr/lib/python3.8/ssl.py", line 1069, in _create
+    self.do_handshake()
+  File "/usr/lib/python3.8/ssl.py", line 1338, in do_handshake
+    self._sslobj.do_handshake()
+ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1131)
+
+
+https://stackoverflow.com/questions/51925384/unable-to-get-local-issuer-certificate-when-using-requests
+
+pip install certifi
+
+import certifi
+certifi.where()
+
+
 
 Workflow for Specify Network Analyst pre-computations
 ===========================================================

From 67a30e14621d7fcfc825a504416dabcdc9e75709 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 22 Mar 2024 17:18:50 -0500
Subject: [PATCH 52/81] debugging

---
 sppy/aws/aws_tools.py | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index c775a4f1..c8b52ae6 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -789,4 +789,35 @@ def create_dataset_name_lookup(
     ds_table.assign(dataset_name=ds_names, dataset_citation=ds_citations)
     tmp_filename = f"/tmp/{lookup_name}"
     ds_table.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=ENCODING)
-    upload_to_s3(tmp_filename, bucket, output_path, region=region)
\ No newline at end of file
+    upload_to_s3(tmp_filename, bucket, output_path, region=region)
+    
+    
+# .............................................................................
+if __name__ == "__main__":
+    from sppy.aws.aws_tools  import *
+    from sppy.aws.aws_constants import *
+
+    bucket=PROJ_BUCKET
+    s3_folders="summary"
+    s3_fname="dataset_counts_2024_02_01_000.parquet"
+    lookup_name = "dataset_name_citation"
+    input_path = f"{s3_folders}/{s3_fname}"
+    output_path = f"{s3_folders}/{lookup_name}"
+
+    ds_table: object = create_dataframe_from_s3obj(
+        bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
+
+    i = 0
+    for rec in ds_table.itertuples():
+        print(i)
+        print(rec)
+        i = i + 1
+        if i == 5:
+            break
+
+    dataset_key = rec.datasetkey
+
+    url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
+    response = requests.get(url)
+
+

From 271cd137db1fca2e4371ada00ad8d45715134afb Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 25 Mar 2024 16:45:51 -0500
Subject: [PATCH 53/81] rm unused pandas-sql

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 284c4368..5b7074f5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,4 @@ awscli
 boto3>=1.34.60
 sqlalchemy
 pandas
-pandas-sql
 pyarrow

From 8161ef52aaa9236343122a244b42cb222fe4c899 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 25 Mar 2024 16:46:55 -0500
Subject: [PATCH 54/81] create lookup tables for GBIF datasets and
 publishingOrganizations; untested

---
 sppy/aws/aws_tools.py | 214 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 179 insertions(+), 35 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index c8b52ae6..8e4944a8 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -686,9 +686,9 @@ def _get_nested_output_val(output, key_list):
             output = output[key]
             if not key_list:
                 val = output
-                if type(val) is bytes:
-                    val = str(val).encode(ENCODING)
-                return str(output).encode(ENCODING)
+                # if type(val) is bytes:
+                #     val = val.decode(ENCODING)
+                return val
         except Exception:
             return None
 
@@ -704,8 +704,8 @@ def _get_values_for_keys(output, keys):
                 val = output[key]
             except Exception:
                 val = None
-            if type(val) is bytes:
-                val = str(val).encode(ENCODING)
+            # if type(val) is bytes:
+            #     val = val.decode(ENCODING)
         values.append(val)
     return values
 
@@ -713,7 +713,6 @@ def _get_values_for_keys(output, keys):
 # ...............................................
 def _get_api_response_vals(url, keys):
     values = []
-    output = {}
     try:
         response = requests.get(url)
     except Exception as e:
@@ -738,11 +737,11 @@ def _get_api_response_vals(url, keys):
                 except Exception as e:
                     errmsg = f"Provider error: Invalid JSON response ({output})"
             # Get values from JSON response
-            _get_values_for_keys(output, keys)
+            values = _get_values_for_keys(output, keys)
     return values
 
 # ...............................................
-def get_dataset(dataset_key):
+def get_dataset_name_citation(dataset_key):
     """Return title from one dataset record with this key.
 
     Args:
@@ -756,55 +755,168 @@ def get_dataset(dataset_key):
         Exception: on query failure.
     """
     url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
-    title, citation = _get_api_response_vals(url, ["title", ["citation", "text"]])
-    return title, citation
+    name, citation = _get_api_response_vals(url, ["title", ["citation", "text"]])
+    return name, citation
+
+
+# ...............................................
+def _parse_records(ret_records, keys):
+    small_recs = []
+    for rec in ret_records:
+        values = _get_values_for_keys(rec, keys)
+        small_recs.append(values)
+    return small_recs
+
+# ...............................................
+def _get_records(url, keys):
+    small_recs = []
+    is_end = True
+    try:
+        response = requests.get(url)
+    except Exception as e:
+        errmsg = str(e)
+    else:
+        try:
+            status_code = response.status_code
+            reason = response.reason
+        except Exception as e:
+            status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+            reason = str(e)
+        if status_code == HTTPStatus.OK:
+            # Parse response
+            try:
+                output = response.json()
+            except Exception:
+                output = response.content
+                if type(output) is bytes:
+                    output = ET.fromstring(str(output))
+                try:
+                    output = ET.parse(output)
+                except Exception:
+                    reason = f"Provider error: Invalid JSON response ({output})"
+            # Last query?
+            try:
+                is_end = output["endOfRecords"]
+            except KeyError:
+                print("Missing endOfRecords flag")
+            # Get values from JSON response
+            try:
+                ret_records = output["results"]
+            except KeyError:
+                reason = "No results returned"
+            else:
+                small_recs = _parse_records(ret_records, keys)
+    return small_recs, is_end
+
+
+# ...............................................
+def create_dataset_lookup():
+    """Return title from one dataset record with this key.
+
+    Returns:
+        dataframe of records containing GBIF dataset key, title, and citation
+
+    Raises:
+        Exception: on query failure.
+    """
+    all_recs = []
+    is_end = False
+    keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+    offset = 0
+    limit = 100
+    while is_end is False:
+        url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}"
+        small_recs, is_end = _get_records(url, keys)
+        all_recs.append(small_recs)
+    lookup_df = pd.DataFrame(
+        all_recs,
+        columns=["datasetKey", "publishingOrganizationKey", "title", "citation"])
+    return lookup_df
 
 # ----------------------------------------------------
-def create_dataset_name_lookup(
-        bucket, s3_folders, s3_fname, ds_key_fieldname, datatype="parquet", region=REGION, encoding="utf-8"):
+def create_dataset_lookup(
+        bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"):
     """Read CSV data from S3 into a pandas DataFrame.
 
     Args:
         bucket: name of the bucket containing the CSV data.
-        s3_path: the object name with enclosing S3 bucket folders.
-        ds_key_fieldname: fieldname of the column with GBIF datasetKey
+        s3_folders: S3 bucket folders for output lookup table
+        lookup_fname: output table for looking up dataset name and citation
         region: AWS region to query.
-        datatype: tabular datatype, options are "csv", "parquet"
+        encoding: encoding of the input data
 
     Returns:
         df: pandas DataFrame containing the CSV data.
     """
-    lookup_name = "dataset_name_citation"
-    input_path = f"{s3_folders}/{s3_fname}"
-    output_path = f"{s3_folders}/{lookup_name}"
-    ds_table = create_dataframe_from_s3obj(
-        bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
-    ds_names = []
-    ds_citations = []
-    for rec in ds_table.itertuples():
-        title, citation = get_dataset(rec.datasetkey)
-        ds_names.append(title)
-        ds_citations.append(citation)
-    # dataset_name and dataset_citation are the new fieldnames to be assigned
-    ds_table.assign(dataset_name=ds_names, dataset_citation=ds_citations)
-    tmp_filename = f"/tmp/{lookup_name}"
-    ds_table.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=ENCODING)
+    all_recs = []
+    is_end = False
+    keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+    offset = 0
+    limit = 100
+    while is_end is False:
+        url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}"
+        small_recs, is_end = _get_records(url, keys)
+        all_recs.append(small_recs)
+    lookup_df = pd.DataFrame(
+        all_recs,
+        columns=["datasetKey", "publishingOrganizationKey", "title", "citation"])
+
+    output_path = f"{s3_folders}/{lookup_fname}"
+    tmp_filename = f"/tmp/{lookup_fname}"
+    # Output data written as CSV
+    lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
     upload_to_s3(tmp_filename, bucket, output_path, region=region)
-    
-    
+
+
+# ----------------------------------------------------
+def create_puborg_lookup(
+        bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"):
+    """Read CSV data from S3 into a pandas DataFrame.
+
+    Args:
+        bucket: name of the bucket containing the CSV data.
+        s3_folders: S3 bucket folders for output lookup table
+        lookup_fname: output table for looking up dataset name and citation
+        region: AWS region to query.
+        encoding: encoding of the input data
+
+    Returns:
+        df: pandas DataFrame containing the CSV data.
+    """
+    all_recs = []
+    is_end = False
+    keys = ["key", "title"]
+    offset = 0
+    limit = 100
+    while is_end is False:
+        url = f"https://api.gbif.org/v1/organization?offset={offset}&limit={limit}"
+        small_recs, is_end = _get_records(url, keys)
+        all_recs.append(small_recs)
+    lookup_df = pd.DataFrame(all_recs, columns=["publishingOrganizationKey", "title"])
+
+    output_path = f"{s3_folders}/{lookup_fname}"
+    tmp_filename = f"/tmp/{lookup_fname}"
+    # Output data written as CSV
+    lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
+    upload_to_s3(tmp_filename, bucket, output_path, region=region)
+
 # .............................................................................
 if __name__ == "__main__":
     from sppy.aws.aws_tools  import *
     from sppy.aws.aws_constants import *
 
+    import certifi
+
+    cert = certifi.where()
+
     bucket=PROJ_BUCKET
     s3_folders="summary"
     s3_fname="dataset_counts_2024_02_01_000.parquet"
-    lookup_name = "dataset_name_citation"
+    lookup_name = "dataset_name_2024_02_01_"
     input_path = f"{s3_folders}/{s3_fname}"
     output_path = f"{s3_folders}/{lookup_name}"
 
-    ds_table: object = create_dataframe_from_s3obj(
+    ds_table = create_dataframe_from_s3obj(
         bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
 
     i = 0
@@ -818,6 +930,38 @@ def create_dataset_name_lookup(
     dataset_key = rec.datasetkey
 
     url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
-    response = requests.get(url)
+    # response = requests.get(url)
+    r = requests.get(url, cert=cert)
+
+"""
+from sppy.aws.aws_tools  import *
+from sppy.aws.aws_constants import *
+
+import certifi
+cert = certifi.where()
+
+bucket=PROJ_BUCKET
+s3_folders="summary"
+s3_fname="dataset_counts_2024_02_01_000.parquet"
+lookup_name = "dataset_name_2024_02_01_"
+input_path = f"{s3_folders}/{s3_fname}"
+output_path = f"{s3_folders}/{lookup_name}"
+
+ds_table = create_dataframe_from_s3obj(
+    bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
+
+i = 0
+for rec in ds_table.itertuples():
+    print(i)
+    print(rec)
+    i = i + 1
+    if i == 5:
+        break
+
+dataset_key = rec.datasetkey
 
+url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
+# response = requests.get(url)
+r = requests.get(url, cert=cert)
 
+"""

From 61178203692737f093d1b150ebf046ae9be8ae4d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 26 Mar 2024 14:30:45 -0500
Subject: [PATCH 55/81] increment offset in paging loop; cleanup

---
 sppy/aws/aws_tools.py | 96 +++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 53 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 8e4944a8..3338e041 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -17,7 +17,7 @@
 
 from sppy.aws.aws_constants import (
     ENCODING, INSTANCE_TYPE, KEY_NAME, LOGFILE_MAX_BYTES, LOG_FORMAT, LOG_DATE_FORMAT,
-    PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
+    PROJ_BUCKET, PROJ_NAME, REGION, SECURITY_GROUP_ID, SPOT_TEMPLATE_BASENAME,
     USER_DATA_TOKEN)
 
 
@@ -814,7 +814,8 @@ def create_dataset_lookup():
     """Return title from one dataset record with this key.
 
     Returns:
-        dataframe of records containing GBIF dataset key, title, and citation
+        dataframe of records containing GBIF dataset key, GBIF publishingOrg key,
+            dataset title, and dataset citation
 
     Raises:
         Exception: on query failure.
@@ -828,13 +829,14 @@ def create_dataset_lookup():
         url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}"
         small_recs, is_end = _get_records(url, keys)
         all_recs.append(small_recs)
+        offset += limit
     lookup_df = pd.DataFrame(
         all_recs,
         columns=["datasetKey", "publishingOrganizationKey", "title", "citation"])
     return lookup_df
 
 # ----------------------------------------------------
-def create_dataset_lookup(
+def create_s3_dataset_lookup(
         bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"):
     """Read CSV data from S3 into a pandas DataFrame.
 
@@ -845,21 +847,11 @@ def create_dataset_lookup(
         region: AWS region to query.
         encoding: encoding of the input data
 
-    Returns:
-        df: pandas DataFrame containing the CSV data.
+    Postcondition:
+        CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
+            to the named S3 object in bucket and folders
     """
-    all_recs = []
-    is_end = False
-    keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
-    offset = 0
-    limit = 100
-    while is_end is False:
-        url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}"
-        small_recs, is_end = _get_records(url, keys)
-        all_recs.append(small_recs)
-    lookup_df = pd.DataFrame(
-        all_recs,
-        columns=["datasetKey", "publishingOrganizationKey", "title", "citation"])
+    lookup_df = create_dataset_lookup()
 
     output_path = f"{s3_folders}/{lookup_fname}"
     tmp_filename = f"/tmp/{lookup_fname}"
@@ -869,19 +861,20 @@ def create_dataset_lookup(
 
 
 # ----------------------------------------------------
-def create_puborg_lookup(
+def create_s3_puborg_lookup(
         bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"):
     """Read CSV data from S3 into a pandas DataFrame.
 
     Args:
         bucket: name of the bucket containing the CSV data.
         s3_folders: S3 bucket folders for output lookup table
-        lookup_fname: output table for looking up dataset name and citation
+        lookup_fname: output table for looking up organization name
         region: AWS region to query.
         encoding: encoding of the input data
 
-    Returns:
-        df: pandas DataFrame containing the CSV data.
+    Postcondition:
+        CSV table with pubOrgKey, pubOrg name written to the named S3 object in
+            bucket and folders
     """
     all_recs = []
     is_end = False
@@ -892,6 +885,7 @@ def create_puborg_lookup(
         url = f"https://api.gbif.org/v1/organization?offset={offset}&limit={limit}"
         small_recs, is_end = _get_records(url, keys)
         all_recs.append(small_recs)
+        offset += limit
     lookup_df = pd.DataFrame(all_recs, columns=["publishingOrganizationKey", "title"])
 
     output_path = f"{s3_folders}/{lookup_fname}"
@@ -900,15 +894,9 @@ def create_puborg_lookup(
     lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
     upload_to_s3(tmp_filename, bucket, output_path, region=region)
 
+
 # .............................................................................
 if __name__ == "__main__":
-    from sppy.aws.aws_tools  import *
-    from sppy.aws.aws_constants import *
-
-    import certifi
-
-    cert = certifi.where()
-
     bucket=PROJ_BUCKET
     s3_folders="summary"
     s3_fname="dataset_counts_2024_02_01_000.parquet"
@@ -930,38 +918,40 @@ def create_puborg_lookup(
     dataset_key = rec.datasetkey
 
     url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
-    # response = requests.get(url)
-    r = requests.get(url, cert=cert)
+    response = requests.get(url)
+    # import certifi
+    # cert = certifi.where()
+    # r = requests.get(url, cert=cert)
 
 """
 from sppy.aws.aws_tools  import *
 from sppy.aws.aws_constants import *
 
-import certifi
-cert = certifi.where()
-
 bucket=PROJ_BUCKET
 s3_folders="summary"
-s3_fname="dataset_counts_2024_02_01_000.parquet"
-lookup_name = "dataset_name_2024_02_01_"
-input_path = f"{s3_folders}/{s3_fname}"
-output_path = f"{s3_folders}/{lookup_name}"
-
-ds_table = create_dataframe_from_s3obj(
-    bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
+lookup_fname = "dataset_name_2024_02_01_"
+
+create_s3_dataset_lookup(
+        bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8")
+
+# ds_table = create_dataframe_from_s3obj(
+#     bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
+# 
+# i = 0
+# for rec in ds_table.itertuples():
+#     print(i)
+#     print(rec)
+#     i = i + 1
+#     if i == 5:
+#         break
+# 
+# dataset_key = rec.datasetkey
+# 
+# url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
+
+# import certifi
+# cert = certifi.where()
+# r = requests.get(url, cert=cert)
 
-i = 0
-for rec in ds_table.itertuples():
-    print(i)
-    print(rec)
-    i = i + 1
-    if i == 5:
-        break
-
-dataset_key = rec.datasetkey
-
-url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
 # response = requests.get(url)
-r = requests.get(url, cert=cert)
-
 """

From 4fbfec35708c59419e98861ea3cc6f93fa1b79e9 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 26 Mar 2024 15:41:03 -0500
Subject: [PATCH 56/81] generalize to create S3 lookup table from any API query

---
 sppy/aws/aws_tools.py | 134 ++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 70 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 3338e041..3d05fde6 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -809,66 +809,86 @@ def _get_records(url, keys):
     return small_recs, is_end
 
 
-# ...............................................
-def create_dataset_lookup():
-    """Return title from one dataset record with this key.
+# ----------------------------------------------------
+def create_s3_lookup(
+        bucket, s3_folders, base_url, response_keys, output_fname, output_columns,
+        region=REGION, encoding="utf-8"):
+    """Query an API, read the data and write a subset to a table in S3.
 
-    Returns:
-        dataframe of records containing GBIF dataset key, GBIF publishingOrg key,
-            dataset title, and dataset citation
+    Args:
+        bucket: name of the bucket containing the CSV data.
+        s3_folders: S3 bucket folders for output lookup table
+        base_url: API URL without any key value pairs for the data service
+        response_keys: list of keys within the API response to pull values from.  A key
+            can be an ordered list of keys nested within several elements of the tree,
+            from outermost to innermost.
+        output_columns: list of column headings for output lookup table
+        output_fname: output table for looking up dataset name and citation
+        region: AWS region containing the destination bucket.
+        encoding: encoding of the input data
 
-    Raises:
-        Exception: on query failure.
+    Postcondition:
+        CSV table with output_columns and values for each written to the named S3 object
+            in bucket and folders
     """
     all_recs = []
     is_end = False
-    keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
     offset = 0
     limit = 100
+
     while is_end is False:
-        url = f"https://api.gbif.org/v1/dataset?offset={offset}&limit={limit}"
-        small_recs, is_end = _get_records(url, keys)
+        url = f"{base_url}?offset={offset}&limit={limit}"
+        small_recs, is_end = _get_records(url, response_keys)
         all_recs.append(small_recs)
         offset += limit
+        if offset % 1000 == 0:
+            print(f"Offset = {offset}")
+
     lookup_df = pd.DataFrame(
         all_recs,
-        columns=["datasetKey", "publishingOrganizationKey", "title", "citation"])
-    return lookup_df
+        columns=output_columns)
+    print(f"Lookup table contains {lookup_df.shape[0]} rows")
+
+    output_path = f"{s3_folders}/{output_fname}"
+    tmp_filename = f"/tmp/{output_fname}"
+    # Output data written as CSV
+    lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
+    print(f"Wrote {tmp_filename}")
+    upload_to_s3(tmp_filename, bucket, output_path, region=region)
+    print(f"Uploaded to s3://{bucket}/{output_path}")
+
 
 # ----------------------------------------------------
-def create_s3_dataset_lookup(
-        bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"):
-    """Read CSV data from S3 into a pandas DataFrame.
+def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
+    """Query the GBIF Dataset API, write a subset of the response to a table in S3.
 
     Args:
         bucket: name of the bucket containing the CSV data.
         s3_folders: S3 bucket folders for output lookup table
-        lookup_fname: output table for looking up dataset name and citation
-        region: AWS region to query.
+        region: AWS region containing the destination bucket.
         encoding: encoding of the input data
 
     Postcondition:
         CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
             to the named S3 object in bucket and folders
     """
-    lookup_df = create_dataset_lookup()
-
-    output_path = f"{s3_folders}/{lookup_fname}"
-    tmp_filename = f"/tmp/{lookup_fname}"
-    # Output data written as CSV
-    lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
-    upload_to_s3(tmp_filename, bucket, output_path, region=region)
-
+    base_url = "https://api.gbif.org/v1/dataset"
+    response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+    data_date = get_current_datadate_str()
+    output_fname = f"dataset_name_{data_date}_"
+    output_fname = "dataset_name_2024_02_01_"
+    output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
+    create_s3_lookup(
+        bucket, s3_folders, base_url, response_keys, output_fname, output_columns,
+        region=region, encoding=encoding)
 
 # ----------------------------------------------------
-def create_s3_puborg_lookup(
-        bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8"):
-    """Read CSV data from S3 into a pandas DataFrame.
+def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
+    """Query the GBIF Organization API, write a subset of the response to a table in S3.
 
     Args:
         bucket: name of the bucket containing the CSV data.
         s3_folders: S3 bucket folders for output lookup table
-        lookup_fname: output table for looking up organization name
         region: AWS region to query.
         encoding: encoding of the input data
 
@@ -876,52 +896,27 @@ def create_s3_puborg_lookup(
         CSV table with pubOrgKey, pubOrg name written to the named S3 object in
             bucket and folders
     """
-    all_recs = []
-    is_end = False
-    keys = ["key", "title"]
-    offset = 0
-    limit = 100
-    while is_end is False:
-        url = f"https://api.gbif.org/v1/organization?offset={offset}&limit={limit}"
-        small_recs, is_end = _get_records(url, keys)
-        all_recs.append(small_recs)
-        offset += limit
-    lookup_df = pd.DataFrame(all_recs, columns=["publishingOrganizationKey", "title"])
-
-    output_path = f"{s3_folders}/{lookup_fname}"
-    tmp_filename = f"/tmp/{lookup_fname}"
-    # Output data written as CSV
-    lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
-    upload_to_s3(tmp_filename, bucket, output_path, region=region)
+    base_url = "https://api.gbif.org/v1/dataset"
+    response_keys = ["key", "title"]
+    data_date = get_current_datadate_str()
+    output_fname = f"organization_name_{data_date}_"
+    output_fname = "organization_name_2024_02_01_"
+    output_columns = ["publishingOrganizationKey", "title"]
+    create_s3_lookup(
+        bucket, s3_folders, base_url, response_keys, output_fname, output_columns,
+        region=region, encoding=encoding)
 
 
 # .............................................................................
 if __name__ == "__main__":
     bucket=PROJ_BUCKET
     s3_folders="summary"
-    s3_fname="dataset_counts_2024_02_01_000.parquet"
-    lookup_name = "dataset_name_2024_02_01_"
-    input_path = f"{s3_folders}/{s3_fname}"
-    output_path = f"{s3_folders}/{lookup_name}"
-
-    ds_table = create_dataframe_from_s3obj(
-        bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
 
-    i = 0
-    for rec in ds_table.itertuples():
-        print(i)
-        print(rec)
-        i = i + 1
-        if i == 5:
-            break
+    create_s3_dataset_lookup(
+        bucket, s3_folders, region=REGION, encoding="utf-8")
+    create_s3_organization_lookup(
+        bucket, s3_folders, region=REGION, encoding="utf-8")
 
-    dataset_key = rec.datasetkey
-
-    url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
-    response = requests.get(url)
-    # import certifi
-    # cert = certifi.where()
-    # r = requests.get(url, cert=cert)
 
 """
 from sppy.aws.aws_tools  import *
@@ -929,10 +924,9 @@ def create_s3_puborg_lookup(
 
 bucket=PROJ_BUCKET
 s3_folders="summary"
-lookup_fname = "dataset_name_2024_02_01_"
 
 create_s3_dataset_lookup(
-        bucket, s3_folders, lookup_fname, region=REGION, encoding="utf-8")
+        bucket, s3_folders, region=REGION, encoding="utf-8")
 
 # ds_table = create_dataframe_from_s3obj(
 #     bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)

From 87297e7e8ac4bfdb91241f5c633421009c829aa7 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Tue, 26 Mar 2024 15:59:12 -0500
Subject: [PATCH 57/81] add new tables for s3 query

---
 sppy/tools/provider/awss3.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index d7bfde71..93444a65 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -31,7 +31,8 @@ def __init__(
         datestr = get_current_datadate_str()
         datestr = "2024_02_01"
         self._dataset_counts_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
-        self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
+        self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_lists_{datestr}_000.parquet"
+        self._dataset_names_path = f"{SUMMARY_FOLDER}/dataset_names_{datestr}_000.csv"
 
     # ----------------------------------------------------
     def _query_table(self, s3_path, query_str, format="CSV"):

From cc6b1bb21e5e1130118bbee0c507199680f8f8a1 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 13:41:23 -0500
Subject: [PATCH 58/81] rename S3Query class to SpNetAnalyses

---
 flask_app/analyst/base.py  | 15 +--------------
 flask_app/analyst/count.py |  4 ++--
 flask_app/analyst/rank.py  |  4 ++--
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 8859d8ad..263441d7 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -5,6 +5,7 @@
 from flask_app.common.s2n_type import AnalystOutput, APIService
 
 from sppy.tools.s2n.utils import get_traceback
+from sppy.tools.provider.awss3 import SpNetAnalyses
 
 
 # .............................................................................
@@ -97,20 +98,6 @@ def _standardize_params(
 
         return usr_params, errinfo
 
-    # ...............................................
-    @classmethod
-    def _add_dataset_names_to_records(
-            cls, records, dataset_key_field="datasetkey",
-            dataset_name_field="dataset_name"):
-        pass
-        # # TODO: change this to a call to an S3 table with all dataset keys/names
-        # # if import is at top level, causes recursion error in awss3.count_datasets
-        # from sppy.tools.provider.gbif import GbifAPI
-        # gbif = GbifAPI(service="dataset")
-        # for rec in records:
-        #     dataset_name, _ = gbif.get_dataset(rec[dataset_key_field])
-        #     rec[dataset_name_field] = dataset_name
-
 
 # .............................................................................
 if __name__ == "__main__":
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 280360a0..4dabf0f3 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -6,7 +6,7 @@
 from flask_app.analyst.base import _AnalystService
 
 from sppy.aws.aws_constants import PROJ_BUCKET
-from sppy.tools.provider.awss3 import S3Query
+from sppy.tools.provider.awss3 import SpNetAnalyses
 from sppy.tools.s2n.utils import (combine_errinfo, get_traceback)
 
 
@@ -86,7 +86,7 @@ def _get_dataset_counts(cls, dataset_key):
         """
         records = []
         errors = {}
-        s3 = S3Query(PROJ_BUCKET)
+        s3 = SpNetAnalyses(PROJ_BUCKET)
         try:
             records = s3.get_dataset_counts(dataset_key)
         except Exception:
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index 61868953..f0429031 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -6,7 +6,7 @@
 from flask_app.analyst.base import _AnalystService
 
 from sppy.aws.aws_constants import PROJ_BUCKET
-from sppy.tools.provider.awss3 import S3Query
+from sppy.tools.provider.awss3 import SpNetAnalyses
 from sppy.tools.s2n.utils import (combine_errinfo, get_traceback)
 
 
@@ -67,7 +67,7 @@ def rank_counts(cls, count_by, order=None, limit=1):
     @classmethod
     def _get_ordered_counts(cls, count_by, order, limit):
         records = []
-        s3 = S3Query(PROJ_BUCKET)
+        s3 = SpNetAnalyses(PROJ_BUCKET)
         try:
             records, errinfo = s3.rank_datasets(count_by, order, limit)
 

From 8e14e1b21bbe2a889559750098d878b152afaf99 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 13:55:36 -0500
Subject: [PATCH 59/81] structure S3 table metadata into a dictionary in class

---
 flask_app/analyst/rank.py    |   2 +-
 sppy/tools/provider/awss3.py | 107 +++++++++++++++++++++++++++++------
 2 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index f0429031..4aa55b26 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -69,7 +69,7 @@ def _get_ordered_counts(cls, count_by, order, limit):
         records = []
         s3 = SpNetAnalyses(PROJ_BUCKET)
         try:
-            records, errinfo = s3.rank_datasets(count_by, order, limit)
+            records, errinfo = s3.rank_dataset_counts(count_by, order, limit)
 
         except Exception:
             errinfo = {"error": [get_traceback()]}
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/awss3.py
index 93444a65..444345d5 100644
--- a/sppy/tools/provider/awss3.py
+++ b/sppy/tools/provider/awss3.py
@@ -10,7 +10,7 @@
 
 
 # .............................................................................
-class S3Query():
+class SpNetAnalyses():
     """Class for retrieving SpecifyNetwork summary data from AWS S3."""
 
     # ...............................................
@@ -30,9 +30,30 @@ def __init__(
         self.exp_type = 'SQL'
         datestr = get_current_datadate_str()
         datestr = "2024_02_01"
-        self._dataset_counts_path = f"{SUMMARY_FOLDER}/dataset_counts_{datestr}_000.parquet"
-        self._dataset_lists_path = f"{SUMMARY_FOLDER}/dataset_lists_{datestr}_000.parquet"
-        self._dataset_names_path = f"{SUMMARY_FOLDER}/dataset_names_{datestr}_000.csv"
+        self._summary_path = "summary"
+        self._summary_tables = {
+            "dataset_counts": {
+                "fname": f"dataset_counts_{datestr}_000.parquet",
+                "fields": ["datasetkey", "occ_count", "species_count"],
+                "key": "datasetkey"
+            },
+            "dataset_species_lists": {
+                "fname": f"dataset_lists_{datestr}_000.parquet",
+                "fields": ["datasetkey", "taxonkey", "species", "occ_count"],
+                "key": "datasetkey"
+            },
+            "dataset_meta": {
+                "fname": f"dataset_names_{datestr}_000.csv",
+                "fields": [
+                    "datasetKey", "publishingOrganizationKey", "title", "citation"],
+                "key": "datasetKey"
+            },
+            "organization_meta": {
+                "fname": f"organization_names_{datestr}_000.csv",
+                "fields": ["publishingOrganizationKey", "title"],
+                "key": "publishingOrganizationKey"
+            }
+        }
 
     # ----------------------------------------------------
     def _query_table(self, s3_path, query_str, format="CSV"):
@@ -135,31 +156,75 @@ def get_dataset_counts(self, dataset_key, format="JSON"):
         Returns:
              records: empty list or list of 1 record (list)
         """
+        fields = self._summary_tables["dataset_counts"]["fields"]
+        key_idx = fields.index(self._summary_tables["dataset_counts"]["key"])
+
+        table_path = \
+            f"{self._summary_path}/{self._summary_tables['dataset_counts']['fname']}"
         query_str = (
-            "SELECT datasetkey, occ_count, species_count FROM s3object s "
-            f"WHERE s.datasetkey = '{dataset_key}'"
+            f"SELECT * FROM s3object s WHERE s.datasetkey = '{dataset_key}'"
         )
         # Returns empty list or list of 1 record
-        records = self._query_table(self._dataset_counts_path, query_str, format=format)
+        records = self._query_table(table_path, query_str, format=format)
+        self.add_dataset_lookup_vals(records, key_idx=key_idx)
         return records
 
     # ----------------------------------------------------
-    def get_org_counts(self, pub_org_key):
-        """Query S3 for occurrence and species counts for this organization.
+    def add_dataset_lookup_vals(self, records, key_idx=0, format="JSON"):
+        """Query the S3 resource for occurrence and species counts for this dataset.
 
         Args:
-            pub_org_key: unique GBIF identifier for organization of interest.
+            key: unique GBIF identifier for object of interest.
+            format: output format, options "CSV" or "JSON"
 
         Returns:
-             records: empty list or list of 1 record containing occ_count, species_count
-
-        TODO: implement this?
+             records: empty list or list of 1 record (list)
         """
-        (occ_count, species_count) = (0,0)
-        return (occ_count, species_count)
+        table_path = \
+            f"{self._summary_path}/{self._summary_tables['dataset_meta']['fname']}"
+        fields = self._summary_tables["dataset_meta"]["fields"]
+        key_fld = fields[0]
+        new_flds = fields[1:]
+        qry_flds = " ".join(new_flds)
+
+        for rec in records:
+            query_str = (
+                f"SELECT {qry_flds} FROM s3object s WHERE s.{key_fld} = "
+                f"'{rec[key_idx]}'"
+            )
+            # Returns empty list or list of 1 record
+            meta_recs = self._query_table(table_path, query_str, format=format)
+            try:
+                meta = meta_recs[0]
+            except IndexError:
+                if format == "CSV":
+                    # Add placeholders for empty values
+                    rec.extend(["" for f in new_flds])
+            else:
+                for fld in new_flds:
+                    if format == "JSON":
+                        rec.update(meta)
+                    else:
+                        rec.extend(meta)
+        return records
+
+    # # ----------------------------------------------------
+    # def get_org_counts(self, pub_org_key):
+    #     """Query S3 for occurrence and species counts for this organization.
+    #
+    #     Args:
+    #         pub_org_key: unique GBIF identifier for organization of interest.
+    #
+    #     Returns:
+    #          records: empty list or list of 1 record containing occ_count, species_count
+    #
+    #     TODO: implement this?
+    #     """
+    #     (occ_count, species_count) = (0,0)
+    #     return (occ_count, species_count)
 
     # ----------------------------------------------------
-    def rank_datasets(self, count_by, order, limit, format="JSON"):
+    def rank_dataset_counts(self, count_by, order, limit, format="JSON"):
         """Return the top or bottom datasets, with counts, ranked by number of species.
 
         Args:
@@ -174,22 +239,28 @@ def rank_datasets(self, count_by, order, limit, format="JSON"):
              records: list of limit records containing dataset_key, occ_count, species_count
         """
         records = []
+        table_path = \
+            f"{self._summary_path}/{self._summary_tables['dataset_counts']['fname']}"
+        fields = self._summary_tables["dataset_counts"]["fields"]
+        key_idx = fields.index(self._summary_tables["dataset_counts"]["key"])
         if count_by == "species":
             sort_field = "species_count"
         else:
             sort_field = "occ_count"
         try:
             records, errors = self._query_order_s3_table(
-                self._dataset_counts_path, sort_field, order, limit)
+                table_path, sort_field, order, limit)
         except Exception as e:
             errors = {"error": [get_traceback()]}
+
+        self.add_dataset_lookup_vals(records, key_idx=key_idx)
         return records, errors
 
 # .............................................................................
 if __name__ == "__main__":
     format = "JSON"
     dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515"
-    s3q = S3Query(PROJ_BUCKET)
+    s3q = SpNetAnalyses(PROJ_BUCKET)
     recs = s3q.get_dataset_counts(dataset_key, format=format)
     for r in recs:
         print(r)

From 14bab92a404e77bb942d319824f1b3dfa5ca900f Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 14:39:26 -0500
Subject: [PATCH 60/81] separate create and write dataframe

---
 sppy/aws/aws_tools.py | 109 +++++++++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 43 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 3d05fde6..34c43d85 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -677,20 +677,22 @@ def create_dataframe_from_s3obj(
         df = pd.read_parquet(s3_uri)
     return df
 
-# ...............................................
-def _get_nested_output_val(output, key_list):
-    while key_list:
-        key = key_list[0]
-        key_list = key_list[1:]
-        try:
-            output = output[key]
-            if not key_list:
-                val = output
-                # if type(val) is bytes:
-                #     val = val.decode(ENCODING)
-                return val
-        except Exception:
-            return None
+
+# # ...............................................
+# def _get_nested_output_val(output, key_list):
+#     while key_list:
+#         key = key_list[0]
+#         key_list = key_list[1:]
+#         try:
+#             output = output[key]
+#             if not key_list:
+#                 val = output
+#                 # if type(val) is bytes:
+#                 #     val = val.decode(ENCODING)
+#                 return val
+#         except Exception:
+#             return None
+
 
 # ...............................................
 def _get_values_for_keys(output, keys):
@@ -698,14 +700,21 @@ def _get_values_for_keys(output, keys):
     # Get values from JSON response
     for key in keys:
         if type(key) is list or type(key) is tuple:
-            val = _get_nested_output_val(output, key)
+            key_list = key
+            while key_list:
+                key = key_list[0]
+                key_list = key_list[1:]
+                try:
+                    output = output[key]
+                    if not key_list:
+                        val = output
+                except Exception:
+                    val = None
         else:
             try:
                 val = output[key]
             except Exception:
                 val = None
-            # if type(val) is bytes:
-            #     val = val.decode(ENCODING)
         values.append(val)
     return values
 
@@ -810,32 +819,23 @@ def _get_records(url, keys):
 
 
 # ----------------------------------------------------
-def create_s3_lookup(
-        bucket, s3_folders, base_url, response_keys, output_fname, output_columns,
-        region=REGION, encoding="utf-8"):
+def create_dataframe_from_api(base_url, response_keys, output_columns):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
-        bucket: name of the bucket containing the CSV data.
-        s3_folders: S3 bucket folders for output lookup table
         base_url: API URL without any key value pairs for the data service
         response_keys: list of keys within the API response to pull values from.  A key
             can be an ordered list of keys nested within several elements of the tree,
             from outermost to innermost.
         output_columns: list of column headings for output lookup table
-        output_fname: output table for looking up dataset name and citation
-        region: AWS region containing the destination bucket.
-        encoding: encoding of the input data
 
-    Postcondition:
-        CSV table with output_columns and values for each written to the named S3 object
-            in bucket and folders
+    Returns:
+        dataframe: Pandas dataframe with rows of data for the output_columns
     """
     all_recs = []
     is_end = False
     offset = 0
     limit = 100
-
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={limit}"
         small_recs, is_end = _get_records(url, response_keys)
@@ -843,16 +843,32 @@ def create_s3_lookup(
         offset += limit
         if offset % 1000 == 0:
             print(f"Offset = {offset}")
+    dataframe = pd.DataFrame(all_recs, columns=output_columns)
+    print(f"Lookup table contains {dataframe.shape[0]} rows")
+    return dataframe
+
+
+# ----------------------------------------------------
+def write_dataframe_to_s3(
+        dataframe, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"):
+    """Query an API, read the data and write a subset to a table in S3.
 
-    lookup_df = pd.DataFrame(
-        all_recs,
-        columns=output_columns)
-    print(f"Lookup table contains {lookup_df.shape[0]} rows")
+    Args:
+        dataframe: Pandas dataframe with rows of data
+        bucket: name of the bucket containing the CSV data.
+        s3_folders: S3 bucket folders for output lookup table
+        output_fname: output table for looking up dataset name and citation
+        region: AWS region containing the destination bucket.
+        encoding: encoding of the input data
 
+    Postcondition:
+        CSV table with output_columns and values for each written to the named S3 object
+            in bucket and folders
+    """
     output_path = f"{s3_folders}/{output_fname}"
     tmp_filename = f"/tmp/{output_fname}"
     # Output data written as CSV
-    lookup_df.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
+    dataframe.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
     print(f"Wrote {tmp_filename}")
     upload_to_s3(tmp_filename, bucket, output_path, region=region)
     print(f"Uploaded to s3://{bucket}/{output_path}")
@@ -878,9 +894,10 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"
     output_fname = f"dataset_name_{data_date}_"
     output_fname = "dataset_name_2024_02_01_"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
-    create_s3_lookup(
-        bucket, s3_folders, base_url, response_keys, output_fname, output_columns,
-        region=region, encoding=encoding)
+    lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns)
+    write_dataframe_to_s3(
+        lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding)
+
 
 # ----------------------------------------------------
 def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
@@ -902,25 +919,31 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
     output_fname = f"organization_name_{data_date}_"
     output_fname = "organization_name_2024_02_01_"
     output_columns = ["publishingOrganizationKey", "title"]
-    create_s3_lookup(
-        bucket, s3_folders, base_url, response_keys, output_fname, output_columns,
-        region=region, encoding=encoding)
+    lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns)
+    write_dataframe_to_s3(
+        lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
 
 # .............................................................................
 if __name__ == "__main__":
     bucket=PROJ_BUCKET
+    region=REGION
+    encoding=ENCODING
     s3_folders="summary"
 
+    base_url = "https://api.gbif.org/v1/dataset"
+    response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+    output_fname = "dataset_name_2024_02_01_"
+    output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
+
     create_s3_dataset_lookup(
         bucket, s3_folders, region=REGION, encoding="utf-8")
-    create_s3_organization_lookup(
-        bucket, s3_folders, region=REGION, encoding="utf-8")
+    # create_s3_organization_lookup(
+    #     bucket, s3_folders, region=REGION, encoding="utf-8")
 
 
 """
 from sppy.aws.aws_tools  import *
-from sppy.aws.aws_constants import *
 
 bucket=PROJ_BUCKET
 s3_folders="summary"

From e71bf5abbdcd075051b07fd5629e715eee994739 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 15:01:38 -0500
Subject: [PATCH 61/81] add metadata implemented in SpNetAnalyses class

---
 flask_app/analyst/count.py | 17 +++--------------
 flask_app/analyst/rank.py  |  4 ----
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index 4dabf0f3..a9c1715f 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -34,7 +34,7 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
         if dataset_key is None and pub_org_key is None:
             return cls.get_endpoint()
 
-        allrecs = []
+        records = []
         try:
             good_params, errinfo = cls._standardize_params(
                 dataset_key=dataset_key, pub_org_key=pub_org_key)
@@ -50,25 +50,14 @@ def get_counts(cls, dataset_key=None, pub_org_key=None):
                         good_params["dataset_key"])
                 except Exception:
                     errors = {"error": [get_traceback()]}
-                else:
-                    cls._add_dataset_names_to_records(
-                        records, dataset_key_field="datasetkey",
-                        dataset_name_field="dataset_name")
-                    if records:
-                        allrecs.append(records)
-                # Combine errors from success or failure
-                errinfo = combine_errinfo(errinfo, errors)
 
-            # Query organization counts
-            if good_params["pub_org_key"] is not None:
-                errors = {
-                    "warning": "Count by Publishing Organization is not implemented"}
+                # Combine errors from success or failure
                 errinfo = combine_errinfo(errinfo, errors)
 
         # Assemble
         full_out = AnalystOutput(
             cls.SERVICE_TYPE["name"], description=cls.SERVICE_TYPE["description"],
-            records=allrecs, errors=errinfo)
+            records=records, errors=errinfo)
 
         return full_out.response
 
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index 4aa55b26..02f5ae94 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -74,10 +74,6 @@ def _get_ordered_counts(cls, count_by, order, limit):
         except Exception:
             errinfo = {"error": [get_traceback()]}
 
-        cls._add_dataset_names_to_records(
-            records, dataset_key_field="datasetkey",
-            dataset_name_field="dataset_name")
-
         return records, errinfo
 
 # .............................................................................

From e664e437d61b1250313cb67589c0880c5e9bcff7 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 15:02:06 -0500
Subject: [PATCH 62/81] testing

---
 sppy/aws/aws_tools.py | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 34c43d85..80dc3070 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -946,29 +946,15 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
 from sppy.aws.aws_tools  import *
 
 bucket=PROJ_BUCKET
+region=REGION
+encoding=ENCODING
 s3_folders="summary"
 
-create_s3_dataset_lookup(
-        bucket, s3_folders, region=REGION, encoding="utf-8")
+base_url = "https://api.gbif.org/v1/dataset"
+response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+output_fname = "dataset_name_2024_02_01_"
+output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
 
-# ds_table = create_dataframe_from_s3obj(
-#     bucket, input_path, datatype="parquet", region=REGION, encoding=ENCODING)
-# 
-# i = 0
-# for rec in ds_table.itertuples():
-#     print(i)
-#     print(rec)
-#     i = i + 1
-#     if i == 5:
-#         break
-# 
-# dataset_key = rec.datasetkey
-# 
-# url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
-
-# import certifi
-# cert = certifi.where()
-# r = requests.get(url, cert=cert)
-
-# response = requests.get(url)
+create_s3_dataset_lookup(
+    bucket, s3_folders, region=REGION, encoding="utf-8")
 """

From 27436750c08f5364fea7eb8c45490798585ec8f8 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 15:17:23 -0500
Subject: [PATCH 63/81] rename awss3 module to reflect provider of SpNetwork
 data

---
 flask_app/analyst/base.py                  | 2 +-
 flask_app/analyst/count.py                 | 2 +-
 flask_app/analyst/rank.py                  | 2 +-
 sppy/tools/provider/{awss3.py => spnet.py} | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename sppy/tools/provider/{awss3.py => spnet.py} (100%)

diff --git a/flask_app/analyst/base.py b/flask_app/analyst/base.py
index 263441d7..7a606862 100644
--- a/flask_app/analyst/base.py
+++ b/flask_app/analyst/base.py
@@ -5,7 +5,7 @@
 from flask_app.common.s2n_type import AnalystOutput, APIService
 
 from sppy.tools.s2n.utils import get_traceback
-from sppy.tools.provider.awss3 import SpNetAnalyses
+from sppy.tools.provider.spnet import SpNetAnalyses
 
 
 # .............................................................................
diff --git a/flask_app/analyst/count.py b/flask_app/analyst/count.py
index a9c1715f..ab09d232 100644
--- a/flask_app/analyst/count.py
+++ b/flask_app/analyst/count.py
@@ -6,7 +6,7 @@
 from flask_app.analyst.base import _AnalystService
 
 from sppy.aws.aws_constants import PROJ_BUCKET
-from sppy.tools.provider.awss3 import SpNetAnalyses
+from sppy.tools.provider.spnet import SpNetAnalyses
 from sppy.tools.s2n.utils import (combine_errinfo, get_traceback)
 
 
diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index 02f5ae94..83571fef 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -6,7 +6,7 @@
 from flask_app.analyst.base import _AnalystService
 
 from sppy.aws.aws_constants import PROJ_BUCKET
-from sppy.tools.provider.awss3 import SpNetAnalyses
+from sppy.tools.provider.spnet import SpNetAnalyses
 from sppy.tools.s2n.utils import (combine_errinfo, get_traceback)
 
 
diff --git a/sppy/tools/provider/awss3.py b/sppy/tools/provider/spnet.py
similarity index 100%
rename from sppy/tools/provider/awss3.py
rename to sppy/tools/provider/spnet.py

From 897efd769995cb4534cfb6076ce0cd6475ecdf6c Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 16:19:19 -0500
Subject: [PATCH 64/81] chg defaults

---
 flask_app/analyst/rank.py    | 2 +-
 flask_app/common/s2n_type.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flask_app/analyst/rank.py b/flask_app/analyst/rank.py
index 83571fef..2eeafa92 100644
--- a/flask_app/analyst/rank.py
+++ b/flask_app/analyst/rank.py
@@ -18,7 +18,7 @@ class RankSvc(_AnalystService):
 
     # ...............................................
     @classmethod
-    def rank_counts(cls, count_by, order=None, limit=1):
+    def rank_counts(cls, count_by, order=None, limit=10):
         """Return occurrence and species counts for dataset/organization identifiers.
 
         Args:
diff --git a/flask_app/common/s2n_type.py b/flask_app/common/s2n_type.py
index b11efa56..f5e6721e 100644
--- a/flask_app/common/s2n_type.py
+++ b/flask_app/common/s2n_type.py
@@ -193,7 +193,7 @@ class APIService:
             "order": {
                 "type": "",
                 "options": ["ascending", "descending"],
-                "default": None
+                "default": "descending"
             },
             "limit": {"type": 2, "default": 10, "min": 1, "max": 500},
         },

From c308bca73982058075c0a0e129dae82e0f9497c7 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 16:20:38 -0500
Subject: [PATCH 65/81] write csv files locally to reduce memory requirements,
 before combining and writing to S3

---
 sppy/aws/aws_tools.py | 102 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 87 insertions(+), 15 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 80dc3070..3511552f 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -835,19 +835,95 @@ def create_dataframe_from_api(base_url, response_keys, output_columns):
     all_recs = []
     is_end = False
     offset = 0
-    limit = 100
+    limit = 1000
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={limit}"
         small_recs, is_end = _get_records(url, response_keys)
         all_recs.append(small_recs)
         offset += limit
-        if offset % 1000 == 0:
+        if offset % 5000 == 0:
             print(f"Offset = {offset}")
     dataframe = pd.DataFrame(all_recs, columns=output_columns)
     print(f"Lookup table contains {dataframe.shape[0]} rows")
     return dataframe
 
 
+# ----------------------------------------------------
+def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fname):
+    """Query an API, read the data and write a subset to a table in S3.
+
+    Args:
+        base_url: API URL without any key value pairs for the data service
+        response_keys: list of keys within the API response to pull values from.  A key
+            can be an ordered list of keys nested within several elements of the tree,
+            from outermost to innermost.
+        output_columns: list of column headings for output lookup table
+        output_fname: base output filename for temporary CSV files
+
+    Returns:
+        csv_files: Local CSV files with records.  The first file in the list will have
+            a header, the rest will not.
+    """
+    csv_files = []
+    records = []
+    is_end = False
+    offset = 0
+    read_limit = 1000
+    write_limit = 5000
+    write_header = True
+    while is_end is False:
+        url = f"{base_url}?offset={offset}&limit={read_limit}"
+        small_recs, is_end = _get_records(url, response_keys)
+        records.append(small_recs)
+        offset += read_limit
+        # Write to tempfile every 5000
+        if offset % write_limit == 0:
+            print(f"Offset = {offset}")
+            dataframe = pd.DataFrame(records, columns=output_columns)
+            tmp_filename = f"/tmp/{output_fname}_{offset}_"
+            dataframe.to_csv(
+                path_or_buf=tmp_filename, sep='\t', header=write_header,
+                encoding=encoding)
+            # Only write header to first file, others will be appended
+            write_header = False
+            csv_files.append(tmp_filename)
+            print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows")
+            # reset records in memory
+            records = []
+    return csv_files
+
+
+# ----------------------------------------------------
+def write_csvfiles_to_s3(
+        csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"):
+    """Query an API, read the data and write a subset to a table in S3.
+
+    Args:
+        csvfiles: input CSV files for S3 table. The first file in the list will have
+            a header, the rest will not.
+        bucket: name of the bucket containing the CSV data.
+        s3_folders: S3 bucket folders for output lookup table
+        output_fname: output table for looking up dataset name and citation
+        region: AWS region containing the destination bucket.
+        encoding: encoding of the input data
+
+    Postcondition:
+        CSV table with output_columns and values for each written to the named S3 object
+            in bucket and folders
+    """
+    output_path = f"{s3_folders}/{output_fname}"
+    combined_fname = f"/tmp/{output_fname}"
+    with open(combined_fname, "a") as outf:
+        # Output data written as CSV
+        for fname in csv_fnames:
+            with open(fname, "r") as inf:
+                data = inf.read()
+                outf.write(data)
+    print(f"Wrote {combined_fname}")
+    upload_to_s3(combined_fname, bucket, output_path, region=region)
+    print(f"Uploaded to s3://{bucket}/{output_path}")
+
+
 # ----------------------------------------------------
 def write_dataframe_to_s3(
         dataframe, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"):
@@ -884,6 +960,11 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"
         region: AWS region containing the destination bucket.
         encoding: encoding of the input data
 
+    Note:
+        There are >100k records for datasets and limited memory on this EC2 instance,
+        so we write them as temporary CSV files, then combine them, then create a
+        dataframe and upload.
+
     Postcondition:
         CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
             to the named S3 object in bucket and folders
@@ -894,9 +975,10 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"
     output_fname = f"dataset_name_{data_date}_"
     output_fname = "dataset_name_2024_02_01_"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
-    lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns)
-    write_dataframe_to_s3(
-        lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding)
+    csv_fnames = create_csvfiles_from_api(
+        base_url, response_keys, output_columns, output_fname)
+    write_csvfiles_to_s3(
+        csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
 
 # ----------------------------------------------------
@@ -931,11 +1013,6 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
     encoding=ENCODING
     s3_folders="summary"
 
-    base_url = "https://api.gbif.org/v1/dataset"
-    response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
-    output_fname = "dataset_name_2024_02_01_"
-    output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
-
     create_s3_dataset_lookup(
         bucket, s3_folders, region=REGION, encoding="utf-8")
     # create_s3_organization_lookup(
@@ -950,11 +1027,6 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
 encoding=ENCODING
 s3_folders="summary"
 
-base_url = "https://api.gbif.org/v1/dataset"
-response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
-output_fname = "dataset_name_2024_02_01_"
-output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
-
 create_s3_dataset_lookup(
     bucket, s3_folders, region=REGION, encoding="utf-8")
 """

From f1d0dcfcb597ff0fac808c09045a95f03fb070d9 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 16:21:19 -0500
Subject: [PATCH 66/81] comment out metadata add until tables are populated

---
 sppy/tools/provider/spnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sppy/tools/provider/spnet.py b/sppy/tools/provider/spnet.py
index 444345d5..4a213153 100644
--- a/sppy/tools/provider/spnet.py
+++ b/sppy/tools/provider/spnet.py
@@ -166,7 +166,7 @@ def get_dataset_counts(self, dataset_key, format="JSON"):
         )
         # Returns empty list or list of 1 record
         records = self._query_table(table_path, query_str, format=format)
-        self.add_dataset_lookup_vals(records, key_idx=key_idx)
+        # self.add_dataset_lookup_vals(records, key_idx=key_idx)
         return records
 
     # ----------------------------------------------------
@@ -253,7 +253,7 @@ def rank_dataset_counts(self, count_by, order, limit, format="JSON"):
         except Exception as e:
             errors = {"error": [get_traceback()]}
 
-        self.add_dataset_lookup_vals(records, key_idx=key_idx)
+        # self.add_dataset_lookup_vals(records, key_idx=key_idx)
         return records, errors
 
 # .............................................................................

From d677746583d011f258646b16b4dbc9d0e8763e58 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 16:26:11 -0500
Subject: [PATCH 67/81] extend record list

---
 sppy/aws/aws_tools.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 3511552f..5338cf15 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -839,7 +839,7 @@ def create_dataframe_from_api(base_url, response_keys, output_columns):
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={limit}"
         small_recs, is_end = _get_records(url, response_keys)
-        all_recs.append(small_recs)
+        all_recs.extend(small_recs)
         offset += limit
         if offset % 5000 == 0:
             print(f"Offset = {offset}")
@@ -874,7 +874,7 @@ def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fna
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={read_limit}"
         small_recs, is_end = _get_records(url, response_keys)
-        records.append(small_recs)
+        records.extend(small_recs)
         offset += read_limit
         # Write to tempfile every 5000
         if offset % write_limit == 0:

From d6073dd8f4c9527207dcc8a91b592491c1166bd9 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 17:06:53 -0500
Subject: [PATCH 68/81] bugfix

---
 sppy/aws/aws_tools.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 5338cf15..2d3c7873 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -849,7 +849,8 @@ def create_dataframe_from_api(base_url, response_keys, output_columns):
 
 
 # ----------------------------------------------------
-def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fname):
+def create_csvfiles_from_api(
+        base_url, response_keys, output_columns, output_fname, encoding="utf-8"):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -859,6 +860,7 @@ def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fna
             from outermost to innermost.
         output_columns: list of column headings for output lookup table
         output_fname: base output filename for temporary CSV files
+        encoding: encoding of the input data
 
     Returns:
         csv_files: Local CSV files with records.  The first file in the list will have
@@ -880,7 +882,7 @@ def create_csvfiles_from_api(base_url, response_keys, output_columns, output_fna
         if offset % write_limit == 0:
             print(f"Offset = {offset}")
             dataframe = pd.DataFrame(records, columns=output_columns)
-            tmp_filename = f"/tmp/{output_fname}_{offset}_"
+            tmp_filename = f"/tmp/{output_fname}{offset}.csv"
             dataframe.to_csv(
                 path_or_buf=tmp_filename, sep='\t', header=write_header,
                 encoding=encoding)

From e54833bc1b6455d99665e07dd5ab21e2469986d0 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Wed, 27 Mar 2024 17:07:07 -0500
Subject: [PATCH 69/81] rm unused pyarrow

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5b7074f5..6687eff1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,3 @@ awscli
 boto3>=1.34.60
 sqlalchemy
 pandas
-pyarrow

From b3c53e984fd1217c70a26bcf9daecff0df667a24 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 28 Mar 2024 12:07:19 -0500
Subject: [PATCH 70/81] testing for proper csv to s3 formatting

---
 sppy/aws/aws_tools.py | 111 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 3 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 2d3c7873..5508cefa 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -776,10 +776,44 @@ def _parse_records(ret_records, keys):
         small_recs.append(values)
     return small_recs
 
+# ...............................................
+def _get_single_record(url, keys):
+    rec = None
+    try:
+        response = requests.get(url)
+    except Exception as e:
+        errmsg = str(e)
+    else:
+        try:
+            status_code = response.status_code
+            reason = response.reason
+        except Exception as e:
+            status_code = HTTPStatus.INTERNAL_SERVER_ERROR
+            reason = str(e)
+        if status_code == HTTPStatus.OK:
+            # Parse response
+            try:
+                output = response.json()
+            except Exception:
+                output = response.content
+                if type(output) is bytes:
+                    output = ET.fromstring(str(output))
+                try:
+                    output = ET.parse(output)
+                except Exception:
+                    reason = f"Provider error: Invalid JSON response ({output})"
+                else:
+                    # Output is only one record
+                    small_recs = _parse_records([output], keys)
+                    try:
+                        rec = small_recs[0]
+                    except Exception as e:
+                        print(f"Error: no output record ({e})")
+    return rec
+
+
 # ...............................................
 def _get_records(url, keys):
-    small_recs = []
-    is_end = True
     try:
         response = requests.get(url)
     except Exception as e:
@@ -817,7 +851,6 @@ def _get_records(url, keys):
                 small_recs = _parse_records(ret_records, keys)
     return small_recs, is_end
 
-
 # ----------------------------------------------------
 def create_dataframe_from_api(base_url, response_keys, output_columns):
     """Query an API, read the data and write a subset to a table in S3.
@@ -885,6 +918,7 @@ def create_csvfiles_from_api(
             tmp_filename = f"/tmp/{output_fname}{offset}.csv"
             dataframe.to_csv(
                 path_or_buf=tmp_filename, sep='\t', header=write_header,
+                columns=output_columns, doublequote=False, escapechar="\\",
                 encoding=encoding)
             # Only write header to first file, others will be appended
             write_header = False
@@ -1008,12 +1042,81 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
         lookup_df, bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
 
+# ----------------------------------------------------
+def create_csvfile_from_api(
+        base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8"):
+    """Query an API, read the data and write a subset to a table in S3.
+
+    Args:
+        base_url: API URL without any key value pairs for the data service
+        keys: unique identifiers to query the API for
+        response_keys: list of keys within the API response to pull values from.  A key
+            can be an ordered list of keys nested within several elements of the tree,
+            from outermost to innermost.
+        output_columns: list of column headings for output lookup table
+        output_fname: base output filename for temporary CSV files
+        encoding: encoding of the input data
+
+    Returns:
+        csv_files: Local CSV files with records.  The first file in the list will have
+            a header, the rest will not.
+    """
+    records = []
+    for key in keys:
+        url = f"{base_url}/{key}"
+        rec = _get_single_record(url, response_keys)
+        records.append(rec)
+    dataframe = pd.DataFrame(records, columns=output_columns)
+    tmp_filename = f"/tmp/{output_fname}.csv"
+    dataframe.to_csv(
+        path_or_buf=tmp_filename, sep='\t', header=True,
+        columns=output_columns, doublequote=False, escapechar="\\",
+        encoding=encoding)
+    print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows")
+    return tmp_filename
+
+# ----------------------------------------------------
+def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
+    """Query the GBIF Dataset API, write a subset of the response to a table in S3.
+
+    Args:
+        bucket: name of the bucket containing the CSV data.
+        s3_folders: S3 bucket folders for output lookup table
+        region: AWS region containing the destination bucket.
+        encoding: encoding of the input data
+
+    Note:
+        There are >100k records for datasets and limited memory on this EC2 instance,
+        so we write them as temporary CSV files, then combine them, then create a
+        dataframe and upload.
+
+    Postcondition:
+        CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
+            to the named S3 object in bucket and folders
+    """
+    base_url = "https://api.gbif.org/v1/dataset"
+    response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+    data_date = get_current_datadate_str()
+    output_fname = f"dataset_name_{data_date}_"
+    output_fname = "dataset_name_test_2024_02_01_"
+    output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
+    csv_fname = create_csvfile_from_api(
+        base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8")
+    write_csvfiles_to_s3(
+        [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding)
+
 # .............................................................................
 if __name__ == "__main__":
     bucket=PROJ_BUCKET
     region=REGION
     encoding=ENCODING
     s3_folders="summary"
+    keys = [
+        "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b",
+        "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e",
+        "c8fded56-3ddb-4e26-8863-ba8d55862689",
+        "3c83d5da-822a-439c-897a-7569e82c4ebc"
+    ]
 
     create_s3_dataset_lookup(
         bucket, s3_folders, region=REGION, encoding="utf-8")
@@ -1022,6 +1125,8 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
 
 
 """
+# Note: Test with quoted data such as: 
+# http://api.gbif.org/v1/dataset/3c83d5da-822a-439c-897a-7569e82c4ebc
 from sppy.aws.aws_tools  import *
 
 bucket=PROJ_BUCKET

From 00d4cef09e99a6afee491cb9c919c204ace9ba47 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 28 Mar 2024 12:07:27 -0500
Subject: [PATCH 71/81] doc

---
 sphinx/about/install_run_notes.rst |  4 ++--
 sphinx/misc/docker.rst             | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sphinx/about/install_run_notes.rst b/sphinx/about/install_run_notes.rst
index cdff1d87..e8106037 100644
--- a/sphinx/about/install_run_notes.rst
+++ b/sphinx/about/install_run_notes.rst
@@ -199,8 +199,8 @@ Reset the FLASK_APP variable to test an alternate resource::
 
 * Test with http, no https!!
 
-  http://localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814)
-  http://localhost:5000/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3
+  http://broker.localhost:5000/api/v1/name?namestr=Notemigonus%20crysoleucas%20(Mitchill,%201814)
+  http://broker.localhost:5000/api/v1/occ?occid=01493b05-4310-4f28-9d81-ad20860311f3
 
 Troubleshooting
 ======================================
diff --git a/sphinx/misc/docker.rst b/sphinx/misc/docker.rst
index 074bb1e4..fd91fc8f 100644
--- a/sphinx/misc/docker.rst
+++ b/sphinx/misc/docker.rst
@@ -75,6 +75,19 @@ all docker containers, shut down httpd, bring up docker.
     sudo systemctl stop httpd
     sudo docker compose  up -d
 
+Run Docker on OSX
+=================================
+
+Need to bind server to 0.0.0.0 instead of 127.0.0.1
+
+Test by getting internal IP, using ifconfig, then command to see if connects successfully::
+
+    nc -v x.x.x.x 443
+
+Then can use same IP in browser, i.e. https://x.x.x.x/api/v1/name/
+This only exposes the broker, not the analyst services.
+
+
 
 Troubleshooting
 =================================

From 4af2758e9141a9400690e69ba8f5d74363908f74 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 28 Mar 2024 14:56:18 -0500
Subject: [PATCH 72/81] testing for csv quoting, escapechars

---
 sppy/aws/aws_tools.py | 99 +++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 46 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 5508cefa..ab65ebba 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -614,7 +614,7 @@ def get_logger(log_name, log_dir=None, log_level=logging.INFO):
     # create file handler
     handler = RotatingFileHandler(
         filename, mode="w", maxBytes=LOGFILE_MAX_BYTES, backupCount=10,
-        encoding="utf-8"
+        encoding=ENCODING
     )
     formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT)
     handler.setLevel(log_level)
@@ -643,14 +643,14 @@ def create_dataframe_from_gbifcsv_s3_bucket(bucket, csv_path, region=REGION):
     s3_client = boto3.client("s3", region_name=region)
     s3_obj = s3_client.get_object(Bucket=bucket, Key=csv_path)
     df = pd.read_csv(
-        s3_obj["Body"], delimiter="\t", encoding="utf-8", low_memory=False,
+        s3_obj["Body"], delimiter="\t", encoding=ENCODING, low_memory=False,
         quoting=csv.QUOTE_NONE)
     return df
 
 
 # ----------------------------------------------------
 def create_dataframe_from_s3obj(
-        bucket, s3_path, datatype="parquet", region=REGION, encoding="utf-8"):
+        bucket, s3_path, datatype="parquet", region=REGION, encoding=ENCODING):
     """Read CSV data from S3 into a pandas DataFrame.
 
     Args:
@@ -678,22 +678,6 @@ def create_dataframe_from_s3obj(
     return df
 
 
-# # ...............................................
-# def _get_nested_output_val(output, key_list):
-#     while key_list:
-#         key = key_list[0]
-#         key_list = key_list[1:]
-#         try:
-#             output = output[key]
-#             if not key_list:
-#                 val = output
-#                 # if type(val) is bytes:
-#                 #     val = val.decode(ENCODING)
-#                 return val
-#         except Exception:
-#             return None
-
-
 # ...............................................
 def _get_values_for_keys(output, keys):
     values = []
@@ -801,14 +785,15 @@ def _get_single_record(url, keys):
                 try:
                     output = ET.parse(output)
                 except Exception:
+                    output = None
                     reason = f"Provider error: Invalid JSON response ({output})"
-                else:
-                    # Output is only one record
-                    small_recs = _parse_records([output], keys)
-                    try:
-                        rec = small_recs[0]
-                    except Exception as e:
-                        print(f"Error: no output record ({e})")
+            if output:
+                # Output is only one record
+                small_recs = _parse_records([output], keys)
+                try:
+                    rec = small_recs[0]
+                except Exception as e:
+                    print(f"Error: no output record ({e})")
     return rec
 
 
@@ -883,7 +868,7 @@ def create_dataframe_from_api(base_url, response_keys, output_columns):
 
 # ----------------------------------------------------
 def create_csvfiles_from_api(
-        base_url, response_keys, output_columns, output_fname, encoding="utf-8"):
+        base_url, response_keys, output_columns, output_fname, encoding=ENCODING):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -915,7 +900,7 @@ def create_csvfiles_from_api(
         if offset % write_limit == 0:
             print(f"Offset = {offset}")
             dataframe = pd.DataFrame(records, columns=output_columns)
-            tmp_filename = f"/tmp/{output_fname}{offset}.csv"
+            tmp_filename = f"/tmp/{output_fname}_{offset}.csv"
             dataframe.to_csv(
                 path_or_buf=tmp_filename, sep='\t', header=write_header,
                 columns=output_columns, doublequote=False, escapechar="\\",
@@ -931,7 +916,7 @@ def create_csvfiles_from_api(
 
 # ----------------------------------------------------
 def write_csvfiles_to_s3(
-        csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"):
+        csv_fnames, bucket, s3_folders, output_fname, region=REGION, encoding=ENCODING):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -948,7 +933,7 @@ def write_csvfiles_to_s3(
             in bucket and folders
     """
     output_path = f"{s3_folders}/{output_fname}"
-    combined_fname = f"/tmp/{output_fname}"
+    combined_fname = f"/tmp/{output_fname}.csv"
     with open(combined_fname, "a") as outf:
         # Output data written as CSV
         for fname in csv_fnames:
@@ -962,7 +947,7 @@ def write_csvfiles_to_s3(
 
 # ----------------------------------------------------
 def write_dataframe_to_s3(
-        dataframe, bucket, s3_folders, output_fname, region=REGION, encoding="utf-8"):
+        dataframe, bucket, s3_folders, output_fname, region=REGION, encoding=ENCODING):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -980,14 +965,16 @@ def write_dataframe_to_s3(
     output_path = f"{s3_folders}/{output_fname}"
     tmp_filename = f"/tmp/{output_fname}"
     # Output data written as CSV
-    dataframe.to_csv(path_or_buf=tmp_filename, sep='\t', header=True, encoding=encoding)
+    dataframe.to_csv(
+        path_or_buf=tmp_filename, sep='\t', header=True, doublequote=False, 
+        escapechar="\\", encoding=encoding)
     print(f"Wrote {tmp_filename}")
     upload_to_s3(tmp_filename, bucket, output_path, region=region)
     print(f"Uploaded to s3://{bucket}/{output_path}")
 
 
 # ----------------------------------------------------
-def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
+def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING):
     """Query the GBIF Dataset API, write a subset of the response to a table in S3.
 
     Args:
@@ -1008,8 +995,8 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"
     base_url = "https://api.gbif.org/v1/dataset"
     response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
     data_date = get_current_datadate_str()
-    output_fname = f"dataset_name_{data_date}_"
-    output_fname = "dataset_name_2024_02_01_"
+    output_fname = f"dataset_name_{data_date}"
+    output_fname = "dataset_name_2024_02_01"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
     csv_fnames = create_csvfiles_from_api(
         base_url, response_keys, output_columns, output_fname)
@@ -1018,7 +1005,7 @@ def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"
 
 
 # ----------------------------------------------------
-def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
+def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING):
     """Query the GBIF Organization API, write a subset of the response to a table in S3.
 
     Args:
@@ -1034,8 +1021,8 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
     base_url = "https://api.gbif.org/v1/dataset"
     response_keys = ["key", "title"]
     data_date = get_current_datadate_str()
-    output_fname = f"organization_name_{data_date}_"
-    output_fname = "organization_name_2024_02_01_"
+    output_fname = f"organization_name_{data_date}"
+    output_fname = "organization_name_2024_02_01"
     output_columns = ["publishingOrganizationKey", "title"]
     lookup_df = create_dataframe_from_api(base_url, response_keys, output_columns)
     write_dataframe_to_s3(
@@ -1044,7 +1031,7 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding="u
 
 # ----------------------------------------------------
 def create_csvfile_from_api(
-        base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8"):
+        base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -1076,12 +1063,13 @@ def create_csvfile_from_api(
     return tmp_filename
 
 # ----------------------------------------------------
-def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="utf-8"):
+def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encoding=ENCODING):
     """Query the GBIF Dataset API, write a subset of the response to a table in S3.
 
     Args:
         bucket: name of the bucket containing the CSV data.
         s3_folders: S3 bucket folders for output lookup table
+        keys: unique identifiers to query the API for
         region: AWS region containing the destination bucket.
         encoding: encoding of the input data
 
@@ -1101,7 +1089,7 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="u
     output_fname = "dataset_name_test_2024_02_01_"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
     csv_fname = create_csvfile_from_api(
-        base_url, keys, response_keys, output_columns, output_fname, encoding="utf-8")
+        base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING)
     write_csvfiles_to_s3(
         [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
@@ -1118,10 +1106,10 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="u
         "3c83d5da-822a-439c-897a-7569e82c4ebc"
     ]
 
-    create_s3_dataset_lookup(
-        bucket, s3_folders, region=REGION, encoding="utf-8")
+
+    create_test_s3_dataset_lookup(bucket, s3_folders, keys)
     # create_s3_organization_lookup(
-    #     bucket, s3_folders, region=REGION, encoding="utf-8")
+    #     bucket, s3_folders, region=REGION, encoding=ENCODING)
 
 
 """
@@ -1133,7 +1121,26 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding="u
 region=REGION
 encoding=ENCODING
 s3_folders="summary"
+keys = [
+        "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b",
+        "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e",
+        "c8fded56-3ddb-4e26-8863-ba8d55862689",
+        "3c83d5da-822a-439c-897a-7569e82c4ebc"
+    ]
 
-create_s3_dataset_lookup(
-    bucket, s3_folders, region=REGION, encoding="utf-8")
+base_url = "https://api.gbif.org/v1/dataset"
+response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+data_date = get_current_datadate_str()
+output_fname = f"dataset_name_{data_date}_"
+output_fname = "dataset_name_test_2024_02_01_"
+output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
+    
+    
+csv_fname = create_csvfile_from_api(
+    base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING)
+write_csvfiles_to_s3(
+    [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding)
+
+
+create_test_s3_dataset_lookup(bucket, s3_folders, keys)
 """

From ce5893b5e6906a9d2285e8fd460e7bf70b4fd708 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Thu, 28 Mar 2024 16:11:10 -0500
Subject: [PATCH 73/81] add certificate for local GBIF api query

---
 sppy/aws/aws_tools.py | 77 +++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index ab65ebba..90283fdf 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -6,6 +6,7 @@
 import boto3
 from botocore.exceptions import ClientError
 import csv
+import certifi
 import datetime
 from http import HTTPStatus
 import logging
@@ -704,10 +705,10 @@ def _get_values_for_keys(output, keys):
 
 
 # ...............................................
-def _get_api_response_vals(url, keys):
+def _get_api_response_vals(url, keys, certificate=None):
     values = []
     try:
-        response = requests.get(url)
+        response = requests.get(url, verify=certificate)
     except Exception as e:
         errmsg = str(e)
     else:
@@ -734,7 +735,7 @@ def _get_api_response_vals(url, keys):
     return values
 
 # ...............................................
-def get_dataset_name_citation(dataset_key):
+def get_dataset_name_citation(dataset_key, certificate=None):
     """Return title from one dataset record with this key.
 
     Args:
@@ -748,7 +749,8 @@ def get_dataset_name_citation(dataset_key):
         Exception: on query failure.
     """
     url = f"https://api.gbif.org/v1/dataset/{dataset_key}"
-    name, citation = _get_api_response_vals(url, ["title", ["citation", "text"]])
+    name, citation = _get_api_response_vals(
+        url, ["title", ["citation", "text"]], certificate=certificate)
     return name, citation
 
 
@@ -761,10 +763,13 @@ def _parse_records(ret_records, keys):
     return small_recs
 
 # ...............................................
-def _get_single_record(url, keys):
+def _get_single_record(url, keys, certificate=None):
     rec = None
     try:
-        response = requests.get(url)
+        if certificate:
+            response = requests.get(url, verify=certificate)
+        else:
+            response = requests.get(url)
     except Exception as e:
         errmsg = str(e)
     else:
@@ -798,11 +803,16 @@ def _get_single_record(url, keys):
 
 
 # ...............................................
-def _get_records(url, keys):
+def _get_records(url, keys, certificate=None):
+    small_recs = []
+    status_code = 0
     try:
-        response = requests.get(url)
+        if certificate:
+            response = requests.get(url, verify=certificate)
+        else:
+            response = requests.get(url)
     except Exception as e:
-        errmsg = str(e)
+        reason = str(e)
     else:
         try:
             status_code = response.status_code
@@ -834,6 +844,8 @@ def _get_records(url, keys):
                 reason = "No results returned"
             else:
                 small_recs = _parse_records(ret_records, keys)
+    if not small_recs:
+        print(f"No records returned, status {status_code}, reason {reason}")
     return small_recs, is_end
 
 # ----------------------------------------------------
@@ -854,9 +866,10 @@ def create_dataframe_from_api(base_url, response_keys, output_columns):
     is_end = False
     offset = 0
     limit = 1000
+    certificate = certifi.where()
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={limit}"
-        small_recs, is_end = _get_records(url, response_keys)
+        small_recs, is_end = _get_records(url, response_keys, certificate=certificate)
         all_recs.extend(small_recs)
         offset += limit
         if offset % 5000 == 0:
@@ -888,17 +901,19 @@ def create_csvfiles_from_api(
     records = []
     is_end = False
     offset = 0
-    read_limit = 1000
+    read_limit = 500
     write_limit = 5000
     write_header = True
+    certificate = certifi.where()
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={read_limit}"
-        small_recs, is_end = _get_records(url, response_keys)
-        records.extend(small_recs)
+        print(url)
+        small_recs, is_end = _get_records(url, response_keys, certificate=certificate)
+        if small_recs:
+            records.extend(small_recs)
         offset += read_limit
         # Write to tempfile every 5000
         if offset % write_limit == 0:
-            print(f"Offset = {offset}")
             dataframe = pd.DataFrame(records, columns=output_columns)
             tmp_filename = f"/tmp/{output_fname}_{offset}.csv"
             dataframe.to_csv(
@@ -1031,7 +1046,7 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=EN
 
 # ----------------------------------------------------
 def create_csvfile_from_api(
-        base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING):
+    base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -1049,9 +1064,10 @@ def create_csvfile_from_api(
             a header, the rest will not.
     """
     records = []
+    certificate = certifi.where()
     for key in keys:
         url = f"{base_url}/{key}"
-        rec = _get_single_record(url, response_keys)
+        rec = _get_single_record(url, response_keys, certificate=certificate)
         records.append(rec)
     dataframe = pd.DataFrame(records, columns=output_columns)
     tmp_filename = f"/tmp/{output_fname}.csv"
@@ -1088,8 +1104,10 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod
     output_fname = f"dataset_name_{data_date}_"
     output_fname = "dataset_name_test_2024_02_01_"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
+    certificate = certifi.where()
     csv_fname = create_csvfile_from_api(
-        base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING)
+        base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING,
+        certificate=certificate)
     write_csvfiles_to_s3(
         [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
@@ -1106,8 +1124,8 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod
         "3c83d5da-822a-439c-897a-7569e82c4ebc"
     ]
 
-
-    create_test_s3_dataset_lookup(bucket, s3_folders, keys)
+    create_s3_dataset_lookup(bucket, s3_folders)
+    # create_test_s3_dataset_lookup(bucket, s3_folders, keys)
     # create_s3_organization_lookup(
     #     bucket, s3_folders, region=REGION, encoding=ENCODING)
 
@@ -1121,26 +1139,7 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod
 region=REGION
 encoding=ENCODING
 s3_folders="summary"
-keys = [
-        "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b",
-        "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e",
-        "c8fded56-3ddb-4e26-8863-ba8d55862689",
-        "3c83d5da-822a-439c-897a-7569e82c4ebc"
-    ]
 
-base_url = "https://api.gbif.org/v1/dataset"
-response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
-data_date = get_current_datadate_str()
-output_fname = f"dataset_name_{data_date}_"
-output_fname = "dataset_name_test_2024_02_01_"
-output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
     
-    
-csv_fname = create_csvfile_from_api(
-    base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING)
-write_csvfiles_to_s3(
-    [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding)
-
-
-create_test_s3_dataset_lookup(bucket, s3_folders, keys)
+create_s3_dataset_lookup(bucket, s3_folders)
 """

From c1401bddc16bf7f9a4138a152b7a8d85f28e69c2 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Fri, 29 Mar 2024 17:09:00 -0500
Subject: [PATCH 74/81] modified to query only datasetkeys encountered bc
 pulling all at once causes JSON parsing errors of GBIF response

---
 sppy/aws/aws_tools.py | 214 ++++++++++++++++++++++++++++--------------
 1 file changed, 143 insertions(+), 71 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 90283fdf..176148a6 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -7,8 +7,9 @@
 from botocore.exceptions import ClientError
 import csv
 import certifi
-import datetime
+import datetime as DT
 from http import HTTPStatus
+import json
 import logging
 from logging.handlers import RotatingFileHandler
 import pandas as pd
@@ -241,7 +242,7 @@ def create_token(type=None):
     """
     if type is None:
         type = PROJ_NAME
-    token = f"{type}_{datetime.datetime.now().timestamp()}"
+    token = f"{type}_{DT.datetime.now().timestamp()}"
     return token
 
 
@@ -252,7 +253,7 @@ def get_today_str():
     Returns:
         date_str(str): string representing date in YYYY-MM-DD format.
     """
-    n = datetime.datetime.now()
+    n = DT.datetime.now()
     date_str = f"{n.year}_{n.month:02d}_{n.day:02d}"
     return date_str
 
@@ -264,7 +265,7 @@ def get_current_datadate_str():
     Returns:
         date_str(str): string representing date in YYYY-MM-DD format.
     """
-    n = datetime.datetime.now()
+    n = DT.datetime.now()
     date_str = f"{n.year}_{n.month:02d}_01"
     return date_str
 
@@ -276,7 +277,7 @@ def get_previous_datadate_str():
     Returns:
         date_str(str): string representing date in YYYY-MM-DD format.
     """
-    n = datetime.datetime.now()
+    n = DT.datetime.now()
     yr = n.year
     mo = n.month - 1
     if n.month == 0:
@@ -754,6 +755,54 @@ def get_dataset_name_citation(dataset_key, certificate=None):
     return name, citation
 
 
+# ----------------------------------------------------
+def _query_table(bucket, s3_path, query_str, region=REGION, format="CSV"):
+    """Query the S3 resource defined for this class.
+
+    Args:
+        bucket:
+        s3_path: S3 folder and filename within the bucket
+        query_str: a SQL query for S3 select.
+        region:
+        format: output format, options "CSV" or "JSON"
+
+    Returns:
+         list of records matching the query
+    """
+    recs = []
+    if format not in ("JSON", "CSV"):
+        format = "JSON"
+    if format == "JSON":
+        out_serialization = {"JSON": {}}
+    elif format == "CSV":
+        out_serialization = {
+            "CSV": {
+                "QuoteFields": "ASNEEDED",
+                "FieldDelimiter": ",",
+                "QuoteCharacter": '"'}
+        }
+    s3 = boto3.client("s3", region_name=region)
+    resp = s3.select_object_content(
+        Bucket=bucket,
+        Key=s3_path,
+        ExpressionType="SQL",
+        Expression=query_str,
+        InputSerialization={"Parquet": {}},
+        OutputSerialization=out_serialization
+    )
+    for event in resp["Payload"]:
+        if "Records" in event:
+            recs_str = event["Records"]["Payload"].decode(ENCODING)
+            rec_strings = recs_str.strip().split("\n")
+            for rs in rec_strings:
+                if format == "JSON":
+                    rec = json.loads(rs)
+                else:
+                    rec = rs.split(",")
+                recs.append(rec)
+    return recs
+
+
 # ...............................................
 def _parse_records(ret_records, keys):
     small_recs = []
@@ -805,7 +854,8 @@ def _get_single_record(url, keys, certificate=None):
 # ...............................................
 def _get_records(url, keys, certificate=None):
     small_recs = []
-    status_code = 0
+    status_code = None
+    is_end = count = None
     try:
         if certificate:
             response = requests.get(url, verify=certificate)
@@ -837,6 +887,11 @@ def _get_records(url, keys, certificate=None):
                 is_end = output["endOfRecords"]
             except KeyError:
                 print("Missing endOfRecords flag")
+            # Expected count
+            try:
+                is_end = output["count"]
+            except KeyError:
+                print("Missing count")
             # Get values from JSON response
             try:
                 ret_records = output["results"]
@@ -846,7 +901,7 @@ def _get_records(url, keys, certificate=None):
                 small_recs = _parse_records(ret_records, keys)
     if not small_recs:
         print(f"No records returned, status {status_code}, reason {reason}")
-    return small_recs, is_end
+    return small_recs, is_end, count
 
 # ----------------------------------------------------
 def create_dataframe_from_api(base_url, response_keys, output_columns):
@@ -869,7 +924,8 @@ def create_dataframe_from_api(base_url, response_keys, output_columns):
     certificate = certifi.where()
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={limit}"
-        small_recs, is_end = _get_records(url, response_keys, certificate=certificate)
+        small_recs, is_end, count = _get_records(
+            url, response_keys, certificate=certificate)
         all_recs.extend(small_recs)
         offset += limit
         if offset % 5000 == 0:
@@ -900,15 +956,15 @@ def create_csvfiles_from_api(
     csv_files = []
     records = []
     is_end = False
-    offset = 0
+    offset = 7000
     read_limit = 500
     write_limit = 5000
-    write_header = True
     certificate = certifi.where()
     while is_end is False:
         url = f"{base_url}?offset={offset}&limit={read_limit}"
         print(url)
-        small_recs, is_end = _get_records(url, response_keys, certificate=certificate)
+        small_recs, is_end, count = _get_records(
+            url, response_keys, certificate=certificate)
         if small_recs:
             records.extend(small_recs)
         offset += read_limit
@@ -916,12 +972,11 @@ def create_csvfiles_from_api(
         if offset % write_limit == 0:
             dataframe = pd.DataFrame(records, columns=output_columns)
             tmp_filename = f"/tmp/{output_fname}_{offset}.csv"
+            # Only write header to first file (offset == 0), others will be appended
             dataframe.to_csv(
-                path_or_buf=tmp_filename, sep='\t', header=write_header,
+                path_or_buf=tmp_filename, sep='\t', header=(offset == 0),
                 columns=output_columns, doublequote=False, escapechar="\\",
                 encoding=encoding)
-            # Only write header to first file, others will be appended
-            write_header = False
             csv_files.append(tmp_filename)
             print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows")
             # reset records in memory
@@ -988,35 +1043,35 @@ def write_dataframe_to_s3(
     print(f"Uploaded to s3://{bucket}/{output_path}")
 
 
-# ----------------------------------------------------
-def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING):
-    """Query the GBIF Dataset API, write a subset of the response to a table in S3.
-
-    Args:
-        bucket: name of the bucket containing the CSV data.
-        s3_folders: S3 bucket folders for output lookup table
-        region: AWS region containing the destination bucket.
-        encoding: encoding of the input data
-
-    Note:
-        There are >100k records for datasets and limited memory on this EC2 instance,
-        so we write them as temporary CSV files, then combine them, then create a
-        dataframe and upload.
-
-    Postcondition:
-        CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
-            to the named S3 object in bucket and folders
-    """
-    base_url = "https://api.gbif.org/v1/dataset"
-    response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
-    data_date = get_current_datadate_str()
-    output_fname = f"dataset_name_{data_date}"
-    output_fname = "dataset_name_2024_02_01"
-    output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
-    csv_fnames = create_csvfiles_from_api(
-        base_url, response_keys, output_columns, output_fname)
-    write_csvfiles_to_s3(
-        csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding)
+# # ----------------------------------------------------
+# def create_s3_dataset_lookup(bucket, s3_folders, region=REGION, encoding=ENCODING):
+#     """Query the GBIF Dataset API, write a subset of the response to a table in S3.
+#
+#     Args:
+#         bucket: name of the bucket containing the CSV data.
+#         s3_folders: S3 bucket folders for output lookup table
+#         region: AWS region containing the destination bucket.
+#         encoding: encoding of the input data
+#
+#     Note:
+#         There are >100k records for datasets and limited memory on this EC2 instance,
+#         so we write them as temporary CSV files, then combine them, then create a
+#         dataframe and upload.
+#
+#     Postcondition:
+#         CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
+#             to the named S3 object in bucket and folders
+#     """
+#     base_url = "https://api.gbif.org/v1/dataset"
+#     response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
+#     data_date = get_current_datadate_str()
+#     output_fname = f"dataset_name_{data_date}"
+#     output_fname = "dataset_name_2024_02_01"
+#     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
+#     csv_fnames = create_csvfiles_from_api(
+#         base_url, response_keys, output_columns, output_fname)
+#     write_csvfiles_to_s3(
+#         csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
 
 # ----------------------------------------------------
@@ -1045,8 +1100,9 @@ def create_s3_organization_lookup(bucket, s3_folders, region=REGION, encoding=EN
 
 
 # ----------------------------------------------------
-def create_csvfile_from_api(
-    base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING):
+def create_csvfiles_from_apiqueries(
+    base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING,
+    certificate=None):
     """Query an API, read the data and write a subset to a table in S3.
 
     Args:
@@ -1058,28 +1114,37 @@ def create_csvfile_from_api(
         output_columns: list of column headings for output lookup table
         output_fname: base output filename for temporary CSV files
         encoding: encoding of the input data
+        certificate: local SSL certificate required by some APIs
 
     Returns:
         csv_files: Local CSV files with records.  The first file in the list will have
             a header, the rest will not.
     """
+    tmp_filenames = []
     records = []
-    certificate = certifi.where()
-    for key in keys:
-        url = f"{base_url}/{key}"
+    write_chunk = 1000
+    for i in range(len(keys)):
+        url = f"{base_url}/{keys[i]}"
         rec = _get_single_record(url, response_keys, certificate=certificate)
         records.append(rec)
-    dataframe = pd.DataFrame(records, columns=output_columns)
-    tmp_filename = f"/tmp/{output_fname}.csv"
-    dataframe.to_csv(
-        path_or_buf=tmp_filename, sep='\t', header=True,
-        columns=output_columns, doublequote=False, escapechar="\\",
-        encoding=encoding)
-    print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows")
-    return tmp_filename
+        if i % 100 == 0:
+            print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}")
+        if i % write_chunk == 0 and i > 0:
+            print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}")
+            dataframe = pd.DataFrame(records, columns=output_columns)
+            tmp_filename = f"/tmp/{output_fname}_{i}.csv"
+            dataframe.to_csv(
+                path_or_buf=tmp_filename, sep='\t', header=True,
+                columns=output_columns, doublequote=False, escapechar="\\",
+                encoding=encoding)
+            print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows")
+            records = []
+            tmp_filenames.append(tmp_filename)
+    return tmp_filenames
 
 # ----------------------------------------------------
-def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encoding=ENCODING):
+def create_s3_dataset_lookup_by_keys(
+        bucket, s3_folders, region=REGION, encoding=ENCODING):
     """Query the GBIF Dataset API, write a subset of the response to a table in S3.
 
     Args:
@@ -1098,18 +1163,24 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod
         CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
             to the named S3 object in bucket and folders
     """
+    input_fname = "dataset_counts_2024_02_01_000.parquet"
+    s3_path = f"{s3_folders}/{input_fname}"
+    query_str = "SELECT datasetkey from s3object s"
+    key_records = _query_table(bucket, s3_path, query_str, format="CSV")
+    keys = [r[0] for r in key_records]
+
     base_url = "https://api.gbif.org/v1/dataset"
     response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
     data_date = get_current_datadate_str()
-    output_fname = f"dataset_name_{data_date}_"
-    output_fname = "dataset_name_test_2024_02_01_"
+    output_fname = f"dataset_name_{data_date}"
+    output_fname = "dataset_name_test_2024_02_01"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
     certificate = certifi.where()
-    csv_fname = create_csvfile_from_api(
+    csv_fnames = create_csvfiles_from_apiqueries(
         base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING,
         certificate=certificate)
     write_csvfiles_to_s3(
-        [csv_fname], bucket, s3_folders, output_fname, region=region, encoding=encoding)
+        csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
 # .............................................................................
 if __name__ == "__main__":
@@ -1117,14 +1188,15 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod
     region=REGION
     encoding=ENCODING
     s3_folders="summary"
-    keys = [
-        "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b",
-        "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e",
-        "c8fded56-3ddb-4e26-8863-ba8d55862689",
-        "3c83d5da-822a-439c-897a-7569e82c4ebc"
-    ]
-
-    create_s3_dataset_lookup(bucket, s3_folders)
+    # keys = [
+    #     "5a95fa0a-5ef3-432a-b95b-816cd85b2f9b",
+    #     "ee789ae4-ef51-4ff2-931b-bc61b2dbe40e",
+    #     "c8fded56-3ddb-4e26-8863-ba8d55862689",
+    #     "3c83d5da-822a-439c-897a-7569e82c4ebc"
+    # ]
+    create_s3_dataset_lookup_by_keys(
+        bucket, s3_folders, region=REGION, encoding=ENCODING)
+    # create_s3_dataset_lookup(bucket, s3_folders)
     # create_test_s3_dataset_lookup(bucket, s3_folders, keys)
     # create_s3_organization_lookup(
     #     bucket, s3_folders, region=REGION, encoding=ENCODING)
@@ -1140,6 +1212,6 @@ def create_test_s3_dataset_lookup(bucket, s3_folders, keys, region=REGION, encod
 encoding=ENCODING
 s3_folders="summary"
 
-    
-create_s3_dataset_lookup(bucket, s3_folders)
+create_s3_dataset_lookup_by_keys(
+        bucket, s3_folders, region=REGION, encoding=ENCODING)
 """

From 36e59dde6b04061b7bc261015d8639afee282723 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 10:09:55 -0500
Subject: [PATCH 75/81] make sure there are records before creating dataframe

---
 sppy/aws/aws_tools.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 176148a6..4c9b9506 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -1126,20 +1126,24 @@ def create_csvfiles_from_apiqueries(
     for i in range(len(keys)):
         url = f"{base_url}/{keys[i]}"
         rec = _get_single_record(url, response_keys, certificate=certificate)
-        records.append(rec)
-        if i % 100 == 0:
-            print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}")
+        if rec:
+            records.append(rec)
         if i % write_chunk == 0 and i > 0:
-            print(f"{DT.datetime.now().isoformat()} Queried key {i} of {len(keys)}")
-            dataframe = pd.DataFrame(records, columns=output_columns)
-            tmp_filename = f"/tmp/{output_fname}_{i}.csv"
-            dataframe.to_csv(
-                path_or_buf=tmp_filename, sep='\t', header=True,
-                columns=output_columns, doublequote=False, escapechar="\\",
-                encoding=encoding)
-            print(f"Wrote {tmp_filename} with {dataframe.shape[0]} rows")
-            records = []
-            tmp_filenames.append(tmp_filename)
+            print(
+                f"{DT.datetime.now().isoformat()} Create dataframe for {len(records)} "
+                f"records; key {i} of {len(keys)}")
+            if records:
+                dataframe = pd.DataFrame(records, columns=output_columns)
+                tmp_filename = f"/tmp/{output_fname}_{i}.csv"
+                dataframe.to_csv(
+                    path_or_buf=tmp_filename, sep='\t', header=(i < write_chunk+1),
+                    columns=output_columns, doublequote=False, escapechar="\\",
+                    encoding=encoding)
+                print(
+                    f"Wrote {tmp_filename} with {len(records)} records and "
+                    f"{dataframe.shape[0]} rows")
+                records = []
+                tmp_filenames.append(tmp_filename)
     return tmp_filenames
 
 # ----------------------------------------------------

From b569dd04b7abcd16035fd6ae08c8f51a0df60883 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 11:40:38 -0500
Subject: [PATCH 76/81] check for existence of dataset metadata before trying
 to use

---
 sppy/aws/aws_tools.py        |  8 +++----
 sppy/tools/provider/spnet.py | 43 ++++++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index 4c9b9506..ae85c23e 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -1065,8 +1065,8 @@ def write_dataframe_to_s3(
 #     base_url = "https://api.gbif.org/v1/dataset"
 #     response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
 #     data_date = get_current_datadate_str()
-#     output_fname = f"dataset_name_{data_date}"
-#     output_fname = "dataset_name_2024_02_01"
+#     output_fname = f"dataset_meta_{data_date}"
+#     output_fname = "dataset_meta_2024_02_01"
 #     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
 #     csv_fnames = create_csvfiles_from_api(
 #         base_url, response_keys, output_columns, output_fname)
@@ -1176,8 +1176,8 @@ def create_s3_dataset_lookup_by_keys(
     base_url = "https://api.gbif.org/v1/dataset"
     response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
     data_date = get_current_datadate_str()
-    output_fname = f"dataset_name_{data_date}"
-    output_fname = "dataset_name_test_2024_02_01"
+    output_fname = f"dataset_meta_{data_date}"
+    output_fname = "dataset_meta_test_2024_02_01"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
     certificate = certifi.where()
     csv_fnames = create_csvfiles_from_apiqueries(
diff --git a/sppy/tools/provider/spnet.py b/sppy/tools/provider/spnet.py
index 4a213153..fde4a41c 100644
--- a/sppy/tools/provider/spnet.py
+++ b/sppy/tools/provider/spnet.py
@@ -28,33 +28,57 @@ def __init__(
         self.region = region
         self.encoding = encoding
         self.exp_type = 'SQL'
-        datestr = get_current_datadate_str()
-        datestr = "2024_02_01"
+        self.datestr = get_current_datadate_str()
+        self.datestr = "2024_02_01"
         self._summary_path = "summary"
         self._summary_tables = {
             "dataset_counts": {
-                "fname": f"dataset_counts_{datestr}_000.parquet",
+                "fname": f"dataset_counts_{self.datestr}_000.parquet",
                 "fields": ["datasetkey", "occ_count", "species_count"],
                 "key": "datasetkey"
             },
             "dataset_species_lists": {
-                "fname": f"dataset_lists_{datestr}_000.parquet",
+                "fname": f"dataset_lists_{self.datestr}_000.parquet",
                 "fields": ["datasetkey", "taxonkey", "species", "occ_count"],
                 "key": "datasetkey"
             },
             "dataset_meta": {
-                "fname": f"dataset_names_{datestr}_000.csv",
+                "fname": f"dataset_meta_{self.datestr}.csv",
                 "fields": [
                     "datasetKey", "publishingOrganizationKey", "title", "citation"],
                 "key": "datasetKey"
             },
             "organization_meta": {
-                "fname": f"organization_names_{datestr}_000.csv",
+                "fname": f"organization_meta_{self.datestr}.csv",
                 "fields": ["publishingOrganizationKey", "title"],
                 "key": "publishingOrganizationKey"
             }
         }
 
+    # ----------------------------------------------------
+    def _list_summaries(self):
+        summary_objs = []
+        s3 = boto3.client("s3", region_name=self.region)
+        summ_objs = s3.list_objects_v2(Bucket=self.bucket, Prefix=self._summary_path)
+        prefix = f"{self._summary_path}/"
+        try:
+            contents = summ_objs["Contents"]
+        except KeyError:
+            pass
+        else:
+            for item in contents:
+                fname = item["Key"].strip(prefix)
+                if len(fname) > 1:
+                    summary_objs.append(fname)
+        return summary_objs
+
+    # ----------------------------------------------------
+    def _dataset_metadata_exists(self):
+        fnames = self._list_summaries()
+        if self._summary_tables["dataset_meta"]["fname"] in fnames:
+            return True
+        return False
+
     # ----------------------------------------------------
     def _query_table(self, s3_path, query_str, format="CSV"):
         """Query the S3 resource defined for this class.
@@ -166,7 +190,8 @@ def get_dataset_counts(self, dataset_key, format="JSON"):
         )
         # Returns empty list or list of 1 record
         records = self._query_table(table_path, query_str, format=format)
-        # self.add_dataset_lookup_vals(records, key_idx=key_idx)
+        if self._dataset_metadata_exists():
+            self.add_dataset_lookup_vals(records, key_idx=key_idx)
         return records
 
     # ----------------------------------------------------
@@ -206,7 +231,6 @@ def add_dataset_lookup_vals(self, records, key_idx=0, format="JSON"):
                         rec.update(meta)
                     else:
                         rec.extend(meta)
-        return records
 
     # # ----------------------------------------------------
     # def get_org_counts(self, pub_org_key):
@@ -253,7 +277,8 @@ def rank_dataset_counts(self, count_by, order, limit, format="JSON"):
         except Exception as e:
             errors = {"error": [get_traceback()]}
 
-        # self.add_dataset_lookup_vals(records, key_idx=key_idx)
+        if self._dataset_metadata_exists():
+            self.add_dataset_lookup_vals(records, key_idx=key_idx)
         return records, errors
 
 # .............................................................................

From a1ff9943a758e542fa864e87474cf703f75c97f8 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 11:48:37 -0500
Subject: [PATCH 77/81] pandas.read_parquet requires pyarrow (or fastparquet)

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 6687eff1..a63db974 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ awscli
 boto3>=1.34.60
 sqlalchemy
 pandas
+pyarrow
\ No newline at end of file

From 5e814da2db1d16b6095325788bffa2aa02648846 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 12:39:25 -0500
Subject: [PATCH 78/81] upgrade pip to fix failing pyarrow install

---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 9c17826d..7c40e2b7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,7 +20,10 @@ USER specify
 
 COPY --chown=specify:specify ./requirements.txt .
 
+RUN pip3 install --upgrade pip
+
 RUN python3 -m venv venv \
+ && venv/bin/pip install --upgrade pip \
  && venv/bin/pip install --no-cache-dir -r ./requirements.txt
 
 COPY --chown=specify:specify ./sppy ./sppy

From 7fe5b1cfc847ffde6815df96a285095bd27f0cfc Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 14:42:08 -0500
Subject: [PATCH 79/81] replace failing pyarrow dependency build with
 fastparquet; upgrade python docker image

---
 Dockerfile       | 4 +---
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7c40e2b7..8fe67c1d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1
 # ........................................................
 # Backend base image
-FROM python:3.10.0rc2-alpine3.14 as base
+FROM python:3.12.2-alpine3.19 as base
 
 LABEL maintainer="Specify Collections Consortium <github.com/specify>"
 
@@ -20,8 +20,6 @@ USER specify
 
 COPY --chown=specify:specify ./requirements.txt .
 
-RUN pip3 install --upgrade pip
-
 RUN python3 -m venv venv \
  && venv/bin/pip install --upgrade pip \
  && venv/bin/pip install --no-cache-dir -r ./requirements.txt
diff --git a/requirements.txt b/requirements.txt
index a63db974..e9944230 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,6 @@ gunicorn==20.1.0
 rtree>=1.0.0
 awscli
 boto3>=1.34.60
+fastparquet
 sqlalchemy
 pandas
-pyarrow
\ No newline at end of file

From cf550a1e8cd71fd78afc804d8878a9131d85664d Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 14:42:37 -0500
Subject: [PATCH 80/81] add testing flag

---
 sppy/aws/aws_tools.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/sppy/aws/aws_tools.py b/sppy/aws/aws_tools.py
index ae85c23e..8380b173 100644
--- a/sppy/aws/aws_tools.py
+++ b/sppy/aws/aws_tools.py
@@ -1148,13 +1148,12 @@ def create_csvfiles_from_apiqueries(
 
 # ----------------------------------------------------
 def create_s3_dataset_lookup_by_keys(
-        bucket, s3_folders, region=REGION, encoding=ENCODING):
+        bucket, s3_folders, region=REGION, encoding=ENCODING, is_test=False):
     """Query the GBIF Dataset API, write a subset of the response to a table in S3.
 
     Args:
         bucket: name of the bucket containing the CSV data.
         s3_folders: S3 bucket folders for output lookup table
-        keys: unique identifiers to query the API for
         region: AWS region containing the destination bucket.
         encoding: encoding of the input data
 
@@ -1167,22 +1166,33 @@ def create_s3_dataset_lookup_by_keys(
         CSV table with dataset key, pubOrgKey, dataset name, dataset citation written
             to the named S3 object in bucket and folders
     """
-    input_fname = "dataset_counts_2024_02_01_000.parquet"
-    s3_path = f"{s3_folders}/{input_fname}"
-    query_str = "SELECT datasetkey from s3object s"
-    key_records = _query_table(bucket, s3_path, query_str, format="CSV")
-    keys = [r[0] for r in key_records]
+    # Current filenames
+    data_date = get_current_datadate_str()
+    data_date = "2024_02_01"
+    input_fname = f"dataset_counts_{data_date}_000.parquet"
+    output_fname = f"dataset_meta_{data_date}"
 
+    # Data and query parameters
     base_url = "https://api.gbif.org/v1/dataset"
     response_keys = ["key", "publishingOrganizationKey", "title", ["citation", "text"]]
-    data_date = get_current_datadate_str()
-    output_fname = f"dataset_meta_{data_date}"
-    output_fname = "dataset_meta_test_2024_02_01"
     output_columns = ["datasetKey", "publishingOrganizationKey", "title", "citation"]
     certificate = certifi.where()
+
+    # Get keys for dataset resolution
+    s3_path = f"{s3_folders}/{input_fname}"
+    query_str = "SELECT datasetkey from s3object s"
+    key_records = _query_table(bucket, s3_path, query_str, format="CSV")
+    keys = [r[0] for r in key_records]
+    if is_test:
+        keys = keys[:2100]
+        output_fname = f"dataset_meta_test_{data_date}"
+
+    # Write tempfiles locally
     csv_fnames = create_csvfiles_from_apiqueries(
         base_url, keys, response_keys, output_columns, output_fname, encoding=ENCODING,
         certificate=certificate)
+
+    # Aggregate and write all records to S3
     write_csvfiles_to_s3(
         csv_fnames, bucket, s3_folders, output_fname, region=region, encoding=encoding)
 
@@ -1210,6 +1220,7 @@ def create_s3_dataset_lookup_by_keys(
 # Note: Test with quoted data such as: 
 # http://api.gbif.org/v1/dataset/3c83d5da-822a-439c-897a-7569e82c4ebc
 from sppy.aws.aws_tools  import *
+from sppy.aws.aws_tools  import _query_table
 
 bucket=PROJ_BUCKET
 region=REGION
@@ -1217,5 +1228,7 @@ def create_s3_dataset_lookup_by_keys(
 s3_folders="summary"
 
 create_s3_dataset_lookup_by_keys(
-        bucket, s3_folders, region=REGION, encoding=ENCODING)
+        bucket, s3_folders, region=REGION, encoding=ENCODING, is_test=False)
+
+
 """

From 759f21ed1cbeb2e21b8aa789c3d0a245d65f44d5 Mon Sep 17 00:00:00 2001
From: Aimee Stewart <zzeppozz@gmail.com>
Date: Mon, 1 Apr 2024 14:43:55 -0500
Subject: [PATCH 81/81] pandas/fastparquet requires s3fs

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index e9944230..8a96d76e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ gunicorn==20.1.0
 rtree>=1.0.0
 awscli
 boto3>=1.34.60
+s3fs
 fastparquet
 sqlalchemy
 pandas