diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..eb50fa2 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,76 @@ +name: Deploy ingestor + +on: + push: + branches: + - version/aeolus + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Configure aws credentials + uses: aws-actions/configure-aws-credentials@master + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_PROD }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_PROD }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Get envionmental values + uses: aws-actions/aws-secretsmanager-get-secrets@v2 + with: + secret-ids: | + AEOLUS, openaq-env/aeolus + name-transformation: uppercase + parse-json-secrets: true + + - uses: actions/setup-node@v4 + with: + node-version: "20" + + + - name: Install CDK + run: | + npm install -g aws-cdk + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Deploy stack + env: + ENV: "aeolus" + PROJECT: "openaq" + + ## deployment variables + # CDK_ACCOUNT: ${{ secrets.CDK_ACCOUNT }} + # CDK_REGION: ${{ secrets.CDK_REGION }} + + VPC_ID: ${{ env.AEOLUS_VPC_ID }} + + TOPIC_ARN: ${{ env.AEOLUS_FETCH_OBJECT_TOPIC_ARN }} + + ## application variables + DATABASE_READ_USER: ${{ env.AEOLUS_DATABASE_READ_USER }} + DATABASE_READ_PASSWORD: ${{ env.AEOLUS_DATABASE_READ_PASSWORD }} + DATABASE_WRITE_USER: ${{ env.AEOLUS_DATABASE_WRITE_USER }} + DATABASE_WRITE_PASSWORD: ${{ env.AEOLUS_DATABASE_WRITE_PASSWORD }} + DATABASE_DB: ${{ env.AEOLUS_DATABASE_DB }} + DATABASE_HOST: ${{ env.AEOLUS_DATABASE_HOST }} + DATABASE_PORT: ${{ env.AEOLUS_DATABASE_PORT }} + FETCH_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} + ETL_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} + PAUSE_INGESTING: False + + + working-directory: ./cdk + run: | + poetry self add poetry-plugin-export + poetry install + cdk deploy openaq-ingest-aeolus --require-approval never diff --git a/README.md b/README.md index d90d3e4..a5a4068 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,5 @@ # Testing a realtime file + +# Testing files diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..e58dbda --- /dev/null +++ b/benchmark.py @@ -0,0 +1,95 @@ +import logging +import os +import sys +import argparse +from time import time +import re + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + description=""" +Test benchmarks for ingestion + """) + +parser.add_argument( + '--name', + type=str, + required=False, + default="4xlarge", + help='Name to use for the test' + ) +parser.add_argument( + '--env', + type=str, + default='.env', + required=False, + help='The dot env file to use' + ) +parser.add_argument( + '--debug', + action="store_true", + help='Output at DEBUG level' + ) +args = parser.parse_args() + +if 'DOTENV' not in os.environ.keys() and args.env is not None: + os.environ['DOTENV'] = args.env + +if args.debug: + os.environ['LOG_LEVEL'] = 'DEBUG' + +from ingest.settings import settings +from fake import config, get_locations, as_realtime +from ingest.fetch import load_realtime + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level=settings.LOG_LEVEL.upper(), + force=True, +) + +f = open(f"benchmark_ingest_output_{args.name}.csv", "w") +f.writelines("name,key,locations,inserted_nodes,updated_nodes,total_meas,inserted_meas,ingest_time,process_time,log_time,copy_time,load_process_time\n") +n = 10 +locations = [50, 250, 1000] +keys = [] +ii = 1 + +## make a set of files +for r in locations: + for i in range(n): + config(source=f"benchmark-test-{r}-{i+1}", gz=True) + l = get_locations(n=r) + key = as_realtime(l["locations"], l["latitude"], l["longitude"]) + keys.append({ "key": key, "locations": len(l["locations"]) }) + ii=+1 + + +## ingest each of the +for i, k in enumerate(keys): + key = k["key"] + locations = k["locations"] + logger.info(f"Ingesting {i+1} of {len(keys)}: {key} with {locations} locations") + + start_time = time() + copy_time, load_process_time, log_time, notice = load_realtime([ + (-1, key, None) + ]) + m = re.findall('([a-z-]+): (.+?),', notice) + + process_time = round(float(m[17][1])) + total_meas = int(m[0][1]) + inserted_meas = int(m[9][1]) + updated_nodes = int(m[8][1]) + inserted_nodes = int(m[11][1]) + ingest_time = round((time() - start_time)*1000) + f.writelines(f"'{args.name}','{key}',{locations},{inserted_nodes},{updated_nodes},{total_meas},{inserted_meas},{ingest_time},{process_time},{log_time},{copy_time},{load_process_time}\n") + + logger.info( + "loaded realtime records, timer: %0.4f, process: %0.4f", + ingest_time, process_time + ) + + +f.close() diff --git a/cdk/app.py b/cdk/app.py index 8318018..e0b9a04 100644 --- a/cdk/app.py +++ b/cdk/app.py @@ -3,6 +3,7 @@ Environment, Tags, ) +import os from lambda_ingest_stack import LambdaIngestStack @@ -19,16 +20,23 @@ app = aws_cdk.App() +env = Environment( + account=os.environ['CDK_DEFAULT_ACCOUNT'], + region=os.environ['CDK_DEFAULT_REGION'] + ) + ingest = LambdaIngestStack( app, f"openaq-ingest-{settings.ENV}", env_name=settings.ENV, lambda_env=lambda_env, fetch_bucket=settings.FETCH_BUCKET, - ingest_lambda_timeout=settings.INGEST_LAMBDA_TIMEOUT, - ingest_lambda_memory_size=settings.INGEST_LAMBDA_MEMORY_SIZE, - ingest_rate_minutes=settings.INGEST_RATE_MINUTES, + vpc_id=settings.VPC_ID, + lambda_timeout=settings.LAMBDA_TIMEOUT, + lambda_memory_size=settings.LAMBDA_MEMORY_SIZE, + rate_minutes=settings.RATE_MINUTES, topic_arn=settings.TOPIC_ARN, + env=env, ) Tags.of(ingest).add("project", settings.PROJECT) diff --git a/cdk/cdk.json b/cdk/cdk.json index f1770f9..76af4a2 100644 --- a/cdk/cdk.json +++ b/cdk/cdk.json @@ -1,5 +1,5 @@ { - "app": "python3.8 app.py", + "app": "poetry run python app.py", "context": { "aws-cdk:enableDiffNoFail": "true", "@aws-cdk/core:stackRelativeExports": "true", diff --git a/cdk/config.py b/cdk/config.py index ccae88d..03cb150 100644 --- a/cdk/config.py +++ b/cdk/config.py @@ -1,5 +1,8 @@ from typing import List -from pydantic import BaseSettings +from pydantic_settings import ( + BaseSettings, + SettingsConfigDict, + ) from pathlib import Path from os import environ @@ -8,18 +11,17 @@ class Settings(BaseSettings): FETCH_BUCKET: str ENV: str = "staging" PROJECT: str = "openaq" - INGEST_LAMBDA_TIMEOUT: int = 900 - INGEST_LAMBDA_MEMORY_SIZE: int = 1536 - INGEST_RATE_MINUTES: int = 15 + LAMBDA_TIMEOUT: int = 900 + LAMBDA_MEMORY_SIZE: int = 1536 + RATE_MINUTES: int = 15 LOG_LEVEL: str = 'INFO' TOPIC_ARN: str = None + VPC_ID: str = None - class Config: - parent = Path(__file__).resolve().parent.parent - if 'DOTENV' in environ: - env_file = Path.joinpath(parent, environ['DOTENV']) - else: - env_file = Path.joinpath(parent, ".env") + + model_config = SettingsConfigDict( + extra="ignore", env_file=f"../{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8" + ) settings = Settings() diff --git a/cdk/lambda_ingest_stack.py b/cdk/lambda_ingest_stack.py index 3b2e380..b12d179 100644 --- a/cdk/lambda_ingest_stack.py +++ b/cdk/lambda_ingest_stack.py @@ -2,8 +2,10 @@ from typing import Dict from aws_cdk import ( + Environment, aws_lambda, aws_s3, + aws_ec2, Stack, Duration, aws_events, @@ -24,18 +26,23 @@ def __init__( self, scope: Construct, id: str, + env: Environment, env_name: str, lambda_env: Dict, fetch_bucket: str, - ingest_lambda_timeout: int, - ingest_lambda_memory_size: int, - ingest_rate_minutes: int = 15, + lambda_timeout: int, + lambda_memory_size: int, + rate_minutes: int = 15, topic_arn: str = None, + vpc_id: str = None, **kwargs, ) -> None: """Lambda plus cronjob to ingest metadata, realtime and pipeline data""" - super().__init__(scope, id, *kwargs) + super().__init__(scope, id, env=env,*kwargs) + + if vpc_id is not None: + vpc_id = aws_ec2.Vpc.from_lookup(self, f"{id}-vpc", vpc_id=vpc_id) ingest_function = aws_lambda.Function( self, @@ -58,11 +65,12 @@ def __init__( ], ), handler="ingest.handler.handler", - runtime=aws_lambda.Runtime.PYTHON_3_8, + vpc=vpc_id, + runtime=aws_lambda.Runtime.PYTHON_3_12, allow_public_subnet=True, - memory_size=ingest_lambda_memory_size, + memory_size=lambda_memory_size, environment=stringify_settings(lambda_env), - timeout=Duration.seconds(ingest_lambda_timeout), + timeout=Duration.seconds(lambda_timeout), layers=[ create_dependencies_layer( self, @@ -81,12 +89,12 @@ def __init__( # Set how often the ingester will run # If 0 the ingester will not run automatically - if ingest_rate_minutes > 0: + if rate_minutes > 0: aws_events.Rule( self, f"{id}-ingest-event-rule", schedule=aws_events.Schedule.cron( - minute=f"0/{ingest_rate_minutes}" + minute=f"0/{rate_minutes}" ), targets=[ aws_events_targets.LambdaFunction(ingest_function), diff --git a/cdk/requirements.txt b/cdk/requirements.txt deleted file mode 100644 index f44b370..0000000 --- a/cdk/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -attrs==21.4.0 -aws-cdk-lib==2.3.0 -aws-cdk.aws-apigatewayv2-alpha==2.3.0a0 -aws-cdk.aws-apigatewayv2-integrations-alpha==2.3.0a0 -cattrs==22.1.0 -constructs==10.1.16 -exceptiongroup==1.0.0rc7 -jsii==1.59.0 -publication==0.0.3 -pydantic==1.9.1 -python-dateutil==2.8.2 -python-dotenv==0.20.0 -six==1.16.0 -typing_extensions==4.2.0 diff --git a/cdk/utils.py b/cdk/utils.py index 42bea63..1e7cec4 100644 --- a/cdk/utils.py +++ b/cdk/utils.py @@ -19,20 +19,22 @@ def create_dependencies_layer( function_name: str, requirements_path: Path ) -> aws_lambda.LayerVersion: - requirements_file = str(requirements_path.resolve()) + #requirements_file = str(requirements_path.resolve()) output_dir = f'../.build/{function_name}' layer_id = f'openaq-{function_name}-{env_name}-dependencies' - if not environ.get('SKIP_PIP'): - print(f'Building {layer_id} from {requirements_file} into {output_dir}') + if not environ.get('SKIP_BUILD'): + print(f'Building {layer_id} into {output_dir}') subprocess.run( - f"""python3.8 -m pip install -qq -r {requirements_file} \ + f""" + poetry export --without=cdk -o requirements.txt --without-hashes && \ + poetry run python -m pip install -qq -r requirements.txt \ -t {output_dir}/python && \ cd {output_dir}/python && \ find . -type f -name '*.pyc' | \ while read f; do n=$(echo $f | \ sed 's/__pycache__\///' | \ - sed 's/.cpython-[2-3] [0-9]//'); \ + sed 's/.cpython-[2-3][0-9]//'); \ cp $f $n; \ done \ && find . -type d -a -name '__pycache__' -print0 | xargs -0 rm -rf \ @@ -47,5 +49,5 @@ def create_dependencies_layer( self, layer_id, code=layer_code, - compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_8] + compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_12] ) diff --git a/check.py b/check.py index 22976cf..71020fc 100644 --- a/check.py +++ b/check.py @@ -1,9 +1,11 @@ import argparse import logging import os -import json +import sys +import orjson +import psycopg2 -logger = logging.getLogger(__name__) +logger = logging.getLogger('check.py') #os.chdir('/home/christian/git/caparker/openaq-ingestor/ingest') #print(os.getcwd()) @@ -16,6 +18,12 @@ """) parser.add_argument('--id', type=int, required=False, help='The fetchlogs_id value') +parser.add_argument('--file', type=str, required=False, + help='A local file to load') +parser.add_argument('--batch', type=str, required=False, + help='The batch id value. Loads files based on batch uuid.') +parser.add_argument('--pattern', type=str, required=False, + help='A reqex to match keys for loading') parser.add_argument('--env', type=str, required=False, help='The dot env file to use') parser.add_argument('--profile', type=str, required=False, @@ -24,11 +32,11 @@ help="""Either the number of entries to list (sorted by date) or the number of days to go back if using the summary or rejects arguments""") -parser.add_argument('--pipeline', type=int, required=False, default=1, +parser.add_argument('--pipeline', type=int, required=False, default=0, help="""The number of pipeline files to load at a time""") -parser.add_argument('--metadata', type=int, required=False, default=1, +parser.add_argument('--metadata', type=int, required=False, default=0, help="""The number of metadata files to load at a time""") -parser.add_argument('--realtime', type=int, required=False, default=1, +parser.add_argument('--realtime', type=int, required=False, default=0, help="""The number of realtime files to load at a time""") parser.add_argument('--fix', action="store_true", help='Automatically attempt to fix the problem') @@ -48,6 +56,8 @@ help='Show list of errors') parser.add_argument('--resubmit', action="store_true", help='Mark the fetchlogs file for resubmittal') +parser.add_argument('--keep', action="store_true", + help='Do not use TEMP tables for the ingest staging tables') args = parser.parse_args() if 'DOTENV' not in os.environ.keys() and args.env is not None: @@ -62,32 +72,42 @@ if args.debug: os.environ['LOG_LEVEL'] = 'DEBUG' +if args.keep: + os.environ['USE_TEMP_TABLES'] = 'False' + from botocore.exceptions import ClientError -from ingest.handler import cronhandler, logger +from ingest.handler import cronhandler from ingest.settings import settings from ingest.lcs import ( - load_metadata_db, - load_measurements_db, - load_measurements_file, + load_metadata, + load_metadata_batch, +) + +from ingest.lcsV2 import ( load_measurements, - get_measurements, + load_measurements_batch, ) from ingest.fetch import ( load_realtime, + create_staging_table, parse_json, ) from ingest.utils import ( + load_fetchlogs, load_errors_list, load_errors_summary, load_rejects_summary, + get_data, get_object, put_object, get_logs_from_ids, get_logs_from_pattern, mark_success, + StringIteratorIO, + deconstruct_path, ) @@ -107,18 +127,19 @@ def check_realtime_key(key: str, fix: bool = False): n = len(lines) errors = [] for jdx, line in enumerate(lines): - try: - # first just try and load it - obj = json.loads(line) - except Exception as e: - errors.append(jdx) - print(f"*** Loading error on line #{jdx} (of {n}): {e}\n{line}") - try: - # then we can try to parse it - parse_json(obj) - except Exception as e: - errors.append(jdx) - print(f"*** Parsing error on line #{jdx} (of {n}): {e}\n{line}") + if len(line) > 0: + try: + # first just try and load it + obj = orjson.loads(line) + except Exception as e: + errors.append(jdx) + print(f"*** Loading error on line #{jdx} (of {n}): {e}\n{line}") + try: + # then we can try to parse it + parse_json(obj) + except Exception as e: + errors.append(jdx) + print(f"*** Parsing error on line #{jdx} (of {n}): {e}\n{line}") if len(errors) > 0 and fix: # remove the bad rows and then replace the file @@ -135,6 +156,15 @@ def check_realtime_key(key: str, fix: bool = False): mark_success(key=key, reset=True) +if args.file is not None: + # check if the files exists + # is it a realtime file or a lcs file? + # upload the file + load_realtime([ + (-1, args.file, None) + ]) + sys.exit() + # If we have passed an id than we check that if args.id is not None: # get the details for that id @@ -142,27 +172,65 @@ def check_realtime_key(key: str, fix: bool = False): # get just the keys keys = [log[1] for log in logs] # loop through and check each + logger.info(f"Downloading {len(keys)} files") for idx, key in enumerate(keys): + if args.download: + # we may be using the new source pat + p = deconstruct_path(key) + download_path = f'~/Downloads/{p["bucket"]}/{p["key"]}'; + logger.info(f'downloading to {download_path}') + txt = get_object(**p) + fpath = os.path.expanduser(download_path) + os.makedirs(os.path.dirname(fpath), exist_ok=True) + with open(fpath.replace('.gz', ''), 'w') as f: + f.write(txt) # if we are resubmiting we dont care # what type of file it is - if args.resubmit: + elif args.resubmit: mark_success(key, reset=True, message='resubmitting') # figure out what type of file it is elif 'realtime' in key: if args.load: - load_realtime([key]) + load_realtime([ + (args.id, key, None) + ]) else: check_realtime_key(key, args.fix) + elif 'stations' in key: + load_metadata([ + {"id": args.id, "Key": key, "LastModified": None} + ]) else: - print(key) + load_measurements([ + (args.id, key, None) + ]) + +elif args.batch is not None: + # load_measurements_batch(args.batch) + load_metadata_batch(args.batch) + +elif args.pattern is not None: + keys = load_fetchlogs(pattern=args.pattern, limit=25, ascending=True) + # loop through and check each + for row in keys: + id = row[0] + key = row[1] + last = row[2] + logger.debug(f"{key}: {id}") + if args.load: + if 'realtime' in key: + load_realtime([ + (id, key, last) + ]) + elif 'stations' in key: + load_metadata([ + {"id": id, "Key": key, "LastModified": last} + ]) + else: + load_measurements([ + (id, key, last) + ]) - if args.download: - print(f'downloading: {key}') - txt = get_object(key) - fpath = os.path.expanduser(f'~/{key}') - os.makedirs(os.path.dirname(fpath), exist_ok=True) - with open(fpath.replace('.gz',''), 'w') as f: - f.write(txt) # Otherwise if we set the summary flag return a daily summary of errors diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql new file mode 100644 index 0000000..7c7a44a --- /dev/null +++ b/ingest/etl_process_measurements.sql @@ -0,0 +1,503 @@ +-- lcs_meas_ingest +DO $$ +DECLARE +__process_start timestamptz := clock_timestamp(); +__total_measurements int; +__inserted_measurements int; +__rejected_measurements int := 0; +__rejected_nodes int := 0; +__total_nodes int := 0; +__updated_nodes int := 0; +__inserted_nodes int := 0; +__exported_days int; +__start_datetime timestamptz; +__end_datetime timestamptz; +__inserted_start_datetime timestamptz; +__inserted_end_datetime timestamptz; +__process_time_ms int; +__insert_time_ms int; +__cache_time_ms int; +__error_context text; +__ingest_method text := 'lcs'; +BEGIN + + +DELETE +FROM staging_measurements +WHERE ingest_id IS NULL +OR datetime is NULL +OR value IS NULL; + +--DELETE +--FROM staging_measurements +--WHERE datetime < '2018-01-01'::timestamptz +--OR datetime>now(); + +DELETE +FROM rejects +WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM staging_measurements) +AND tbl ~* '^meas'; + + +SELECT COUNT(1) +, MIN(datetime) +, MAX(datetime) +INTO __total_measurements +, __start_datetime +, __end_datetime +FROM staging_measurements; + + +-- The ranking is to deal with the current possibility +-- that duplicate sensors with the same ingest/source id are created + -- this is a short term fix + -- a long term fix would not allow duplicate source_id's +WITH staged_sensors AS ( + -- this first part significantly speeds it up on slow machines + SELECT DISTINCT ingest_id + FROM staging_measurements +), ranked_sensors AS ( + SELECT s.sensors_id + , s.source_id + , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk + FROM sensors s + JOIN staged_sensors m ON (s.source_id = m.ingest_id) +), active_sensors AS ( + SELECT source_id + , sensors_id + FROM ranked_sensors + WHERE rnk = 1) + UPDATE staging_measurements + SET sensors_id=s.sensors_id + FROM active_sensors s + WHERE s.source_id=ingest_id; + + +-- Now we have to fill in any missing information +-- first add the nodes and systems that dont exist +-- add just the bare minimum amount of data to the system +-- we assume that the node information will be added later +WITH nodes AS ( +INSERT INTO sensor_nodes ( + source_name +, site_name +, source_id +, metadata) +SELECT source_name +, source_name +, source_id +, jsonb_build_object('fetchlogs_id', MIN(fetchlogs_id)) +FROM staging_measurements +WHERE sensors_id IS NULL +GROUP BY 1,2,3 +ON CONFLICT (source_name, source_id) DO UPDATE +SET source_id = EXCLUDED.source_id +, metadata = EXCLUDED.metadata||COALESCE(sensor_nodes.metadata, '{}'::jsonb) +RETURNING sensor_nodes_id, source_id) +INSERT INTO sensor_systems ( + sensor_nodes_id +, source_id) +SELECT sensor_nodes_id +, source_id +FROM nodes +ON CONFLICT DO NOTHING; + +-- now create a sensor for each +-- this method depends on us having a match for the parameter +WITH sen AS ( + SELECT ingest_id + , source_name + , source_id + , measurand as parameter + FROM staging_measurements + WHERE sensors_id IS NULL + GROUP BY 1,2,3,4 +), inserts AS ( +INSERT INTO sensors (sensor_systems_id, measurands_id, source_id) +SELECT sy.sensor_systems_id +, m.measurands_id +, ingest_id +FROM sen s +JOIN measurands_map_view m ON (s.parameter = m.key) +JOIN sensor_nodes n ON (s.source_name = n.source_name AND s.source_id = n.source_id) +JOIN sensor_systems sy ON (sy.sensor_nodes_id = n.sensor_nodes_id AND s.source_id = sy.source_id) +ON CONFLICT DO NOTHING +RETURNING sensor_systems_id) +SELECT COUNT(DISTINCT sensor_systems_id) INTO __inserted_nodes +FROM inserts; + +-- try again to find the sensors +UPDATE staging_measurements +SET sensors_id=s.sensors_id +FROM sensors s +WHERE s.source_id=ingest_id +AND staging_measurements.sensors_id IS NULL; + + +SELECT COUNT(DISTINCT sensors_id) +INTO __total_nodes +FROM staging_measurements; + + +__process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- reject any missing. Most likely due to issues +-- with the measurand +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT + current_timestamp + , 'meas-missing-sensors-id' + , to_jsonb(staging_measurements) + , fetchlogs_id +FROM staging_measurements +WHERE sensors_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_measurements +FROM r; + +-- restart the clock to measure just inserts +__process_start := clock_timestamp(); + +WITH inserts AS ( +INSERT INTO measurements ( + sensors_id, + datetime, + value, + lon, + lat +) SELECT + --DISTINCT + sensors_id, + datetime, + value, + lon, + lat +FROM staging_measurements +WHERE sensors_id IS NOT NULL +ON CONFLICT DO NOTHING +RETURNING sensors_id, datetime, value, lat, lon +), inserted as ( + INSERT INTO staging_inserted_measurements (sensors_id, datetime, value, lat, lon) + SELECT sensors_id + , datetime + , value + , lat + , lon + FROM inserts + RETURNING sensors_id, datetime +) +SELECT MIN(datetime) +, MAX(datetime) +, COUNT(1) +INTO __inserted_start_datetime +, __inserted_end_datetime +, __inserted_measurements +FROM inserted; + +__insert_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- mark the fetchlogs as done +WITH inserted AS ( + SELECT m.fetchlogs_id + , COUNT(m.*) as n_records + , COUNT(t.*) as n_inserted + , MIN(m.datetime) as fr_datetime + , MAX(m.datetime) as lr_datetime + , MIN(t.datetime) as fi_datetime + , MAX(t.datetime) as li_datetime + FROM staging_measurements m + LEFT JOIN staging_inserted_measurements t ON (t.sensors_id = m.sensors_id AND t.datetime = m.datetime) + GROUP BY m.fetchlogs_id) +UPDATE fetchlogs +SET completed_datetime = CURRENT_TIMESTAMP +, inserted = COALESCE(n_inserted, 0) +, records = COALESCE(n_records, 0) +, first_recorded_datetime = fr_datetime +, last_recorded_datetime = lr_datetime +, first_inserted_datetime = fi_datetime +, last_inserted_datetime = li_datetime +FROM inserted +WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; + +-- track the time required to update cache tables +__process_start := clock_timestamp(); + +-- -- Now we can use those staging_inserted_measurements to update the cache tables +-- INSERT INTO sensors_latest ( +-- sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- ) +-- ---- identify the row that has the latest value +-- WITH numbered AS ( +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn +-- FROM staging_inserted_measurements +-- ), latest AS ( +-- ---- only insert those rows +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- FROM numbered +-- WHERE rn = 1 +-- ) +-- SELECT l.sensors_id +-- , l.datetime +-- , l.value +-- , l.lat +-- , l.lon +-- FROM latest l +-- LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +-- WHERE sl.sensors_id IS NULL +-- OR l.datetime > sl.datetime +-- ON CONFLICT (sensors_id) DO UPDATE +-- SET datetime = EXCLUDED.datetime +-- , value = EXCLUDED.value +-- , lat = EXCLUDED.lat +-- , lon = EXCLUDED.lon +-- , modified_on = now() +-- --, fetchlogs_id = EXCLUDED.fetchlogs_id +-- ; +-- update the exceedances +INSERT INTO sensor_exceedances (sensors_id, threshold_value, datetime_latest) + SELECT + m.sensors_id + , t.value + , MAX(datetime) + FROM staging_inserted_measurements m + JOIN sensors s ON (m.sensors_id = s.sensors_id) + JOIN thresholds t ON (s.measurands_id = t.measurands_id) + AND m.value > t.value + GROUP BY 1, 2 + ON CONFLICT (sensors_id, threshold_value) DO UPDATE SET + datetime_latest = GREATEST(sensor_exceedances.datetime_latest, EXCLUDED.datetime_latest) + , updated_on = now(); + + +INSERT INTO sensors_rollup ( + sensors_id + , datetime_first + , datetime_last + , value_latest + , value_count + , value_avg + , value_sd + , value_min + , value_max + , geom_latest + ) +---- identify the row that has the latest value +WITH numbered AS ( + SELECT sensors_id + , datetime + , value + , lat + , lon + , sum(1) OVER (PARTITION BY sensors_id) as value_count + , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min + , avg(value) OVER (PARTITION BY sensors_id) as value_avg + , stddev(value) OVER (PARTITION BY sensors_id) as value_sd + , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn + FROM staging_inserted_measurements +), latest AS ( +---- only insert those rows + SELECT sensors_id + , datetime + , value + , value_count + , value_avg + , value_sd + , datetime_min + , lat + , lon + FROM numbered + WHERE rn = 1 +) +SELECT l.sensors_id +, l.datetime_min -- first +, l.datetime -- last +, l.value -- last value +, l.value_count +, l.value_avg +, COALESCE(l.value_sd, 0) +, l.value -- min +, l.value -- max +, public.pt3857(lon, lat) +FROM latest l +LEFT JOIN sensors_rollup sr ON (l.sensors_id = sr.sensors_id) +WHERE sr.sensors_id IS NULL +OR l.datetime > sr.datetime_last +OR l.datetime_min < sr.datetime_first +ON CONFLICT (sensors_id) DO UPDATE +SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_last) +, value_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.value_latest + ELSE sensors_rollup.value_latest + END +, geom_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.geom_latest + ELSE sensors_rollup.geom_latest + END +, value_count = sensors_rollup.value_count + EXCLUDED.value_count +, value_min = LEAST(sensors_rollup.value_min, EXCLUDED.value_latest) +, value_max = GREATEST(sensors_rollup.value_max, EXCLUDED.value_latest) +, datetime_first = LEAST(sensors_rollup.datetime_first, EXCLUDED.datetime_first) +, modified_on = now() +--, fetchlogs_id = EXCLUDED.fetchlogs_id +; + + +-- Update the table that will help to track hourly rollups +-- this is a replacement to the hourly stats table + WITH inserted_hours AS ( + -- first we group things, adding an hour to make it time-ending after truncating + SELECT datetime + '1h'::interval as datetime + , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset + FROM staging_inserted_measurements m + JOIN sensors s ON (s.sensors_id = m.sensors_id) + JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) + JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id) + JOIN timezones tz ON (sn.timezones_id = tz.timezones_id) + GROUP BY 1, 2 + ) + INSERT INTO hourly_data_queue (datetime, tz_offset) + SELECT as_utc_hour(datetime, tz_offset), tz_offset + FROM inserted_hours + GROUP BY 1, 2 + ON CONFLICT (datetime, tz_offset) DO UPDATE + SET modified_on = now(); + + +--Update the export queue/logs to export these records +--wrap it in a block just in case the database does not have this module installed +--we subtract the second because the data is assumed to be time ending +WITH e AS ( +INSERT INTO open_data_export_logs (sensor_nodes_id, day, records, measurands, modified_on) +SELECT sn.sensor_nodes_id +, ((m.datetime - '1sec'::interval) AT TIME ZONE (COALESCE(sn.metadata->>'timezone', 'UTC'))::text)::date as day +, COUNT(1) +, COUNT(DISTINCT p.measurands_id) +, MAX(now()) +FROM staging_inserted_measurements m -- meas m +JOIN sensors s ON (m.sensors_id = s.sensors_id) +JOIN measurands p ON (s.measurands_id = p.measurands_id) +JOIN sensor_systems ss ON (s.sensor_systems_id = ss.sensor_systems_id) +JOIN sensor_nodes sn ON (ss.sensor_nodes_id = sn.sensor_nodes_id) +GROUP BY sn.sensor_nodes_id +, ((m.datetime - '1sec'::interval) AT TIME ZONE (COALESCE(sn.metadata->>'timezone', 'UTC'))::text)::date +ON CONFLICT (sensor_nodes_id, day) DO UPDATE +SET records = EXCLUDED.records +, measurands = EXCLUDED.measurands +, modified_on = EXCLUDED.modified_on +RETURNING 1) +SELECT COUNT(1) INTO __exported_days +FROM e; + + +__cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +INSERT INTO ingest_stats ( + ingest_method + -- total + , total_measurements_processed + , total_measurements_inserted + , total_measurements_rejected + , total_nodes_processed + , total_nodes_inserted + , total_nodes_updated + , total_nodes_rejected + -- total times + , total_process_time_ms + , total_insert_time_ms + , total_cache_time_ms + -- latest + , latest_measurements_processed + , latest_measurements_inserted + , latest_measurements_rejected + , latest_nodes_processed + , latest_nodes_inserted + , latest_nodes_updated + , latest_nodes_rejected + -- times + , latest_process_time_ms + , latest_insert_time_ms + , latest_cache_time_ms + ) VALUES ( + -- totals + __ingest_method + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms + -- latest + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms +) ON CONFLICT (ingest_method) DO UPDATE SET + -- totals + total_measurements_processed = ingest_stats.total_measurements_processed + EXCLUDED.total_measurements_processed + , total_measurements_inserted = ingest_stats.total_measurements_inserted + EXCLUDED.total_measurements_inserted + , total_measurements_rejected = ingest_stats.total_measurements_rejected + EXCLUDED.total_measurements_rejected + , total_nodes_processed = ingest_stats.total_nodes_processed + EXCLUDED.total_nodes_processed + , total_nodes_inserted = ingest_stats.total_nodes_inserted + EXCLUDED.total_nodes_inserted + , total_nodes_updated = ingest_stats.total_nodes_updated + EXCLUDED.total_nodes_updated + , total_nodes_rejected = ingest_stats.total_nodes_rejected + EXCLUDED.total_nodes_rejected + , total_process_time_ms = ingest_stats.total_process_time_ms + EXCLUDED.total_process_time_ms + , total_insert_time_ms = ingest_stats.total_insert_time_ms + EXCLUDED.total_insert_time_ms + , total_cache_time_ms = ingest_stats.total_cache_time_ms + EXCLUDED.total_cache_time_ms + -- latest + , latest_measurements_processed = EXCLUDED.latest_measurements_processed + , latest_measurements_inserted = EXCLUDED.latest_measurements_inserted + , latest_measurements_rejected = EXCLUDED.latest_measurements_rejected + , latest_nodes_processed = EXCLUDED.latest_nodes_processed + , latest_nodes_inserted = EXCLUDED.latest_nodes_inserted + , latest_nodes_updated = EXCLUDED.latest_nodes_updated + , latest_nodes_rejected = EXCLUDED.latest_nodes_rejected + -- times + , latest_process_time_ms = EXCLUDED.latest_process_time_ms + , latest_insert_time_ms = EXCLUDED.latest_insert_time_ms + , latest_cache_time_ms = EXCLUDED.latest_cache_time_ms + , ingest_count = ingest_stats.ingest_count + 1 + , ingested_on = EXCLUDED.ingested_on; + + +RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: lcs' + , __inserted_measurements + , __inserted_start_datetime + , __inserted_end_datetime + , __rejected_measurements + , __exported_days + , __process_time_ms + , __insert_time_ms + , __cache_time_ms; + + +EXCEPTION WHEN OTHERS THEN + GET STACKED DIAGNOSTICS __error_context = PG_EXCEPTION_CONTEXT; + RAISE NOTICE 'Failed to ingest measurements: %, %', SQLERRM, __error_context; + +END $$; diff --git a/ingest/etl_process_nodes.sql b/ingest/etl_process_nodes.sql new file mode 100644 index 0000000..5b78e6c --- /dev/null +++ b/ingest/etl_process_nodes.sql @@ -0,0 +1,420 @@ +-- lcs_ingest_full +DO $$ +DECLARE +__process_start timestamptz := clock_timestamp(); +__inserted_nodes int; +__inserted_sensors int; +__rejected_nodes int; +__rejected_systems int; +__rejected_sensors int; +__rejected_measurands int; + +BEGIN + +-------------------------- +-- lcs_ingest_nodes.sql -- +-------------------------- + +DELETE +FROM staging_sensornodes +WHERE staging_sensornodes.ingest_id IS NULL; + +DELETE +FROM staging_sensorsystems +WHERE staging_sensorsystems.ingest_id IS NULL +OR ingest_sensor_nodes_id IS NULL; + +DELETE +FROM staging_sensors +WHERE staging_sensors.ingest_id IS NULL +OR ingest_sensor_systems_id IS NULL; + +UPDATE staging_sensors +SET units = 'µg/m³' +WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); + + + +-- match the locations to existing nodes using the source_name/id combo +UPDATE staging_sensornodes +SET sensor_nodes_id = s.sensor_nodes_id +, timezones_id = s.timezones_id +, countries_id = s.countries_id +, is_new = false +, is_moved = st_astext(s.geom) != st_astext(staging_sensornodes.geom) +FROM sensor_nodes s +WHERE s.source_name = staging_sensornodes.source_name +AND s.source_id = staging_sensornodes.source_id +AND ( staging_sensornodes.matching_method IS NULL + OR staging_sensornodes.matching_method = 'ingest-id'); + + +-- now update them using the source + spatial method +UPDATE staging_sensornodes +SET sensor_nodes_id = s.sensor_nodes_id +, timezones_id = s.timezones_id +, countries_id = s.countries_id +, is_new = false +, is_moved = st_astext(s.geom) != st_astext(staging_sensornodes.geom) +FROM sensor_nodes s +WHERE s.source_name = staging_sensornodes.source_name +AND st_distance(staging_sensornodes.geom, s.geom) < 0.00001 -- about 1.11 meters difference +AND staging_sensornodes.matching_method = 'source-spatial'; + + +-- only update the nodes where the geom has changed +-- the geom queries are really slow so we dont want to be doing that all the time +-- ~18 locations per second +UPDATE staging_sensornodes SET + timezones_id = get_timezones_id(geom) +, countries_id = get_countries_id(geom) +WHERE is_new + OR is_moved + OR timezones_id IS NULL + OR countries_id IS NULL; + + +-- we are going to update the source_id where we are matching via geometry +-- for ingest-id matches this should not matter. +UPDATE sensor_nodes +SET source_id = COALESCE(s.source_id, sensor_nodes.source_id) + , geom = COALESCE(s.geom, sensor_nodes.geom) + , site_name = COALESCE(s.site_name, sensor_nodes.site_name) + , timezones_id = COALESCE(s.timezones_id, sensor_nodes.timezones_id) + , countries_id = COALESCE(s.countries_id, sensor_nodes.countries_id) + , ismobile = COALESCE(s.ismobile, sensor_nodes.ismobile) + , metadata = COALESCE(s.metadata, '{}') || COALESCE(sensor_nodes.metadata, '{}') + , modified_on = now() +FROM staging_sensornodes s +WHERE sensor_nodes.sensor_nodes_id = s.sensor_nodes_id; + + +-- And now we insert those into the sensor nodes table +WITH inserts AS ( +INSERT INTO sensor_nodes ( + site_name +, source_name +, ismobile +, geom +, metadata +, source_id +, timezones_id +, providers_id +, countries_id +) +SELECT site_name +, source_name +, ismobile +, geom +, metadata +, source_id +, timezones_id +-- default to the unknown provider +-- just to make sure we have one set +, COALESCE(get_providers_id(source_name), 1) +, countries_id +FROM staging_sensornodes +WHERE sensor_nodes_id IS NULL +ON CONFLICT (source_name, source_id) DO UPDATE +SET + site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name) + , source_id=COALESCE(EXCLUDED.source_id, sensor_nodes.source_id) + , ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile) + , geom=coalesce(EXCLUDED.geom,sensor_nodes.geom) + , metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , timezones_id = COALESCE(EXCLUDED.timezones_id, sensor_nodes.timezones_id) + , providers_id = COALESCE(EXCLUDED.providers_id, sensor_nodes.providers_id) + , modified_on = now() +RETURNING 1) +SELECT COUNT(1) INTO __inserted_nodes +FROM inserts; + +---------------------------- +-- lcs_ingest_systems.sql -- +---------------------------- + +-- fill in any new sensor_nodes_id +UPDATE staging_sensornodes +SET sensor_nodes_id = sensor_nodes.sensor_nodes_id +FROM sensor_nodes +WHERE staging_sensornodes.sensor_nodes_id is null +AND sensor_nodes.source_name = staging_sensornodes.source_name +AND sensor_nodes.source_id = staging_sensornodes.source_id; + +-- log anything we were not able to get an id for +WITH r AS ( +INSERT INTO rejects (t, tbl,r,fetchlogs_id) +SELECT now() +, 'staging_sensornodes-missing-nodes-id' +, to_jsonb(staging_sensornodes) +, fetchlogs_id +FROM staging_sensornodes +WHERE sensor_nodes_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_nodes +FROM r; + +-------------------- +-- Sensor Systems -- +-------------------- + + +-- make sure that we have a system entry for every ingest_id +-- this is to deal with fetchers that do not add these data +INSERT INTO staging_sensorsystems (sensor_nodes_id, ingest_id, fetchlogs_id, metadata) +SELECT sensor_nodes_id +--, source_id -- the ingest_id has the source_name in it and we dont need/want that +, ingest_id +, fetchlogs_id +, '{"note":"automatically added for sensor node"}' +FROM staging_sensornodes +WHERE is_new AND ingest_id NOT IN (SELECT ingest_sensor_nodes_id FROM staging_sensorsystems) +ON CONFLICT (ingest_id) DO UPDATE + SET sensor_nodes_id = EXCLUDED.sensor_nodes_id + ; + +-- Now match the sensor nodes to the system +UPDATE staging_sensorsystems +SET sensor_nodes_id = staging_sensornodes.sensor_nodes_id +FROM staging_sensornodes +WHERE staging_sensorsystems.ingest_sensor_nodes_id = staging_sensornodes.ingest_id; + +-- And match to any existing sensor systems +UPDATE staging_sensorsystems +SET sensor_systems_id = sensor_systems.sensor_systems_id +, is_new = false +FROM sensor_systems +WHERE sensor_systems.sensor_nodes_id = staging_sensorsystems.sensor_nodes_id +AND sensor_systems.source_id = staging_sensorsystems.ingest_id; + + +-- log anything we were not able to get an id for +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT now() +, 'staging_sensorsystems-missing-nodes-id' +, to_jsonb(staging_sensorsystems) +, fetchlogs_id +FROM staging_sensorsystems +WHERE sensor_nodes_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_systems +FROM r; + +-- And finally we add/update the sensor systems +INSERT INTO sensor_systems (sensor_nodes_id, source_id, instruments_id, metadata) +SELECT sensor_nodes_id +, s.ingest_id +, i.instruments_id +, metadata +FROM staging_sensorsystems s +LEFT JOIN instruments i ON (s.instrument_ingest_id = i.ingest_id) +WHERE sensor_nodes_id IS NOT NULL +GROUP BY sensor_nodes_id, s.ingest_id, instruments_id, metadata +ON CONFLICT (sensor_nodes_id, source_id) DO UPDATE SET + metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , instruments_id = EXCLUDED.instruments_id + , modified_on = now(); + +---------------------------- +-- lcs_ingest_sensors.sql -- +---------------------------- + +-- Match the sensor system data +UPDATE staging_sensorsystems +SET sensor_systems_id = sensor_systems.sensor_systems_id +FROM sensor_systems +WHERE staging_sensorsystems.sensor_systems_id IS NULL +AND staging_sensorsystems.sensor_nodes_id=sensor_systems.sensor_nodes_id +AND staging_sensorsystems.ingest_id=sensor_systems.source_id +; + +WITH r AS ( +INSERT INTO rejects (t, tbl,r,fetchlogs_id) +SELECT + now() +, 'staging_sensorsystems-missing-systems-id' +, to_jsonb(staging_sensorsystems) +, fetchlogs_id +FROM staging_sensorsystems +WHERE sensor_systems_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_systems +FROM r; + +------------- +-- SENSORS -- +------------- + + -- We do not want to create default sensors because we are not dealling with measurements here +UPDATE staging_sensors +SET sensor_systems_id = staging_sensorsystems.sensor_systems_id +FROM staging_sensorsystems +WHERE staging_sensors.ingest_sensor_systems_id = staging_sensorsystems.ingest_id; + +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT + now() +, 'staging_sensors-missing-systems-id' +, to_jsonb(staging_sensors) +, fetchlogs_id +FROM staging_sensors +WHERE sensor_systems_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_sensors +FROM r; + + +UPDATE staging_sensors +SET sensors_id = sensors.sensors_id +FROM sensors +WHERE sensors.sensor_systems_id=staging_sensors.sensor_systems_id +AND sensors.source_id = staging_sensors.ingest_id; + + +UPDATE staging_sensors +SET measurands_id = m.measurands_id +FROM (SELECT measurand, MIN(measurands_id) AS measurands_id FROM measurands GROUP BY measurand) as m +WHERE staging_sensors.measurand=m.measurand +--AND staging_sensors.units=measurands.units +; + + +WITH r AS ( +INSERT INTO rejects (t, tbl,r,fetchlogs_id) +SELECT + now() +, 'staging_sensors-missing-measurands-id' +, to_jsonb(staging_sensors) +, fetchlogs_id +FROM staging_sensors +WHERE measurands_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_measurands +FROM r; + +WITH inserts AS ( +INSERT INTO sensors ( + source_id +, sensor_systems_id +, measurands_id +, data_logging_period_seconds +, data_averaging_period_seconds +, sensor_statuses_id +, metadata) +SELECT ingest_id +, sensor_systems_id +, measurands_id +, logging_interval_seconds +, averaging_interval_seconds +, COALESCE(ss.sensor_statuses_id, 1) +, metadata +FROM staging_sensors s +LEFT JOIN sensor_statuses ss ON (ss.short_code = s.status) +WHERE measurands_id is not null +AND sensor_systems_id is not null +GROUP BY ingest_id +, sensor_systems_id +, measurands_id +, logging_interval_seconds +, averaging_interval_seconds +, ss.sensor_statuses_id +, metadata +ON CONFLICT (sensor_systems_id, measurands_id, source_id) DO UPDATE +SET metadata = COALESCE(sensors.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , data_logging_period_seconds = EXCLUDED.data_logging_period_seconds + , data_averaging_period_seconds = EXCLUDED.data_averaging_period_seconds + , sensor_statuses_id = EXCLUDED.sensor_statuses_id + , modified_on = now() +RETURNING 1) +SELECT COUNT(1) INTO __inserted_sensors +FROM inserts; + +UPDATE staging_sensors +SET sensors_id = sensors.sensors_id +FROM sensors +WHERE sensors.sensor_systems_id=staging_sensors.sensor_systems_id +AND sensors.source_id = staging_sensors.ingest_id; + +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT + now() + , 'staging_sensors-missing-sensors-id' + , to_jsonb(staging_sensors) + , fetchlogs_id +FROM staging_sensors +WHERE sensors_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_sensors +FROM r; + + +-- update the period so that we dont have to keep doing it later +-- we could do this on import as well if we feel this is slowing us down +UPDATE staging_flags + SET period = tstzrange(COALESCE(datetime_from, '-infinity'::timestamptz),COALESCE(datetime_to, 'infinity'::timestamptz), '[]'); + +-- Now we have to match things +-- get the right node id and sensors id for the flags +UPDATE staging_flags +SET sensors_id = s.sensors_id + , sensor_nodes_id = sy.sensor_nodes_id +FROM sensors s +JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) +WHERE staging_flags.sensor_ingest_id = s.source_id; + +-- and then get the right flags_id +UPDATE staging_flags +SET flag_types_id = ft.flag_types_id +FROM flag_types ft +WHERE split_part(staging_flags.ingest_id, '::', 1) = ft.ingest_id; + +-- now we should look to see if we should be just extending a flag +UPDATE staging_flags sf + SET flags_id = fm.flags_id + FROM flags fm + -- where the core information is the same (exactly) + WHERE sf.sensor_nodes_id = fm.sensor_nodes_id + AND sf.flag_types_id = fm.flag_types_id + AND ((sf.note = fm.note) OR (sf.note IS NULL AND fm.note IS NULL)) + -- the periods touch or overlap + AND fm.period && sf.period + -- and the flagged record sensors contains the current sensors + AND fm.sensors_ids @> ARRAY[sf.sensors_id]; + +-- and finally we will insert the new flags +INSERT INTO flags (flag_types_id, sensor_nodes_id, sensors_ids, period, note) + SELECT flag_types_id + , sensor_nodes_id + , CASE WHEN sensors_id IS NOT NULL THEN ARRAY[sensors_id] ELSE NULL END + , period + , note + FROM staging_flags + WHERE flag_types_id IS NOT NULL + AND sensor_nodes_id IS NOT NULL + AND flags_id IS NULL; + +-- And then update any that need to be updated + UPDATE flags fm + SET period = sf.period + fm.period + , note = sf.note + , modified_on = now() + FROM staging_flags sf + WHERE sf.flags_id = fm.flags_id; + + +------------------ +-- Return stats -- +------------------ + +RAISE NOTICE 'inserted-nodes: %, inserted-sensors: %, rejected-nodes: %, rejected-sensors: %, rejected-measurands: %, process-time-ms: %, source: lcs' + , __inserted_nodes + , __inserted_sensors + , __rejected_nodes + , __rejected_sensors + , __rejected_measurands + , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +END $$; diff --git a/ingest/fetch.py b/ingest/fetch.py index 450fca3..3f70892 100644 --- a/ingest/fetch.py +++ b/ingest/fetch.py @@ -1,10 +1,10 @@ import gzip import io -import json import os import logging -import time +from time import time from datetime import datetime, timedelta +import orjson import boto3 import psycopg2 @@ -15,15 +15,17 @@ StringIteratorIO, clean_csv_value, get_query, + get_data, load_fail, load_success, + load_fetchlogs, ) app = typer.Typer() dir_path = os.path.dirname(os.path.realpath(__file__)) -logger = logging.getLogger(__name__) +logger = logging.getLogger('fetch') FETCH_BUCKET = settings.FETCH_BUCKET s3 = boto3.resource("s3") @@ -64,7 +66,7 @@ def parse_json(j, key: str = None): else: coords = None - data = json.dumps(j) + data = orjson.dumps(j).decode() row = [ location, @@ -87,11 +89,14 @@ def parse_json(j, key: str = None): def create_staging_table(cursor): - cursor.execute(get_query("fetch_staging.sql")) + cursor.execute(get_query( + "fetch_staging.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) -def copy_data(cursor, key): - obj = s3.Object(FETCH_BUCKET, key) +def copy_data(cursor, key, fetchlogsId=None): + #obj = s3.Object(FETCH_BUCKET, key) # This should not be checked here, # if we ask it to copy data it should do that # if we want to prevent duplicate attemps we should @@ -102,13 +107,15 @@ def copy_data(cursor, key): # we are also removing the try/catch # if it fails we want to deal with it elsewhere logger.debug(f"Copying data for {key}") - with gzip.GzipFile(fileobj=obj.get()["Body"]) as gz: - f = io.BufferedReader(gz) + with get_data(key) as f: + # make sure that the file is complete iterator = StringIteratorIO( - (parse_json(json.loads(line)) for line in f) + (f"{fetchlogsId}\t"+parse_json(orjson.loads(line)) for line in f) ) + query = """ COPY tempfetchdata ( + fetchlogs_id, location, value, unit, @@ -125,6 +132,7 @@ def copy_data(cursor, key): avpd_value ) FROM STDIN; """ + logger.debug("Loading data from STDIN") cursor.copy_expert(query, iterator) @@ -132,7 +140,7 @@ def copy_file(cursor, file): with gzip.GzipFile(file) as gz: f = io.BufferedReader(gz) iterator = StringIteratorIO( - (parse_json(json.loads(line)) for line in f) + (parse_json(orjson.loads(line)) for line in f) ) try: query = get_query("fetch_copy.sql") @@ -141,11 +149,17 @@ def copy_file(cursor, file): # load_success(cursor, file) except Exception as e: + logger.warning(f'File copy failed: {e}') load_fail(cursor, file, e) def process_data(cursor): - query = get_query("fetch_ingest_full.sql") + # see file for details on how + # to use the variables + query = get_query( + "fetch_ingest_full.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + ) cursor.execute(query) # if results: # mindate, maxdate = results @@ -201,7 +215,7 @@ def load_fetch_file(file: str): @app.command() def load_fetch_day(day: str): - start = time.time() + start = time() conn = boto3.client("s3") prefix = f"realtime-gzipped/{day}" keys = [] @@ -221,7 +235,7 @@ def load_fetch_day(day: str): create_staging_table(cursor) for key in keys: copy_data(cursor, key) - print(f"All data copied {time.time()-start}") + print(f"All data copied {time()-start}") filter_data(cursor) mindate, maxdate = process_data(cursor) update_rollups(cursor, mindate=mindate, maxdate=maxdate) @@ -272,62 +286,69 @@ def submit_file_error(ids, e): @app.command() def load_db(limit: int = 50, ascending: bool = False): - order = 'ASC' if ascending else 'DESC' - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: - connection.set_session(autocommit=True) - with connection.cursor() as cursor: - cursor.execute( - f""" - SELECT key - ,last_modified - ,fetchlogs_id - FROM fetchlogs - WHERE key~E'^realtime-gzipped/.*\\.ndjson.gz$' - AND completed_datetime is null - ORDER BY last_modified {order} nulls last - LIMIT %s - ; - """, - (limit,), - ) - rows = cursor.fetchall() - keys = [r[0] for r in rows] - if len(keys) > 0: - try: - load_realtime(keys) - except Exception as e: - # catch and continue to next page - ids = [r[2] for r in rows] - logger.error(f""" - Error processing realtime files: {e}, {ids} - """) - submit_file_error(ids, e) - finally: - connection.commit() + pattern = '^realtime-gzipped/.*\\.ndjson.gz$' + rows = load_fetchlogs(pattern, limit, ascending) + if len(rows) > 0: + try: + load_realtime(rows) + except Exception as e: + # catch and continue to next page + ids = [r[2] for r in rows] + logger.error(f""" + Error processing realtime files: {e}, {ids} + """) + submit_file_error(ids, e) - return len(keys) + return len(rows) -def load_realtime(keys): +def load_realtime(rows): # create a connection and share for all keys + logger.debug(f"Loading {len(rows)} keys") + log_time = -1 + process_time = -1 + copy_time = 0 with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: # create all the data staging table create_staging_table(cursor) + logger.debug('Created realtime staging tables') # now copy all the data - for key in keys: - copy_data(cursor, key) + keys = [] + start = time() + for row in rows: + key = row[1] + fetchlogsId = row[0] + logger.debug(f"Loading {key}, id: {fetchlogsId}") + try: + copy_data(cursor, key, fetchlogsId) + keys.append(key) + copy_time += (time() - start) + except Exception as e: + # all until now is lost + # reset things and try to recover + connection.rollback() + keys = [] + load_fail(cursor, fetchlogsId, e) + break + # finally process the data as one - process_data(cursor) - # we are outputing some stats - for notice in connection.notices: - print(notice) - # mark files as done - load_success(cursor, keys) + if len(keys) > 0: + logger.debug(f"Processing realtime files") + start = time() + process_data(cursor) + process_time = time() - start + # we are outputing some stats + for notice in connection.notices: + logger.info(notice) + # mark files as done + start = time() + load_success(cursor, keys) + log_time = time() - start # close and commit connection.commit() - + return round(copy_time*1000), round(process_time*1000), round(log_time*1000), notice if __name__ == "__main__": app() diff --git a/ingest/fetch_filter.sql b/ingest/fetch_filter.sql deleted file mode 100644 index 156f6ad..0000000 --- a/ingest/fetch_filter.sql +++ /dev/null @@ -1,13 +0,0 @@ -DELETE FROM tempfetchdata -WHERE -datetime <= ( - SELECT max(range_end) - FROM timescaledb_information.chunks - WHERE - hypertable_name IN ('rollups', 'measurements') - AND is_compressed -); -DELETE FROM tempfetchdata WHERE datetime > now(); -DELETE FROM tempfetchdata WHERE datetime < (SELECT max(datetime) - '2 days'::interval from tempfetchdata) -; -SELECT min(datetime), max(datetime) FROM tempfetchdata; \ No newline at end of file diff --git a/ingest/fetch_ingest1.sql b/ingest/fetch_ingest1.sql deleted file mode 100644 index 12ef519..0000000 --- a/ingest/fetch_ingest1.sql +++ /dev/null @@ -1,45 +0,0 @@ -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors AS -WITH t AS ( -SELECT DISTINCT - location as site_name, - unit as units, - parameter as measurand, - country, - city, - jsonb_merge_agg(data) as data, - source_name, - coords::geometry as geom, - source_type, - mobile as ismobile, - avpd_unit, - avpd_value, - coords::geometry as cgeom, - NULL::int as sensor_nodes_id, - null::int as sensor_systems_id, - null::int as measurands_id, - null::int as sensors_id, - null::jsonb as node_metadata, - null::jsonb as sensor_metadata, - array_agg(tfdid) as tfdids -FROM tempfetchdata -GROUP BY - location, - unit, - parameter, - country, - city, - coords, - source_type, - source_name, - mobile, - avpd_unit, - avpd_value, - sensor_nodes_id, - sensor_systems_id, - measurands_id, - sensors_id, - node_metadata, - sensor_metadata -) -SELECT row_number() over () as tfsid, * FROM t; -CREATE INDEX ON tempfetchdata_sensors (tfsid); \ No newline at end of file diff --git a/ingest/fetch_ingest2.sql b/ingest/fetch_ingest2.sql deleted file mode 100644 index 23beb0f..0000000 --- a/ingest/fetch_ingest2.sql +++ /dev/null @@ -1,24 +0,0 @@ --- Cleanup fields - -UPDATE tempfetchdata_sensors t SET -geom = NULL WHERE st_x(geom) = 0 and st_y(geom) =0; - -UPDATE tempfetchdata_sensors SET units = 'µg/m³' -WHERE units IN ('µg/m��','��g/m³'); - -UPDATE tempfetchdata_sensors SET -node_metadata = - jsonb_strip_nulls( - COALESCE(data, '{}'::jsonb) - || - jsonb_build_object( - 'source_type', - 'government', - 'origin', - 'openaq' - ) - ), -sensor_metadata = jsonb_strip_nulls(jsonb_build_object( - 'data_averaging_period_seconds', avpd_value * 3600 - )) -; \ No newline at end of file diff --git a/ingest/fetch_ingest3.sql b/ingest/fetch_ingest3.sql deleted file mode 100644 index 1a65a4d..0000000 --- a/ingest/fetch_ingest3.sql +++ /dev/null @@ -1,79 +0,0 @@ -/* -CREATE TEMP TABLE tempfetchdata_nodes AS -SELECT * FROM (SELECT - first_notnull(site_name) as site_name, - first_notnull(source_name) as source_name, - first_notnull(country) as country, - first_notnull(city) as city, - --jsonb_merge_agg(node_metadata) as metadata, - first_notnull(ismobile) as ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - st_centroid(st_collect(geom)) as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NOT NULL -GROUP BY - sensor_nodes_id,st_snaptogrid(geom, .0001) -) AS wgeom -UNION ALL -SELECT * FROM -(SELECT - site_name, - source_name, - first_notnull(country) as country, - first_notnull(city) as city, - --jsonb_merge_agg(node_metadata) as metadata, - first_notnull(ismobile) as ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - null::geometry as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NULL -AND site_name is not null -and source_name is not null -GROUP BY - site_name, source_name, sensor_nodes_id -) as nogeom -; -*/ - -CREATE TEMP TABLE tempfetchdata_nodes AS -SELECT * FROM (SELECT - site_name, - source_name, - country, - city, - node_metadata as metadata, - ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - st_centroid(st_collect(geom)) as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NOT NULL -GROUP BY - 1,2,3,4,5,6,7,8,st_snaptogrid(geom, .0001) -) AS wgeom -UNION ALL -SELECT * FROM -(SELECT - site_name, - source_name, - country, - city, - node_metadata as metadata, - ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - null::geometry as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NULL -AND site_name is not null -and source_name is not null -GROUP BY - 1,2,3,4,5,6,7,8,9 -) as nogeom -; diff --git a/ingest/fetch_ingest4.sql b/ingest/fetch_ingest4.sql deleted file mode 100644 index 6c6ae00..0000000 --- a/ingest/fetch_ingest4.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Lookup Node Ids - -UPDATE tempfetchdata_nodes t -SET sensor_nodes_id = sn.sensor_nodes_id FROM -sensor_nodes sn -WHERE t.geom is not null -AND st_dwithin(sn.geom, t.geom, .0001) -AND origin='OPENAQ'; - -UPDATE tempfetchdata_nodes t -SET sensor_nodes_id = sn.sensor_nodes_id FROM -sensor_nodes sn -WHERE -t.sensor_nodes_id is null AND -t.site_name is not null -and t.source_name is not null -and t.site_name = sn.site_name -and t.source_name=sn.source_name -and origin='OPENAQ'; \ No newline at end of file diff --git a/ingest/fetch_ingest5.sql b/ingest/fetch_ingest5.sql deleted file mode 100644 index 644dfb5..0000000 --- a/ingest/fetch_ingest5.sql +++ /dev/null @@ -1,35 +0,0 @@ --- Update any records that have changed - -UPDATE sensor_nodes s SET - site_name = COALESCE(t.site_name, s.site_name), - source_name = COALESCE(t.source_name, s.source_name), - city = COALESCE(t.city, s.city), - country = COALESCE(t.country, s.country), - ismobile = COALESCE(t.ismobile, s.ismobile), - metadata = COALESCE(s.metadata, '{}'::jsonb) || t.metadata, - geom = COALESCE(t.geom, s.geom) -FROM tempfetchdata_nodes t -WHERE t.sensor_nodes_id = s.sensor_nodes_id AND -( - (s.geom IS NULL and t.geom IS NOT NULL) -OR - - ROW( - t.sensor_nodes_id, - t.ismobile, - t.site_name, - t.source_name, - t.city, - t.country, - t.metadata - ) IS DISTINCT FROM ( - s.sensor_nodes_id, - s.ismobile, - s.site_name, - s.source_name, - s.city, - s.country, - s.metadata - ) -) -; diff --git a/ingest/fetch_ingest6.sql b/ingest/fetch_ingest6.sql deleted file mode 100644 index 2025749..0000000 --- a/ingest/fetch_ingest6.sql +++ /dev/null @@ -1,27 +0,0 @@ --- Create new nodes where they don't exist -WITH sn AS ( -INSERT INTO sensor_nodes ( - site_name, - metadata, - geom, - source_name, - city, - country, - ismobile -) -SELECT - site_name, - metadata, - geom, - source_name, - city, - country, - ismobile -FROM tempfetchdata_nodes t -WHERE t.sensor_nodes_id is NULL -RETURNING * -) -UPDATE tempfetchdata_nodes tf SET sensor_nodes_id = sn.sensor_nodes_id -FROM sn WHERE tf.sensor_nodes_id is null -and row(tf.site_name, tf.geom, tf.source_name) is not distinct -from row(sn.site_name, sn.geom, sn.source_name); \ No newline at end of file diff --git a/ingest/fetch_ingest7.sql b/ingest/fetch_ingest7.sql deleted file mode 100644 index 6df2009..0000000 --- a/ingest/fetch_ingest7.sql +++ /dev/null @@ -1,151 +0,0 @@ --- Get sensor systems - - -UPDATE tempfetchdata_nodes t -SET sensor_systems_id = ss.sensor_systems_id FROM -sensor_systems ss -WHERE t.sensor_nodes_id = ss.sensor_nodes_id; - --- Add any rows that did not get an id --- into the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp(), 'sensor_nodes', to_jsonb(tf) FROM -tempfetchdata_nodes tf WHERE sensor_nodes_id IS NULL; -DELETE FROM tempfetchdata_nodes WHERE sensor_nodes_id IS NULL; - --- create sensor systems that don't exist -WITH ss AS ( -INSERT INTO sensor_systems (sensor_nodes_id) -SELECT DISTINCT sensor_nodes_id FROM tempfetchdata_nodes t -WHERE t.sensor_systems_id is NULL AND t.sensor_nodes_id IS NOT NULL -RETURNING * -) UPDATE tempfetchdata_nodes tf -SET sensor_systems_id = ss.sensor_systems_id -FROM ss WHERE tf.sensor_nodes_id=ss.sensor_nodes_id -and tf.sensor_systems_id is null; - --- Add any rows that did not get an id --- into the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp(), 'sensor_systems', to_jsonb(tf) FROM -tempfetchdata_nodes tf WHERE sensor_systems_id IS NULL; -DELETE FROM tempfetchdata_nodes WHERE sensor_systems_id IS NULL; - --- merge sensor node / system ids back to sensors table -UPDATE tempfetchdata_sensors ts SET - sensor_nodes_id = tn.sensor_nodes_id, - sensor_systems_id = tn.sensor_systems_id -FROM - tempfetchdata_nodes tn -WHERE - ts.tfsid = ANY(tn.tfsids); - - --- add any measurands that don't exist -UPDATE tempfetchdata_sensors t SET measurands_id= m.measurands_id FROM -measurands m -WHERE t.measurand = m.measurand AND t.units = m.units; - -WITH m AS ( -INSERT INTO measurands (measurand, units) -SELECT DISTINCT measurand, units FROM tempfetchdata_sensors t -WHERE t.measurands_id is NULL -RETURNING * -) UPDATE tempfetchdata_sensors tf SET measurands_id = m.measurands_id -FROM m WHERE tf.measurand=m.measurand -and tf.units=m.units and tf.measurands_id is null; - --- get cleaned sensors table -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors_clean AS -SELECT - null::int as sensors_id, - sensor_nodes_id, - sensor_systems_id, - measurands_id, - jsonb_merge_agg(sensor_metadata) as metadata, - array_merge_agg(tfdids) as tfdids -FROM tempfetchdata_sensors -GROUP BY 1,2,3,4; - - --- get sensor id -UPDATE tempfetchdata_sensors_clean t -SET sensors_id = s.sensors_id -FROM sensors s -WHERE t.sensor_systems_id = s.sensor_systems_id -AND t.measurands_id = s.measurands_id -; - --- Add any rows that did not get an id --- into the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp() -, 'sensors' -, to_jsonb(tf) -FROM tempfetchdata_sensors_clean tf -WHERE sensor_systems_id IS NULL -OR measurands_id IS NULL; - -DELETE -FROM tempfetchdata_sensors_clean -WHERE sensor_systems_id IS NULL -OR measurands_id IS NULL; - --- add any sensors that don't exist -WITH s AS ( - INSERT INTO sensors ( - sensor_systems_id, - measurands_id, - metadata - ) - SELECT - sensor_systems_id, - measurands_id, - metadata - FROM - tempfetchdata_sensors_clean tf - WHERE - tf.sensors_id IS NULL - RETURNING * -) UPDATE tempfetchdata_sensors_clean tfc - SET - sensors_id = s.sensors_id - FROM s - WHERE - tfc.sensors_id IS NULL - AND - s.sensor_systems_id = tfc.sensor_systems_id - AND - s.measurands_id = tfc.measurands_id -; - -UPDATE tempfetchdata t -SET sensors_id = ts.sensors_id -FROM tempfetchdata_sensors_clean ts -WHERE t.tfdid = ANY(ts.tfdids); - --- Add any rows that did not get an id into --- the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp() -, 'sensors' -, to_jsonb(tf) -FROM tempfetchdata tf -WHERE sensors_id IS NULL; - -DELETE -FROM tempfetchdata -WHERE sensors_id IS NULL; - -INSERT INTO measurements (sensors_id, datetime, value) -SELECT sensors_id, datetime, value -FROM tempfetchdata -ON CONFLICT DO NOTHING; - - -UPDATE fetchlogs -SET completed_datetime=clock_timestamp() -, last_message = NULL -- reset any previous error -WHERE key IN (SELECT key FROM ingestfiles); - -SELECT min(datetime), max(datetime) FROM tempfetchdata; diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index 089c647..3234d53 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -1,17 +1,20 @@ --- Get sensor systems +-- fetch_ingest_full DO $$ DECLARE __process_start timestamptz := clock_timestamp(); +__min_measurement_date date := '1970-01-01'::date; +__max_measurement_date date := current_date + 1; __total_measurements int; +__total_nodes int; __updated_nodes int; __inserted_nodes int; __inserted_sensors int; __inserted_measurements int; __inserted_measurands int; -__rejected_nodes int; +__rejected_nodes int := 0; __rejected_systems int; __rejected_sensors int; -__rejected_measurements int; +__rejected_measurements int := 0; __start_datetime timestamptz; __end_datetime timestamptz; __inserted_start_datetime timestamptz; @@ -20,41 +23,22 @@ __deleted_timescaledb int; __deleted_future_measurements int; __deleted_past_measurements int; __exported_days int; +__process_time_ms int; +__insert_time_ms int; +__node_time_ms int; +__cache_time_ms int; +__ingest_method text := 'realtime'; +__inserted_spatial_rollups int := 0; BEGIN -SELECT now() INTO __process_start; +-- REQUIRED +-- {table} should be `TEMP TABLE` in production but could be changed to +-- just `TABLE` if you are debugging and want the temp tables to persist --------------------------- -- File fetch_filter.sql -- --------------------------- --- Note: I am including this because it already existed --- I am not sure why its here --- update: it is likely here because we cannot insert data into --- compressed partitions - -WITH deletes AS ( - DELETE - FROM tempfetchdata - WHERE datetime <= ( - SELECT COALESCE(max(range_end), '1970-01-01'::timestamp) - FROM timescaledb_information.chunks - WHERE hypertable_name IN ('rollups', 'measurements') - AND is_compressed - ) - RETURNING 1) -SELECT COUNT(1) INTO __deleted_timescaledb -FROM deletes; - --- This makes sense though we should track in case its systemic -WITH deletes AS ( - DELETE - FROM tempfetchdata - WHERE datetime > now() - RETURNING 1) -SELECT COUNT(1) INTO __deleted_future_measurements -FROM deletes; - -- this seems questionable, I dont want to pass data to this -- process only to have some of it filtered out because its too old -- Commenting this out because it will prevent us from submitting patch @@ -67,8 +51,15 @@ FROM deletes; -- SELECT COUNT(1) INTO __deleted_past_measurements -- FROM deletes; ----------------------------------- +-- use the partitions to determine start and end date +SELECT partition_start_date + , partition_end_date +INTO __min_measurement_date + , __max_measurement_date +FROM data_table_stats +WHERE table_name = 'public.measurements'; +--------------------------------- -- start with simple count SELECT COUNT(1) , MIN(datetime) @@ -76,13 +67,14 @@ SELECT COUNT(1) INTO __total_measurements , __start_datetime , __end_datetime -FROM tempfetchdata; +FROM tempfetchdata +WHERE datetime <= now(); -- Now we start the old fetch_ingest#.sql files ------------- -- File #1 -- ------------- -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors AS +CREATE {table} IF NOT EXISTS tempfetchdata_sensors AS WITH t AS ( SELECT DISTINCT location as site_name, @@ -97,14 +89,15 @@ SELECT DISTINCT mobile as ismobile, avpd_unit, avpd_value, - coords::geometry as cgeom, - NULL::int as sensor_nodes_id, +-- coords::geometry as cgeom, + null::int as sensor_nodes_id, null::int as sensor_systems_id, null::int as measurands_id, null::int as sensors_id, null::jsonb as node_metadata, null::jsonb as sensor_metadata, - array_agg(tfdid) as tfdids + array_agg(tfdid) as tfdids, + fetchlogs_id FROM tempfetchdata GROUP BY location, @@ -123,10 +116,12 @@ GROUP BY measurands_id, sensors_id, node_metadata, - sensor_metadata + sensor_metadata, + fetchlogs_id ) SELECT row_number() over () as tfsid, * FROM t; + CREATE INDEX ON tempfetchdata_sensors (tfsid); ------------- -- File #2 -- @@ -141,20 +136,21 @@ AND st_y(geom) = 0; UPDATE tempfetchdata_sensors SET units = 'µg/m³' -WHERE units IN ('µg/m��','��g/m³'); +WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); UPDATE tempfetchdata_sensors SET node_metadata = jsonb_strip_nulls( - COALESCE(data, '{}'::jsonb) + COALESCE(data, '{{}}'::jsonb) || jsonb_build_object( - 'source_type', - 'government', - 'origin', - 'openaq' + 'source_type', 'government', + 'origin','openaq', + 'fetchlogs_id', fetchlogs_id ) ), + -- the following assumes that avpd_unit is always hours + -- which at the last check (2022-12-07) it was sensor_metadata = jsonb_strip_nulls(jsonb_build_object( 'data_averaging_period_seconds', avpd_value * 3600 )) @@ -164,23 +160,26 @@ sensor_metadata = jsonb_strip_nulls(jsonb_build_object( -- File #3 -- ------------- -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_nodes AS +CREATE {table} IF NOT EXISTS tempfetchdata_nodes AS SELECT * FROM (SELECT site_name, source_name, country, city, - node_metadata as metadata, + node_metadata::jsonb as metadata, ismobile, null::int as sensor_nodes_id, null::int as sensor_systems_id, null::boolean as added, + null::text as method, st_centroid(st_collect(geom)) as geom, array_agg(tfsid) as tfsids + , array_agg(st_astext(geom)) as points + , COUNT(DISTINCT st_astext(geom)) as n_points FROM tempfetchdata_sensors WHERE geom IS NOT NULL GROUP BY - 1,2,3,4,5,6,7,8,9,st_snaptogrid(geom, .0001) + 1,2,3,4,5,6,7,8,9,st_snaptogrid(geom, .00001) ) AS wgeom UNION ALL SELECT * FROM @@ -189,22 +188,29 @@ SELECT * FROM source_name, country, city, - node_metadata as metadata, + node_metadata::jsonb as metadata, ismobile, null::int as sensor_nodes_id, null::int as sensor_systems_id, null::boolean as added, + null::text as method, null::geometry as geom, array_agg(tfsid) as tfsids + , null::text[] as points + , 0 as n_points FROM tempfetchdata_sensors WHERE geom IS NULL -AND site_name is not null -and source_name is not null +AND site_name IS NOT NULL +AND source_name IS NOT NULL GROUP BY 1,2,3,4,5,6,7,8,9,10 ) as nogeom ; +SELECT COUNT(1) +INTO __total_nodes +FROM tempfetchdata_nodes; + ------------- -- File #4 -- ------------- @@ -214,6 +220,7 @@ GROUP BY UPDATE tempfetchdata_nodes t SET sensor_nodes_id = sn.sensor_nodes_id , added = FALSE +, method = 'spatial' FROM sensor_nodes sn WHERE t.geom IS NOT NULL AND st_dwithin(sn.geom, t.geom, .0001) @@ -222,18 +229,76 @@ AND origin='OPENAQ'; UPDATE tempfetchdata_nodes t SET sensor_nodes_id = sn.sensor_nodes_id , added = FALSE +, method = 'source_id' FROM sensor_nodes sn WHERE t.sensor_nodes_id is null AND t.site_name is not null AND t.source_name is not null AND t.site_name = sn.site_name AND t.source_name=sn.source_name +AND t.geom IS NULL AND origin='OPENAQ'; + +__process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + ------------- -- File #5 -- ------------- +--DROP TABLE IF EXISTS checkrealtime_matched; +-- CREATE TABLE IF NOT EXISTS checkrealtime_matched ( +-- sensor_nodes_id int +-- , site_name text +-- , source_name text +-- , city text +-- , country text +-- , origin text +-- , method text +-- , geom_old geometry +-- , geom_new geometry +-- , added_on timestamptz DEFAULT now() +-- ); + + +-- INSERT INTO checkrealtime_matched +-- SELECT t.sensor_nodes_id +-- , format('%s -> %s', s.site_name, t.site_name) +-- , format('%s -> %s', s.source_name, t.source_name) +-- , format('%s -> %s', s.city, t.city) +-- , format('%s -> %s', s.country, t.country) +-- , origin +-- , method +-- , s.geom +-- , t.geom +-- FROM tempfetchdata_nodes t +-- JOIN sensor_nodes s ON (t.sensor_nodes_id = s.sensor_nodes_id) +-- WHERE ROW( +-- t.site_name, +-- t.source_name, +-- t.city, +-- t.country, +-- t.metadata +-- ) IS DISTINCT FROM ( +-- s.site_name, +-- s.source_name, +-- s.city, +-- s.country, +-- s.metadata - 'timezone' +-- ); + +-- SELECT sensor_nodes_id +-- , method +-- , site_name +-- , source_name +-- , city +-- , country +-- , ROUND(st_distancesphere(geom_new, geom_old)::numeric, 1) as distance +-- FROM checkrealtime_matched +-- WHERE st_distancesphere(geom_new, geom_old) > 0 +-- GROUP BY 1,2,3,4,5,6, 7 +-- LIMIT 100; + -- Update any records that have changed WITH updates AS ( UPDATE sensor_nodes s SET @@ -242,11 +307,14 @@ UPDATE sensor_nodes s SET city = COALESCE(t.city, s.city), country = COALESCE(t.country, s.country), ismobile = COALESCE(t.ismobile, s.ismobile), - metadata = COALESCE(s.metadata, '{}'::jsonb) || t.metadata, + metadata = COALESCE(s.metadata, '{{}}'::jsonb) || t.metadata, geom = COALESCE(t.geom, s.geom) - --, modified_on = now() + --, timezones_id = get_timezones_id(COALESCE(t.geom, s.geom)) + , providers_id = get_providers_id(COALESCE(t.source_name, s.source_name)) + , modified_on = now() FROM tempfetchdata_nodes t -WHERE t.sensor_nodes_id = s.sensor_nodes_id AND +WHERE t.sensor_nodes_id = s.sensor_nodes_id +AND ( (s.geom IS NULL and t.geom IS NOT NULL) OR @@ -258,7 +326,7 @@ OR t.source_name, t.city, t.country, - t.metadata + t.metadata - ARRAY['imported','fetchlogs_id']::text[] ) IS DISTINCT FROM ( s.sensor_nodes_id, s.ismobile, @@ -266,13 +334,62 @@ OR s.source_name, s.city, s.country, - s.metadata + s.metadata - ARRAY['imported','fetchlogs_id']::text[] ) ) RETURNING 1) SELECT COUNT(1) INTO __updated_nodes FROM updates; + +-- SELECT s.sensor_nodes_id +-- , t.site_name +-- , s.site_name +-- , t.metadata - ARRAY['imported','fetchlogs_id']::text[] as temp +-- , s.metadata - ARRAY['imported','fetchlogs_id']::text[] as node +-- FROM tempfetchdata_nodes t +-- JOIN sensor_nodes s ON (t.sensor_nodes_id = s.sensor_nodes_id) +-- WHERE (s.geom IS NULL and t.geom IS NOT NULL) +-- OR +-- ROW ( +-- t.sensor_nodes_id, +-- -- t.ismobile, +-- -- t.site_name, +-- -- t.source_name, +-- -- t.city, +-- -- t.country, +-- t.metadata - ARRAY['imported','fetchlogs_id']::text[] +-- ) IS DISTINCT FROM ( +-- s.sensor_nodes_id, +-- -- s.ismobile, +-- -- s.site_name, +-- -- s.source_name, +-- -- s.city, +-- -- s.country, +-- s.metadata - ARRAY['imported','fetchlogs_id']::text[] +-- ) +-- LIMIT 20; + +-- SELECT h.site_name +-- , n.site_name +-- , st_astext(h.geom) +-- , st_astext(n.geom) +-- , h.origin +-- , n.origin +-- , h.metadata - ARRAY['imported','fetchlogs_id']::text[] as history +-- , n.metadata - ARRAY['imported','fetchlogs_id']::text[] as current +-- FROM sensor_nodes_history h +-- JOIN sensor_nodes n USING (sensor_nodes_id) +-- WHERE created > now() - '2min'::interval; + +-- SELECT source_name +-- , COALESCE(jsonb_array_length(metadata->'attribution'), 0) as attributes +-- , COUNT(1) as n +-- FROM sensor_nodes +-- GROUP BY 1,2 +-- ORDER BY 2 DESC +-- LIMIT 500; + ------------- -- File #6 -- ------------- @@ -287,7 +404,10 @@ INSERT INTO sensor_nodes ( city, country, ismobile, - origin + origin, + timezones_id, + providers_id, + countries_id ) SELECT site_name, @@ -298,6 +418,9 @@ SELECT country, ismobile, 'OPENAQ' + , get_timezones_id(geom) + , get_providers_id(source_name) + , get_countries_id(geom) FROM tempfetchdata_nodes t WHERE t.sensor_nodes_id is NULL RETURNING * @@ -395,7 +518,7 @@ SELECT COUNT(1) INTO __inserted_measurands FROM inserts; -- get cleaned sensors table -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors_clean AS +CREATE {table} IF NOT EXISTS tempfetchdata_sensors_clean AS SELECT null::int as sensors_id, sensor_nodes_id, @@ -489,6 +612,29 @@ DELETE FROM tempfetchdata WHERE sensors_id IS NULL; +--DELETE +--FROM measurements m +--USING tempfetchdata t +--WHERE m.datetime = t.datetime +--AND m.sensors_id = t.sensors_id; + +__node_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); +-- restart the clock to measure just inserts +__process_start := clock_timestamp(); + + +-- moved down +-- count the future measurements +SELECT COUNT(1) INTO __deleted_future_measurements +FROM tempfetchdata +WHERE datetime > __max_measurement_date +; + + SELECT COUNT(1) INTO __deleted_past_measurements +FROM tempfetchdata +WHERE datetime < __min_measurement_date +; + WITH inserts AS ( INSERT INTO measurements (sensors_id, datetime, value) @@ -496,12 +642,15 @@ WITH inserts AS ( , datetime , value FROM tempfetchdata + WHERE datetime > __min_measurement_date + AND datetime < __max_measurement_date ON CONFLICT DO NOTHING - RETURNING sensors_id, datetime + RETURNING sensors_id, datetime, value ), inserted as ( - INSERT INTO temp_inserted_measurements (sensors_id, datetime) + INSERT INTO temp_inserted_measurements (sensors_id, datetime, value) SELECT sensors_id , datetime + , value FROM inserts RETURNING sensors_id, datetime ) @@ -513,18 +662,203 @@ INTO __inserted_start_datetime , __inserted_measurements FROM inserted; - --- No longer going to manage the fetch log in this way --- WITH updates AS ( --- UPDATE fetchlogs --- SET completed_datetime = clock_timestamp() --- , last_message = NULL -- reset any previous error --- WHERE key IN (SELECT key FROM ingestfiles) --- RETURNING 1) --- SELECT COUNT(1) INTO __keys --- FROM updates; +__insert_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- mark the fetchlogs as done +WITH inserted AS ( + SELECT m.fetchlogs_id + , COUNT(m.*) as n_records + , COUNT(t.*) as n_inserted + , MIN(m.datetime) as fr_datetime + , MAX(m.datetime) as lr_datetime + , MIN(t.datetime) as fi_datetime + , MAX(t.datetime) as li_datetime + FROM tempfetchdata m + LEFT JOIN temp_inserted_measurements t ON (t.sensors_id = m.sensors_id AND t.datetime = m.datetime) + GROUP BY m.fetchlogs_id) +UPDATE fetchlogs +SET completed_datetime = CURRENT_TIMESTAMP +, inserted = COALESCE(n_inserted, 0) +, records = COALESCE(n_records, 0) +, first_recorded_datetime = fr_datetime +, last_recorded_datetime = lr_datetime +, first_inserted_datetime = fi_datetime +, last_inserted_datetime = li_datetime +FROM inserted +WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; + +-- track the time required to update cache tables +__process_start := clock_timestamp(); + +-- -- Now we can use those temp_inserted_measurements to update the cache tables +-- INSERT INTO sensors_latest ( +-- sensors_id +-- , datetime +-- , value +-- ) +-- ---- identify the row that has the latest value +-- WITH numbered AS ( +-- SELECT sensors_id +-- , datetime +-- , value +-- , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn +-- FROM temp_inserted_measurements +-- ), latest AS ( +-- ---- only insert those rows +-- SELECT sensors_id +-- , datetime +-- , value +-- FROM numbered +-- WHERE rn = 1 +-- ) +-- SELECT l.sensors_id +-- , l.datetime +-- , l.value +-- FROM latest l +-- LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +-- WHERE sl.sensors_id IS NULL +-- OR l.datetime > sl.datetime +-- ON CONFLICT (sensors_id) DO UPDATE +-- SET datetime = EXCLUDED.datetime +-- , value = EXCLUDED.value +-- , modified_on = now() +-- --, fetchlogs_id = EXCLUDED.fetchlogs_id +-- ; + +-- update the exceedances +INSERT INTO sensor_exceedances (sensors_id, threshold_value, datetime_latest) + SELECT + m.sensors_id + , t.value + , MAX(datetime) + FROM temp_inserted_measurements m + JOIN sensors s ON (m.sensors_id = s.sensors_id) + JOIN thresholds t ON (s.measurands_id = t.measurands_id) + AND m.value > t.value + GROUP BY 1, 2 + ON CONFLICT (sensors_id, threshold_value) DO UPDATE SET + datetime_latest = GREATEST(sensor_exceedances.datetime_latest, EXCLUDED.datetime_latest) + , updated_on = now(); + +INSERT INTO sensors_rollup ( + sensors_id + , datetime_first + , datetime_last + , value_latest + , value_count + , value_avg + , value_min + , value_max + ) +---- identify the row that has the latest value +WITH numbered AS ( + SELECT sensors_id + , datetime + , value + , sum(1) OVER (PARTITION BY sensors_id) as value_count + , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min + , avg(value) OVER (PARTITION BY sensors_id) as value_avg + , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn + FROM temp_inserted_measurements +), latest AS ( +---- only insert those rows + SELECT sensors_id + , datetime + , value + , value_count + , value_avg + , datetime_min + FROM numbered + WHERE rn = 1 +) +SELECT l.sensors_id +, l.datetime_min -- first +, l.datetime -- last +, l.value -- last value +, l.value_count +, l.value_avg +, l.value -- min +, l.value -- max +FROM latest l +LEFT JOIN sensors_rollup sr ON (l.sensors_id = sr.sensors_id) +WHERE sr.sensors_id IS NULL +OR l.datetime > sr.datetime_last +OR l.datetime_min < sr.datetime_first +ON CONFLICT (sensors_id) DO UPDATE +SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_last) +, value_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.value_latest + ELSE sensors_rollup.value_latest + END +, value_count = sensors_rollup.value_count + EXCLUDED.value_count +, value_min = LEAST(sensors_rollup.value_min, EXCLUDED.value_latest) +, value_max = GREATEST(sensors_rollup.value_max, EXCLUDED.value_latest) +, datetime_first = LEAST(sensors_rollup.datetime_first, EXCLUDED.datetime_first) +, modified_on = now() +--, fetchlogs_id = EXCLUDED.fetchlogs_id +; +-- WITH spatial_inserts AS ( +-- INSERT INTO sensor_nodes_spatial_rollup ( +-- sensor_nodes_id +-- , geom +-- , cell_size +-- , start_datetime +-- , end_datetime +-- , measurements_count +-- , added_on) +-- SELECT sensor_nodes_id +-- , st_snaptogrid(s.geom, 250) +-- , 250 +-- , MIN(datetime) as start_datetime +-- , MAX(datetime) as end_datetime +-- , COUNT(DISTINCT datetime) as measurements +-- , now() +-- FROM temp_inserted_measurements +-- JOIN tempfetchdata_sensors s USING (sensors_id) +-- JOIN sensor_systems ss USING (sensor_systems_id) +-- WHERE lat IS NOT NULL +-- AND lon IS NOT NULL +-- GROUP BY 1,2 +-- ON CONFLICT (sensor_nodes_id, geom) DO UPDATE SET +-- start_datetime = LEAST(sensor_nodes_spatial_rollup.start_datetime, EXCLUDED.start_datetime) +-- , end_datetime = GREATEST(sensor_nodes_spatial_rollup.end_datetime, EXCLUDED.end_datetime) +-- , measurements_count = sensor_nodes_spatial_rollup.measurements_count + EXCLUDED.measurements_count +-- , modified_on = now() +-- RETURNING 1) +-- SELECT COUNT(1) INTO __inserted_spatial_rollups +-- FROM spatial_inserts; + + +-- Update the table that will help to track hourly rollups +--INSERT INTO hourly_stats (datetime) +-- SELECT date_trunc('hour', datetime) +-- FROM temp_inserted_measurements +-- GROUP BY 1 +--ON CONFLICT (datetime) DO UPDATE +--SET modified_on = now(); + + WITH inserted_hours AS ( + -- first we group things, adding an hour to make it time-ending after truncating + SELECT datetime + '1h'::interval as datetime + , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset + FROM temp_inserted_measurements m + JOIN sensors s ON (s.sensors_id = m.sensors_id) + JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) + JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id) + JOIN timezones tz ON (sn.timezones_id = tz.timezones_id) + GROUP BY 1, 2 + ) + INSERT INTO hourly_data_queue (datetime, tz_offset) + SELECT as_utc_hour(datetime, tz_offset), tz_offset + FROM inserted_hours + GROUP BY 1, 2 + ON CONFLICT (datetime, tz_offset) DO UPDATE + SET modified_on = now(); + + +-- update the table that will track the daily exports WITH e AS ( INSERT INTO open_data_export_logs (sensor_nodes_id, day, records, measurands, modified_on) SELECT sn.sensor_nodes_id @@ -548,7 +882,91 @@ SELECT COUNT(1) INTO __exported_days FROM e; -RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, source: fetch' +__cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + + +INSERT INTO ingest_stats ( + ingest_method + -- total + , total_measurements_processed + , total_measurements_inserted + , total_measurements_rejected + , total_nodes_processed + , total_nodes_inserted + , total_nodes_updated + , total_nodes_rejected + -- total times + , total_process_time_ms + , total_insert_time_ms + , total_cache_time_ms + -- latest + , latest_measurements_processed + , latest_measurements_inserted + , latest_measurements_rejected + , latest_nodes_processed + , latest_nodes_inserted + , latest_nodes_updated + , latest_nodes_rejected + -- times + , latest_process_time_ms + , latest_insert_time_ms + , latest_cache_time_ms + ) VALUES ( + -- totals + __ingest_method + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms + -- latest + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms +) ON CONFLICT (ingest_method) DO UPDATE SET + -- totals + total_measurements_processed = ingest_stats.total_measurements_processed + EXCLUDED.total_measurements_processed + , total_measurements_inserted = ingest_stats.total_measurements_inserted + EXCLUDED.total_measurements_inserted + , total_measurements_rejected = ingest_stats.total_measurements_rejected + EXCLUDED.total_measurements_rejected + , total_nodes_processed = ingest_stats.total_nodes_processed + EXCLUDED.total_nodes_processed + , total_nodes_inserted = ingest_stats.total_nodes_inserted + EXCLUDED.total_nodes_inserted + , total_nodes_updated = ingest_stats.total_nodes_updated + EXCLUDED.total_nodes_updated + , total_nodes_rejected = ingest_stats.total_nodes_rejected + EXCLUDED.total_nodes_rejected + , total_process_time_ms = ingest_stats.total_process_time_ms + EXCLUDED.total_process_time_ms + , total_insert_time_ms = ingest_stats.total_insert_time_ms + EXCLUDED.total_insert_time_ms + , total_cache_time_ms = ingest_stats.total_cache_time_ms + EXCLUDED.total_cache_time_ms + -- latest + , latest_measurements_processed = EXCLUDED.latest_measurements_processed + , latest_measurements_inserted = EXCLUDED.latest_measurements_inserted + , latest_measurements_rejected = EXCLUDED.latest_measurements_rejected + , latest_nodes_processed = EXCLUDED.latest_nodes_processed + , latest_nodes_inserted = EXCLUDED.latest_nodes_inserted + , latest_nodes_updated = EXCLUDED.latest_nodes_updated + , latest_nodes_rejected = EXCLUDED.latest_nodes_rejected + -- times + , latest_process_time_ms = EXCLUDED.latest_process_time_ms + , latest_insert_time_ms = EXCLUDED.latest_insert_time_ms + , latest_cache_time_ms = EXCLUDED.latest_cache_time_ms + , ingest_count = ingest_stats.ingest_count + 1 + , ingested_on = EXCLUDED.ingested_on; + + + +RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, inserted-spatial-rollups: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: fetch' , __total_measurements , __deleted_timescaledb , __deleted_future_measurements @@ -565,7 +983,10 @@ RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-meas , __rejected_systems , __rejected_sensors , __exported_days - , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + , __inserted_spatial_rollups + , __process_time_ms + , __insert_time_ms + , __cache_time_ms; END $$; diff --git a/ingest/fetch_staging.sql b/ingest/fetch_staging.sql index 5ea6e43..48ddc93 100644 --- a/ingest/fetch_staging.sql +++ b/ingest/fetch_staging.sql @@ -1,4 +1,11 @@ -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata ( +-- DROP TABLE IF EXISTS tempfetchdata +-- , temp_inserted_measurements +-- , tempfetchdata_nodes +-- , tempfetchdata_sensors +-- , tempfetchdata_sensors_clean; + +CREATE {table} IF NOT EXISTS tempfetchdata ( + fetchlogs_id int, location text, value float, unit text, @@ -17,15 +24,19 @@ CREATE TEMP TABLE IF NOT EXISTS tempfetchdata ( sensors_id int ); -CREATE TEMP TABLE IF NOT EXISTS ingestfiles( - key text -); +--CREATE {table} IF NOT EXISTS ingestfiles( +-- key text +--); -- This table will hold measurements that have -- actually been inserted into the measurements table -- this is to deal with the overlap that we see in the -- incoming files -CREATE TEMP TABLE IF NOT EXISTS temp_inserted_measurements ( - sensors_id int, - datetime timestamptz +CREATE {table} IF NOT EXISTS temp_inserted_measurements ( + sensors_id int + , datetime timestamptz + , value double precision + , lat double precision + , lon double precision + , fetchlogs_id int ); diff --git a/ingest/handler.py b/ingest/handler.py index 2bb72cc..f827c3b 100644 --- a/ingest/handler.py +++ b/ingest/handler.py @@ -2,7 +2,8 @@ import logging import psycopg2 from .settings import settings -from .lcs import load_measurements_db, load_metadata_db +from .lcs import load_metadata_db +from .lcsV2 import load_measurements_db from .fetch import load_db from time import time import json @@ -38,7 +39,6 @@ def handler(event, context): else: keys = getKeysFromS3Record(record) - logger.debug(keys) for obj in keys: bucket = obj['bucket'] key = obj['key'] @@ -47,24 +47,29 @@ def handler(event, context): ) try: + file_size = lov2["Contents"][0]["Size"] last_modified = lov2["Contents"][0]["LastModified"] except KeyError: logger.error(""" - could not get last modified time from obj + could not get info from obj """) + file_size = None last_modified = datetime.now().replace( tzinfo=timezone.utc ) cursor.execute( """ - INSERT INTO fetchlogs (key, last_modified) - VALUES(%s, %s) + INSERT INTO fetchlogs (key + , file_size + , last_modified + ) + VALUES(%s, %s, %s) ON CONFLICT (key) DO UPDATE SET last_modified=EXCLUDED.last_modified, completed_datetime=NULL RETURNING *; """, - (key, last_modified,), + (key, file_size, last_modified,), ) row = cursor.fetchone() connection.commit() @@ -99,6 +104,10 @@ def getKeysFromS3Record(record): def cronhandler(event, context): + if settings.PAUSE_INGESTING: + logger.info('Ingesting is paused') + return None + start_time = time() timeout = settings.INGEST_TIMEOUT # manual timeout for testing ascending = settings.FETCH_ASCENDING if 'ascending' not in event else event['ascending'] @@ -107,86 +116,58 @@ def cronhandler(event, context): metadata_limit = settings.METADATA_LIMIT if 'metadata_limit' not in event else event['metadata_limit'] logger.info(f"Running cron job: {event['source']}, ascending: {ascending}") - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: - connection.set_session(autocommit=True) - with connection.cursor() as cursor: - cursor.execute( - """ - SELECT count(*) - FROM fetchlogs - WHERE completed_datetime is null - AND key ~*'stations'; - """, - ) - metadata = cursor.fetchone() - cursor.execute( - """ - SELECT count(*) - FROM fetchlogs - WHERE completed_datetime is null - AND key ~*'measures'; - """, - ) - pipeline = cursor.fetchone() - cursor.execute( - """ - SELECT count(*) - FROM fetchlogs - WHERE completed_datetime is null - AND key ~*'realtime'; - """, - ) - realtime = cursor.fetchone() - for notice in connection.notices: - logger.debug(notice) - - metadata = 0 if metadata is None else metadata[0] - realtime = 0 if realtime is None else realtime[0] - pipeline = 0 if pipeline is None else pipeline[0] - logger.info(f"{metadata_limit}/{metadata} metadata, {realtime_limit}/{realtime} openaq, {pipeline_limit}/{pipeline} pipeline records pending") # these exceptions are just a failsafe so that if something # unaccounted for happens we can still move on to the next # process. In case of this type of exception we will need to # fix it asap try: - if metadata > 0 and metadata_limit > 0: + if metadata_limit > 0: cnt = 0 - while cnt < metadata and (time() - start_time) < timeout: - cnt += load_metadata_db(metadata_limit, ascending) + loaded = 1 + while ( + loaded > 0 + and (time() - start_time) < timeout + ): + loaded = load_metadata_db(metadata_limit, ascending) + cnt += loaded logger.info( - "loaded %s of %s metadata records, timer: %0.4f", - cnt, metadata, time() - start_time + "loaded %s metadata records, timer: %0.4f", + cnt, time() - start_time ) except Exception as e: logger.error(f"load metadata failed: {e}") try: - if realtime > 0 and realtime_limit > 0: + if realtime_limit > 0: cnt = 0 loaded = 1 while ( loaded > 0 - and cnt < realtime and (time() - start_time) < timeout ): loaded = load_db(realtime_limit, ascending) cnt += loaded logger.info( - "loaded %s of %s fetch records, timer: %0.4f", - cnt, realtime, time() - start_time + "loaded %s fetch records, timer: %0.4f", + cnt, time() - start_time ) except Exception as e: logger.error(f"load realtime failed: {e}") try: - if pipeline > 0 and pipeline_limit > 0: + if pipeline_limit > 0: cnt = 0 - while cnt < pipeline and (time() - start_time) < timeout: - cnt += load_measurements_db(pipeline_limit, ascending) + loaded = 1 + while ( + loaded > 0 + and (time() - start_time) < timeout + ): + loaded = load_measurements_db(pipeline_limit, ascending) + cnt += loaded logger.info( - "loaded %s of %s pipeline records, timer: %0.4f", - cnt, pipeline, time() - start_time + "loaded %s pipeline records, timer: %0.4f", + cnt, time() - start_time ) except Exception as e: logger.error(f"load pipeline failed: {e}") diff --git a/ingest/lcs.py b/ingest/lcs.py index 43b2da0..c59df22 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -4,6 +4,7 @@ import dateparser import pytz import orjson +import uuid import csv from time import time from urllib.parse import unquote_plus @@ -14,7 +15,13 @@ import typer from io import StringIO from .settings import settings -from .utils import get_query, clean_csv_value, StringIteratorIO, fix_units +from .utils import ( + get_query, + clean_csv_value, + StringIteratorIO, + fix_units, + load_fetchlogs, +) s3 = boto3.resource("s3") s3c = boto3.client("s3") @@ -22,7 +29,7 @@ app = typer.Typer() dir_path = os.path.dirname(os.path.realpath(__file__)) -FETCH_BUCKET = settings.ETL_BUCKET +FETCH_BUCKET = settings.FETCH_BUCKET logger = logging.getLogger(__name__) @@ -91,7 +98,7 @@ def system(self, j, node_id, fetchlogsId): self.systems.append(system) def node(self, j): - node = {} + node = {"fetchlogs_id": None} metadata = {} if "sensor_node_id" in j: id = j["sensor_node_id"] @@ -99,9 +106,7 @@ def node(self, j): return None # if we have passed the fetchlogs_id we should track it if "fetchlogs_id" in j: - fetchlogsId = j["fetchlogs_id"] - else: - fetchlogsId = None + node["fetchlogs_id"] = j["fetchlogs_id"] for key, value in j.items(): key = str.replace(key, "sensor_node_", "") @@ -122,7 +127,7 @@ def node(self, j): except Exception: node["geom"] = None elif key == "sensor_systems": - self.system(value, id, fetchlogsId) + self.system(value, id, node["fetchlogs_id"]) else: metadata[key] = value node["metadata"] = orjson.dumps(metadata).decode() @@ -159,12 +164,12 @@ def get_station(self, key, fetchlogsId): self.node(obj) def load_data(self): - logger.debug(f"load_data: {self.keys}") + logger.debug(f"load_data: {self.keys}, {self.nodes}") with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: start_time = time() - self.create_staging_table(cursor) + create_staging_table(cursor) write_csv( cursor, @@ -247,17 +252,7 @@ def load_data(self): def process_data(self, cursor): query = get_query("lcs_ingest_full.sql") cursor.execute(query) - # query = get_query("lcs_ingest_nodes.sql") - # cursor.execute(query) - - # query = get_query("lcs_ingest_systems.sql") - # cursor.execute(query) - - # query = get_query("lcs_ingest_sensors.sql") - # cursor.execute(query) - def create_staging_table(self, cursor): - cursor.execute(get_query("lcs_staging.sql")) def get_metadata(self): hasnew = False @@ -266,6 +261,7 @@ def get_metadata(self): id = obj["id"] last_modified = obj["LastModified"] try: + logger.debug(f"Loading station file: {id}:{key}") self.get_station(key, id) self.keys.append( { @@ -286,6 +282,14 @@ def get_metadata(self): self.load_data() + +def create_staging_table(cursor): + # table and batch are used primarily for testing + cursor.execute(get_query( + "lcs_staging.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) + def write_csv(cursor, data, table, columns): fields = ",".join(columns) sio = StringIO() @@ -316,23 +320,40 @@ def load_metadata_bucketscan(count=100): break -def load_metadata_db(count=250, ascending: bool = False): +def load_metadata_db(limit=250, ascending: bool = False): order = 'ASC' if ascending else 'DESC' + pattern = 'lcs-etl-pipeline/stations/' + rows = load_fetchlogs(pattern, limit, ascending) + contents = [] + for row in rows: + logger.debug(row) + contents.append( + { + "Key": unquote_plus(row[1]), + "LastModified": row[2], + "id": row[0], + } + ) + if len(contents) > 0: + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() + return len(rows) + + +def load_metadata_batch(batch: str): with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: cursor.execute( - f""" + """ SELECT key , last_modified , fetchlogs_id FROM fetchlogs - WHERE key~'lcs-etl-pipeline/stations/' - AND completed_datetime is null - ORDER BY last_modified {order} nulls last - LIMIT %s; + WHERE batch_uuid = %s """, - (count,), + (batch,), ) rows = cursor.fetchall() rowcount = cursor.rowcount @@ -348,11 +369,23 @@ def load_metadata_db(count=250, ascending: bool = False): for notice in connection.notices: logger.debug(notice) if len(contents) > 0: - data = LCSData(contents) - data.get_metadata() + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() return rowcount +def load_metadata(keys): + logger.debug(f'Load metadata: {len(keys)}') + data = LCSData(keys) + try: + data.get_metadata() + except Exception as e: + ids = ','.join([str(k['id']) for k in keys]) + logger.error(f'load error: {e} ids: {ids}') + raise + + def select_object(key): key = unquote_plus(key) if str.endswith(key, ".gz"): @@ -362,7 +395,7 @@ def select_object(key): try: content = "" resp = s3c.select_object_content( - Bucket=settings.ETL_BUCKET, + Bucket=settings.FETCH_BUCKET, Key=key, ExpressionType="SQL", Expression=""" @@ -426,13 +459,19 @@ def get_measurements(key, fetchlogsId): dt = row[2] try: - dt = datetime.fromtimestamp(int(dt), timezone.utc) + if dt.isnumeric(): + if len(dt) == 13: + dt = datetime.fromtimestamp(int(dt)/1000.0, timezone.utc) + else: + dt = datetime.fromtimestamp(int(dt), timezone.utc) + row[2] = dt.isoformat() except Exception: try: dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) except Exception: logger.warning(f"Exception in parsing date for {dt} {Exception}") - row[2] = dt.isoformat() + + #row[2] = dt.isoformat() # addd the log id for tracing purposes row.insert(5, fetchlogsId) ret.append(row) @@ -442,6 +481,7 @@ def get_measurements(key, fetchlogsId): def submit_file_error(key, e): """Update the log to reflect the error and prevent a retry""" + logger.error(f"{key}: {e}") with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: @@ -451,26 +491,25 @@ def submit_file_error(key, e): SET completed_datetime = clock_timestamp() , last_message = %s WHERE key = %s - """ - ), - (f"ERROR: {e}", key), + """, + (f"ERROR: {e}", key), + ) + def to_tsv(row): tsv = "\t".join(map(clean_csv_value, row)) + "\n" return tsv return "" + def load_measurements_file(fetchlogs_id: int): with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: cursor.execute( """ - SELECT key - , init_datetime - , loaded_datetime - , completed_datetime - , last_message + SELECT fetchlogs_id + , key FROM fetchlogs WHERE fetchlogs_id = %s LIMIT 1 @@ -479,34 +518,29 @@ def load_measurements_file(fetchlogs_id: int): (fetchlogs_id,), ) rows = cursor.fetchall() - print(rows) - keys = [r[0] for r in rows] - load_measurements(keys) + load_measurements(rows) + + +def load_measurements_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT fetchlogs_id + , key + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + load_measurements(rows) def load_measurements_db(limit=250, ascending: bool = False): - order = 'ASC' if ascending else 'DESC' - conn = psycopg2.connect(settings.DATABASE_WRITE_URL) - cur = conn.cursor() - cur.execute( - f""" - SELECT fetchlogs_id - , key - , last_modified - FROM fetchlogs - WHERE key~E'^lcs-etl-pipeline/measures/.*\\.csv' - AND completed_datetime is null - ORDER BY last_modified {order} nulls last - LIMIT %s - ; - """, - (limit,), - ) - rows = cur.fetchall() - # keys = [r[0] for r in rows] - conn.commit() - cur.close() - conn.close() + pattern = '^lcs-etl-pipeline/measures/.*\\.csv' + rows = load_fetchlogs(pattern, limit, ascending) load_measurements(rows) return len(rows) @@ -527,13 +561,12 @@ def load_measurements(rows): logger.info("load_measurements:get: %s keys; %s rows; %0.4f seconds", len(rows), len(data), time() - start_time) if len(data) > 0: - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: - cursor.execute(get_query("lcs_meas_staging.sql")) - start = time() + create_staging_table(cursor) + write_csv( cursor, new, "keys", ["key",], ) @@ -551,43 +584,11 @@ def load_measurements(rows): mrows = cursor.rowcount status = cursor.statusmessage logger.debug(f"COPY Rows: {mrows} Status: {status}") - cursor.execute( - """ - INSERT INTO fetchlogs( - key, - loaded_datetime - ) SELECT key, clock_timestamp() - FROM keys - ON CONFLICT (key) DO - UPDATE - SET - loaded_datetime=EXCLUDED.loaded_datetime - ; - """ - ) - connection.commit() + cursor.execute(get_query("lcs_meas_ingest.sql")) for notice in connection.notices: print(notice) - #irows = cursor.rowcount - #logger.info("load_measurements:insert: %s rows; %0.4f seconds", irows, time() - start) - #status = cursor.statusmessage - #logger.debug(f"INGEST Rows: {irows} Status: {status}") - cursor.execute( - """ - INSERT INTO fetchlogs( - key, - completed_datetime - ) SELECT key, clock_timestamp() - FROM keys - ON CONFLICT (key) DO - UPDATE - SET - completed_datetime=EXCLUDED.completed_datetime - ; - """ - ) logger.info( "load_measurements: keys: %s; rows: %s; time: %0.4f", len(rows), mrows, time() - start_time) diff --git a/ingest/lcsV2.py b/ingest/lcsV2.py new file mode 100644 index 0000000..f3fecae --- /dev/null +++ b/ingest/lcsV2.py @@ -0,0 +1,959 @@ +import os +import logging +from datetime import datetime, timezone +import dateparser +import pytz +import orjson +import uuid +import csv +from time import time +from urllib.parse import unquote_plus +import warnings +import re + +import boto3 +import psycopg2 +import typer +from io import StringIO +from .settings import settings +from .utils import ( + get_query, + clean_csv_value, + StringIteratorIO, + fix_units, + load_fetchlogs, + select_object, + get_file, +) + +s3 = boto3.resource("s3") +s3c = boto3.client("s3") + +app = typer.Typer() +dir_path = os.path.dirname(os.path.realpath(__file__)) + +FETCH_BUCKET = settings.FETCH_BUCKET + +logger = logging.getLogger(__name__) + +warnings.filterwarnings( + "ignore", + message="The localize method is no longer necessary, as this time zone supports the fold attribute", +) + + +def to_geometry(key, data): + # could be passed as lat/lng or coordinates + if key in ['lat','lon']: + lat = data.get('lat') + lon = data.get('lon') + elif key == 'coordinates': + lat = data.get('coordinates', {}).get('lat') + lon = data.get('coordinates', {}).get('lon') + if None in [lat, lon]: + raise Exception('Missing value for coordinates') + # could add more checks + return f"SRID=4326;POINT({lon} {lat})" + +def to_timestamp(key, data): + dt = data.get(key) + value = None + if dt in [None, '']: + logger.warning('Passed none type value for timestamp') + # no need for exception, we check for nones later + return None; + if dt.isnumeric(): + if len(dt) == 13: + dt = datetime.fromtimestamp(int(dt)/1000.0, timezone.utc) + else: + dt = datetime.fromtimestamp(int(dt), timezone.utc) + else: + return dt + dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) + + return dt.isoformat() + + +class IngestClient: + def __init__( + self, key=None, fetchlogs_id=None, data=None + ): + self.key = key + self.fetchlogs_id = fetchlogs_id + self.keys = [] + self.st = datetime.now().replace(tzinfo=pytz.UTC) + self.sensors = [] + self.systems = [] + self.flags = [] + self.nodes = [] + self.node_ids = [] + self.system_ids = [] + self.sensor_ids = [] + self.measurements = [] + self.matching_method = 'ingest-id' + self.source = None + self.node_map = { + "fetchlogs_id": {}, + "site_name": { "col":"site_name" }, + "source_name": {}, + "ismobile": {}, + "ingest_id": {}, + "matching_method": {}, + "location": {"col":"ingest_id"}, + "sensor_node_id": {"col":"ingest_id"}, + "label": {"col":"site_name"}, + "coordinates": {"col":"geom","func": to_geometry }, + "geometry": {"col":"geom", "func": to_geometry }, + "lat": {"col":"geom","func": to_geometry }, + "lon": {"col":"geom","func": to_geometry }, + } + self.measurement_map = { + "sensor_id": {"col": "ingest_id"}, + "ingest_id": {"col": "ingest_id"}, + "timestamp": {"col": "datetime", "func": to_timestamp }, + "datetime": {"col": "datetime", "func": to_timestamp }, + "measure": {"col": "value"}, + "value": {}, + "lat": {}, + "lon": {}, + } + # if fetchlogs_id but no key or data + # get key + # if key, load data + # if data + if data is not None and isinstance(data, dict): + self.load(data) + + def process(self, key, data, mp): + col = None + value = None + m = mp.get(key) + if m is not None: + col = m.get('col', key) + func = m.get('func') + if func is None: + # just return value + value = data.get(key) + else: + # functions require key and data + value = func(key, data) + return col, value + + def dump(self, load: bool = True): + """ + Dump any data that is currenly loaded into the database + We will dump if there is data OR if we have loaded any keys + We do this because its possible that a file is empty but we + need to run the dump method to get the file to be marked as finished + """ + logger.debug(f"Dumping data from {len(self.keys)} files") + if len(self.nodes)>0 or len(self.keys)>0: + self.dump_locations(load) + if len(self.measurements)>0 or len(self.keys)>0: + self.dump_measurements(load) + + def dump_locations(self, load: bool = True): + """ + Dump the nodes into the temporary tables + """ + db_table = "TEMP TABLE" if (settings.USE_TEMP_TABLES and load) else "TABLE" + logger.debug(f"Dumping {len(self.nodes)} nodes using {db_table} ({settings.USE_TEMP_TABLES}|{load})") + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + start_time = time() + + cursor.execute(get_query( + "temp_locations_dump.sql", + table=db_table + )) + + write_csv( + cursor, + self.keys, + f"staging_keys", + [ + "key", + "last_modified", + "fetchlogs_id", + ], + ) + # update by id instead of key due to matching issue + cursor.execute( + """ + UPDATE fetchlogs + SET loaded_datetime = clock_timestamp() + , last_message = 'load_data' + WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM staging_keys) + """ + ) + connection.commit() + + write_csv( + cursor, + self.nodes, + "staging_sensornodes", + [ + "ingest_id", + "site_name", + "matching_method", + "source_name", + "source_id", + "ismobile", + "geom", + "metadata", + "fetchlogs_id", + ], + ) + + write_csv( + cursor, + self.systems, + "staging_sensorsystems", + [ + "ingest_id", + "instrument_ingest_id", + "ingest_sensor_nodes_id", + "metadata", + "fetchlogs_id", + ], + ) + + write_csv( + cursor, + self.sensors, + "staging_sensors", + [ + "ingest_id", + "ingest_sensor_systems_id", + "measurand", + "units", + "status", + "logging_interval_seconds", + "averaging_interval_seconds", + "metadata", + "fetchlogs_id", + ], + ) + + write_csv( + cursor, + self.flags, + "staging_flags", + [ + "ingest_id", + "sensor_ingest_id", + "datetime_from", + "datetime_to", + "note", + "metadata", + "fetchlogs_id", + ], + ) + + connection.commit() + + # and now we load all the nodes,systems and sensors + if load: + query = get_query("etl_process_nodes.sql") + cursor.execute(query) + + for notice in connection.notices: + logger.debug(notice) + + cursor.execute( + """ + UPDATE fetchlogs + SET completed_datetime = clock_timestamp() + , last_message = NULL + WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM staging_keys) + """ + ) + + connection.commit() + logger.info("dump_locations: locations: %s; time: %0.4f", len(self.nodes), time() - start_time) + for notice in connection.notices: + logger.debug(notice) + + + + def dump_measurements(self, load: bool = True): + db_table = "TEMP TABLE" if (settings.USE_TEMP_TABLES and load) else "TABLE" + logger.debug(f"Dumping {len(self.measurements)} measurements using {db_table} ({settings.USE_TEMP_TABLES}|{load})") + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + start_time = time() + + cursor.execute(get_query( + "temp_measurements_dump.sql", + table=db_table + )) + + iterator = StringIteratorIO( + (to_tsv(line) for line in self.measurements) + ) + cursor.copy_expert( + """ + COPY staging_measurements (ingest_id, source_name, source_id, measurand, value, datetime, lon, lat, fetchlogs_id) + FROM stdin; + """, + iterator, + ) + + if load: + logger.info(f'processing {len(self.measurements)} measurements'); + query = get_query("etl_process_measurements.sql") + try: + cursor.execute(query) + connection.commit() + logger.info("dump_measurements: measurements: %s; time: %0.4f", len(self.measurements), time() - start_time) + for notice in connection.notices: + logger.debug(notice) + + except Exception as err: + logger.error(err) + + + def load(self, data = {}): + if "meta" in data.keys(): + self.load_metadata(data.get('meta')) + if "locations" in data.keys(): + self.load_locations(data.get('locations')) + if "measures" in data.keys(): + self.load_measurements(data.get('measures')) + + + def reset(self): + """ + Reset the client to the new state. Mostly for testing purposes + """ + logger.debug("Reseting the client data") + self.measurements = [] + self.nodes = [] + self.systems = [] + self.sensors = [] + self.flags = [] + self.keys = [] + self.key = None + self.fetchlogs_id = None + self.node_ids = [] + self.system_ids = [] + self.sensor_ids = [] + + + def load_keys(self, rows): + # for each fetchlog we need to read and load + for row in rows: + key = row[1] + fetchlogs_id = row[0] + last_modified = row[2] + self.load_key(key, fetchlogs_id, last_modified) + + + def load_key(self, key, fetchlogs_id, last_modified): + logger.debug(f"Loading key: {fetchlogs_id}//:{key}") + is_csv = bool(re.search(r"\.csv(.gz)?$", key)) + is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) + self.fetchlogs_id = fetchlogs_id + + # is it a local file? This is used for dev + # but likely fine to leave in + if os.path.exists(os.path.expanduser(key)): + content = get_file(os.path.expanduser(key)).read() + else: + content = select_object(key) + + if is_json: + logger.debug(f"Read JSON containing {len(content)} characters") + else: + logger.debug(f"Read CSV containing {len(content)} lines") + + if is_csv: + # all csv data will be measurements + for rw in csv.reader(content.split("\n")): + self.add_measurement(rw) + elif is_json: + # all json data should just be parsed and loaded + data = orjson.loads(content) + self.load(data) + else: + raise Exception('No idea what to do') + + # add the key to the table to update + self.keys.append({"key": key, "last_modified": last_modified, "fetchlogs_id": fetchlogs_id}) + + + def load_metadata(self, meta): + if "source" in meta.keys(): + self.source = meta.get('source') + if "matching_method" in meta.keys(): + self.matching_method = meta.get('matching_method') + if "schema" in meta.keys(): + self.schema = meta.get('schema') + + def load_locations(self, locations): + for loc in locations: + self.add_node(loc) + + def load_measurements(self, measurements): + logger.debug(f'Loading {len(measurements)} measurements') + for meas in measurements: + self.add_measurement(meas) + + + def add_sensor(self, j, system_id, fetchlogsId): + for s in j: + sensor = {} + metadata = {} + sensor["ingest_sensor_systems_id"] = system_id + sensor["fetchlogs_id"] = fetchlogsId + + if "sensor_id" in s: + id = s.get("sensor_id") + elif "id" in s: + id = s.get("id") + else: + id = system_id + + if id in self.sensor_ids: + # would it make more sense to merge or skip or throw error? + # merge and submit a warning maybe? + continue + + sensor["ingest_id"] = id + + for key, value in s.items(): + key = str.replace(key, "sensor_", "") + if key == "flags": + self.add_flags(value, id, fetchlogsId) + elif key == "measurand_parameter": + sensor["measurand"] = value + elif key == "measurand_unit": + sensor["units"] = fix_units(value) + elif key == "status": + sensor["status"] = value + elif key == "interval_seconds": + sensor["logging_interval_seconds"] = value + sensor["averaging_interval_seconds"] = value + else: + metadata[key] = value + if not sensor.get('measurand'): + # get it from the ingest id + ingest_arr = sensor.get('ingest_id').split('-') + sensor['measurand'] = ingest_arr[-1] # take the last one + sensor["metadata"] = orjson.dumps(metadata).decode() + self.sensors.append(sensor) + self.sensor_ids.append(id) + + def add_flags(self, flags, sensor_id, fetchlogsId): + for f in flags: + flag = {} + metadata = {} + flag["sensor_ingest_id"] = sensor_id + flag["fetchlogs_id"] = fetchlogsId + for key, value in f.items(): + key = str.replace(key, "flag_", "") + if key == "id": + v = str.replace(value, f"{sensor_id}-", "") + flag["ingest_id"] = v + + elif key == 'datetime_from': + flag["datetime_from"] = value + elif key == 'datetime_to': + flag["datetime_to"] = value + elif key == 'note': + flag["note"] = value + else: + metadata[key] = value + + flag["metadata"] = orjson.dumps(metadata).decode() + self.flags.append(flag) + + def add_systems(self, j, node_id, fetchlogsId): + for s in j: + system = {} + metadata = {} + if "sensor_system_id" in s: + id = s.get("sensor_system_id") + elif "system_id" in s: + id = s.get("system_id") + else: + id = node_id + + if id in self.system_ids: + # would it make more sense to merge or skip or throw error? + continue + + ingest_arr = id.split('-') + # this will not work with a uuid passed as a site id + if len(ingest_arr) == 3: + system["instrument_ingest_id"] = ingest_arr[-1]; + + system["ingest_sensor_nodes_id"] = node_id + system["ingest_id"] = id + system["fetchlogs_id"] = fetchlogsId + for key, value in s.items(): + key = str.replace(key, "sensor_system_", "") + if key == "sensors": + self.add_sensor(value, id, fetchlogsId) + else: + metadata[key] = value + system["metadata"] = orjson.dumps(metadata).decode() + self.systems.append(system) + self.system_ids.append(id) + + def add_node(self, j): + fetchlogs_id = j.get('fetchlogs_id', self.fetchlogs_id) + node = { "fetchlogs_id": fetchlogs_id } + metadata = {} + mp = self.node_map + + for k, v in j.items(): + # pass the whole measure + col, value = self.process(k, j, self.node_map) + if col is not None: + node[col] = value + else: + if not k in ['systems','sensor_system']: + metadata[k] = v + + # make sure we actually have data to add + if len(node.keys())>0: + # check for id + ingest_id = node.get('ingest_id') + if ingest_id is None: + raise Exception('Missing ingest id') + + ingest_arr = ingest_id.split('-') + # source name could be set explicitly + # or in the ingest id + # or in the metadata + if node.get('source_name') is None: + if len(ingest_arr)>1: + node['source_name'] = ingest_arr[0] + elif self.source is not None: + node['source_name'] = self.source + else: + raise Exception('Could not find source name') + + # support ingest id that is just the source id + if node.get('source_id') is None: + if len(ingest_arr)>1: + # updated to handle uuid + node['source_id'] = '-'.join(ingest_arr[1:len(ingest_arr)]) + else: + node['source_id'] = ingest_arr[0] + + if node.get('matching_method') is None: + node['matching_method'] = self.matching_method + + # prevent adding the node more than once + # this does not save processing time of course + if ingest_id not in self.node_ids: + node["metadata"] = orjson.dumps(metadata).decode() + self.node_ids.append(ingest_id) + self.nodes.append(node) + # now look for systems + if "sensor_system" in j.keys(): + self.add_systems(j.get('sensor_system'), node.get('ingest_id'), node.get('fetchlogs_id')) + elif "systems" in j.keys(): + self.add_systems(j.get("systems"), node.get('ingest_id'), node.get('fetchlogs_id')) + else: + # no systems + logger.debug(j.keys()) + else: + logger.warning('nothing mapped to node') + + + def add_measurement(self, m): + # create a row with + # ingest_id,datetime,value,lon,lat + # where ingest id will be what links to the sensor + meas = {} + lat = None + lon = None + + # csv method + if isinstance(m, list): + if len(m) < 3: + logger.warning(f'Not enough data in list value: {m}') + return + + fetchlogs_id = self.fetchlogs_id + ingest_id = m[0] + value = m[1] + # using the same key/data format as below + datetime = to_timestamp('dt', {"dt": m[2]}) + if len(m) == 5: + lat = m[3] + lon = m[4] + + elif isinstance(m, dict): + for k, v in m.items(): + # pass the whole measure + col, value = self.process(k, m, self.measurement_map) + if col is not None: + meas[col] = value + + ingest_id = meas.get('ingest_id') + datetime = meas.get('datetime') + value = meas.get('value') + lon = meas.get('lon', None) + lat = meas.get('lat', None) + fetchlogs_id = m.get('fetchlogs_id', self.fetchlogs_id) + + # parse the ingest id here + ingest_arr = ingest_id.split('-') + if len(ingest_arr) < 3: + logger.warning(f'Not enough information in ingest-id: `{ingest_id}`') + return + + source_name = ingest_arr[0] + source_id = '-'.join(ingest_arr[1:len(ingest_arr)-1]) + measurand = ingest_arr[-1] + + if not None in [ingest_id, datetime, source_name, source_id, measurand]: + self.measurements.append([ingest_id, source_name, source_id, measurand, value, datetime, lon, lat, fetchlogs_id]) + + + + def refresh_cached_tables(self): + """ + Refresh the cached tables that we use for most production endpoints. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + logger.debug("Refreshing the cached tables") + cursor.execute("REFRESH MATERIALIZED VIEW locations_view_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW locations_manufacturers_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW locations_latest_measurements_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW providers_view_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW countries_view_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW parameters_view_cached;") + + + + def process_hourly_data(self,n: int = 1000): + """ + Process any pending hourly data rollups. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute("SELECT datetime, tz_offset FROM fetch_hourly_data_jobs(%s)", (n,)) + rows = cursor.fetchall() + for row in rows: + cursor.execute("SELECT update_hourly_data(%s, %s)", row) + connection.commit() + + + def process_daily_data(self,n: int = 500): + """ + Process any pending daily data rollups. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute("SELECT datetime, tz_offset FROM fetch_daily_data_jobs(%s)", (n,)) + rows = cursor.fetchall() + for row in rows: + cursor.execute("SELECT update_daily_data(%s, %s)", row) + connection.commit() + + + def process_annual_data(self,n: int = 25): + """ + Process any pending annual data rollups. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute("SELECT datetime, tz_offset FROM fetch_annual_data_jobs(%s)", (n,)) + rows = cursor.fetchall() + for row in rows: + cursor.execute("SELECT update_annual_data(%s, %s)", row) + connection.commit() + + + def get_metadata(self): + hasnew = False + for obj in self.page: + key = obj["Key"] + id = obj["id"] + last_modified = obj["LastModified"] + try: + logger.debug(f"Loading station file: {id}:{key}") + self.get_station(key, id) + self.keys.append( + { + "key": key, + "last_modified": last_modified, + "fetchlogs_id": id + } + ) + hasnew = True + except Exception as e: + # catch and continue to next page + logger.error( + f"Could not process file: {id}: {key}: {e}" + ) + + if hasnew: + logger.debug(f"get_metadata:hasnew - {self.keys}") + self.load_data() + +def create_staging_table(cursor): + # table and batch are used primarily for testing + cursor.execute(get_query( + "etl_staging_v2.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) + +def write_csv(cursor, data, table, columns): + fields = ",".join(columns) + sio = StringIO() + writer = csv.DictWriter(sio, columns) + writer.writerows(data) + sio.seek(0) + cursor.copy_expert( + f""" + copy {table} ({fields}) from stdin with csv; + """, + sio, + ) + logger.debug(f"table: {table}; rowcount: {cursor.rowcount}") + + + + +def load_metadata_bucketscan(count=100): + paginator = s3c.get_paginator("list_objects_v2") + for page in paginator.paginate( + Bucket=FETCH_BUCKET, + Prefix="lcs-etl-pipeline/stations", + PaginationConfig={"PageSize": count}, + ): + try: + contents = page["Contents"] + data = LCSData(contents) + data.get_metadata() + except KeyError: + break + + +def load_metadata_db(limit=250, ascending: bool = False): + order = 'ASC' if ascending else 'DESC' + pattern = 'lcs-etl-pipeline/stations/' + rows = load_fetchlogs(pattern, limit, ascending) + contents = [] + for row in rows: + logger.debug(row) + contents.append( + { + "Key": unquote_plus(row[1]), + "LastModified": row[2], + "id": row[0], + } + ) + if len(contents) > 0: + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() + return len(rows) + + +def load_metadata_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT key + , last_modified + , fetchlogs_id + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + rowcount = cursor.rowcount + contents = [] + for row in rows: + contents.append( + { + "Key": unquote_plus(row[0]), + "LastModified": row[1], + "id": row[2], + } + ) + for notice in connection.notices: + logger.debug(notice) + if len(contents) > 0: + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() + return rowcount + + +def load_metadata(keys): + logger.debug(f'Load metadata: {len(keys)}') + data = LCSData(keys) + try: + data.get_metadata() + except Exception as e: + ids = ','.join([str(k['id']) for k in keys]) + logger.error(f'load error: {e} ids: {ids}') + raise + + +def get_measurements(key, fetchlogsId): + start = time() + content = select_object(key) + fetch_time = time() - start + + ret = [] + start = time() + for row in csv.reader(content.split("\n")): + if len(row) not in [3, 5]: + continue + if len(row) == 5: + try: + lon = float(row[3]) + lat = float(row[4]) + if not ( + lon is None + or lat is None + or lat == "" + or lon == "" + or lon == 0 + or lat == 0 + or lon < -180 + or lon > 180 + or lat < -90 + or lat > 90 + ): + row[3] = lon + row[4] = lat + else: + row[3] = None + row[4] = None + except Exception: + row[3] = None + row[4] = None + else: + row.insert(3, None) + row.insert(4, None) + if row[0] == "" or row[0] is None: + continue + dt = row[2] + + try: + if dt.isnumeric(): + if len(dt) == 13: + dt = datetime.fromtimestamp(int(dt)/1000.0, timezone.utc) + else: + dt = datetime.fromtimestamp(int(dt), timezone.utc) + row[2] = dt.isoformat() + except Exception: + try: + dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) + except Exception: + logger.warning(f"Exception in parsing date for {dt} {Exception}") + + #row[2] = dt.isoformat() + # addd the log id for tracing purposes + row.insert(5, fetchlogsId) + ret.append(row) + logger.info("get_measurements:csv: %s; size: %s; rows: %s; fetching: %0.4f; reading: %0.4f", key, len(content)/1000, len(ret), fetch_time, time() - start) + return ret + + +def submit_file_error(key, e): + """Update the log to reflect the error and prevent a retry""" + logger.error(f"{key}: {e}") + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + UPDATE fetchlogs + SET completed_datetime = clock_timestamp() + , last_message = %s + WHERE key = %s + """, + (f"ERROR: {e}", key), + ) + + +def to_tsv(row): + tsv = "\t".join(map(clean_csv_value, row)) + "\n" + return tsv + return "" + + +def load_measurements_file(fetchlogs_id: int): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT fetchlogs_id + , key + FROM fetchlogs + WHERE fetchlogs_id = %s + LIMIT 1 + ; + """, + (fetchlogs_id,), + ) + rows = cursor.fetchall() + load_measurements(rows) + + +def load_measurements_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT fetchlogs_id + , key + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + load_measurements(rows) + + +def load_measurements_db( + limit=250, + ascending: bool = False, + pattern = '^lcs-etl-pipeline/measures/.*\\.(csv|json)' + ): + rows = load_fetchlogs(pattern, limit, ascending) + load_measurements(rows) + return len(rows) + + +# Keep seperate from above so we can test rows not from the database +def load_measurements(rows): + logger.debug(f"loading {len(rows)} measurements") + start_time = time() + # get a client object to hold all the data + client = IngestClient() + # load all the keys + client.load_keys(rows) + # and finally we can dump it all into the db + client.dump() + # write to the log + logger.info("load_measurements:get: %s keys; %s measurements; %s locations; %0.4f seconds", + len(client.keys), len(client.measurements), len(client.nodes), time() - start_time) diff --git a/ingest/lcs_ingest_full.sql b/ingest/lcs_ingest_full.sql index 612cc36..a9523fd 100644 --- a/ingest/lcs_ingest_full.sql +++ b/ingest/lcs_ingest_full.sql @@ -1,4 +1,4 @@ --- Get sensor systems +-- lcs_ingest_full DO $$ DECLARE __process_start timestamptz := clock_timestamp(); @@ -29,12 +29,32 @@ FROM ms_sensors WHERE ms_sensors.ingest_id IS NULL OR ingest_sensor_systems_id IS NULL; +UPDATE ms_sensors +SET units = 'µg/m³' +WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); + +-- first thing we want to do is to get the source +-- and the source_id from the ingest id +-- adding the station forces our method to treat the station as the parameter +-- the first section as the source name and then the rest as teh source id +-- this is required for ingest_ids that use `-` in the source_id +-- e.g. something-blah-blah-blah-pm10 +-- where the sensor node ingest id would be +-- something-blah-blah-blah +-- and blah could be read as a paramter value +UPDATE ms_sensornodes +SET source_id = CASE + WHEN source_name ~* 'purpleair|habitatmap' THEN ingest_id + ELSE split_ingest_id(ingest_id||'-station', 2) -- station is a placeholder + END; + + -- match the sensor nodes to get the sensor_nodes_id UPDATE ms_sensornodes SET sensor_nodes_id = sensor_nodes.sensor_nodes_id FROM sensor_nodes WHERE sensor_nodes.source_name = ms_sensornodes.source_name -AND sensor_nodes.source_id = ms_sensornodes.ingest_id; +AND sensor_nodes.source_id = ms_sensornodes.source_id; -- And now we insert those into the sensor nodes table -- we are gouping to deal with any duplicates that currently exist @@ -46,21 +66,30 @@ INSERT INTO sensor_nodes ( , geom , metadata , source_id +, timezones_id +, providers_id +, countries_id ) SELECT site_name , source_name , ismobile , geom , metadata -, ingest_id +, source_id +, get_timezones_id(geom) +, get_providers_id(source_name) +, get_countries_id(geom) FROM ms_sensornodes -GROUP BY site_name, source_name, ismobile, geom, metadata, ingest_id +GROUP BY 1,2,3,4,5,6,7,8 ON CONFLICT (source_name, source_id) DO UPDATE SET - site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name), - ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile), - geom=coalesce(EXCLUDED.geom,sensor_nodes.geom), - metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name) + , ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile) + , geom=coalesce(EXCLUDED.geom,sensor_nodes.geom) + , metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , timezones_id = COALESCE(EXCLUDED.timezones_id, sensor_nodes.timezones_id) + , providers_id = COALESCE(EXCLUDED.providers_id, sensor_nodes.providers_id) + , modified_on = now() RETURNING 1) SELECT COUNT(1) INTO __inserted_nodes FROM inserts; @@ -75,7 +104,7 @@ SET sensor_nodes_id = sensor_nodes.sensor_nodes_id FROM sensor_nodes WHERE ms_sensornodes.sensor_nodes_id is null AND sensor_nodes.source_name = ms_sensornodes.source_name -AND sensor_nodes.source_id = ms_sensornodes.ingest_id; +AND sensor_nodes.source_id = ms_sensornodes.source_id; -- log anything we were not able to get an id for WITH r AS ( @@ -124,7 +153,8 @@ FROM ms_sensorsystems WHERE sensor_nodes_id IS NOT NULL GROUP BY sensor_nodes_id, ingest_id, metadata ON CONFLICT (sensor_nodes_id, source_id) DO UPDATE SET - metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}'); + metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , modified_on = now(); ---------------------------- -- lcs_ingest_sensors.sql -- @@ -183,27 +213,7 @@ from measurands WHERE ms_sensors.measurand=measurands.measurand and ms_sensors.units=measurands.units; --- Removed the following because it has the ids hard coded in --- if we want to continue to filter these out we should do it at the fetcher -------------------------------------------------------------------------------------------------------------- --- UPDATE ms_sensors -- --- SET measurands_id = 10 -- --- WHERE ms_sensors.measurand='ozone' -- --- AND ms_sensors.units='ppm'; -- --- -- --- UPDATE ms_sensors SET measurands_id = 126 WHERE measurands_id is null and ms_sensors.measurand='um010'; -- --- UPDATE ms_sensors SET measurands_id = 130 WHERE measurands_id is null and ms_sensors.measurand='um025'; -- --- UPDATE ms_sensors SET measurands_id = 135 WHERE measurands_id is null and ms_sensors.measurand='um100'; -- --- UPDATE ms_sensors SET measurands_id = 19 WHERE measurands_id is null and ms_sensors.measurand='pm1'; -- --- UPDATE ms_sensors SET measurands_id = 2 WHERE measurands_id is null and ms_sensors.measurand='pm25'; -- --- UPDATE ms_sensors SET measurands_id = 1 WHERE measurands_id is null and ms_sensors.measurand='pm10'; -- --- -- --- DELETE -- --- FROM ms_sensors -- --- WHERE ingest_id ~* 'purple' -- --- AND measurands_id is null -- --- AND measurand in ('um003','um050','um005'); -- -------------------------------------------------------------------------------------------------------------- + WITH r AS ( INSERT INTO rejects (t, tbl,r,fetchlogs_id) SELECT diff --git a/ingest/lcs_ingest_nodes.sql b/ingest/lcs_ingest_nodes.sql deleted file mode 100644 index f077150..0000000 --- a/ingest/lcs_ingest_nodes.sql +++ /dev/null @@ -1,28 +0,0 @@ -DELETE FROM ms_sensornodes WHERE ms_sensornodes.ingest_id IS NULL; -DELETE FROM ms_sensorsystems WHERE ms_sensorsystems.ingest_id is null or ingest_sensor_nodes_id IS NULL; -DELETE FROM ms_sensors WHERE ms_sensors.ingest_id is null OR ingest_sensor_systems_id IS NULL; - -SELECT notify('After Deletes'); - -UPDATE ms_sensornodes -SET sensor_nodes_id = sensor_nodes.sensor_nodes_id -FROM sensor_nodes -WHERE -sensor_nodes.source_name = ms_sensornodes.source_name -AND -sensor_nodes.source_id = ms_sensornodes.ingest_id; - - -INSERT INTO sensor_nodes (site_name, source_name, ismobile, geom, metadata, source_id) -SELECT site_name, source_name, ismobile, geom, metadata, ingest_id FROM -ms_sensornodes -ON CONFLICT (source_name, source_id) DO -UPDATE - SET - site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name), - ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile), - geom=coalesce(EXCLUDED.geom,sensor_nodes.geom), - metadata=sensor_nodes.metadata || EXCLUDED.metadata -; - -SELECT notify('After nodes'); diff --git a/ingest/lcs_ingest_sensors.sql b/ingest/lcs_ingest_sensors.sql deleted file mode 100644 index f0750e2..0000000 --- a/ingest/lcs_ingest_sensors.sql +++ /dev/null @@ -1,114 +0,0 @@ - -UPDATE ms_sensorsystems -SET sensor_systems_id = sensor_systems.sensor_systems_id -FROM sensor_systems -WHERE ms_sensorsystems.sensor_systems_id IS NULL -AND -ms_sensorsystems.sensor_nodes_id=sensor_systems.sensor_nodes_id -AND -ms_sensorsystems.ingest_id=sensor_systems.source_id -; - -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensorsystems', - to_jsonb(ms_sensorsystems) -FROM ms_sensorsystems WHERE sensor_systems_id IS NULL; - -UPDATE ms_sensors -SET sensor_systems_id = ms_sensorsystems.sensor_systems_id -FROM ms_sensorsystems WHERE -ms_sensors.ingest_sensor_systems_id = ms_sensorsystems.ingest_id; - -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensors', - to_jsonb(ms_sensors) -FROM ms_sensors WHERE sensor_systems_id IS NULL; - - -UPDATE ms_sensors -SET sensors_id = sensors.sensors_id -FROM sensors -WHERE -sensors.sensor_systems_id=ms_sensors.sensor_systems_id -AND -sensors.source_id = ms_sensors.ingest_id; - -SELECT count(*) from measurands; - -/* -INSERT INTO measurands (measurand, units) -SELECT DISTINCT measurand, units FROM ms_sensors -ON CONFLICT DO NOTHING; - -SELECT count(*) from measurands; -*/ - -UPDATE ms_sensors -SET measurands_id = measurands.measurands_id -from measurands -WHERE ms_sensors.measurand=measurands.measurand -and ms_sensors.units=measurands.units; - -UPDATE ms_sensors -SET measurands_id = 10 -WHERE -ms_sensors.measurand='ozone' -AND -ms_sensors.units='ppm'; - -UPDATE ms_sensors SET measurands_id = 126 WHERE measurands_id is null and ms_sensors.measurand='um010'; -UPDATE ms_sensors SET measurands_id = 130 WHERE measurands_id is null and ms_sensors.measurand='um025'; -UPDATE ms_sensors SET measurands_id = 135 WHERE measurands_id is null and ms_sensors.measurand='um100'; -UPDATE ms_sensors SET measurands_id = 19 WHERE measurands_id is null and ms_sensors.measurand='pm1'; -UPDATE ms_sensors SET measurands_id = 2 WHERE measurands_id is null and ms_sensors.measurand='pm25'; -UPDATE ms_sensors SET measurands_id = 1 WHERE measurands_id is null and ms_sensors.measurand='pm10'; - -DELETE FROM ms_sensors WHERE ingest_id ~* 'purple' AND measurands_id is null AND measurand in ('um003','um050','um005'); - -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensors no measurand', - to_jsonb(ms_sensors) -FROM ms_sensors WHERE measurands_id IS NULL; - -INSERT INTO sensors ( - source_id -, sensor_systems_id -, measurands_id -, metadata) -SELECT ingest_id -, sensor_systems_id -, measurands_id -, metadata -FROM ms_sensors -WHERE measurands_id is not null -AND sensor_systems_id is not null -GROUP BY ingest_id -, sensor_systems_id -, measurands_id -, metadata -ON CONFLICT (sensor_systems_id, measurands_id, source_id) DO -UPDATE SET - metadata=sensors.metadata || EXCLUDED.metadata -; - - -SELECT notify('After sensors'); - - -UPDATE ms_sensors -SET sensors_id = sensors.sensors_id -FROM sensors -WHERE -sensors.sensor_systems_id=ms_sensors.sensor_systems_id -AND -sensors.source_id = ms_sensors.ingest_id; - - -INSERT INTO rejects (tbl,r) -SELECT - 'ms_sensors', - to_jsonb(ms_sensors) -FROM ms_sensors WHERE sensors_id IS NULL; diff --git a/ingest/lcs_ingest_systems.sql b/ingest/lcs_ingest_systems.sql deleted file mode 100644 index 4d0b592..0000000 --- a/ingest/lcs_ingest_systems.sql +++ /dev/null @@ -1,53 +0,0 @@ - --- fill in any new sensor_nodes_id -UPDATE ms_sensornodes -SET sensor_nodes_id = sensor_nodes.sensor_nodes_id -FROM sensor_nodes -WHERE -ms_sensornodes.sensor_nodes_id is null -AND -sensor_nodes.source_name = ms_sensornodes.source_name -AND -sensor_nodes.source_id = ms_sensornodes.ingest_id; - --- log anything we were not able to get an id for -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensornodes', - to_jsonb(ms_sensornodes) -FROM ms_sensornodes WHERE sensor_nodes_id IS NULL; - - -UPDATE ms_sensorsystems -SET sensor_nodes_id = ms_sensornodes.sensor_nodes_id -FROM ms_sensornodes WHERE -ms_sensorsystems.ingest_sensor_nodes_id = ms_sensornodes.ingest_id; - -UPDATE ms_sensorsystems -SET sensor_systems_id = sensor_systems.sensor_systems_id -FROM sensor_systems -WHERE -sensor_systems.sensor_nodes_id = ms_sensorsystems.sensor_nodes_id -AND -sensor_systems.source_id = ms_sensorsystems.ingest_id; - --- log anything we were not able to get an id for -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensorsystems', - to_jsonb(ms_sensorsystems) -FROM ms_sensorsystems WHERE sensor_nodes_id IS NULL; - -SELECT notify('immediately before insert on systems'); - -INSERT INTO sensor_systems (sensor_nodes_id, source_id, metadata) -SELECT sensor_nodes_id, ingest_id, metadata -FROM ms_sensorsystems -WHERE sensor_nodes_id IS NOT NULL -ON CONFLICT (sensor_nodes_id, source_id) -DO -UPDATE SET - metadata=sensor_systems.metadata || EXCLUDED.metadata -; - -SELECT notify('After systems'); diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 953791e..8ab06a7 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -1,10 +1,24 @@ --- Get sensor systems +-- lcs_meas_ingest DO $$ DECLARE __process_start timestamptz := clock_timestamp(); +__total_measurements int; __inserted_measurements int; -__rejected_measurements int; +__rejected_measurements int := 0; +__rejected_nodes int := 0; +__total_nodes int := 0; +__updated_nodes int := 0; +__inserted_nodes int := 0; __exported_days int; +__start_datetime timestamptz; +__end_datetime timestamptz; +__inserted_start_datetime timestamptz; +__inserted_end_datetime timestamptz; +__process_time_ms int; +__insert_time_ms int; +__cache_time_ms int; +__error_context text; +__ingest_method text := 'lcs'; BEGIN DELETE @@ -13,21 +27,139 @@ WHERE ingest_id IS NULL OR datetime is NULL OR value IS NULL; -DELETE -FROM meas -WHERE datetime < '2018-01-01'::timestamptz -OR datetime>now(); +--DELETE +--FROM meas +--WHERE datetime < '2018-01-01'::timestamptz +--OR datetime>now(); DELETE FROM rejects WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM meas) AND tbl ~* '^meas'; + +SELECT COUNT(1) +, MIN(datetime) +, MAX(datetime) +INTO __total_measurements +, __start_datetime +, __end_datetime +FROM meas; + + +-- -- The ranking is to deal with the current possibility +-- -- that duplicate sensors with the same ingest/source id are created +-- -- this is a short term fix +-- -- a long term fix would not allow duplicate source_id's +-- WITH ranked_sensors AS ( +-- SELECT s.sensors_id +-- , s.source_id +-- , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk +-- FROM sensors s +-- JOIN meas m ON (s.source_id = m.ingest_id) +-- WHERE s.is_active +-- ), active_sensors AS ( +-- SELECT source_id +-- , sensors_id +-- FROM ranked_sensors +-- WHERE rnk = 1) +-- UPDATE meas +-- SET sensors_id=s.sensors_id +-- FROM active_sensors s +-- WHERE s.source_id=ingest_id; + +-- The ranking is to deal with the current possibility +-- that duplicate sensors with the same ingest/source id are created + -- this is a short term fix + -- a long term fix would not allow duplicate source_id's +WITH staged_sensors AS ( + -- this first part signficantly speeds it up on slow machines + SELECT DISTINCT ingest_id + FROM meas +), ranked_sensors AS ( + SELECT s.sensors_id + , s.source_id + , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk + FROM sensors s + JOIN staged_sensors m ON (s.source_id = m.ingest_id) +), active_sensors AS ( + SELECT source_id + , sensors_id + FROM ranked_sensors + WHERE rnk = 1) + UPDATE meas + SET sensors_id=s.sensors_id + FROM active_sensors s + WHERE s.source_id=ingest_id; + + +-- first the sensor nodes +WITH nodes AS ( +INSERT INTO sensor_nodes ( + source_name +, site_name +, source_id +, metadata) +SELECT split_ingest_id(ingest_id, 1) as source_name +, split_ingest_id(ingest_id, 2) as site_name +, split_ingest_id(ingest_id, 2) as source_id +, jsonb_build_object('fetchlogs_id', MIN(fetchlogs_id)) +FROM meas +WHERE sensors_id IS NULL +GROUP BY 1,2,3 +ON CONFLICT (source_name, source_id) DO UPDATE +SET source_id = EXCLUDED.source_id +, metadata = EXCLUDED.metadata||COALESCE(sensor_nodes.metadata, '{}'::jsonb) +RETURNING sensor_nodes_id, source_id) +INSERT INTO sensor_systems ( + sensor_nodes_id +, source_id) +SELECT sensor_nodes_id +, source_id +FROM nodes +ON CONFLICT DO NOTHING; + +-- now create a sensor for each +-- this method depends on us having a match for the parameter +WITH sen AS ( + SELECT ingest_id + , split_ingest_id(ingest_id, 1) as source_name + , split_ingest_id(ingest_id, 2) as source_id + , split_ingest_id(ingest_id, 3) as parameter + FROM meas + WHERE sensors_id IS NULL + GROUP BY 1,2,3,4 +), inserts AS ( +INSERT INTO sensors (sensor_systems_id, measurands_id, source_id) +SELECT sy.sensor_systems_id +, m.measurands_id +, ingest_id +FROM sen s +JOIN measurands_map_view m ON (s.parameter = m.key) +JOIN sensor_nodes n ON (s.source_name = n.source_name AND s.source_id = n.source_id) +JOIN sensor_systems sy ON (sy.sensor_nodes_id = n.sensor_nodes_id AND s.source_id = sy.source_id) +ON CONFLICT DO NOTHING +RETURNING sensor_systems_id) +SELECT COUNT(DISTINCT sensor_systems_id) INTO __inserted_nodes +FROM inserts; + +-- try again to find the sensors UPDATE meas SET sensors_id=s.sensors_id FROM sensors s -WHERE s.source_id=ingest_id; +WHERE s.source_id=ingest_id +AND meas.sensors_id IS NULL; + + +SELECT COUNT(DISTINCT sensors_id) +INTO __total_nodes +FROM meas; + +__process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- reject any missing. Most likely due to issues +-- with the measurand WITH r AS ( INSERT INTO rejects (t,tbl,r,fetchlogs_id) SELECT @@ -41,21 +173,10 @@ RETURNING 1) SELECT COUNT(1) INTO __rejected_measurements FROM r; +-- restart the clock to measure just inserts +__process_start := clock_timestamp(); -DELETE -FROM meas -WHERE sensors_id IS NULL; - --- --Some fake data to make it easier to test this section --- TRUNCATE meas; --- INSERT INTO meas (ingest_id, sensors_id, value, datetime) --- SELECT 'fake-ingest' --- , (SELECT sensors_id FROM sensors ORDER BY random() LIMIT 1) --- , -99 --- , generate_series(now() - '3day'::interval, current_date, '1hour'::interval); - - -WITH m AS ( +WITH inserts AS ( INSERT INTO measurements ( sensors_id, datetime, @@ -63,7 +184,7 @@ INSERT INTO measurements ( lon, lat ) SELECT - DISTINCT + --DISTINCT sensors_id, datetime, value, @@ -72,13 +193,194 @@ INSERT INTO measurements ( FROM meas WHERE sensors_id IS NOT NULL ON CONFLICT DO NOTHING -RETURNING 1) -SELECT COUNT(1) INTO __inserted_measurements -FROM m; +RETURNING sensors_id, datetime, value, lat, lon +), inserted as ( + INSERT INTO temp_inserted_measurements (sensors_id, datetime, value, lat, lon) + SELECT sensors_id + , datetime + , value + , lat + , lon + FROM inserts + RETURNING sensors_id, datetime +) +SELECT MIN(datetime) +, MAX(datetime) +, COUNT(1) +INTO __inserted_start_datetime +, __inserted_end_datetime +, __inserted_measurements +FROM inserted; + +__insert_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- mark the fetchlogs as done +WITH inserted AS ( + SELECT m.fetchlogs_id + , COUNT(m.*) as n_records + , COUNT(t.*) as n_inserted + , MIN(m.datetime) as fr_datetime + , MAX(m.datetime) as lr_datetime + , MIN(t.datetime) as fi_datetime + , MAX(t.datetime) as li_datetime + FROM meas m + LEFT JOIN temp_inserted_measurements t ON (t.sensors_id = m.sensors_id AND t.datetime = m.datetime) + GROUP BY m.fetchlogs_id) +UPDATE fetchlogs +SET completed_datetime = CURRENT_TIMESTAMP +, inserted = COALESCE(n_inserted, 0) +, records = COALESCE(n_records, 0) +, first_recorded_datetime = fr_datetime +, last_recorded_datetime = lr_datetime +, first_inserted_datetime = fi_datetime +, last_inserted_datetime = li_datetime +FROM inserted +WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; + +-- track the time required to update cache tables +__process_start := clock_timestamp(); + +-- -- Now we can use those temp_inserted_measurements to update the cache tables +-- INSERT INTO sensors_latest ( +-- sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- ) +-- ---- identify the row that has the latest value +-- WITH numbered AS ( +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn +-- FROM temp_inserted_measurements +-- ), latest AS ( +-- ---- only insert those rows +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- FROM numbered +-- WHERE rn = 1 +-- ) +-- SELECT l.sensors_id +-- , l.datetime +-- , l.value +-- , l.lat +-- , l.lon +-- FROM latest l +-- LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +-- WHERE sl.sensors_id IS NULL +-- OR l.datetime > sl.datetime +-- ON CONFLICT (sensors_id) DO UPDATE +-- SET datetime = EXCLUDED.datetime +-- , value = EXCLUDED.value +-- , lat = EXCLUDED.lat +-- , lon = EXCLUDED.lon +-- , modified_on = now() +-- --, fetchlogs_id = EXCLUDED.fetchlogs_id +-- ; +-- update the exceedances +INSERT INTO sensor_exceedances (sensors_id, threshold_value, datetime_latest) + SELECT + m.sensors_id + , t.value + , MAX(datetime) + FROM temp_inserted_measurements m + JOIN sensors s ON (m.sensors_id = s.sensors_id) + JOIN thresholds t ON (s.measurands_id = t.measurands_id) + AND m.value > t.value + GROUP BY 1, 2 + ON CONFLICT (sensors_id, threshold_value) DO UPDATE SET + datetime_latest = GREATEST(sensor_exceedances.datetime_latest, EXCLUDED.datetime_latest) + , updated_on = now(); + --- Update the export queue/logs to export these records --- wrap it in a block just in case the database does not have this module installed --- we subtract the second because the data is assumed to be time ending +INSERT INTO sensors_rollup ( + sensors_id + , datetime_first + , datetime_last + , value_latest + , value_count + , value_avg + , value_min + , value_max + , geom_latest + ) +---- identify the row that has the latest value +WITH numbered AS ( + SELECT sensors_id + , datetime + , value + , lat + , lon + , sum(1) OVER (PARTITION BY sensors_id) as value_count + , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min + , avg(value) OVER (PARTITION BY sensors_id) as value_avg + , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn + FROM temp_inserted_measurements +), latest AS ( +---- only insert those rows + SELECT sensors_id + , datetime + , value + , value_count + , value_avg + , datetime_min + , lat + , lon + FROM numbered + WHERE rn = 1 +) +SELECT l.sensors_id +, l.datetime_min -- first +, l.datetime -- last +, l.value -- last value +, l.value_count +, l.value_avg +, l.value -- min +, l.value -- max +, public.pt3857(lon, lat) +FROM latest l +LEFT JOIN sensors_rollup sr ON (l.sensors_id = sr.sensors_id) +WHERE sr.sensors_id IS NULL +OR l.datetime > sr.datetime_last +OR l.datetime_min < sr.datetime_first +ON CONFLICT (sensors_id) DO UPDATE +SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_last) +, value_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.value_latest + ELSE sensors_rollup.value_latest + END +, geom_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.geom_latest + ELSE sensors_rollup.geom_latest + END +, value_count = sensors_rollup.value_count + EXCLUDED.value_count +, value_min = LEAST(sensors_rollup.value_min, EXCLUDED.value_latest) +, value_max = GREATEST(sensors_rollup.value_max, EXCLUDED.value_latest) +, datetime_first = LEAST(sensors_rollup.datetime_first, EXCLUDED.datetime_first) +, modified_on = now() +--, fetchlogs_id = EXCLUDED.fetchlogs_id +; + + +-- Update the table that will help to track hourly rollups +INSERT INTO hourly_stats (datetime) + SELECT date_trunc('hour', datetime) + FROM temp_inserted_measurements + GROUP BY 1 +ON CONFLICT (datetime) DO UPDATE +SET modified_on = now(); + + +--Update the export queue/logs to export these records +--wrap it in a block just in case the database does not have this module installed +--we subtract the second because the data is assumed to be time ending WITH e AS ( INSERT INTO open_data_export_logs (sensor_nodes_id, day, records, measurands, modified_on) SELECT sn.sensor_nodes_id @@ -86,7 +388,7 @@ SELECT sn.sensor_nodes_id , COUNT(1) , COUNT(DISTINCT p.measurands_id) , MAX(now()) -FROM meas m +FROM temp_inserted_measurements m -- meas m JOIN sensors s ON (m.sensors_id = s.sensors_id) JOIN measurands p ON (s.measurands_id = p.measurands_id) JOIN sensor_systems ss ON (s.sensor_systems_id = ss.sensor_systems_id) @@ -101,14 +403,102 @@ RETURNING 1) SELECT COUNT(1) INTO __exported_days FROM e; -RAISE NOTICE 'inserted-measurements: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, source: lcs' + +__cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +INSERT INTO ingest_stats ( + ingest_method + -- total + , total_measurements_processed + , total_measurements_inserted + , total_measurements_rejected + , total_nodes_processed + , total_nodes_inserted + , total_nodes_updated + , total_nodes_rejected + -- total times + , total_process_time_ms + , total_insert_time_ms + , total_cache_time_ms + -- latest + , latest_measurements_processed + , latest_measurements_inserted + , latest_measurements_rejected + , latest_nodes_processed + , latest_nodes_inserted + , latest_nodes_updated + , latest_nodes_rejected + -- times + , latest_process_time_ms + , latest_insert_time_ms + , latest_cache_time_ms + ) VALUES ( + -- totals + __ingest_method + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms + -- latest + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms +) ON CONFLICT (ingest_method) DO UPDATE SET + -- totals + total_measurements_processed = ingest_stats.total_measurements_processed + EXCLUDED.total_measurements_processed + , total_measurements_inserted = ingest_stats.total_measurements_inserted + EXCLUDED.total_measurements_inserted + , total_measurements_rejected = ingest_stats.total_measurements_rejected + EXCLUDED.total_measurements_rejected + , total_nodes_processed = ingest_stats.total_nodes_processed + EXCLUDED.total_nodes_processed + , total_nodes_inserted = ingest_stats.total_nodes_inserted + EXCLUDED.total_nodes_inserted + , total_nodes_updated = ingest_stats.total_nodes_updated + EXCLUDED.total_nodes_updated + , total_nodes_rejected = ingest_stats.total_nodes_rejected + EXCLUDED.total_nodes_rejected + , total_process_time_ms = ingest_stats.total_process_time_ms + EXCLUDED.total_process_time_ms + , total_insert_time_ms = ingest_stats.total_insert_time_ms + EXCLUDED.total_insert_time_ms + , total_cache_time_ms = ingest_stats.total_cache_time_ms + EXCLUDED.total_cache_time_ms + -- latest + , latest_measurements_processed = EXCLUDED.latest_measurements_processed + , latest_measurements_inserted = EXCLUDED.latest_measurements_inserted + , latest_measurements_rejected = EXCLUDED.latest_measurements_rejected + , latest_nodes_processed = EXCLUDED.latest_nodes_processed + , latest_nodes_inserted = EXCLUDED.latest_nodes_inserted + , latest_nodes_updated = EXCLUDED.latest_nodes_updated + , latest_nodes_rejected = EXCLUDED.latest_nodes_rejected + -- times + , latest_process_time_ms = EXCLUDED.latest_process_time_ms + , latest_insert_time_ms = EXCLUDED.latest_insert_time_ms + , latest_cache_time_ms = EXCLUDED.latest_cache_time_ms + , ingest_count = ingest_stats.ingest_count + 1 + , ingested_on = EXCLUDED.ingested_on; + + +RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: lcs' , __inserted_measurements + , __inserted_start_datetime + , __inserted_end_datetime , __rejected_measurements , __exported_days - , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + , __process_time_ms + , __insert_time_ms + , __cache_time_ms; + EXCEPTION WHEN OTHERS THEN - RAISE NOTICE 'Failed to export to logs: %', SQLERRM - USING HINT = 'Make sure that the open data module is installed'; + GET STACKED DIAGNOSTICS __error_context = PG_EXCEPTION_CONTEXT; + RAISE NOTICE 'Failed to ingest measurements: %, %', SQLERRM, __error_context; END $$; diff --git a/ingest/lcs_meas_staging.sql b/ingest/lcs_meas_staging.sql deleted file mode 100644 index 3f8caf8..0000000 --- a/ingest/lcs_meas_staging.sql +++ /dev/null @@ -1,10 +0,0 @@ -CREATE TEMP TABLE meas ( - ingest_id text, - sensors_id int, - value float, - datetime timestamptz, - lon float, - lat float, - fetchlogs_id int -); -CREATE TEMP TABLE keys (key text, last_modified timestamptz); diff --git a/ingest/lcs_staging.sql b/ingest/lcs_staging.sql index dcf3067..7b2e7b8 100644 --- a/ingest/lcs_staging.sql +++ b/ingest/lcs_staging.sql @@ -1,15 +1,24 @@ -CREATE TEMP TABLE IF NOT EXISTS ms_sensornodes ( +-- DROP TABLE IF EXISTS +-- ms_sensornodes +-- , ms_sensorsystems +-- , ms_sensors +-- , meas +-- , keys +-- , temp_inserted_measurements; + +CREATE {table} IF NOT EXISTS ms_sensornodes ( sensor_nodes_id int, ingest_id text, site_name text, source_name text, + source_id text, ismobile boolean, geom geometry, metadata jsonb, fetchlogs_id int ); -CREATE TEMP TABLE IF NOT EXISTS ms_sensorsystems ( +CREATE {table} IF NOT EXISTS ms_sensorsystems ( sensor_systems_id int, ingest_id text, ingest_sensor_nodes_id text, @@ -18,8 +27,7 @@ CREATE TEMP TABLE IF NOT EXISTS ms_sensorsystems ( fetchlogs_id int ); - -CREATE TEMP TABLE IF NOT EXISTS ms_sensors ( +CREATE {table} IF NOT EXISTS ms_sensors ( ingest_id text, sensors_id int, sensor_systems_id int, @@ -31,4 +39,31 @@ CREATE TEMP TABLE IF NOT EXISTS ms_sensors ( fetchlogs_id int ); -CREATE TEMP TABLE keys (fetchlogs_id int, key text, last_modified timestamptz); +CREATE {table} IF NOT EXISTS meas ( + ingest_id text, + sensors_id int, + value float, + datetime timestamptz, + lon float, + lat float, + fetchlogs_id int +); + +CREATE {table} IF NOT EXISTS keys ( + fetchlogs_id int + , key text + , last_modified timestamptz + ); + +-- This table will hold measurements that have +-- actually been inserted into the measurements table +-- this is to deal with the overlap that we see in the +-- incoming files +CREATE {table} IF NOT EXISTS temp_inserted_measurements ( + sensors_id int + , datetime timestamptz + , value double precision + , lat double precision + , lon double precision + , fetchlogs_id int +); diff --git a/ingest/settings.py b/ingest/settings.py index ced8420..f7fa380 100644 --- a/ingest/settings.py +++ b/ingest/settings.py @@ -1,5 +1,12 @@ from typing import Union -from pydantic import BaseSettings, validator + +from pydantic_settings import ( + BaseSettings, + SettingsConfigDict, + ) + +from pydantic import computed_field + from pathlib import Path from os import environ @@ -12,9 +19,6 @@ class Settings(BaseSettings): DATABASE_DB: str DATABASE_HOST: str DATABASE_PORT: int - DATABASE_READ_URL: Union[str, None] - DATABASE_WRITE_URL: Union[str, None] - FASTAPI_URL: str DRYRUN: bool = False FETCH_BUCKET: str ETL_BUCKET: str @@ -24,21 +28,20 @@ class Settings(BaseSettings): METADATA_LIMIT: int = 10 REALTIME_LIMIT: int = 10 LOG_LEVEL: str = 'INFO' + USE_TEMP_TABLES: bool = True + PAUSE_INGESTING: bool = False - @validator('DATABASE_READ_URL', allow_reuse=True) - def get_read_url(cls, v, values): - return v or f"postgresql://{values['DATABASE_READ_USER']}:{values['DATABASE_READ_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" + @computed_field + def DATABASE_READ_URL(self) -> str: + return f"postgresql://{self.DATABASE_READ_USER}:{self.DATABASE_READ_PASSWORD}@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_DB}" - @validator('DATABASE_WRITE_URL', allow_reuse=True) - def get_write_url(cls, v, values): - return v or f"postgresql://{values['DATABASE_WRITE_USER']}:{values['DATABASE_WRITE_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" + @computed_field + def DATABASE_WRITE_URL(self) -> str: + return f"postgresql://{self.DATABASE_WRITE_USER}:{self.DATABASE_WRITE_PASSWORD}@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_DB}" - class Config: - parent = Path(__file__).resolve().parent.parent - if 'DOTENV' in environ: - env_file = Path.joinpath(parent, environ['DOTENV']) - else: - env_file = Path.joinpath(parent, ".env") + model_config = SettingsConfigDict( + extra="ignore", env_file=f"{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8" + ) settings = Settings() diff --git a/ingest/temp_locations_dump.sql b/ingest/temp_locations_dump.sql new file mode 100644 index 0000000..5cc645a --- /dev/null +++ b/ingest/temp_locations_dump.sql @@ -0,0 +1,74 @@ +DROP TABLE IF EXISTS + staging_sensornodes +, staging_sensorsystems +, staging_sensors +, staging_flags +, staging_keys; + +CREATE {table} IF NOT EXISTS staging_keys ( + fetchlogs_id int, + key text, + last_modified timestamptz +); + +CREATE {table} IF NOT EXISTS staging_sensornodes ( + sensor_nodes_id int, + is_new boolean DEFAULT true, + is_moved boolean DEFAULT false, + ingest_id text NOT NULL UNIQUE, + source_name text NOT NULL, + source_id text NOT NULL, + matching_method text NOT NULL DEFAULT 'ingest-id', + site_name text, + ismobile boolean, + geom geometry, + timezones_id int, + countries_id int, + metadata jsonb, + fetchlogs_id int, + UNIQUE (source_name, source_id) +); + +CREATE {table} IF NOT EXISTS staging_sensorsystems ( + sensor_systems_id int, + is_new boolean DEFAULT true, + ingest_id text NOT NULL UNIQUE, + instrument_ingest_id text, + ingest_sensor_nodes_id text, + sensor_nodes_id int, + metadata jsonb, + fetchlogs_id int +); + +CREATE {table} IF NOT EXISTS staging_sensors ( + ingest_id text, + is_new boolean DEFAULT true, + -- source_name text NOT NULL, + -- source_id text NOT NULL, + sensors_id int, + sensor_systems_id int, + ingest_sensor_systems_id text, + status text, + measurand text, + units text, + measurands_id int, + averaging_interval_seconds int, + logging_interval_seconds int, + metadata jsonb, + fetchlogs_id int +); + +CREATE {table} IF NOT EXISTS staging_flags ( + ingest_id text NOT NULL, + sensor_ingest_id text NOT NULL, + flags_id int, + sensor_nodes_id int, + sensors_id int, + flag_types_id int, + datetime_from timestamptz, + datetime_to timestamptz, + period tstzrange, + note text, + metadata jsonb, + fetchlogs_id int +); diff --git a/ingest/temp_measurements_dump.sql b/ingest/temp_measurements_dump.sql new file mode 100644 index 0000000..4840750 --- /dev/null +++ b/ingest/temp_measurements_dump.sql @@ -0,0 +1,64 @@ +DROP TABLE IF EXISTS + staging_sensors +, staging_measurements +, staging_inserted_measurements; + + +CREATE {table} IF NOT EXISTS staging_sensors ( + ingest_id text, + is_new boolean DEFAULT true, + source_name text NOT NULL, + source_id text NOT NULL, + sensors_id int, + sensor_systems_id int, + ingest_sensor_systems_id text, + status text, + measurand text, + units text, + measurands_id int, + averaging_interval_seconds int, + logging_interval_seconds int, + metadata jsonb, + fetchlogs_id int +); + +-- CREATE {table} IF NOT EXISTS staging_sensors ( +-- ingest_id text NOT NULL, +-- is_new boolean DEFAULT true, +-- source_name text NOT NULL, +-- source_id text NOT NULL, +-- measurand text NOT NULL, +-- sensors_id int, +-- sensor_systems_id int, +-- ingest_sensor_systems_id text, +-- units text, +-- measurands_id int, +-- metadata jsonb, +-- fetchlogs_id int +-- ); + +CREATE {table} IF NOT EXISTS staging_measurements ( + ingest_id text NOT NULL, + source_name text NOT NULL, + source_id text NOT NULL, + measurand text NOT NULL, + sensors_id int, + value float, + datetime timestamptz, + lon float, + lat float, + fetchlogs_id int +); + +--This table will hold measurements that have +--actually been inserted into the measurements table +--this is to deal with the overlap that we see in the +--incoming files +CREATE {table} IF NOT EXISTS staging_inserted_measurements ( + sensors_id int + , datetime timestamptz + , value double precision + , lat double precision + , lon double precision + , fetchlogs_id int +); diff --git a/ingest/utils.py b/ingest/utils.py index 32a03df..e4ef524 100644 --- a/ingest/utils.py +++ b/ingest/utils.py @@ -1,11 +1,14 @@ import io import os +import sys from pathlib import Path import logging from urllib.parse import unquote_plus import gzip +import uuid import boto3 +import re from io import StringIO import psycopg2 # import typer @@ -58,6 +61,7 @@ def read(self, n=None): return "".join(line) + def put_metric( namespace, metricname, @@ -107,7 +111,7 @@ def clean_csv_value(value): def get_query(file, **params): - logger.debug("get_query: {file}, params: {params}") + logger.debug(f"get_query: {file}, params: {params}") query = Path(os.path.join(dir_path, file)).read_text() if params is not None and len(params) >= 1: query = query.format(**params) @@ -209,12 +213,87 @@ def check_if_done(cursor, key): return False +def deconstruct_path(key: str): + is_local = os.path.isfile(key) + is_s3 = bool(re.match(r"s3://[a-zA-Z]+[a-zA-Z0-9_-]+/[a-zA-Z]+", key)) + is_csv = bool(re.search(r"\.csv(.gz)?$", key)) + is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) + is_compressed = bool(re.search(r"\.gz$", key)) + path = {} + if is_local: + path["local"] = True + path["key"] = key + elif is_s3: + # pull out the bucket name + p = key.split("//")[1].split("/") + path["bucket"] = p.pop(0) + path["key"] = "/".join(p) + else: + # use the current bucket from settings + path["bucket"] = settings.FETCH_BUCKET + path["key"] = key + + logger.debug(path) + return path + +def get_data(key: str): + # check to see if we were provided with a path that includes the source + # e.g. + # s3://bucket/key + # local://drive/key + # /key (assume local) + # or no source + # key (no forward slash, assume etl bucket) + if re.match(r"local://[a-zA-Z]+", key): + key = key.replace("local://", "") + + is_local = os.path.isfile(key) + is_s3 = bool(re.match(r"s3://[a-zA-Z]+[a-zA-Z0-9_-]+/[a-zA-Z]+", key)) + #is_csv = bool(re.search(r"\.csv(.gz)?$", key)) + #is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) + is_compressed = bool(re.search(r"\.gz$", key)) + logger.debug(f"checking - {key}\ns3: {is_s3}; is_local: {is_local}") + + if is_local: + return get_file(key) + elif is_s3: + # pull out the bucket name + path = key.split("//")[1].split("/") + bucket = path.pop(0) + key = "/".join(path) + else: + # use the current bucket from settings + bucket = settings.FETCH_BUCKET + + # stream the file + logger.debug(f"streaming s3 file data from s3://{bucket}/{key}") + obj = s3.get_object( + Bucket=bucket, + Key=key, + ) + f = obj["Body"] + if is_compressed: + return gzip.GzipFile(fileobj=obj["Body"]) + else: + return obj["Body"] + + +def get_file(filepath: str): + is_compressed = bool(re.search(r"\.gz$", filepath)) + logger.debug(f"streaming local file data from {filepath}") + if is_compressed: + return gzip.open(filepath, 'rb') + else: + return io.open(filepath, "r", encoding="utf-8") + + def get_object( key: str, - bucket: str = settings.ETL_BUCKET + bucket: str = settings.FETCH_BUCKET ): key = unquote_plus(key) text = '' + logger.debug(f"Getting {key} from {bucket}") obj = s3.get_object( Bucket=bucket, Key=key, @@ -227,10 +306,11 @@ def get_object( return text + def put_object( data: str, key: str, - bucket: str = settings.ETL_BUCKET + bucket: str = settings.FETCH_BUCKET ): out = io.BytesIO() with gzip.GzipFile(fileobj=out, mode='wb') as gz: @@ -282,7 +362,7 @@ def select_object(key: str): content = "" logger.debug(f"Getting object: {key}, {output_serialization}") resp = s3.select_object_content( - Bucket=settings.ETL_BUCKET, + Bucket=settings.FETCH_BUCKET, Key=key, ExpressionType="SQL", Expression=""" @@ -392,40 +472,23 @@ def load_errors_list(limit: int = 10): return rows -def load_fail(cursor, key, e): - print("full copy failed", key, e) +def load_fail(cursor, fetchlogsId, e): + logger.warning(f"full copy of {fetchlogsId} failed: {e}") cursor.execute( """ UPDATE fetchlogs - SET - last_message=%s - WHERE - key=%s + SET last_message=%s + , has_error = true + , completed_datetime = clock_timestamp() + WHERE fetchlogs_id=%s """, ( str(e), - key, + fetchlogsId, ), ) -# def load_success(cursor, key): -# cursor.execute( -# """ -# UPDATE fetchlogs -# SET -# last_message=%s, -# loaded_datetime=clock_timestamp() -# WHERE -# key=%s -# """, -# ( -# str(cursor.statusmessage), -# key, -# ), -# ) - - def load_success(cursor, keys, message: str = 'success'): cursor.execute( """ @@ -433,6 +496,7 @@ def load_success(cursor, keys, message: str = 'success'): SET last_message=%s , completed_datetime=clock_timestamp() + , has_error = false WHERE key=ANY(%s) """, ( @@ -442,6 +506,50 @@ def load_success(cursor, keys, message: str = 'success'): ) +def load_fetchlogs( + pattern: str, + limit: int = 250, + ascending: bool = False, +): + order = 'ASC' if ascending else 'DESC' + conn = psycopg2.connect(settings.DATABASE_WRITE_URL) + cur = conn.cursor() + batch_uuid = uuid.uuid4().hex + cur.execute( + f""" + UPDATE fetchlogs + SET loaded_datetime = CURRENT_TIMESTAMP + , jobs = jobs + 1 + , batch_uuid = %s + FROM ( + SELECT fetchlogs_id + FROM fetchlogs + WHERE key~E'{pattern}' + AND NOT has_error + AND completed_datetime is null + AND ( + loaded_datetime IS NULL + OR loaded_datetime < now() - '30min'::interval + ) + ORDER BY last_modified {order} nulls last + LIMIT %s + FOR UPDATE SKIP LOCKED + ) as q + WHERE q.fetchlogs_id = fetchlogs.fetchlogs_id + RETURNING fetchlogs.fetchlogs_id + , fetchlogs.key + , fetchlogs.last_modified; + """, + (batch_uuid, limit,), + ) + rows = cur.fetchall() + logger.debug(f'Loaded {len(rows)} from fetchlogs using {pattern}/{order}') + conn.commit() + cur.close() + conn.close() + return rows + + def add_fetchlog(key: str): with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: with connection.cursor() as cursor: @@ -500,6 +608,7 @@ def mark_success( SET last_message=%s , completed_datetime={completed} + , has_error = false WHERE {where} """, ( @@ -553,7 +662,7 @@ def crawl(bucket, prefix): def crawl_lcs(): - crawl(settings.ETL_BUCKET, "lcs-etl-pipeline/") + crawl(settings.FETCH_BUCKET, "lcs-etl-pipeline/") def crawl_fetch(): diff --git a/local.py b/local.py new file mode 100644 index 0000000..7a52adf --- /dev/null +++ b/local.py @@ -0,0 +1,73 @@ +import os +import sys +import orjson +import psycopg2 +import logging +from time import time +import csv + + +from ingest.lcsV2 import ( + IngestClient, + load_measurements, + load_measurements_db, +) + +from ingest.utils import ( + select_object, + get_file, +) + +logger = logging.getLogger('handler') + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level='DEBUG', + force=True, +) + +logging.getLogger('boto3').setLevel(logging.WARNING) +logging.getLogger('botocore').setLevel(logging.WARNING) +logging.getLogger('urllib3').setLevel(logging.WARNING) + + +# local files +#load_measurements_db(pattern = '^/home/christian/.*\\.(csv|json)') +# remote files, make sure it can at least read it +#load_measurements_db() + +## client based methods +## get a client +client = IngestClient() +## load all the data into the client +client.load_keys([ + [1, '~/Downloads/openaq-fetches/lcs-etl-pipeline/measures/lovemyair/2024-11-12/1731445632-1snpf.json', '2024-10-23'] +]) + +## dump just the locations +client.dump() + +# rollups and cached tables +#client.process_hourly_data() +#client.process_daily_data() +#client.process_annual_data() +#client.refresh_cached_tables() + +#client.dump_locations(False) +#client.dump_measurements(load=False) +## dump just the measurements +# client.dump_measurements +## Dump both +#client.dump() + +# #client.load(data) +# client.load_metadata(data['meta']) +# client.load_locations(data['locations']) +# client.load_measurements(data['measures']) + +# #client.dump() + + +# print(time() - start_time) +# print(f"measurements: {len(client.measurements)}") +# print(f"locations: {len(client.nodes)}") diff --git a/matching_clarity_sensors.sql b/matching_clarity_sensors.sql new file mode 100644 index 0000000..83997b9 --- /dev/null +++ b/matching_clarity_sensors.sql @@ -0,0 +1,80 @@ + + +CREATE TABLE current_clarity_nodes AS +SELECT unnest(ARRAY[1285716,1314366,1373846,1378636,1533718,1533720,1533721,1533915,1533917,1533920,1894630,1894631,1894632,1894634,1894636,1894637,1894638,1894639,1894640,1894641,1894642,1894643,1924313,1949202,1949203,1949206,2152632,2152633,2156118,2402491,290475,290476,290477,290478,290479,290480,290481,290482,290483,290484,290485,290487,290488,290489,290490,290491,290492,290495,290496,290498,290499,290500,290501,290502,290504,290505,290506,290508,290510,290512,290513,290515,290517,290518,290519,290520,290521,290522,290523,290524,290526,290528,290529,290530,290531,290532,290533,290534,290535,290536,290537,290538,290540,290541,290542,290543,290544,290545,290546,290549,290551,290552,290553,290554,290555,290557,290558,290559,290560,290561,290563,290564,290565,290566,290567,290569,290570,290571,290572,290573,290574,290575,290576,290578,290582,290583,290584,290585,290587,290588,290589,290590,290591,290593,290594,290595,290596,290597,290599,290600,290601,290602,290603,290604,290605,290606,290607,290608,290609,290610,290611,290614,290615,290616,290618,290620,290621,290622,290623,290624,290625,290626,290628,290629,290630,290632,290633,290634,290635,290636,290637,290638,290639,290641,290642,290643,290644,290645,290646,290648,290649,290650,290651,290652,290653,290654,290655,290656,290657,290658,290659,290660,290661,290662,290664,290665,290667,290668,290670,290671,290672,290674,290675,290677,290678,290679,290680,290681,290683,290685,290686,290687,300026,300027,300028,300030,301884,301885,308728,310353,310354,310355,310356,310357,310358,310360,351822,351823,351824,351825,351826,367083,367107,367110,367112,367113,367114,367116,367117,367118,370742,370743,370744,370750,370751,370752,815609,923364,923365,929705,938377,947124,947125,947126,947127,947128,947130,947132,947133,947134,947137,947138,947139,947140,947141,947142,947143,947144,947150,947151,947152,947153,947154,947155,947156,947157,947158,947159,947160,947161,947162,947163,947164,947165,947166,947168,947169,947170,947171,947172,947173,947174,947175,947176,947177,947178,947180,947182,947183,947184,947185,947186,947187,947188,947189,947190,947191,947192,947194,947195,947196,947197,947198,947199,947200,947201,947202,947203,947204,947205,947206,947207,947208,947210,947211,947212,947213,947214,947216,947217,947218,947219,947220,947221,947222,947223,947224,947225,947226,947227,947228,947229,947230,947231,947232,947234,947235,947236,947237,947238,947239,947240,947241,947242,947243,947244,947245,947246,947247,947248,947249,947250,947251,947252,947253,947254,947255,947256,947257,947258,947259,947260,947261,947262,947264,947265,947266,947267,947268,947270,947271,947273,947274,947275,947276,947277,947278,947279,947280,947281,947283,947284,947285,947286,947287,947288,947289,947290,947291,947292,947295,947296,947297,947298,947299,947300,947301,947302,947303,947304,947305,947306,947307,947308,947309,947310,947312,947313,947314,947315,947316,947317,947318,947319,947320,947321,947322,947323,947324,947325,947326,947327,947328,947329,947330,947332,947334,947335,947336,947338,947339,947340,947341,947342,947343,947344,947345,947346,947347,947348,947349]) as node; + + + + WITH clarity AS ( + SELECT sensor_nodes_id + , source_id + , site_name + , geom + , added_on + , node IS NOT NULL as is_active + FROM sensor_nodes + JOIN current_clarity_nodes ON (sensor_nodes_id = node) + WHERE source_name = 'clarity') + SELECT c.sensor_nodes_id + , c.source_id + , c.site_name + , n.source_id + , n.site_name + , c.geom = n.geom + , is_active + , ROUND(st_distance(c.geom, n.geom)::numeric, 4) as distance + FROM clarity c + LEFT JOIN staging_sensornodes n ON (st_distance(c.geom, n.geom)<0.0001) + -- WHERE n.source_id IS NOT NULL OR is_active + WHERE n.source_id IS NULL + ORDER BY c.sensor_nodes_id DESC NULLS FIRST; + + + + + WITH clarity AS ( + SELECT sensor_nodes_id + , source_id + , site_name + , geom + , added_on + , node IS NOT NULL as is_active + FROM sensor_nodes + LEFT JOIN current_clarity_nodes ON (sensor_nodes_id = node) + WHERE source_name = 'clarity') + SELECT n.source_id + , n.site_name + , c.source_id + , c.site_name + --, c.geom + --, n.geom + --, c.added_on + , c.geom = n.geom + , is_active + , ROUND(st_distance(c.geom, n.geom)::numeric, 4) as distance + , c.sensor_nodes_id + , c.sensor_nodes_id = LAG(c.sensor_nodes_id) OVER (ORDER BY c.sensor_nodes_id) + FROM staging_sensornodes n + --JOIN clarity c ON (n.site_name = c.site_name) + --JOIN clarity c ON (n.geom = c.geom) + JOIN clarity c ON (n.source_id = c.source_id) + --LEFT JOIN clarity c ON (st_distance(c.geom, n.geom)<0.00001) + --WHERE n.source_id IS NOT NULL OR is_active + WHERE is_active + ORDER BY sensor_nodes_id DESC; + + + + + + SELECT * + FROM staging_sensornodes + WHERE source_id = 'DBXRI9190'; + + + -- How many active clarity sensor nodes do we have? + SELECT string_agg(DISTINCT sensor_nodes_id::text, ',') + FROM sensor_nodes_check + WHERE source_name = 'clarity' + AND datetime_last > current_date + ; diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1c86072 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "ingest" +version = "0.1.0" +description = "Data ingestor for OpenAQ Framework" +authors = ["OpenAQ "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.12" +dateparser = "^1.2.0" +orjson = "^3.10.4" +psycopg2-binary = "^2.9.9" +pytz = "^2024.1" +typer = "^0.12.3" +typing-extensions = "^4.12.2" +pydantic = {extras = ["dotenv"], version = "^2.7.3"} +pydantic-settings = "^2.3.2" + +[tool.poetry.group.cdk.dependencies] +aws-cdk-lib = "^2.145.0" +boto3 = "^1.34.124" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4296f15..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -dateparser==1.1.1 -orjson==3.6.8 -psycopg2-binary==2.9.3 -pydantic[dotenv] -pytz==2022.1 -pytz-deprecation-shim==0.1.0.post0 -typer==0.4.1 -typing_extensions==4.2.0 diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index 30ddf82..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1 +0,0 @@ -boto3 diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..d8dfc34 --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,43 @@ +import logging +import os +import sys +import argparse + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + description=""" +Test benchmarks for ingestion + """) + +parser.add_argument( + '--name', + type=str, + required=False, + default="test", + help='Name to use for the test' + ) +parser.add_argument( + '--env', + type=str, + default='.env', + required=False, + help='The dot env file to use' + ) +parser.add_argument( + '--debug', + action="store_true", + help='Output at DEBUG level' + ) +args = parser.parse_args() + +from ingest.settings import settings + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level=settings.LOG_LEVEL.upper(), + force=True, +) + + +print(args) diff --git a/tests/benchmarking.r b/tests/benchmarking.r new file mode 100644 index 0000000..3b9a076 --- /dev/null +++ b/tests/benchmarking.r @@ -0,0 +1,157 @@ + +source("~/git/R/ESRfunctions.r") + +stats <- dir('tests/benchmark_output', pattern = "*stats.csv$", full.names=TRUE) + +params <- data.frame( + ram = c( + 2, 1, 0.25, 0.5, .02, .25, 0.25, 5, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256 + ), + cores = c( + 16, 16, 16, 16, 4, 4, + 8, 8, 8, 8, 8, 8, 8, 8, 8, + 16, 16, 16, 16, 16, 16, 16, + 4, 4, 4, 4, 4, 4, 4, + 8, 8, 8, 8, 8, 8, 8, + 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16 + ), + x86 = c( + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + ), + v1 = c( + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + ), + ingesting = c( + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + ), + row.names = c( + "4xlarge_stats.csv", "4xlarge1g_stats.csv", "4xlarge256mb_stats.csv", "4xlarge500mb_stats.csv", + "prod_stats.csv", "r5xlarge_stats.csv", "r6i2xlarge_stats.csv", "xxlarge-5gb_stats.csv", + "2xlarge-4MB_stats.csv", "2xlarge-8MB_stats.csv","2xlarge-16MB_stats.csv","2xlarge-32MB_stats.csv", + "2xlarge-64MB_stats.csv","2xlarge-128MB_stats.csv","2xlarge-256MB_stats.csv", + "4xlarge-4MB_stats.csv", "4xlarge-8MB_stats.csv","4xlarge-16MB_stats.csv","4xlarge-32MB_stats.csv", + "4xlarge-64MB_stats.csv","4xlarge-128MB_stats.csv","4xlarge-256MB_stats.csv", + "xlarge-4MB_stats.csv", "xlarge-8MB_stats.csv","xlarge-16MB_stats.csv","xlarge-32MB_stats.csv", + "xlarge-64MB_stats.csv","xlarge-128MB_stats.csv","xlarge-256MB_stats.csv", + "2xlargeV2-4MB_stats.csv", "2xlargeV2-8MB_stats.csv","2xlargeV2-16MB_stats.csv","2xlargeV2-32MB_stats.csv", + "2xlargeV2-64MB_stats.csv","2xlargeV2-128MB_stats.csv","2xlargeV2-256MB_stats.csv", + "2xlargeARM-4MB_stats.csv", "2xlargeARM-8MB_stats.csv","2xlargeARM-16MB_stats.csv","2xlargeARM-32MB_stats.csv", + "2xlargeARM-64MB_stats.csv","2xlargeARM-128MB_stats.csv","2xlargeARM-256MB_stats.csv", + "2xlargeX86-4MB_stats.csv", "2xlargeX86-8MB_stats.csv","2xlargeX86-16MB_stats.csv","2xlargeX86-32MB_stats.csv", + "2xlargeX86-64MB_stats.csv","2xlargeX86-128MB_stats.csv","2xlargeX86-256MB_stats.csv" + ) +) + +x <- do.call(rbind, lapply(stats, function(path) { + x <- read.csv(path) + x$path <- basename(path) + x[x$Name == 'Aggregated', ] + #x[x$Name == 'v2/locations/:id', ] + #x[x$Name == 'v2/latest/empty', ] +})) + +x$cores <- params[x$path, "cores"] +x$ram <- params[x$path, "ram"] +x$x86 <- params[x$path, "x86"] +x$v1 <- params[x$path, "v1"] +x$ingesting <- params[x$path, "ingesting"] + + +x <- x[x$path != "prod_stats.csv", ] + +x <- x[order(x$ram), ] + +plot(Average.Response.Time ~ cores, x) +plot(Requests.s ~ cores, x) + +plot(Average.Response.Time ~ ram, x) + +ncores <- 16 +plot(Requests.s ~ ram, subset(x, cores == ncores)) +plot(Average.Response.Time ~ ram, subset(x, cores == ncores)) +plot(Average.Response.Time ~ cores, subset(x, cores <= ncores), pch=cores, col=1) + +plot(Average.Response.Time ~ ram, subset(x, cores <= ncores), pch=cores, col=1) +legend('topright', legend=unique(x$cores), pch=unique(x$cores), bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores <= ncores), pch=19, col=as.numeric(1:nrow(x) %in% grep('V2', x$path))+1) +legend('topright', legend=c('V1', 'V2'), pch=19, col=1:2, bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores == ncores), pch=19, col=as.numeric(1:nrow(x) %in% grep('ARM', x$path))+1) +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores == ncores), pch=19) +points(Average.Response.Time ~ ram, x[grep('ARM', x$path), ], pch=19, col='red') +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores == ncores & ingesting), pch=19, col=x$x86+1) +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + + +points(Average.Response.Time ~ ram, x[grep('ARM', x$path), ], pch=19, col='red') +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + + +plot(X50. ~ ram, x) +plot(X75. ~ ram, subset(x, cores == ncores)) +plot(Request.Count ~ ram, subset(x, cores == ncores)) +plot(Failure.Count ~ ram, subset(x, cores == ncores)) +plot(ram ~ cores, x) + + +exporters <- dir('tests/benchmark_output', pattern = "*export_output*", full.names=TRUE) + +params <- data.frame( + ram = c( + 64, 0.128, .004, .004, + .004, 1, 20, 40, + 5, 8, .004, .004 + ), + cores = c( + 16, 16, 4, 4, + 2, 8, 8, 8, + 8, 8, 4, 4 + ), + row.names = c( + "4xlarge-wm64gb","4xlarge", "prod", "r5", + "small", "xxlarge-wm1g", "xxlarge-wm20g", "xxlarge-wm40g", + "xxlarge-wm5g", "xxlarge-wm8g", "xxlarge", "benchmark_export_output" + ) +) + +x <- do.call(rbind, lapply(exporters, function(path) { + x <- read.csv(path, quote="'") + x$path <- basename(path) + x$test <- gsub("benchmark_export_output_|.csv$", "", basename(path)) + return(x) +})) +x$cores = params[x$test,"cores"] +x$ram = params[x$test,"ram"] + +boxplot(time_ms~cores, x) +plot(I(time_ms/1000)~jitter(ram, 10),x, log="y") diff --git a/tests/check_lcs_file.py b/tests/check_lcs_file.py deleted file mode 100644 index 06fa454..0000000 --- a/tests/check_lcs_file.py +++ /dev/null @@ -1,59 +0,0 @@ -import logging -import sys -import os -import json - -if 'DOTENV' not in os.environ.keys(): - os.environ['DOTENV'] = '.env.testing' - -if 'AWS_PROFILE' not in os.environ.keys(): - os.environ['AWS_PROFILE'] = 'python-user' - -from pandas import DataFrame -from botocore.exceptions import ClientError -from openaq_fastapi.ingest.handler import cronhandler, logger -from openaq_fastapi.settings import settings - -from openaq_fastapi.ingest.lcs import ( - LCSData, - load_metadata_db, - load_measurements_db, - load_measurements_file, - load_measurements, - get_measurements, -) - - -from openaq_fastapi.ingest.utils import ( - load_errors, - select_object, - get_object, - get_logs_from_ids, - get_logs_from_pattern, - unquote_plus, -) - - -# load_realtime('realtime-gzipped/2022-02-04/1643994434.ndjson.gz') - -# logs = get_logs_from_pattern('stations/clarity', 2) -# - -# station data -# logs = get_logs_from_ids(ids=[5544399, 4874871]) - -# for each of them lets try and import the data -# contents = [] -# for row in logs: -# contents.append( -# {"Key": unquote_plus(row[1]), "LastModified": row[6], "id": row[0], } -# ) - -# data = LCSData(contents) -# data.get_metadata() - - -# measurement data -logs = get_logs_from_ids(ids=[5609404]) - -load_measurements(logs) diff --git a/tests/check_realtime_file.py b/tests/check_realtime_file.py deleted file mode 100644 index 880417e..0000000 --- a/tests/check_realtime_file.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging -import sys -import os -import json - -if 'DOTENV' not in os.environ.keys(): - os.environ['DOTENV'] = '.env.testing' - -if 'AWS_PROFILE' not in os.environ.keys(): - os.environ['AWS_PROFILE'] = 'python-user' - -from botocore.exceptions import ClientError -from openaq_fastapi.ingest.handler import cronhandler, logger -from openaq_fastapi.settings import settings - -from openaq_fastapi.ingest.lcs import ( - load_metadata_db, - load_measurements_db, - load_measurements_file, - load_measurements, - get_measurements, -) - -from openaq_fastapi.ingest.fetch import ( - load_realtime, - parse_json, -) - -from openaq_fastapi.ingest.utils import ( - load_errors, - select_object, - get_object, - get_logs_from_ids, -) - - -# load_realtime('realtime-gzipped/2022-02-04/1643994434.ndjson.gz') - -logs = get_logs_from_ids(ids=[5634328]) - -# logs = load_errors() - -keys = [log[1] for log in logs] - -#load_realtime(keys) - -print(f"Found {len(keys)} potential errors") - -for idx, key in enumerate(keys): - print(f"\n## Checking #{idx}: {key}") - # get text of object - try: - txt = get_object(key) - except Exception as e: - print(f"\t*** Error getting file: {e}") - continue - # break into lines - lines = txt.split("\n") - # check parse for each line - n = len(lines) - errors = [] - for jdx, line in enumerate(lines): - try: - # first just try and load it - obj = json.loads(line) - except Exception as e: - errors.append(jdx) - print(f"\t*** Loading error on line #{jdx} (of {n}): {e}\n{line}") - try: - # then we can try to parse it - row = parse_json(obj) - except Exception as e: - errors.append(jdx) - print(f"\t*** Parsing rror on line #{jdx} (of {n}): {e}\n{line}") - - - -# load_realtime(keys) - # load_realtime([ - # 'realtime-gzipped/2022-02-05/1644020232.ndjson.gz', - # 'realtime-gzipped/2022-02-05/1644068231.ndjson.gz' - # ]) - -# errors = load_errors(10) - -# print(f"Found {len(errors)} possible error files") - -# for file in errors: -# key = file[3] -# print(f"Checking file {key}") -# try: -# obj = select_object(key) -# except ClientError as e: -# if e.response['Error']['Code'] == 'JSONParsingError': -# print("There was an error parsing the file, fetching as raw file") -# print(e.response['Error']) -# obj = get_object(key) -# else: -# print("Some other error") -# except Exception as e: -# print(f"post-boto error: {e}") -# obj = get_object(key) - -# print(obj[-50:]) -# # save the file locally -# filepath = os.path.join(settings.LOCAL_SAVE_DIRECTORY, key) -# print(f"Writing file to {filepath}") -# os.makedirs(os.path.dirname(filepath), exist_ok=True) -# fle = open(filepath.replace(".gz", ""), 'w') -# fle.write(obj) -# fle.close() diff --git a/tests/test_file1.json b/tests/test_file1.json new file mode 100644 index 0000000..228fb0b --- /dev/null +++ b/tests/test_file1.json @@ -0,0 +1,120 @@ +{ + "meta": { + "schema": "v0.1", + "source": "local", + "matching_method": "ingest-id" + }, + "measures": [ + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-01T00:00:00Z", + "measure": 0.01 + }, + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-02T00:00:00Z", + "measure": 0.02 + }, + { + "sensor_id": "local-test_site_2-wind_speed", + "timestamp": "2024-01-01T00:00:00Z", + "measure": 0.01 + }, + { + "sensor_id": "local-test_site_2-wind_speed", + "timestamp": "2024-01-02T00:00:00Z", + "measure": 0.02 + } + ], + "locations": [ + { + "location": "local-test_site_1", + "label": "Test Site #1", + "lat": "45.56", + "lon": -123.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_1-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_1-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-wind_speed-info::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-02", + "flag_name": "info", + "note": "initial flag for sensor" + } + ] + } + ] + }, + { + "system_id": "local-test_site_1-ecotech:serinus_30", + "manufacturer_name": "Ecotech", + "model_name": "Serinus 30", + "sensors": [ + { + "sensor_id": "local-test_site_1-co", + "status": "u", + "parameter": "co", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-co-info::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-05", + "flag_name": "info" + } + ] + } + ] + } + ] + }, + { + "location": "local-test_site_2", + "label": "Test Site #2", + "lat": "47.56", + "lon": -124.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_2-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_2-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [] + }, + { + "sensor_id": "local-test_site_2-wind_direction", + "status": "u", + "parameter": "wd", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_2-wind_direction-error::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-02", + "flag_name": "info" + } + ] + } + ] + } + ] + } + ] +} diff --git a/tests/test_file2.json b/tests/test_file2.json new file mode 100644 index 0000000..9a619d5 --- /dev/null +++ b/tests/test_file2.json @@ -0,0 +1,127 @@ +{ + "meta": { + "schema": "v0.1", + "source": "local", + "matching_method": "ingest-id" + }, + "measures": [ + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-03T00:00:00Z", + "measure": 0.03 + }, + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-04T00:00:00Z", + "measure": 0.04 + }, + { + "sensor_id": "local-test_site_1-wind_speed", + "timestamp": "2024-01-03T00:00:00Z", + "measure": 0.03 + }, + { + "sensor_id": "local-test_site_1-wind_speed", + "timestamp": "2024-01-04T00:00:00Z", + "measure": 0.04 + } + ], + "locations": [ + { + "location": "local-test_site_1", + "label": "Test Site #1", + "lat": "45.56", + "lon": -123.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_1-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_1-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-wind_speed-info::2024-01-01", + "datetime_from": "2024-01-02", + "datetime_to": "2024-01-04", + "flag_name": "info", + "note": "initial flag for sensor" + }, + { + "flag_id": "local-test_site_1-wind_speed-info::2024-01-01", + "datetime_from": "2024-01-02", + "datetime_to": "2024-01-04", + "flag_name": "info", + "note": "A new note for this sensor" + } + ] + } + ] + }, + { + "system_id": "local-test_site_1-ecotech:serinus_30", + "manufacturer_name": "Ecotech", + "model_name": "Serinus 30", + "sensors": [ + { + "sensor_id": "local-test_site_1-co", + "status": "u", + "parameter": "co", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-co-info::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-05", + "flag_name": "info" + } + ] + } + ] + } + ] + }, + { + "location": "local-test_site_2", + "label": "Test Site #2", + "lat": "47.56", + "lon": -124.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_2-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_2-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [] + }, + { + "sensor_id": "local-test_site_2-wind_direction", + "status": "u", + "parameter": "wd", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_2-wind_direction-error::2024-01-03", + "datetime_from": "2024-01-03", + "datetime_to": "2024-01-04", + "flag_name": "info" + } + ] + } + ] + } + ] + } + ] +} diff --git a/tests/test_flags.py b/tests/test_flags.py new file mode 100644 index 0000000..8ac6fd4 --- /dev/null +++ b/tests/test_flags.py @@ -0,0 +1,48 @@ +import os +import sys +import orjson +import psycopg2 +import logging +from time import time +import csv + +os.chdir(os.path.dirname(os.path.dirname(__file__))) + +from ingest.lcsV2 import ( + IngestClient, +) + + +logger = logging.getLogger('handler') + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level='DEBUG', + force=True, +) + +logging.getLogger('boto3').setLevel(logging.WARNING) +logging.getLogger('botocore').setLevel(logging.WARNING) +logging.getLogger('urllib3').setLevel(logging.WARNING) + + +## client based methods +## get a client +client = IngestClient() +## load all the data into the client +client.load_keys([[1, './tests/test_file1.json', '2024-01-01']]) +## load the data +client.dump(load=True) +#client.dump_locations(load=False) +#client.dump_measurements(load=True) + +client.reset() + +client.load_keys([[2, './tests/test_file2.json', '2024-01-02']]) +## load the data +client.dump(load=True) + +client.process_hourly_data() +client.process_daily_data() +client.process_annual_data() +client.refresh_cached_tables()