From 0e094159fa21a65c80b85d93695f765969d9b2a4 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 12 Oct 2022 08:35:54 -0700 Subject: [PATCH 01/42] Improvements to allow nodes to be added from measurements in lcs --- check.py | 2 +- ingest/lcs.py | 2 +- ingest/lcs_meas_ingest.sql | 93 ++++++++++++++++++++++++++++++++------ ingest/lcs_staging.sql | 25 +++++++++- 4 files changed, 106 insertions(+), 16 deletions(-) diff --git a/check.py b/check.py index 22976cf..919ee8c 100644 --- a/check.py +++ b/check.py @@ -154,7 +154,7 @@ def check_realtime_key(key: str, fix: bool = False): else: check_realtime_key(key, args.fix) else: - print(key) + load_measurements([(args.id, key, None)]) if args.download: print(f'downloading: {key}') diff --git a/ingest/lcs.py b/ingest/lcs.py index 43b2da0..0e9596f 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -532,7 +532,7 @@ def load_measurements(rows): connection.set_session(autocommit=True) with connection.cursor() as cursor: - cursor.execute(get_query("lcs_meas_staging.sql")) + cursor.execute(get_query("lcs_staging.sql")) start = time() write_csv( cursor, new, "keys", ["key",], diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 953791e..0bdabc9 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -5,6 +5,8 @@ __process_start timestamptz := clock_timestamp(); __inserted_measurements int; __rejected_measurements int; __exported_days int; +__inserted_start_datetime timestamptz; +__inserted_end_datetime timestamptz; BEGIN DELETE @@ -28,6 +30,57 @@ SET sensors_id=s.sensors_id FROM sensors s WHERE s.source_id=ingest_id; + +-- first the sensor nodes +WITH nodes AS ( +INSERT INTO sensor_nodes ( + source_name +, source_id ) +SELECT split_part(ingest_id, '-', 1) as source_name +, split_part(ingest_id, '-', 2) as source_id +FROM meas +WHERE sensors_id IS NULL +GROUP BY 1,2 +ON CONFLICT (source_name, source_id) DO UPDATE +SET source_id = EXCLUDED.source_id +RETURNING sensor_nodes_id, source_id) +INSERT INTO sensor_systems ( + sensor_nodes_id +, source_id) +SELECT sensor_nodes_id +, source_id +FROM nodes +ON CONFLICT DO NOTHING; + +-- now create a sensor for each +-- this method depends on us having a match for the parameter +WITH sen AS ( +SELECT ingest_id +, split_part(ingest_id, '-', 1) as source_name +, split_part(ingest_id, '-', 2) as source_id +, split_part(ingest_id, '-', 3) as parameter +FROM meas +WHERE sensors_id IS NULL +GROUP BY 1,2,3,4) +INSERT INTO sensors (sensor_systems_id, measurands_id, source_id) +SELECT sy.sensor_systems_id +, m.measurands_id +, ingest_id +FROM sen s +JOIN measurands_map_view m ON (s.parameter = m.key) +JOIN sensor_nodes n ON (s.source_name = n.source_name AND s.source_id = n.source_id) +JOIN sensor_systems sy ON (sy.sensor_nodes_id = n.sensor_nodes_id AND s.source_id = sy.source_id) +ON CONFLICT DO NOTHING; + +-- try again to find the sensors +UPDATE meas +SET sensors_id=s.sensors_id +FROM sensors s +WHERE s.source_id=ingest_id +AND meas.sensors_id IS NULL; + +-- reject any missing. Most likely due to issues +-- with the measurand WITH r AS ( INSERT INTO rejects (t,tbl,r,fetchlogs_id) SELECT @@ -42,9 +95,9 @@ SELECT COUNT(1) INTO __rejected_measurements FROM r; -DELETE -FROM meas -WHERE sensors_id IS NULL; +--DELETE +--FROM meas +--WHERE sensors_id IS NULL; -- --Some fake data to make it easier to test this section -- TRUNCATE meas; @@ -55,7 +108,7 @@ WHERE sensors_id IS NULL; -- , generate_series(now() - '3day'::interval, current_date, '1hour'::interval); -WITH m AS ( +WITH inserts AS ( INSERT INTO measurements ( sensors_id, datetime, @@ -63,7 +116,7 @@ INSERT INTO measurements ( lon, lat ) SELECT - DISTINCT + --DISTINCT sensors_id, datetime, value, @@ -72,13 +125,25 @@ INSERT INTO measurements ( FROM meas WHERE sensors_id IS NOT NULL ON CONFLICT DO NOTHING -RETURNING 1) -SELECT COUNT(1) INTO __inserted_measurements -FROM m; +RETURNING sensors_id, datetime +), inserted as ( + INSERT INTO temp_inserted_measurements (sensors_id, datetime) + SELECT sensors_id + , datetime + FROM inserts + RETURNING sensors_id, datetime +) +SELECT MIN(datetime) +, MAX(datetime) +, COUNT(1) +INTO __inserted_start_datetime +, __inserted_end_datetime +, __inserted_measurements +FROM inserted; --- Update the export queue/logs to export these records --- wrap it in a block just in case the database does not have this module installed --- we subtract the second because the data is assumed to be time ending +--Update the export queue/logs to export these records +--wrap it in a block just in case the database does not have this module installed +--we subtract the second because the data is assumed to be time ending WITH e AS ( INSERT INTO open_data_export_logs (sensor_nodes_id, day, records, measurands, modified_on) SELECT sn.sensor_nodes_id @@ -86,7 +151,7 @@ SELECT sn.sensor_nodes_id , COUNT(1) , COUNT(DISTINCT p.measurands_id) , MAX(now()) -FROM meas m +FROM temp_inserted_measurements m -- meas m JOIN sensors s ON (m.sensors_id = s.sensors_id) JOIN measurands p ON (s.measurands_id = p.measurands_id) JOIN sensor_systems ss ON (s.sensor_systems_id = ss.sensor_systems_id) @@ -101,8 +166,10 @@ RETURNING 1) SELECT COUNT(1) INTO __exported_days FROM e; -RAISE NOTICE 'inserted-measurements: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, source: lcs' +RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, source: lcs' , __inserted_measurements + , __inserted_start_datetime + , __inserted_end_datetime , __rejected_measurements , __exported_days , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); diff --git a/ingest/lcs_staging.sql b/ingest/lcs_staging.sql index dcf3067..6beb76c 100644 --- a/ingest/lcs_staging.sql +++ b/ingest/lcs_staging.sql @@ -31,4 +31,27 @@ CREATE TEMP TABLE IF NOT EXISTS ms_sensors ( fetchlogs_id int ); -CREATE TEMP TABLE keys (fetchlogs_id int, key text, last_modified timestamptz); +CREATE TEMP TABLE IF NOT EXISTS meas ( + ingest_id text, + sensors_id int, + value float, + datetime timestamptz, + lon float, + lat float, + fetchlogs_id int +); + +CREATE TEMP TABLE IF NOT EXISTS keys ( + fetchlogs_id int + , key text + , last_modified timestamptz + ); + +-- This table will hold measurements that have +-- actually been inserted into the measurements table +-- this is to deal with the overlap that we see in the +-- incoming files +CREATE TEMP TABLE IF NOT EXISTS temp_inserted_measurements ( + sensors_id int, + datetime timestamptz +); From d79becd6f5dc60150c049e7044c3ca64fbaf6fe7 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 25 Oct 2022 15:22:05 -0700 Subject: [PATCH 02/42] Moved to using orjson in the fetch process Old method was not keeping ascii characters as is and instead of updating the old json.dumps method we are just shifting over to orjson like the lcs method is using which has different defaults. --- ingest/fetch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ingest/fetch.py b/ingest/fetch.py index 450fca3..b751693 100644 --- a/ingest/fetch.py +++ b/ingest/fetch.py @@ -1,10 +1,10 @@ import gzip import io -import json import os import logging import time from datetime import datetime, timedelta +import orjson import boto3 import psycopg2 @@ -64,7 +64,7 @@ def parse_json(j, key: str = None): else: coords = None - data = json.dumps(j) + data = orjson.dumps(j).decode() row = [ location, @@ -105,7 +105,7 @@ def copy_data(cursor, key): with gzip.GzipFile(fileobj=obj.get()["Body"]) as gz: f = io.BufferedReader(gz) iterator = StringIteratorIO( - (parse_json(json.loads(line)) for line in f) + (parse_json(orjson.loads(line)) for line in f) ) query = """ COPY tempfetchdata ( @@ -132,7 +132,7 @@ def copy_file(cursor, file): with gzip.GzipFile(file) as gz: f = io.BufferedReader(gz) iterator = StringIteratorIO( - (parse_json(json.loads(line)) for line in f) + (parse_json(orjson.loads(line)) for line in f) ) try: query = get_query("fetch_copy.sql") From 99365ebb35aeffcd883c89dfe41548fb37945b58 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 25 Oct 2022 15:27:35 -0700 Subject: [PATCH 03/42] Updated check to use orjson, added line check before load --- check.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/check.py b/check.py index 919ee8c..74e241d 100644 --- a/check.py +++ b/check.py @@ -1,7 +1,7 @@ import argparse import logging import os -import json +import orjson logger = logging.getLogger(__name__) @@ -107,18 +107,19 @@ def check_realtime_key(key: str, fix: bool = False): n = len(lines) errors = [] for jdx, line in enumerate(lines): - try: - # first just try and load it - obj = json.loads(line) - except Exception as e: - errors.append(jdx) - print(f"*** Loading error on line #{jdx} (of {n}): {e}\n{line}") - try: - # then we can try to parse it - parse_json(obj) - except Exception as e: - errors.append(jdx) - print(f"*** Parsing error on line #{jdx} (of {n}): {e}\n{line}") + if len(line) > 0: + try: + # first just try and load it + obj = orjson.loads(line) + except Exception as e: + errors.append(jdx) + print(f"*** Loading error on line #{jdx} (of {n}): {e}\n{line}") + try: + # then we can try to parse it + parse_json(obj) + except Exception as e: + errors.append(jdx) + print(f"*** Parsing error on line #{jdx} (of {n}): {e}\n{line}") if len(errors) > 0 and fix: # remove the bad rows and then replace the file @@ -143,6 +144,7 @@ def check_realtime_key(key: str, fix: bool = False): keys = [log[1] for log in logs] # loop through and check each for idx, key in enumerate(keys): + print(key) # if we are resubmiting we dont care # what type of file it is if args.resubmit: @@ -159,7 +161,7 @@ def check_realtime_key(key: str, fix: bool = False): if args.download: print(f'downloading: {key}') txt = get_object(key) - fpath = os.path.expanduser(f'~/{key}') + fpath = os.path.expanduser(f'~/Downloads/{key}') os.makedirs(os.path.dirname(fpath), exist_ok=True) with open(fpath.replace('.gz',''), 'w') as f: f.write(txt) From 33228c86e0bf0e0489b47d4706f0ab5661ec7fe4 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 9 Nov 2022 10:16:15 -0800 Subject: [PATCH 04/42] Added sensors_latest update on ingestion --- cdk/app.py | 1 + ingest/fetch_filter.sql | 13 --- ingest/fetch_ingest1.sql | 45 ---------- ingest/fetch_ingest2.sql | 24 ------ ingest/fetch_ingest3.sql | 79 ------------------ ingest/fetch_ingest4.sql | 19 ----- ingest/fetch_ingest5.sql | 35 -------- ingest/fetch_ingest6.sql | 27 ------ ingest/fetch_ingest7.sql | 151 ---------------------------------- ingest/fetch_ingest_full.sql | 64 +++++++++----- ingest/fetch_staging.sql | 6 +- ingest/lcs_ingest_full.sql | 22 +---- ingest/lcs_ingest_nodes.sql | 28 ------- ingest/lcs_ingest_sensors.sql | 114 ------------------------- ingest/lcs_ingest_systems.sql | 53 ------------ ingest/lcs_meas_ingest.sql | 54 +++++++++++- ingest/lcs_staging.sql | 8 +- 17 files changed, 107 insertions(+), 636 deletions(-) delete mode 100644 ingest/fetch_filter.sql delete mode 100644 ingest/fetch_ingest1.sql delete mode 100644 ingest/fetch_ingest2.sql delete mode 100644 ingest/fetch_ingest3.sql delete mode 100644 ingest/fetch_ingest4.sql delete mode 100644 ingest/fetch_ingest5.sql delete mode 100644 ingest/fetch_ingest6.sql delete mode 100644 ingest/fetch_ingest7.sql delete mode 100644 ingest/lcs_ingest_nodes.sql delete mode 100644 ingest/lcs_ingest_sensors.sql delete mode 100644 ingest/lcs_ingest_systems.sql diff --git a/cdk/app.py b/cdk/app.py index 8318018..d71236b 100644 --- a/cdk/app.py +++ b/cdk/app.py @@ -19,6 +19,7 @@ app = aws_cdk.App() + ingest = LambdaIngestStack( app, f"openaq-ingest-{settings.ENV}", diff --git a/ingest/fetch_filter.sql b/ingest/fetch_filter.sql deleted file mode 100644 index 156f6ad..0000000 --- a/ingest/fetch_filter.sql +++ /dev/null @@ -1,13 +0,0 @@ -DELETE FROM tempfetchdata -WHERE -datetime <= ( - SELECT max(range_end) - FROM timescaledb_information.chunks - WHERE - hypertable_name IN ('rollups', 'measurements') - AND is_compressed -); -DELETE FROM tempfetchdata WHERE datetime > now(); -DELETE FROM tempfetchdata WHERE datetime < (SELECT max(datetime) - '2 days'::interval from tempfetchdata) -; -SELECT min(datetime), max(datetime) FROM tempfetchdata; \ No newline at end of file diff --git a/ingest/fetch_ingest1.sql b/ingest/fetch_ingest1.sql deleted file mode 100644 index 12ef519..0000000 --- a/ingest/fetch_ingest1.sql +++ /dev/null @@ -1,45 +0,0 @@ -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors AS -WITH t AS ( -SELECT DISTINCT - location as site_name, - unit as units, - parameter as measurand, - country, - city, - jsonb_merge_agg(data) as data, - source_name, - coords::geometry as geom, - source_type, - mobile as ismobile, - avpd_unit, - avpd_value, - coords::geometry as cgeom, - NULL::int as sensor_nodes_id, - null::int as sensor_systems_id, - null::int as measurands_id, - null::int as sensors_id, - null::jsonb as node_metadata, - null::jsonb as sensor_metadata, - array_agg(tfdid) as tfdids -FROM tempfetchdata -GROUP BY - location, - unit, - parameter, - country, - city, - coords, - source_type, - source_name, - mobile, - avpd_unit, - avpd_value, - sensor_nodes_id, - sensor_systems_id, - measurands_id, - sensors_id, - node_metadata, - sensor_metadata -) -SELECT row_number() over () as tfsid, * FROM t; -CREATE INDEX ON tempfetchdata_sensors (tfsid); \ No newline at end of file diff --git a/ingest/fetch_ingest2.sql b/ingest/fetch_ingest2.sql deleted file mode 100644 index 23beb0f..0000000 --- a/ingest/fetch_ingest2.sql +++ /dev/null @@ -1,24 +0,0 @@ --- Cleanup fields - -UPDATE tempfetchdata_sensors t SET -geom = NULL WHERE st_x(geom) = 0 and st_y(geom) =0; - -UPDATE tempfetchdata_sensors SET units = 'µg/m³' -WHERE units IN ('µg/m��','��g/m³'); - -UPDATE tempfetchdata_sensors SET -node_metadata = - jsonb_strip_nulls( - COALESCE(data, '{}'::jsonb) - || - jsonb_build_object( - 'source_type', - 'government', - 'origin', - 'openaq' - ) - ), -sensor_metadata = jsonb_strip_nulls(jsonb_build_object( - 'data_averaging_period_seconds', avpd_value * 3600 - )) -; \ No newline at end of file diff --git a/ingest/fetch_ingest3.sql b/ingest/fetch_ingest3.sql deleted file mode 100644 index 1a65a4d..0000000 --- a/ingest/fetch_ingest3.sql +++ /dev/null @@ -1,79 +0,0 @@ -/* -CREATE TEMP TABLE tempfetchdata_nodes AS -SELECT * FROM (SELECT - first_notnull(site_name) as site_name, - first_notnull(source_name) as source_name, - first_notnull(country) as country, - first_notnull(city) as city, - --jsonb_merge_agg(node_metadata) as metadata, - first_notnull(ismobile) as ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - st_centroid(st_collect(geom)) as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NOT NULL -GROUP BY - sensor_nodes_id,st_snaptogrid(geom, .0001) -) AS wgeom -UNION ALL -SELECT * FROM -(SELECT - site_name, - source_name, - first_notnull(country) as country, - first_notnull(city) as city, - --jsonb_merge_agg(node_metadata) as metadata, - first_notnull(ismobile) as ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - null::geometry as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NULL -AND site_name is not null -and source_name is not null -GROUP BY - site_name, source_name, sensor_nodes_id -) as nogeom -; -*/ - -CREATE TEMP TABLE tempfetchdata_nodes AS -SELECT * FROM (SELECT - site_name, - source_name, - country, - city, - node_metadata as metadata, - ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - st_centroid(st_collect(geom)) as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NOT NULL -GROUP BY - 1,2,3,4,5,6,7,8,st_snaptogrid(geom, .0001) -) AS wgeom -UNION ALL -SELECT * FROM -(SELECT - site_name, - source_name, - country, - city, - node_metadata as metadata, - ismobile, - null::int as sensor_nodes_id, - null::int as sensor_systems_id, - null::geometry as geom, - array_agg(tfsid) as tfsids -FROM tempfetchdata_sensors -WHERE geom IS NULL -AND site_name is not null -and source_name is not null -GROUP BY - 1,2,3,4,5,6,7,8,9 -) as nogeom -; diff --git a/ingest/fetch_ingest4.sql b/ingest/fetch_ingest4.sql deleted file mode 100644 index 6c6ae00..0000000 --- a/ingest/fetch_ingest4.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Lookup Node Ids - -UPDATE tempfetchdata_nodes t -SET sensor_nodes_id = sn.sensor_nodes_id FROM -sensor_nodes sn -WHERE t.geom is not null -AND st_dwithin(sn.geom, t.geom, .0001) -AND origin='OPENAQ'; - -UPDATE tempfetchdata_nodes t -SET sensor_nodes_id = sn.sensor_nodes_id FROM -sensor_nodes sn -WHERE -t.sensor_nodes_id is null AND -t.site_name is not null -and t.source_name is not null -and t.site_name = sn.site_name -and t.source_name=sn.source_name -and origin='OPENAQ'; \ No newline at end of file diff --git a/ingest/fetch_ingest5.sql b/ingest/fetch_ingest5.sql deleted file mode 100644 index 644dfb5..0000000 --- a/ingest/fetch_ingest5.sql +++ /dev/null @@ -1,35 +0,0 @@ --- Update any records that have changed - -UPDATE sensor_nodes s SET - site_name = COALESCE(t.site_name, s.site_name), - source_name = COALESCE(t.source_name, s.source_name), - city = COALESCE(t.city, s.city), - country = COALESCE(t.country, s.country), - ismobile = COALESCE(t.ismobile, s.ismobile), - metadata = COALESCE(s.metadata, '{}'::jsonb) || t.metadata, - geom = COALESCE(t.geom, s.geom) -FROM tempfetchdata_nodes t -WHERE t.sensor_nodes_id = s.sensor_nodes_id AND -( - (s.geom IS NULL and t.geom IS NOT NULL) -OR - - ROW( - t.sensor_nodes_id, - t.ismobile, - t.site_name, - t.source_name, - t.city, - t.country, - t.metadata - ) IS DISTINCT FROM ( - s.sensor_nodes_id, - s.ismobile, - s.site_name, - s.source_name, - s.city, - s.country, - s.metadata - ) -) -; diff --git a/ingest/fetch_ingest6.sql b/ingest/fetch_ingest6.sql deleted file mode 100644 index 2025749..0000000 --- a/ingest/fetch_ingest6.sql +++ /dev/null @@ -1,27 +0,0 @@ --- Create new nodes where they don't exist -WITH sn AS ( -INSERT INTO sensor_nodes ( - site_name, - metadata, - geom, - source_name, - city, - country, - ismobile -) -SELECT - site_name, - metadata, - geom, - source_name, - city, - country, - ismobile -FROM tempfetchdata_nodes t -WHERE t.sensor_nodes_id is NULL -RETURNING * -) -UPDATE tempfetchdata_nodes tf SET sensor_nodes_id = sn.sensor_nodes_id -FROM sn WHERE tf.sensor_nodes_id is null -and row(tf.site_name, tf.geom, tf.source_name) is not distinct -from row(sn.site_name, sn.geom, sn.source_name); \ No newline at end of file diff --git a/ingest/fetch_ingest7.sql b/ingest/fetch_ingest7.sql deleted file mode 100644 index 6df2009..0000000 --- a/ingest/fetch_ingest7.sql +++ /dev/null @@ -1,151 +0,0 @@ --- Get sensor systems - - -UPDATE tempfetchdata_nodes t -SET sensor_systems_id = ss.sensor_systems_id FROM -sensor_systems ss -WHERE t.sensor_nodes_id = ss.sensor_nodes_id; - --- Add any rows that did not get an id --- into the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp(), 'sensor_nodes', to_jsonb(tf) FROM -tempfetchdata_nodes tf WHERE sensor_nodes_id IS NULL; -DELETE FROM tempfetchdata_nodes WHERE sensor_nodes_id IS NULL; - --- create sensor systems that don't exist -WITH ss AS ( -INSERT INTO sensor_systems (sensor_nodes_id) -SELECT DISTINCT sensor_nodes_id FROM tempfetchdata_nodes t -WHERE t.sensor_systems_id is NULL AND t.sensor_nodes_id IS NOT NULL -RETURNING * -) UPDATE tempfetchdata_nodes tf -SET sensor_systems_id = ss.sensor_systems_id -FROM ss WHERE tf.sensor_nodes_id=ss.sensor_nodes_id -and tf.sensor_systems_id is null; - --- Add any rows that did not get an id --- into the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp(), 'sensor_systems', to_jsonb(tf) FROM -tempfetchdata_nodes tf WHERE sensor_systems_id IS NULL; -DELETE FROM tempfetchdata_nodes WHERE sensor_systems_id IS NULL; - --- merge sensor node / system ids back to sensors table -UPDATE tempfetchdata_sensors ts SET - sensor_nodes_id = tn.sensor_nodes_id, - sensor_systems_id = tn.sensor_systems_id -FROM - tempfetchdata_nodes tn -WHERE - ts.tfsid = ANY(tn.tfsids); - - --- add any measurands that don't exist -UPDATE tempfetchdata_sensors t SET measurands_id= m.measurands_id FROM -measurands m -WHERE t.measurand = m.measurand AND t.units = m.units; - -WITH m AS ( -INSERT INTO measurands (measurand, units) -SELECT DISTINCT measurand, units FROM tempfetchdata_sensors t -WHERE t.measurands_id is NULL -RETURNING * -) UPDATE tempfetchdata_sensors tf SET measurands_id = m.measurands_id -FROM m WHERE tf.measurand=m.measurand -and tf.units=m.units and tf.measurands_id is null; - --- get cleaned sensors table -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors_clean AS -SELECT - null::int as sensors_id, - sensor_nodes_id, - sensor_systems_id, - measurands_id, - jsonb_merge_agg(sensor_metadata) as metadata, - array_merge_agg(tfdids) as tfdids -FROM tempfetchdata_sensors -GROUP BY 1,2,3,4; - - --- get sensor id -UPDATE tempfetchdata_sensors_clean t -SET sensors_id = s.sensors_id -FROM sensors s -WHERE t.sensor_systems_id = s.sensor_systems_id -AND t.measurands_id = s.measurands_id -; - --- Add any rows that did not get an id --- into the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp() -, 'sensors' -, to_jsonb(tf) -FROM tempfetchdata_sensors_clean tf -WHERE sensor_systems_id IS NULL -OR measurands_id IS NULL; - -DELETE -FROM tempfetchdata_sensors_clean -WHERE sensor_systems_id IS NULL -OR measurands_id IS NULL; - --- add any sensors that don't exist -WITH s AS ( - INSERT INTO sensors ( - sensor_systems_id, - measurands_id, - metadata - ) - SELECT - sensor_systems_id, - measurands_id, - metadata - FROM - tempfetchdata_sensors_clean tf - WHERE - tf.sensors_id IS NULL - RETURNING * -) UPDATE tempfetchdata_sensors_clean tfc - SET - sensors_id = s.sensors_id - FROM s - WHERE - tfc.sensors_id IS NULL - AND - s.sensor_systems_id = tfc.sensor_systems_id - AND - s.measurands_id = tfc.measurands_id -; - -UPDATE tempfetchdata t -SET sensors_id = ts.sensors_id -FROM tempfetchdata_sensors_clean ts -WHERE t.tfdid = ANY(ts.tfdids); - --- Add any rows that did not get an id into --- the rejects table and then delete -INSERT INTO rejects -SELECT clock_timestamp() -, 'sensors' -, to_jsonb(tf) -FROM tempfetchdata tf -WHERE sensors_id IS NULL; - -DELETE -FROM tempfetchdata -WHERE sensors_id IS NULL; - -INSERT INTO measurements (sensors_id, datetime, value) -SELECT sensors_id, datetime, value -FROM tempfetchdata -ON CONFLICT DO NOTHING; - - -UPDATE fetchlogs -SET completed_datetime=clock_timestamp() -, last_message = NULL -- reset any previous error -WHERE key IN (SELECT key FROM ingestfiles); - -SELECT min(datetime), max(datetime) FROM tempfetchdata; diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index 089c647..ad5013c 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -28,24 +28,6 @@ SELECT now() INTO __process_start; -- File fetch_filter.sql -- --------------------------- --- Note: I am including this because it already existed --- I am not sure why its here --- update: it is likely here because we cannot insert data into --- compressed partitions - -WITH deletes AS ( - DELETE - FROM tempfetchdata - WHERE datetime <= ( - SELECT COALESCE(max(range_end), '1970-01-01'::timestamp) - FROM timescaledb_information.chunks - WHERE hypertable_name IN ('rollups', 'measurements') - AND is_compressed - ) - RETURNING 1) -SELECT COUNT(1) INTO __deleted_timescaledb -FROM deletes; - -- This makes sense though we should track in case its systemic WITH deletes AS ( DELETE @@ -205,6 +187,7 @@ GROUP BY ) as nogeom ; + ------------- -- File #4 -- ------------- @@ -489,6 +472,11 @@ DELETE FROM tempfetchdata WHERE sensors_id IS NULL; +--DELETE +--FROM measurements m +--USING tempfetchdata t +--WHERE m.datetime = t.datetime +--AND m.sensors_id = t.sensors_id; WITH inserts AS ( INSERT INTO measurements (sensors_id, datetime, value) @@ -497,11 +485,12 @@ WITH inserts AS ( , value FROM tempfetchdata ON CONFLICT DO NOTHING - RETURNING sensors_id, datetime + RETURNING sensors_id, datetime, value ), inserted as ( - INSERT INTO temp_inserted_measurements (sensors_id, datetime) + INSERT INTO temp_inserted_measurements (sensors_id, datetime, value) SELECT sensors_id , datetime + , value FROM inserts RETURNING sensors_id, datetime ) @@ -514,6 +503,41 @@ INTO __inserted_start_datetime FROM inserted; +-- Now we can use those temp_inserted_measurements to update the cache tables +INSERT INTO sensors_latest ( + sensors_id + , datetime + , value + ) +---- identify the row that has the latest value +WITH numbered AS ( + SELECT sensors_id + , datetime + , value + , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn + FROM temp_inserted_measurements +), latest AS ( +---- only insert those rows + SELECT sensors_id + , datetime + , value + FROM numbered + WHERE rn = 1 +) +SELECT l.sensors_id +, l.datetime +, l.value +FROM latest l +LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +WHERE sl.sensors_id IS NULL +OR l.datetime > sl.datetime +ON CONFLICT (sensors_id) DO UPDATE +SET datetime = EXCLUDED.datetime +, value = EXCLUDED.value +, modified_on = now() +--, fetchlogs_id = EXCLUDED.fetchlogs_id +; + -- No longer going to manage the fetch log in this way -- WITH updates AS ( -- UPDATE fetchlogs diff --git a/ingest/fetch_staging.sql b/ingest/fetch_staging.sql index 5ea6e43..d6595f6 100644 --- a/ingest/fetch_staging.sql +++ b/ingest/fetch_staging.sql @@ -26,6 +26,8 @@ CREATE TEMP TABLE IF NOT EXISTS ingestfiles( -- this is to deal with the overlap that we see in the -- incoming files CREATE TEMP TABLE IF NOT EXISTS temp_inserted_measurements ( - sensors_id int, - datetime timestamptz + sensors_id int + , datetime timestamptz + , value double precision + , fetchlogs_id int ); diff --git a/ingest/lcs_ingest_full.sql b/ingest/lcs_ingest_full.sql index 612cc36..73d8266 100644 --- a/ingest/lcs_ingest_full.sql +++ b/ingest/lcs_ingest_full.sql @@ -183,27 +183,7 @@ from measurands WHERE ms_sensors.measurand=measurands.measurand and ms_sensors.units=measurands.units; --- Removed the following because it has the ids hard coded in --- if we want to continue to filter these out we should do it at the fetcher -------------------------------------------------------------------------------------------------------------- --- UPDATE ms_sensors -- --- SET measurands_id = 10 -- --- WHERE ms_sensors.measurand='ozone' -- --- AND ms_sensors.units='ppm'; -- --- -- --- UPDATE ms_sensors SET measurands_id = 126 WHERE measurands_id is null and ms_sensors.measurand='um010'; -- --- UPDATE ms_sensors SET measurands_id = 130 WHERE measurands_id is null and ms_sensors.measurand='um025'; -- --- UPDATE ms_sensors SET measurands_id = 135 WHERE measurands_id is null and ms_sensors.measurand='um100'; -- --- UPDATE ms_sensors SET measurands_id = 19 WHERE measurands_id is null and ms_sensors.measurand='pm1'; -- --- UPDATE ms_sensors SET measurands_id = 2 WHERE measurands_id is null and ms_sensors.measurand='pm25'; -- --- UPDATE ms_sensors SET measurands_id = 1 WHERE measurands_id is null and ms_sensors.measurand='pm10'; -- --- -- --- DELETE -- --- FROM ms_sensors -- --- WHERE ingest_id ~* 'purple' -- --- AND measurands_id is null -- --- AND measurand in ('um003','um050','um005'); -- -------------------------------------------------------------------------------------------------------------- + WITH r AS ( INSERT INTO rejects (t, tbl,r,fetchlogs_id) SELECT diff --git a/ingest/lcs_ingest_nodes.sql b/ingest/lcs_ingest_nodes.sql deleted file mode 100644 index f077150..0000000 --- a/ingest/lcs_ingest_nodes.sql +++ /dev/null @@ -1,28 +0,0 @@ -DELETE FROM ms_sensornodes WHERE ms_sensornodes.ingest_id IS NULL; -DELETE FROM ms_sensorsystems WHERE ms_sensorsystems.ingest_id is null or ingest_sensor_nodes_id IS NULL; -DELETE FROM ms_sensors WHERE ms_sensors.ingest_id is null OR ingest_sensor_systems_id IS NULL; - -SELECT notify('After Deletes'); - -UPDATE ms_sensornodes -SET sensor_nodes_id = sensor_nodes.sensor_nodes_id -FROM sensor_nodes -WHERE -sensor_nodes.source_name = ms_sensornodes.source_name -AND -sensor_nodes.source_id = ms_sensornodes.ingest_id; - - -INSERT INTO sensor_nodes (site_name, source_name, ismobile, geom, metadata, source_id) -SELECT site_name, source_name, ismobile, geom, metadata, ingest_id FROM -ms_sensornodes -ON CONFLICT (source_name, source_id) DO -UPDATE - SET - site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name), - ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile), - geom=coalesce(EXCLUDED.geom,sensor_nodes.geom), - metadata=sensor_nodes.metadata || EXCLUDED.metadata -; - -SELECT notify('After nodes'); diff --git a/ingest/lcs_ingest_sensors.sql b/ingest/lcs_ingest_sensors.sql deleted file mode 100644 index f0750e2..0000000 --- a/ingest/lcs_ingest_sensors.sql +++ /dev/null @@ -1,114 +0,0 @@ - -UPDATE ms_sensorsystems -SET sensor_systems_id = sensor_systems.sensor_systems_id -FROM sensor_systems -WHERE ms_sensorsystems.sensor_systems_id IS NULL -AND -ms_sensorsystems.sensor_nodes_id=sensor_systems.sensor_nodes_id -AND -ms_sensorsystems.ingest_id=sensor_systems.source_id -; - -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensorsystems', - to_jsonb(ms_sensorsystems) -FROM ms_sensorsystems WHERE sensor_systems_id IS NULL; - -UPDATE ms_sensors -SET sensor_systems_id = ms_sensorsystems.sensor_systems_id -FROM ms_sensorsystems WHERE -ms_sensors.ingest_sensor_systems_id = ms_sensorsystems.ingest_id; - -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensors', - to_jsonb(ms_sensors) -FROM ms_sensors WHERE sensor_systems_id IS NULL; - - -UPDATE ms_sensors -SET sensors_id = sensors.sensors_id -FROM sensors -WHERE -sensors.sensor_systems_id=ms_sensors.sensor_systems_id -AND -sensors.source_id = ms_sensors.ingest_id; - -SELECT count(*) from measurands; - -/* -INSERT INTO measurands (measurand, units) -SELECT DISTINCT measurand, units FROM ms_sensors -ON CONFLICT DO NOTHING; - -SELECT count(*) from measurands; -*/ - -UPDATE ms_sensors -SET measurands_id = measurands.measurands_id -from measurands -WHERE ms_sensors.measurand=measurands.measurand -and ms_sensors.units=measurands.units; - -UPDATE ms_sensors -SET measurands_id = 10 -WHERE -ms_sensors.measurand='ozone' -AND -ms_sensors.units='ppm'; - -UPDATE ms_sensors SET measurands_id = 126 WHERE measurands_id is null and ms_sensors.measurand='um010'; -UPDATE ms_sensors SET measurands_id = 130 WHERE measurands_id is null and ms_sensors.measurand='um025'; -UPDATE ms_sensors SET measurands_id = 135 WHERE measurands_id is null and ms_sensors.measurand='um100'; -UPDATE ms_sensors SET measurands_id = 19 WHERE measurands_id is null and ms_sensors.measurand='pm1'; -UPDATE ms_sensors SET measurands_id = 2 WHERE measurands_id is null and ms_sensors.measurand='pm25'; -UPDATE ms_sensors SET measurands_id = 1 WHERE measurands_id is null and ms_sensors.measurand='pm10'; - -DELETE FROM ms_sensors WHERE ingest_id ~* 'purple' AND measurands_id is null AND measurand in ('um003','um050','um005'); - -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensors no measurand', - to_jsonb(ms_sensors) -FROM ms_sensors WHERE measurands_id IS NULL; - -INSERT INTO sensors ( - source_id -, sensor_systems_id -, measurands_id -, metadata) -SELECT ingest_id -, sensor_systems_id -, measurands_id -, metadata -FROM ms_sensors -WHERE measurands_id is not null -AND sensor_systems_id is not null -GROUP BY ingest_id -, sensor_systems_id -, measurands_id -, metadata -ON CONFLICT (sensor_systems_id, measurands_id, source_id) DO -UPDATE SET - metadata=sensors.metadata || EXCLUDED.metadata -; - - -SELECT notify('After sensors'); - - -UPDATE ms_sensors -SET sensors_id = sensors.sensors_id -FROM sensors -WHERE -sensors.sensor_systems_id=ms_sensors.sensor_systems_id -AND -sensors.source_id = ms_sensors.ingest_id; - - -INSERT INTO rejects (tbl,r) -SELECT - 'ms_sensors', - to_jsonb(ms_sensors) -FROM ms_sensors WHERE sensors_id IS NULL; diff --git a/ingest/lcs_ingest_systems.sql b/ingest/lcs_ingest_systems.sql deleted file mode 100644 index 4d0b592..0000000 --- a/ingest/lcs_ingest_systems.sql +++ /dev/null @@ -1,53 +0,0 @@ - --- fill in any new sensor_nodes_id -UPDATE ms_sensornodes -SET sensor_nodes_id = sensor_nodes.sensor_nodes_id -FROM sensor_nodes -WHERE -ms_sensornodes.sensor_nodes_id is null -AND -sensor_nodes.source_name = ms_sensornodes.source_name -AND -sensor_nodes.source_id = ms_sensornodes.ingest_id; - --- log anything we were not able to get an id for -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensornodes', - to_jsonb(ms_sensornodes) -FROM ms_sensornodes WHERE sensor_nodes_id IS NULL; - - -UPDATE ms_sensorsystems -SET sensor_nodes_id = ms_sensornodes.sensor_nodes_id -FROM ms_sensornodes WHERE -ms_sensorsystems.ingest_sensor_nodes_id = ms_sensornodes.ingest_id; - -UPDATE ms_sensorsystems -SET sensor_systems_id = sensor_systems.sensor_systems_id -FROM sensor_systems -WHERE -sensor_systems.sensor_nodes_id = ms_sensorsystems.sensor_nodes_id -AND -sensor_systems.source_id = ms_sensorsystems.ingest_id; - --- log anything we were not able to get an id for -INSERT INTO rejects (t, tbl,r) SELECT - now(), - 'ms_sensorsystems', - to_jsonb(ms_sensorsystems) -FROM ms_sensorsystems WHERE sensor_nodes_id IS NULL; - -SELECT notify('immediately before insert on systems'); - -INSERT INTO sensor_systems (sensor_nodes_id, source_id, metadata) -SELECT sensor_nodes_id, ingest_id, metadata -FROM ms_sensorsystems -WHERE sensor_nodes_id IS NOT NULL -ON CONFLICT (sensor_nodes_id, source_id) -DO -UPDATE SET - metadata=sensor_systems.metadata || EXCLUDED.metadata -; - -SELECT notify('After systems'); diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 0bdabc9..6754e70 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -107,7 +107,6 @@ FROM r; -- , -99 -- , generate_series(now() - '3day'::interval, current_date, '1hour'::interval); - WITH inserts AS ( INSERT INTO measurements ( sensors_id, @@ -125,11 +124,14 @@ INSERT INTO measurements ( FROM meas WHERE sensors_id IS NOT NULL ON CONFLICT DO NOTHING -RETURNING sensors_id, datetime +RETURNING sensors_id, datetime, value, lat, lon ), inserted as ( - INSERT INTO temp_inserted_measurements (sensors_id, datetime) + INSERT INTO temp_inserted_measurements (sensors_id, datetime, value, lat, lon) SELECT sensors_id , datetime + , value + , lat + , lon FROM inserts RETURNING sensors_id, datetime ) @@ -141,6 +143,52 @@ INTO __inserted_start_datetime , __inserted_measurements FROM inserted; +-- Now we can use those temp_inserted_measurements to update the cache tables +INSERT INTO sensors_latest ( + sensors_id + , datetime + , value + , lat + , lon + ) +---- identify the row that has the latest value +WITH numbered AS ( + SELECT sensors_id + , datetime + , value + , lat + , lon + , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn + FROM temp_inserted_measurements +), latest AS ( +---- only insert those rows + SELECT sensors_id + , datetime + , value + , lat + , lon + FROM numbered + WHERE rn = 1 +) +SELECT l.sensors_id +, l.datetime +, l.value +, l.lat +, l.lon +FROM latest l +LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +WHERE sl.sensors_id IS NULL +OR l.datetime > sl.datetime +ON CONFLICT (sensors_id) DO UPDATE +SET datetime = EXCLUDED.datetime +, value = EXCLUDED.value +, lat = EXCLUDED.lat +, lon = EXCLUDED.lon +, modified_on = now() +--, fetchlogs_id = EXCLUDED.fetchlogs_id +; + + --Update the export queue/logs to export these records --wrap it in a block just in case the database does not have this module installed --we subtract the second because the data is assumed to be time ending diff --git a/ingest/lcs_staging.sql b/ingest/lcs_staging.sql index 6beb76c..147fcb6 100644 --- a/ingest/lcs_staging.sql +++ b/ingest/lcs_staging.sql @@ -52,6 +52,10 @@ CREATE TEMP TABLE IF NOT EXISTS keys ( -- this is to deal with the overlap that we see in the -- incoming files CREATE TEMP TABLE IF NOT EXISTS temp_inserted_measurements ( - sensors_id int, - datetime timestamptz + sensors_id int + , datetime timestamptz + , value double precision + , lat double precision + , lon double precision + , fetchlogs_id int ); From 10076d1504c0b84446e701684c93399d0e4b9451 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 17 Nov 2022 06:46:32 -0800 Subject: [PATCH 05/42] Updated to use new split_ingest_id method to parse ingest_id --- ingest/lcs_meas_ingest.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 6754e70..d5c6442 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -36,8 +36,8 @@ WITH nodes AS ( INSERT INTO sensor_nodes ( source_name , source_id ) -SELECT split_part(ingest_id, '-', 1) as source_name -, split_part(ingest_id, '-', 2) as source_id +SELECT split_ingest_id(ingest_id, 1) as source_name +, split_ingest_id(ingest_id, 2) as source_id FROM meas WHERE sensors_id IS NULL GROUP BY 1,2 @@ -56,9 +56,9 @@ ON CONFLICT DO NOTHING; -- this method depends on us having a match for the parameter WITH sen AS ( SELECT ingest_id -, split_part(ingest_id, '-', 1) as source_name -, split_part(ingest_id, '-', 2) as source_id -, split_part(ingest_id, '-', 3) as parameter +, split_ingest_id(ingest_id, 1) as source_name +, split_ingest_id(ingest_id, 2) as source_id +, split_ingest_id(ingest_id, 3) as parameter FROM meas WHERE sensors_id IS NULL GROUP BY 1,2,3,4) From d9e728b8c5df5bb9f88a72fee02d2e9d33b727e6 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Fri, 18 Nov 2022 17:46:37 -0800 Subject: [PATCH 06/42] Testing out a new way to handle the timestamps on ingest Passing datetime strings to the ingest method is causing it to run about 500-600x slower than if we passed numeric timestamps. This is primarily because of try/catch method we have of determining string vs numeric. I modified the staging tables to accept a string for the datetime and then only convert the numeric timestamps. This makes the string method on par with the numeric method. Since we can control the shape of the data coming to this part I would recommend that we come up with a standard format and use the try/catch correctly, and not as an if/then. --- ingest/lcs.py | 17 ++++++++--------- ingest/lcs_meas_ingest.sql | 10 +++++----- ingest/lcs_meas_staging.sql | 3 ++- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/ingest/lcs.py b/ingest/lcs.py index 0e9596f..824c115 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -423,17 +423,16 @@ def get_measurements(key, fetchlogsId): row.insert(4, None) if row[0] == "" or row[0] is None: continue - dt = row[2] + # dt = row[2] try: - dt = datetime.fromtimestamp(int(dt), timezone.utc) + if row[1].isnumeric(): + dt = dateparser.parse(row[2]).replace(tzinfo=timezone.utc) + row[2] = dt.isoformat() except Exception: - try: - dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) - except Exception: - logger.warning(f"Exception in parsing date for {dt} {Exception}") - row[2] = dt.isoformat() - # addd the log id for tracing purposes + pass + + # add the log id for tracing purposes row.insert(5, fetchlogsId) ret.append(row) logger.info("get_measurements:csv: %s; size: %s; rows: %s; fetching: %0.4f; reading: %0.4f", key, len(content)/1000, len(ret), fetch_time, time() - start) @@ -494,7 +493,7 @@ def load_measurements_db(limit=250, ascending: bool = False): , key , last_modified FROM fetchlogs - WHERE key~E'^lcs-etl-pipeline/measures/.*\\.csv' + WHERE key~E'^(lcs-etl-pipeline|uploaded)/measures/.*\\.csv' AND completed_datetime is null ORDER BY last_modified {order} nulls last LIMIT %s diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index d5c6442..91c7d3d 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -15,10 +15,10 @@ WHERE ingest_id IS NULL OR datetime is NULL OR value IS NULL; -DELETE -FROM meas -WHERE datetime < '2018-01-01'::timestamptz -OR datetime>now(); +--DELETE +--FROM meas +--WHERE datetime < '2018-01-01'::timestamptz +--OR datetime>now(); DELETE FROM rejects @@ -117,7 +117,7 @@ INSERT INTO measurements ( ) SELECT --DISTINCT sensors_id, - datetime, + datetime::timestamptz, value, lon, lat diff --git a/ingest/lcs_meas_staging.sql b/ingest/lcs_meas_staging.sql index 3f8caf8..ce2cc49 100644 --- a/ingest/lcs_meas_staging.sql +++ b/ingest/lcs_meas_staging.sql @@ -2,7 +2,8 @@ CREATE TEMP TABLE meas ( ingest_id text, sensors_id int, value float, - datetime timestamptz, + --datetime timestamptz, + datetime text, lon float, lat float, fetchlogs_id int From 492096bcab2f45b357cd66a7092ccebcc4ded2e2 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 22 Nov 2022 11:29:30 -0800 Subject: [PATCH 07/42] Fixed timestamp 'bug', Added diagnostic data to ingest --- check.py | 18 ++++---- ingest/handler.py | 21 ++++++--- ingest/lcs.py | 87 +++++++++++++++++-------------------- ingest/lcs_meas_ingest.sql | 57 +++++++++++++++++------- ingest/lcs_meas_staging.sql | 11 ----- 5 files changed, 104 insertions(+), 90 deletions(-) delete mode 100644 ingest/lcs_meas_staging.sql diff --git a/check.py b/check.py index 74e241d..f185e1f 100644 --- a/check.py +++ b/check.py @@ -145,9 +145,16 @@ def check_realtime_key(key: str, fix: bool = False): # loop through and check each for idx, key in enumerate(keys): print(key) + if args.download: + print(f'downloading: {key}') + txt = get_object(key) + fpath = os.path.expanduser(f'~/Downloads/{key}') + os.makedirs(os.path.dirname(fpath), exist_ok=True) + with open(fpath.replace('.gz', ''), 'w') as f: + f.write(txt) # if we are resubmiting we dont care # what type of file it is - if args.resubmit: + elif args.resubmit: mark_success(key, reset=True, message='resubmitting') # figure out what type of file it is elif 'realtime' in key: @@ -158,15 +165,6 @@ def check_realtime_key(key: str, fix: bool = False): else: load_measurements([(args.id, key, None)]) - if args.download: - print(f'downloading: {key}') - txt = get_object(key) - fpath = os.path.expanduser(f'~/Downloads/{key}') - os.makedirs(os.path.dirname(fpath), exist_ok=True) - with open(fpath.replace('.gz',''), 'w') as f: - f.write(txt) - - # Otherwise if we set the summary flag return a daily summary of errors elif args.summary: rows = load_errors_summary(args.n) diff --git a/ingest/handler.py b/ingest/handler.py index 2bb72cc..34e5baa 100644 --- a/ingest/handler.py +++ b/ingest/handler.py @@ -47,24 +47,29 @@ def handler(event, context): ) try: + file_size = lov2["Contents"][0]["Size"] last_modified = lov2["Contents"][0]["LastModified"] except KeyError: logger.error(""" - could not get last modified time from obj + could not get info from obj """) + file_size = None last_modified = datetime.now().replace( tzinfo=timezone.utc ) cursor.execute( """ - INSERT INTO fetchlogs (key, last_modified) - VALUES(%s, %s) + INSERT INTO fetchlogs (key + , file_size + , last_modified + ) + VALUES(%s, %s, %s) ON CONFLICT (key) DO UPDATE SET last_modified=EXCLUDED.last_modified, completed_datetime=NULL RETURNING *; """, - (key, last_modified,), + (key, file_size, last_modified,), ) row = cursor.fetchone() connection.commit() @@ -123,8 +128,12 @@ def cronhandler(event, context): """ SELECT count(*) FROM fetchlogs - WHERE completed_datetime is null - AND key ~*'measures'; + WHERE key ~*'measures' + AND completed_datetime is null + AND ( + loaded_datetime IS NULL + OR loaded_datetime < now() - '1hour'::interval + ); """, ) pipeline = cursor.fetchone() diff --git a/ingest/lcs.py b/ingest/lcs.py index 824c115..88343fb 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -4,6 +4,7 @@ import dateparser import pytz import orjson +import uuid import csv from time import time from urllib.parse import unquote_plus @@ -423,16 +424,23 @@ def get_measurements(key, fetchlogsId): row.insert(4, None) if row[0] == "" or row[0] is None: continue - # dt = row[2] + dt = row[2] try: - if row[1].isnumeric(): - dt = dateparser.parse(row[2]).replace(tzinfo=timezone.utc) + if dt.isnumeric(): + if len(dt) == 13: + dt = datetime.fromtimestamp(int(dt)/1000.0, timezone.utc) + else: + dt = datetime.fromtimestamp(int(dt), timezone.utc) row[2] = dt.isoformat() except Exception: - pass + try: + dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) + except Exception: + logger.warning(f"Exception in parsing date for {dt} {Exception}") - # add the log id for tracing purposes + #row[2] = dt.isoformat() + # addd the log id for tracing purposes row.insert(5, fetchlogsId) ret.append(row) logger.info("get_measurements:csv: %s; size: %s; rows: %s; fetching: %0.4f; reading: %0.4f", key, len(content)/1000, len(ret), fetch_time, time() - start) @@ -487,19 +495,34 @@ def load_measurements_db(limit=250, ascending: bool = False): order = 'ASC' if ascending else 'DESC' conn = psycopg2.connect(settings.DATABASE_WRITE_URL) cur = conn.cursor() + batch_uuid = uuid.uuid4().hex + pattern = '^lcs-etl-pipeline/measures/.*\\.csv' + # pattern = '^uploaded/measures/.*\\.csv' cur.execute( f""" - SELECT fetchlogs_id - , key - , last_modified - FROM fetchlogs - WHERE key~E'^(lcs-etl-pipeline|uploaded)/measures/.*\\.csv' - AND completed_datetime is null - ORDER BY last_modified {order} nulls last - LIMIT %s - ; + UPDATE fetchlogs + SET loaded_datetime = CURRENT_TIMESTAMP + , jobs = jobs + 1 + , batch_uuid = %s + FROM ( + SELECT fetchlogs_id + FROM fetchlogs + WHERE key~E'{pattern}' + AND completed_datetime is null + AND ( + loaded_datetime IS NULL + OR loaded_datetime < now() - '1hour'::interval + ) + ORDER BY last_modified {order} nulls last + LIMIT %s + FOR UPDATE SKIP LOCKED + ) as q + WHERE q.fetchlogs_id = fetchlogs.fetchlogs_id + RETURNING fetchlogs.fetchlogs_id + , fetchlogs.key + , fetchlogs.last_modified; """, - (limit,), + (batch_uuid, limit,), ) rows = cur.fetchall() # keys = [r[0] for r in rows] @@ -550,43 +573,11 @@ def load_measurements(rows): mrows = cursor.rowcount status = cursor.statusmessage logger.debug(f"COPY Rows: {mrows} Status: {status}") - cursor.execute( - """ - INSERT INTO fetchlogs( - key, - loaded_datetime - ) SELECT key, clock_timestamp() - FROM keys - ON CONFLICT (key) DO - UPDATE - SET - loaded_datetime=EXCLUDED.loaded_datetime - ; - """ - ) - connection.commit() + cursor.execute(get_query("lcs_meas_ingest.sql")) for notice in connection.notices: print(notice) - #irows = cursor.rowcount - #logger.info("load_measurements:insert: %s rows; %0.4f seconds", irows, time() - start) - #status = cursor.statusmessage - #logger.debug(f"INGEST Rows: {irows} Status: {status}") - cursor.execute( - """ - INSERT INTO fetchlogs( - key, - completed_datetime - ) SELECT key, clock_timestamp() - FROM keys - ON CONFLICT (key) DO - UPDATE - SET - completed_datetime=EXCLUDED.completed_datetime - ; - """ - ) logger.info( "load_measurements: keys: %s; rows: %s; time: %0.4f", len(rows), mrows, time() - start_time) diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 91c7d3d..e32340e 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -7,6 +7,9 @@ __rejected_measurements int; __exported_days int; __inserted_start_datetime timestamptz; __inserted_end_datetime timestamptz; +__process_time_ms int; +__insert_time_ms int; +__cache_time_ms int; BEGIN DELETE @@ -79,6 +82,8 @@ FROM sensors s WHERE s.source_id=ingest_id AND meas.sensors_id IS NULL; +__process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + -- reject any missing. Most likely due to issues -- with the measurand WITH r AS ( @@ -94,18 +99,8 @@ RETURNING 1) SELECT COUNT(1) INTO __rejected_measurements FROM r; - ---DELETE ---FROM meas ---WHERE sensors_id IS NULL; - --- --Some fake data to make it easier to test this section --- TRUNCATE meas; --- INSERT INTO meas (ingest_id, sensors_id, value, datetime) --- SELECT 'fake-ingest' --- , (SELECT sensors_id FROM sensors ORDER BY random() LIMIT 1) --- , -99 --- , generate_series(now() - '3day'::interval, current_date, '1hour'::interval); +-- restart the clock to measure just inserts +__process_start := clock_timestamp(); WITH inserts AS ( INSERT INTO measurements ( @@ -117,7 +112,7 @@ INSERT INTO measurements ( ) SELECT --DISTINCT sensors_id, - datetime::timestamptz, + datetime, value, lon, lat @@ -143,6 +138,33 @@ INTO __inserted_start_datetime , __inserted_measurements FROM inserted; +__insert_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- mark the fetchlogs as done +WITH inserted AS ( + SELECT m.fetchlogs_id + , COUNT(m.*) as n_records + , COUNT(t.*) as n_inserted + , MIN(m.datetime) as fr_datetime + , MAX(m.datetime) as lr_datetime + , MIN(t.datetime) as fi_datetime + , MAX(t.datetime) as li_datetime + FROM meas m + LEFT JOIN temp_inserted_measurements t ON (t.sensors_id = m.sensors_id AND t.datetime = m.datetime) + GROUP BY m.fetchlogs_id) +UPDATE fetchlogs +SET completed_datetime = CURRENT_TIMESTAMP +, inserted = COALESCE(n_inserted, 0) +, records = COALESCE(n_records, 0) +, first_recorded_datetime = fr_datetime +, last_recorded_datetime = lr_datetime +, first_inserted_datetime = fi_datetime +, last_inserted_datetime = li_datetime +FROM inserted +WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; + +-- track the time required to update cache tables +__process_start := clock_timestamp(); -- Now we can use those temp_inserted_measurements to update the cache tables INSERT INTO sensors_latest ( sensors_id @@ -214,13 +236,18 @@ RETURNING 1) SELECT COUNT(1) INTO __exported_days FROM e; -RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, source: lcs' +__cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + + +RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: lcs' , __inserted_measurements , __inserted_start_datetime , __inserted_end_datetime , __rejected_measurements , __exported_days - , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + , __process_time_ms + , __insert_time_ms + , __cache_time_ms; EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'Failed to export to logs: %', SQLERRM diff --git a/ingest/lcs_meas_staging.sql b/ingest/lcs_meas_staging.sql deleted file mode 100644 index ce2cc49..0000000 --- a/ingest/lcs_meas_staging.sql +++ /dev/null @@ -1,11 +0,0 @@ -CREATE TEMP TABLE meas ( - ingest_id text, - sensors_id int, - value float, - --datetime timestamptz, - datetime text, - lon float, - lat float, - fetchlogs_id int -); -CREATE TEMP TABLE keys (key text, last_modified timestamptz); From a7d107c94973fbbce3f5c7b30f979e2d323d2234 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 1 Dec 2022 14:13:46 -0800 Subject: [PATCH 08/42] Added providers and timezones check on insert --- check.py | 13 ++++++++----- ingest/lcs.py | 18 +++++++++++++----- ingest/lcs_ingest_full.sql | 16 +++++++++++----- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/check.py b/check.py index f185e1f..ced5ee0 100644 --- a/check.py +++ b/check.py @@ -67,11 +67,8 @@ from ingest.settings import settings from ingest.lcs import ( - load_metadata_db, - load_measurements_db, - load_measurements_file, + load_metadata, load_measurements, - get_measurements, ) from ingest.fetch import ( @@ -162,8 +159,14 @@ def check_realtime_key(key: str, fix: bool = False): load_realtime([key]) else: check_realtime_key(key, args.fix) + elif 'stations' in key: + load_metadata([ + {"id": args.id, "Key": key, "LastModified": None} + ]) else: - load_measurements([(args.id, key, None)]) + load_measurements([ + (args.id, key, None) + ]) # Otherwise if we set the summary flag return a daily summary of errors elif args.summary: diff --git a/ingest/lcs.py b/ingest/lcs.py index 88343fb..51ff948 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -349,11 +349,17 @@ def load_metadata_db(count=250, ascending: bool = False): for notice in connection.notices: logger.debug(notice) if len(contents) > 0: - data = LCSData(contents) - data.get_metadata() + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() return rowcount +def load_metadata(keys): + data = LCSData(keys) + data.get_metadata() + + def select_object(key): key = unquote_plus(key) if str.endswith(key, ".gz"): @@ -449,6 +455,7 @@ def get_measurements(key, fetchlogsId): def submit_file_error(key, e): """Update the log to reflect the error and prevent a retry""" + logger.error(f"{key}: {e}") with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: @@ -458,9 +465,10 @@ def submit_file_error(key, e): SET completed_datetime = clock_timestamp() , last_message = %s WHERE key = %s - """ - ), - (f"ERROR: {e}", key), + """, + (f"ERROR: {e}", key), + ) + def to_tsv(row): tsv = "\t".join(map(clean_csv_value, row)) + "\n" diff --git a/ingest/lcs_ingest_full.sql b/ingest/lcs_ingest_full.sql index 73d8266..153b46b 100644 --- a/ingest/lcs_ingest_full.sql +++ b/ingest/lcs_ingest_full.sql @@ -46,6 +46,8 @@ INSERT INTO sensor_nodes ( , geom , metadata , source_id +, timezones_id +, providers_id ) SELECT site_name , source_name @@ -53,14 +55,18 @@ SELECT site_name , geom , metadata , ingest_id +, get_timezones_id(geom) +, get_providers_id(source_name) FROM ms_sensornodes -GROUP BY site_name, source_name, ismobile, geom, metadata, ingest_id +GROUP BY 1,2,3,4,5,6,7,8 ON CONFLICT (source_name, source_id) DO UPDATE SET - site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name), - ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile), - geom=coalesce(EXCLUDED.geom,sensor_nodes.geom), - metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name) + , ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile) + , geom=coalesce(EXCLUDED.geom,sensor_nodes.geom) + , metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , timezones_id = COALESCE(EXCLUDED.timezones_id, sensor_nodes.timezones_id) + , providers_id = COALESCE(EXCLUDED.providers_id, sensor_nodes.providers_id) RETURNING 1) SELECT COUNT(1) INTO __inserted_nodes FROM inserts; From 31bdb281097d9351b7d732da8e27e71104a4471f Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 19 Jan 2023 20:33:09 -0800 Subject: [PATCH 09/42] More updates to help with ingesting --- check.py | 16 +- ingest/fetch.py | 99 +++++----- ingest/fetch_ingest_full.sql | 367 ++++++++++++++++++++++++++++++----- ingest/fetch_staging.sql | 13 +- ingest/handler.py | 82 +++----- ingest/lcs.py | 105 ++++------ ingest/lcs_ingest_full.sql | 27 ++- ingest/lcs_meas_ingest.sql | 270 +++++++++++++++++++++++--- ingest/lcs_staging.sql | 22 ++- ingest/settings.py | 2 + ingest/utils.py | 80 +++++--- 11 files changed, 796 insertions(+), 287 deletions(-) diff --git a/check.py b/check.py index ced5ee0..f277418 100644 --- a/check.py +++ b/check.py @@ -16,6 +16,8 @@ """) parser.add_argument('--id', type=int, required=False, help='The fetchlogs_id value') +parser.add_argument('--batch', type=str, required=False, + help='The batch id value. Loads files based on batch uuid.') parser.add_argument('--env', type=str, required=False, help='The dot env file to use') parser.add_argument('--profile', type=str, required=False, @@ -24,11 +26,11 @@ help="""Either the number of entries to list (sorted by date) or the number of days to go back if using the summary or rejects arguments""") -parser.add_argument('--pipeline', type=int, required=False, default=1, +parser.add_argument('--pipeline', type=int, required=False, default=0, help="""The number of pipeline files to load at a time""") -parser.add_argument('--metadata', type=int, required=False, default=1, +parser.add_argument('--metadata', type=int, required=False, default=0, help="""The number of metadata files to load at a time""") -parser.add_argument('--realtime', type=int, required=False, default=1, +parser.add_argument('--realtime', type=int, required=False, default=0, help="""The number of realtime files to load at a time""") parser.add_argument('--fix', action="store_true", help='Automatically attempt to fix the problem') @@ -69,6 +71,7 @@ from ingest.lcs import ( load_metadata, load_measurements, + load_measurements_batch, ) from ingest.fetch import ( @@ -156,7 +159,9 @@ def check_realtime_key(key: str, fix: bool = False): # figure out what type of file it is elif 'realtime' in key: if args.load: - load_realtime([key]) + load_realtime([ + (args.id, key, None) + ]) else: check_realtime_key(key, args.fix) elif 'stations' in key: @@ -168,6 +173,9 @@ def check_realtime_key(key: str, fix: bool = False): (args.id, key, None) ]) +elif args.batch is not None: + load_measurements_batch(args.batch) + # Otherwise if we set the summary flag return a daily summary of errors elif args.summary: rows = load_errors_summary(args.n) diff --git a/ingest/fetch.py b/ingest/fetch.py index b751693..d0c3f50 100644 --- a/ingest/fetch.py +++ b/ingest/fetch.py @@ -17,6 +17,7 @@ get_query, load_fail, load_success, + load_fetchlogs, ) app = typer.Typer() @@ -87,10 +88,13 @@ def parse_json(j, key: str = None): def create_staging_table(cursor): - cursor.execute(get_query("fetch_staging.sql")) + cursor.execute(get_query( + "fetch_staging.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) -def copy_data(cursor, key): +def copy_data(cursor, key, fetchlogsId=None): obj = s3.Object(FETCH_BUCKET, key) # This should not be checked here, # if we ask it to copy data it should do that @@ -104,11 +108,13 @@ def copy_data(cursor, key): logger.debug(f"Copying data for {key}") with gzip.GzipFile(fileobj=obj.get()["Body"]) as gz: f = io.BufferedReader(gz) + # make sure that the file is complete iterator = StringIteratorIO( - (parse_json(orjson.loads(line)) for line in f) + (f"{fetchlogsId}\t"+parse_json(orjson.loads(line)) for line in f) ) query = """ COPY tempfetchdata ( + fetchlogs_id, location, value, unit, @@ -141,11 +147,17 @@ def copy_file(cursor, file): # load_success(cursor, file) except Exception as e: + logger.warning(f'File copy failed: {e}') load_fail(cursor, file, e) def process_data(cursor): - query = get_query("fetch_ingest_full.sql") + # see file for details on how + # to use the variables + query = get_query( + "fetch_ingest_full.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + ) cursor.execute(query) # if results: # mindate, maxdate = results @@ -272,43 +284,23 @@ def submit_file_error(ids, e): @app.command() def load_db(limit: int = 50, ascending: bool = False): - order = 'ASC' if ascending else 'DESC' - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: - connection.set_session(autocommit=True) - with connection.cursor() as cursor: - cursor.execute( - f""" - SELECT key - ,last_modified - ,fetchlogs_id - FROM fetchlogs - WHERE key~E'^realtime-gzipped/.*\\.ndjson.gz$' - AND completed_datetime is null - ORDER BY last_modified {order} nulls last - LIMIT %s - ; - """, - (limit,), - ) - rows = cursor.fetchall() - keys = [r[0] for r in rows] - if len(keys) > 0: - try: - load_realtime(keys) - except Exception as e: - # catch and continue to next page - ids = [r[2] for r in rows] - logger.error(f""" - Error processing realtime files: {e}, {ids} - """) - submit_file_error(ids, e) - finally: - connection.commit() + pattern = '^realtime-gzipped/.*\\.ndjson.gz$' + rows = load_fetchlogs(pattern, limit, ascending) + if len(rows) > 0: + try: + load_realtime(rows) + except Exception as e: + # catch and continue to next page + ids = [r[2] for r in rows] + logger.error(f""" + Error processing realtime files: {e}, {ids} + """) + submit_file_error(ids, e) - return len(keys) + return len(rows) -def load_realtime(keys): +def load_realtime(rows): # create a connection and share for all keys with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) @@ -316,15 +308,30 @@ def load_realtime(keys): # create all the data staging table create_staging_table(cursor) # now copy all the data - for key in keys: - copy_data(cursor, key) + keys = [] + + for row in rows: + key = row[1] + fetchlogsId = row[0] + try: + copy_data(cursor, key, fetchlogsId) + keys.append(key) + except Exception as e: + # all until now is lost + # reset things and try to recover + connection.rollback() + keys = [] + load_fail(cursor, fetchlogsId, e) + break + # finally process the data as one - process_data(cursor) - # we are outputing some stats - for notice in connection.notices: - print(notice) - # mark files as done - load_success(cursor, keys) + if len(keys) > 0: + process_data(cursor) + # we are outputing some stats + for notice in connection.notices: + print(notice) + # mark files as done + load_success(cursor, keys) # close and commit connection.commit() diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index ad5013c..9d718d5 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -1,17 +1,18 @@ --- Get sensor systems +-- fetch_ingest_full DO $$ DECLARE __process_start timestamptz := clock_timestamp(); __total_measurements int; +__total_nodes int; __updated_nodes int; __inserted_nodes int; __inserted_sensors int; __inserted_measurements int; __inserted_measurands int; -__rejected_nodes int; +__rejected_nodes int := 0; __rejected_systems int; __rejected_sensors int; -__rejected_measurements int; +__rejected_measurements int := 0; __start_datetime timestamptz; __end_datetime timestamptz; __inserted_start_datetime timestamptz; @@ -20,9 +21,15 @@ __deleted_timescaledb int; __deleted_future_measurements int; __deleted_past_measurements int; __exported_days int; +__process_time_ms int; +__insert_time_ms int; +__cache_time_ms int; +__ingest_method text := 'realtime'; BEGIN -SELECT now() INTO __process_start; +-- REQUIRED +-- {table} should be `TEMP TABLE` in production but could be changed to +-- just `TABLE` if you are debugging and want the temp tables to persist --------------------------- -- File fetch_filter.sql -- @@ -64,7 +71,7 @@ FROM tempfetchdata; ------------- -- File #1 -- ------------- -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors AS +CREATE {table} IF NOT EXISTS tempfetchdata_sensors AS WITH t AS ( SELECT DISTINCT location as site_name, @@ -79,14 +86,15 @@ SELECT DISTINCT mobile as ismobile, avpd_unit, avpd_value, - coords::geometry as cgeom, - NULL::int as sensor_nodes_id, +-- coords::geometry as cgeom, + null::int as sensor_nodes_id, null::int as sensor_systems_id, null::int as measurands_id, null::int as sensors_id, null::jsonb as node_metadata, null::jsonb as sensor_metadata, - array_agg(tfdid) as tfdids + array_agg(tfdid) as tfdids, + fetchlogs_id FROM tempfetchdata GROUP BY location, @@ -105,10 +113,12 @@ GROUP BY measurands_id, sensors_id, node_metadata, - sensor_metadata + sensor_metadata, + fetchlogs_id ) SELECT row_number() over () as tfsid, * FROM t; + CREATE INDEX ON tempfetchdata_sensors (tfsid); ------------- -- File #2 -- @@ -128,15 +138,16 @@ WHERE units IN ('µg/m��','��g/m³'); UPDATE tempfetchdata_sensors SET node_metadata = jsonb_strip_nulls( - COALESCE(data, '{}'::jsonb) + COALESCE(data, '{{}}'::jsonb) || jsonb_build_object( - 'source_type', - 'government', - 'origin', - 'openaq' + 'source_type', 'government', + 'origin','openaq', + 'fetchlogs_id', fetchlogs_id ) ), + -- the following assumes that avpd_unit is always hours + -- which at the last check (2022-12-07) it was sensor_metadata = jsonb_strip_nulls(jsonb_build_object( 'data_averaging_period_seconds', avpd_value * 3600 )) @@ -146,19 +157,22 @@ sensor_metadata = jsonb_strip_nulls(jsonb_build_object( -- File #3 -- ------------- -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_nodes AS +CREATE {table} IF NOT EXISTS tempfetchdata_nodes AS SELECT * FROM (SELECT site_name, source_name, country, city, - node_metadata as metadata, + node_metadata::jsonb as metadata, ismobile, null::int as sensor_nodes_id, null::int as sensor_systems_id, null::boolean as added, + null::text as method, st_centroid(st_collect(geom)) as geom, array_agg(tfsid) as tfsids + , array_agg(st_astext(geom)) as points + , COUNT(DISTINCT st_astext(geom)) as n_points FROM tempfetchdata_sensors WHERE geom IS NOT NULL GROUP BY @@ -171,22 +185,28 @@ SELECT * FROM source_name, country, city, - node_metadata as metadata, + node_metadata::jsonb as metadata, ismobile, null::int as sensor_nodes_id, null::int as sensor_systems_id, null::boolean as added, + null::text as method, null::geometry as geom, array_agg(tfsid) as tfsids + , null::text[] as points + , 0 as n_points FROM tempfetchdata_sensors WHERE geom IS NULL -AND site_name is not null -and source_name is not null +AND site_name IS NOT NULL +AND source_name IS NOT NULL GROUP BY 1,2,3,4,5,6,7,8,9,10 ) as nogeom ; +SELECT COUNT(1) +INTO __total_nodes +FROM tempfetchdata_nodes; ------------- -- File #4 -- @@ -197,6 +217,7 @@ GROUP BY UPDATE tempfetchdata_nodes t SET sensor_nodes_id = sn.sensor_nodes_id , added = FALSE +, method = 'spatial' FROM sensor_nodes sn WHERE t.geom IS NOT NULL AND st_dwithin(sn.geom, t.geom, .0001) @@ -205,18 +226,76 @@ AND origin='OPENAQ'; UPDATE tempfetchdata_nodes t SET sensor_nodes_id = sn.sensor_nodes_id , added = FALSE +, method = 'source_id' FROM sensor_nodes sn WHERE t.sensor_nodes_id is null AND t.site_name is not null AND t.source_name is not null AND t.site_name = sn.site_name AND t.source_name=sn.source_name +AND t.geom IS NULL AND origin='OPENAQ'; + +__process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + ------------- -- File #5 -- ------------- +DROP TABLE IF EXISTS checkrealtime_matched; +-- CREATE TABLE IF NOT EXISTS checkrealtime_matched ( +-- sensor_nodes_id int +-- , site_name text +-- , source_name text +-- , city text +-- , country text +-- , origin text +-- , method text +-- , geom_old geometry +-- , geom_new geometry +-- , added_on timestamptz DEFAULT now() +-- ); + + +-- INSERT INTO checkrealtime_matched +-- SELECT t.sensor_nodes_id +-- , format('%s -> %s', s.site_name, t.site_name) +-- , format('%s -> %s', s.source_name, t.source_name) +-- , format('%s -> %s', s.city, t.city) +-- , format('%s -> %s', s.country, t.country) +-- , origin +-- , method +-- , s.geom +-- , t.geom +-- FROM tempfetchdata_nodes t +-- JOIN sensor_nodes s ON (t.sensor_nodes_id = s.sensor_nodes_id) +-- WHERE ROW( +-- t.site_name, +-- t.source_name, +-- t.city, +-- t.country, +-- t.metadata +-- ) IS DISTINCT FROM ( +-- s.site_name, +-- s.source_name, +-- s.city, +-- s.country, +-- s.metadata - 'timezone' +-- ); + +-- SELECT sensor_nodes_id +-- , method +-- , site_name +-- , source_name +-- , city +-- , country +-- , ROUND(st_distancesphere(geom_new, geom_old)::numeric, 1) as distance +-- FROM checkrealtime_matched +-- WHERE st_distancesphere(geom_new, geom_old) > 0 +-- GROUP BY 1,2,3,4,5,6, 7 +-- LIMIT 100; + -- Update any records that have changed WITH updates AS ( UPDATE sensor_nodes s SET @@ -225,9 +304,11 @@ UPDATE sensor_nodes s SET city = COALESCE(t.city, s.city), country = COALESCE(t.country, s.country), ismobile = COALESCE(t.ismobile, s.ismobile), - metadata = COALESCE(s.metadata, '{}'::jsonb) || t.metadata, + metadata = COALESCE(s.metadata, '{{}}'::jsonb) || t.metadata, geom = COALESCE(t.geom, s.geom) - --, modified_on = now() + --, timezones_id = get_timezones_id(COALESCE(t.geom, s.geom)) + , providers_id = get_providers_id(COALESCE(t.source_name, s.source_name)) + , modified_on = now() FROM tempfetchdata_nodes t WHERE t.sensor_nodes_id = s.sensor_nodes_id AND ( @@ -249,7 +330,7 @@ OR s.source_name, s.city, s.country, - s.metadata + s.metadata - 'timezone' ) ) RETURNING 1) @@ -270,7 +351,10 @@ INSERT INTO sensor_nodes ( city, country, ismobile, - origin + origin, + timezones_id, + providers_id, + countries_id ) SELECT site_name, @@ -281,6 +365,9 @@ SELECT country, ismobile, 'OPENAQ' + , get_timezones_id(geom) + , get_providers_id(source_name) + , get_countries_id(geom) FROM tempfetchdata_nodes t WHERE t.sensor_nodes_id is NULL RETURNING * @@ -378,7 +465,7 @@ SELECT COUNT(1) INTO __inserted_measurands FROM inserts; -- get cleaned sensors table -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata_sensors_clean AS +CREATE {table} IF NOT EXISTS tempfetchdata_sensors_clean AS SELECT null::int as sensors_id, sensor_nodes_id, @@ -478,6 +565,10 @@ WHERE sensors_id IS NULL; --WHERE m.datetime = t.datetime --AND m.sensors_id = t.sensors_id; +-- restart the clock to measure just inserts +__process_start := clock_timestamp(); + + WITH inserts AS ( INSERT INTO measurements (sensors_id, datetime, value) SELECT sensors_id @@ -502,18 +593,102 @@ INTO __inserted_start_datetime , __inserted_measurements FROM inserted; - --- Now we can use those temp_inserted_measurements to update the cache tables -INSERT INTO sensors_latest ( +__insert_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- mark the fetchlogs as done +WITH inserted AS ( + SELECT m.fetchlogs_id + , COUNT(m.*) as n_records + , COUNT(t.*) as n_inserted + , MIN(m.datetime) as fr_datetime + , MAX(m.datetime) as lr_datetime + , MIN(t.datetime) as fi_datetime + , MAX(t.datetime) as li_datetime + FROM tempfetchdata m + LEFT JOIN temp_inserted_measurements t ON (t.sensors_id = m.sensors_id AND t.datetime = m.datetime) + GROUP BY m.fetchlogs_id) +UPDATE fetchlogs +SET completed_datetime = CURRENT_TIMESTAMP +, inserted = COALESCE(n_inserted, 0) +, records = COALESCE(n_records, 0) +, first_recorded_datetime = fr_datetime +, last_recorded_datetime = lr_datetime +, first_inserted_datetime = fi_datetime +, last_inserted_datetime = li_datetime +FROM inserted +WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; + +-- track the time required to update cache tables +__process_start := clock_timestamp(); + +-- -- Now we can use those temp_inserted_measurements to update the cache tables +-- INSERT INTO sensors_latest ( +-- sensors_id +-- , datetime +-- , value +-- ) +-- ---- identify the row that has the latest value +-- WITH numbered AS ( +-- SELECT sensors_id +-- , datetime +-- , value +-- , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn +-- FROM temp_inserted_measurements +-- ), latest AS ( +-- ---- only insert those rows +-- SELECT sensors_id +-- , datetime +-- , value +-- FROM numbered +-- WHERE rn = 1 +-- ) +-- SELECT l.sensors_id +-- , l.datetime +-- , l.value +-- FROM latest l +-- LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +-- WHERE sl.sensors_id IS NULL +-- OR l.datetime > sl.datetime +-- ON CONFLICT (sensors_id) DO UPDATE +-- SET datetime = EXCLUDED.datetime +-- , value = EXCLUDED.value +-- , modified_on = now() +-- --, fetchlogs_id = EXCLUDED.fetchlogs_id +-- ; + +-- update the exceedances +INSERT INTO sensor_exceedances (sensors_id, threshold_value, datetime_latest) + SELECT + m.sensors_id + , t.value + , MAX(datetime) + FROM temp_inserted_measurements m + JOIN sensors s ON (m.sensors_id = s.sensors_id) + JOIN thresholds t ON (s.measurands_id = t.measurands_id) + AND m.value > t.value + GROUP BY 1, 2 + ON CONFLICT (sensors_id, threshold_value) DO UPDATE SET + datetime_latest = GREATEST(sensor_exceedances.datetime_latest, EXCLUDED.datetime_latest) + , updated_on = now(); + +INSERT INTO sensors_rollup ( sensors_id - , datetime - , value + , datetime_first + , datetime_last + , value_latest + , value_count + , value_avg + , value_min + , value_max ) ---- identify the row that has the latest value WITH numbered AS ( SELECT sensors_id , datetime , value + , sum(1) OVER (PARTITION BY sensors_id) as value_count + , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min + , avg(value) OVER (PARTITION BY sensors_id) as value_avg , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn FROM temp_inserted_measurements ), latest AS ( @@ -521,34 +696,50 @@ WITH numbered AS ( SELECT sensors_id , datetime , value + , value_count + , value_avg + , datetime_min FROM numbered WHERE rn = 1 ) SELECT l.sensors_id -, l.datetime -, l.value +, l.datetime_min -- first +, l.datetime -- last +, l.value -- last value +, l.value_count +, l.value_avg +, l.value -- min +, l.value -- max FROM latest l -LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) -WHERE sl.sensors_id IS NULL -OR l.datetime > sl.datetime +LEFT JOIN sensors_rollup sr ON (l.sensors_id = sr.sensors_id) +WHERE sr.sensors_id IS NULL +OR l.datetime > sr.datetime_last +OR l.datetime_min < sr.datetime_first ON CONFLICT (sensors_id) DO UPDATE -SET datetime = EXCLUDED.datetime -, value = EXCLUDED.value +SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_last) +, value_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.value_latest + ELSE sensors_rollup.value_latest + END +, value_count = sensors_rollup.value_count + EXCLUDED.value_count +, value_min = LEAST(sensors_rollup.value_min, EXCLUDED.value_latest) +, value_max = GREATEST(sensors_rollup.value_max, EXCLUDED.value_latest) +, datetime_first = LEAST(sensors_rollup.datetime_first, EXCLUDED.datetime_first) , modified_on = now() --, fetchlogs_id = EXCLUDED.fetchlogs_id ; --- No longer going to manage the fetch log in this way --- WITH updates AS ( --- UPDATE fetchlogs --- SET completed_datetime = clock_timestamp() --- , last_message = NULL -- reset any previous error --- WHERE key IN (SELECT key FROM ingestfiles) --- RETURNING 1) --- SELECT COUNT(1) INTO __keys --- FROM updates; +-- Update the table that will help to track hourly rollups +INSERT INTO hourly_stats (datetime) + SELECT date_trunc('hour', datetime) + FROM temp_inserted_measurements + GROUP BY 1 +ON CONFLICT (datetime) DO UPDATE +SET modified_on = now(); + +-- update the table that will track the daily exports WITH e AS ( INSERT INTO open_data_export_logs (sensor_nodes_id, day, records, measurands, modified_on) SELECT sn.sensor_nodes_id @@ -572,7 +763,91 @@ SELECT COUNT(1) INTO __exported_days FROM e; -RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, source: fetch' +__cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + + +INSERT INTO ingest_stats ( + ingest_method + -- total + , total_measurements_processed + , total_measurements_inserted + , total_measurements_rejected + , total_nodes_processed + , total_nodes_inserted + , total_nodes_updated + , total_nodes_rejected + -- total times + , total_process_time_ms + , total_insert_time_ms + , total_cache_time_ms + -- latest + , latest_measurements_processed + , latest_measurements_inserted + , latest_measurements_rejected + , latest_nodes_processed + , latest_nodes_inserted + , latest_nodes_updated + , latest_nodes_rejected + -- times + , latest_process_time_ms + , latest_insert_time_ms + , latest_cache_time_ms + ) VALUES ( + -- totals + __ingest_method + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms + -- latest + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms +) ON CONFLICT (ingest_method) DO UPDATE SET + -- totals + total_measurements_processed = ingest_stats.total_measurements_processed + EXCLUDED.total_measurements_processed + , total_measurements_inserted = ingest_stats.total_measurements_inserted + EXCLUDED.total_measurements_inserted + , total_measurements_rejected = ingest_stats.total_measurements_rejected + EXCLUDED.total_measurements_rejected + , total_nodes_processed = ingest_stats.total_nodes_processed + EXCLUDED.total_nodes_processed + , total_nodes_inserted = ingest_stats.total_nodes_inserted + EXCLUDED.total_nodes_inserted + , total_nodes_updated = ingest_stats.total_nodes_updated + EXCLUDED.total_nodes_updated + , total_nodes_rejected = ingest_stats.total_nodes_rejected + EXCLUDED.total_nodes_rejected + , total_process_time_ms = ingest_stats.total_process_time_ms + EXCLUDED.total_process_time_ms + , total_insert_time_ms = ingest_stats.total_insert_time_ms + EXCLUDED.total_insert_time_ms + , total_cache_time_ms = ingest_stats.total_cache_time_ms + EXCLUDED.total_cache_time_ms + -- latest + , latest_measurements_processed = EXCLUDED.latest_measurements_processed + , latest_measurements_inserted = EXCLUDED.latest_measurements_inserted + , latest_measurements_rejected = EXCLUDED.latest_measurements_rejected + , latest_nodes_processed = EXCLUDED.latest_nodes_processed + , latest_nodes_inserted = EXCLUDED.latest_nodes_inserted + , latest_nodes_updated = EXCLUDED.latest_nodes_updated + , latest_nodes_rejected = EXCLUDED.latest_nodes_rejected + -- times + , latest_process_time_ms = EXCLUDED.latest_process_time_ms + , latest_insert_time_ms = EXCLUDED.latest_insert_time_ms + , latest_cache_time_ms = EXCLUDED.latest_cache_time_ms + , ingest_count = ingest_stats.ingest_count + 1 + , ingested_on = EXCLUDED.ingested_on; + + + +RAISE NOTICE 'inserted-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, source: fetch' , __total_measurements , __deleted_timescaledb , __deleted_future_measurements diff --git a/ingest/fetch_staging.sql b/ingest/fetch_staging.sql index d6595f6..d6b717d 100644 --- a/ingest/fetch_staging.sql +++ b/ingest/fetch_staging.sql @@ -1,4 +1,5 @@ -CREATE TEMP TABLE IF NOT EXISTS tempfetchdata ( +CREATE {table} IF NOT EXISTS tempfetchdata ( + fetchlogs_id int, location text, value float, unit text, @@ -17,17 +18,19 @@ CREATE TEMP TABLE IF NOT EXISTS tempfetchdata ( sensors_id int ); -CREATE TEMP TABLE IF NOT EXISTS ingestfiles( - key text -); +--CREATE {table} IF NOT EXISTS ingestfiles( +-- key text +--); -- This table will hold measurements that have -- actually been inserted into the measurements table -- this is to deal with the overlap that we see in the -- incoming files -CREATE TEMP TABLE IF NOT EXISTS temp_inserted_measurements ( +CREATE {table} IF NOT EXISTS temp_inserted_measurements ( sensors_id int , datetime timestamptz , value double precision + , lat double precision + , lon double precision , fetchlogs_id int ); diff --git a/ingest/handler.py b/ingest/handler.py index 34e5baa..7f97dd8 100644 --- a/ingest/handler.py +++ b/ingest/handler.py @@ -104,6 +104,10 @@ def getKeysFromS3Record(record): def cronhandler(event, context): + if settings.PAUSE_INGESTING: + logger.info('Ingesting is paused') + return None + start_time = time() timeout = settings.INGEST_TIMEOUT # manual timeout for testing ascending = settings.FETCH_ASCENDING if 'ascending' not in event else event['ascending'] @@ -112,90 +116,58 @@ def cronhandler(event, context): metadata_limit = settings.METADATA_LIMIT if 'metadata_limit' not in event else event['metadata_limit'] logger.info(f"Running cron job: {event['source']}, ascending: {ascending}") - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: - connection.set_session(autocommit=True) - with connection.cursor() as cursor: - cursor.execute( - """ - SELECT count(*) - FROM fetchlogs - WHERE completed_datetime is null - AND key ~*'stations'; - """, - ) - metadata = cursor.fetchone() - cursor.execute( - """ - SELECT count(*) - FROM fetchlogs - WHERE key ~*'measures' - AND completed_datetime is null - AND ( - loaded_datetime IS NULL - OR loaded_datetime < now() - '1hour'::interval - ); - """, - ) - pipeline = cursor.fetchone() - cursor.execute( - """ - SELECT count(*) - FROM fetchlogs - WHERE completed_datetime is null - AND key ~*'realtime'; - """, - ) - realtime = cursor.fetchone() - for notice in connection.notices: - logger.debug(notice) - - metadata = 0 if metadata is None else metadata[0] - realtime = 0 if realtime is None else realtime[0] - pipeline = 0 if pipeline is None else pipeline[0] - logger.info(f"{metadata_limit}/{metadata} metadata, {realtime_limit}/{realtime} openaq, {pipeline_limit}/{pipeline} pipeline records pending") # these exceptions are just a failsafe so that if something # unaccounted for happens we can still move on to the next # process. In case of this type of exception we will need to # fix it asap try: - if metadata > 0 and metadata_limit > 0: + if metadata_limit > 0: cnt = 0 - while cnt < metadata and (time() - start_time) < timeout: - cnt += load_metadata_db(metadata_limit, ascending) + loaded = 1 + while ( + loaded > 0 + and (time() - start_time) < timeout + ): + loaded = load_metadata_db(metadata_limit, ascending) + cnt += loaded logger.info( - "loaded %s of %s metadata records, timer: %0.4f", - cnt, metadata, time() - start_time + "loaded %s metadata records, timer: %0.4f", + cnt, time() - start_time ) except Exception as e: logger.error(f"load metadata failed: {e}") try: - if realtime > 0 and realtime_limit > 0: + if realtime_limit > 0: cnt = 0 loaded = 1 while ( loaded > 0 - and cnt < realtime and (time() - start_time) < timeout ): loaded = load_db(realtime_limit, ascending) cnt += loaded logger.info( - "loaded %s of %s fetch records, timer: %0.4f", - cnt, realtime, time() - start_time + "loaded %s fetch records, timer: %0.4f", + cnt, time() - start_time ) except Exception as e: logger.error(f"load realtime failed: {e}") try: - if pipeline > 0 and pipeline_limit > 0: + if pipeline_limit > 0: cnt = 0 - while cnt < pipeline and (time() - start_time) < timeout: - cnt += load_measurements_db(pipeline_limit, ascending) + loaded = 1 + while ( + loaded > 0 + and (time() - start_time) < timeout + ): + loaded = load_measurements_db(pipeline_limit, ascending) + cnt += loaded logger.info( - "loaded %s of %s pipeline records, timer: %0.4f", - cnt, pipeline, time() - start_time + "loaded %s pipeline records, timer: %0.4f", + cnt, time() - start_time ) except Exception as e: logger.error(f"load pipeline failed: {e}") diff --git a/ingest/lcs.py b/ingest/lcs.py index 51ff948..fdb0a1c 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -15,7 +15,13 @@ import typer from io import StringIO from .settings import settings -from .utils import get_query, clean_csv_value, StringIteratorIO, fix_units +from .utils import ( + get_query, + clean_csv_value, + StringIteratorIO, + fix_units, + load_fetchlogs, +) s3 = boto3.resource("s3") s3c = boto3.client("s3") @@ -92,7 +98,7 @@ def system(self, j, node_id, fetchlogsId): self.systems.append(system) def node(self, j): - node = {} + node = {"fetchlogs_id": None} metadata = {} if "sensor_node_id" in j: id = j["sensor_node_id"] @@ -100,9 +106,7 @@ def node(self, j): return None # if we have passed the fetchlogs_id we should track it if "fetchlogs_id" in j: - fetchlogsId = j["fetchlogs_id"] - else: - fetchlogsId = None + node["fetchlogs_id"] = j["fetchlogs_id"] for key, value in j.items(): key = str.replace(key, "sensor_node_", "") @@ -123,7 +127,7 @@ def node(self, j): except Exception: node["geom"] = None elif key == "sensor_systems": - self.system(value, id, fetchlogsId) + self.system(value, id, node["fetchlogs_id"]) else: metadata[key] = value node["metadata"] = orjson.dumps(metadata).decode() @@ -160,7 +164,7 @@ def get_station(self, key, fetchlogsId): self.node(obj) def load_data(self): - logger.debug(f"load_data: {self.keys}") + logger.debug(f"load_data: {self.keys}, {self.nodes}") with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: @@ -248,17 +252,12 @@ def load_data(self): def process_data(self, cursor): query = get_query("lcs_ingest_full.sql") cursor.execute(query) - # query = get_query("lcs_ingest_nodes.sql") - # cursor.execute(query) - - # query = get_query("lcs_ingest_systems.sql") - # cursor.execute(query) - - # query = get_query("lcs_ingest_sensors.sql") - # cursor.execute(query) def create_staging_table(self, cursor): - cursor.execute(get_query("lcs_staging.sql")) + cursor.execute(get_query( + "lcs_staging.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) def get_metadata(self): hasnew = False @@ -475,17 +474,15 @@ def to_tsv(row): return tsv return "" + def load_measurements_file(fetchlogs_id: int): with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: cursor.execute( """ - SELECT key - , init_datetime - , loaded_datetime - , completed_datetime - , last_message + SELECT fetchlogs_id + , key FROM fetchlogs WHERE fetchlogs_id = %s LIMIT 1 @@ -494,49 +491,29 @@ def load_measurements_file(fetchlogs_id: int): (fetchlogs_id,), ) rows = cursor.fetchall() - print(rows) - keys = [r[0] for r in rows] - load_measurements(keys) + load_measurements(rows) + + +def load_measurements_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT fetchlogs_id + , key + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + load_measurements(rows) def load_measurements_db(limit=250, ascending: bool = False): - order = 'ASC' if ascending else 'DESC' - conn = psycopg2.connect(settings.DATABASE_WRITE_URL) - cur = conn.cursor() - batch_uuid = uuid.uuid4().hex pattern = '^lcs-etl-pipeline/measures/.*\\.csv' - # pattern = '^uploaded/measures/.*\\.csv' - cur.execute( - f""" - UPDATE fetchlogs - SET loaded_datetime = CURRENT_TIMESTAMP - , jobs = jobs + 1 - , batch_uuid = %s - FROM ( - SELECT fetchlogs_id - FROM fetchlogs - WHERE key~E'{pattern}' - AND completed_datetime is null - AND ( - loaded_datetime IS NULL - OR loaded_datetime < now() - '1hour'::interval - ) - ORDER BY last_modified {order} nulls last - LIMIT %s - FOR UPDATE SKIP LOCKED - ) as q - WHERE q.fetchlogs_id = fetchlogs.fetchlogs_id - RETURNING fetchlogs.fetchlogs_id - , fetchlogs.key - , fetchlogs.last_modified; - """, - (batch_uuid, limit,), - ) - rows = cur.fetchall() - # keys = [r[0] for r in rows] - conn.commit() - cur.close() - conn.close() + rows = load_fetchlogs(pattern, limit, ascending) load_measurements(rows) return len(rows) @@ -557,13 +534,15 @@ def load_measurements(rows): logger.info("load_measurements:get: %s keys; %s rows; %0.4f seconds", len(rows), len(data), time() - start_time) if len(data) > 0: - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: - cursor.execute(get_query("lcs_staging.sql")) - start = time() + cursor.execute(get_query( + "lcs_staging.sql", + table="TEMP TABLE" + )) + write_csv( cursor, new, "keys", ["key",], ) diff --git a/ingest/lcs_ingest_full.sql b/ingest/lcs_ingest_full.sql index 153b46b..d186636 100644 --- a/ingest/lcs_ingest_full.sql +++ b/ingest/lcs_ingest_full.sql @@ -1,4 +1,4 @@ --- Get sensor systems +-- lcs_ingest_full DO $$ DECLARE __process_start timestamptz := clock_timestamp(); @@ -29,12 +29,25 @@ FROM ms_sensors WHERE ms_sensors.ingest_id IS NULL OR ingest_sensor_systems_id IS NULL; +-- first thing we want to do is to get the source +-- and the source_id from the ingest id +-- adding the station forces our method to treat the station as the parameter +-- the first section as the source name and then the rest as teh source id +-- this is required for ingest_ids that use `-` in the source_id +-- e.g. something-blah-blah-blah-pm10 +-- where the sensor node ingest id would be +-- something-blah-blah-blah +-- and blah could be read as a paramter value +UPDATE ms_sensornodes +SET source_id = split_ingest_id(ingest_id||'-station', 2); + + -- match the sensor nodes to get the sensor_nodes_id UPDATE ms_sensornodes SET sensor_nodes_id = sensor_nodes.sensor_nodes_id FROM sensor_nodes WHERE sensor_nodes.source_name = ms_sensornodes.source_name -AND sensor_nodes.source_id = ms_sensornodes.ingest_id; +AND sensor_nodes.source_id = ms_sensornodes.source_id; -- And now we insert those into the sensor nodes table -- we are gouping to deal with any duplicates that currently exist @@ -48,15 +61,17 @@ INSERT INTO sensor_nodes ( , source_id , timezones_id , providers_id +, countries_id ) SELECT site_name , source_name , ismobile , geom , metadata -, ingest_id +, source_id , get_timezones_id(geom) , get_providers_id(source_name) +, get_countries_id(geom) FROM ms_sensornodes GROUP BY 1,2,3,4,5,6,7,8 ON CONFLICT (source_name, source_id) DO UPDATE @@ -67,6 +82,7 @@ SET , metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') , timezones_id = COALESCE(EXCLUDED.timezones_id, sensor_nodes.timezones_id) , providers_id = COALESCE(EXCLUDED.providers_id, sensor_nodes.providers_id) + , modified_on = now() RETURNING 1) SELECT COUNT(1) INTO __inserted_nodes FROM inserts; @@ -81,7 +97,7 @@ SET sensor_nodes_id = sensor_nodes.sensor_nodes_id FROM sensor_nodes WHERE ms_sensornodes.sensor_nodes_id is null AND sensor_nodes.source_name = ms_sensornodes.source_name -AND sensor_nodes.source_id = ms_sensornodes.ingest_id; +AND sensor_nodes.source_id = ms_sensornodes.source_id; -- log anything we were not able to get an id for WITH r AS ( @@ -130,7 +146,8 @@ FROM ms_sensorsystems WHERE sensor_nodes_id IS NOT NULL GROUP BY sensor_nodes_id, ingest_id, metadata ON CONFLICT (sensor_nodes_id, source_id) DO UPDATE SET - metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}'); + metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , modified_on = now(); ---------------------------- -- lcs_ingest_sensors.sql -- diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index e32340e..2bd6b3a 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -1,15 +1,24 @@ --- Get sensor systems +-- lcs_meas_ingest DO $$ DECLARE __process_start timestamptz := clock_timestamp(); +__total_measurements int; __inserted_measurements int; -__rejected_measurements int; +__rejected_measurements int := 0; +__rejected_nodes int := 0; +__total_nodes int := 0; +__updated_nodes int := 0; +__inserted_nodes int := 0; __exported_days int; +__start_datetime timestamptz; +__end_datetime timestamptz; __inserted_start_datetime timestamptz; __inserted_end_datetime timestamptz; __process_time_ms int; __insert_time_ms int; __cache_time_ms int; +__error_context text; +__ingest_method text := 'lcs'; BEGIN DELETE @@ -28,6 +37,16 @@ FROM rejects WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM meas) AND tbl ~* '^meas'; + +SELECT COUNT(1) +, MIN(datetime) +, MAX(datetime) +INTO __total_measurements +, __start_datetime +, __end_datetime +FROM meas; + + UPDATE meas SET sensors_id=s.sensors_id FROM sensors s @@ -38,14 +57,19 @@ WHERE s.source_id=ingest_id; WITH nodes AS ( INSERT INTO sensor_nodes ( source_name -, source_id ) +, site_name +, source_id +, metadata) SELECT split_ingest_id(ingest_id, 1) as source_name +, split_ingest_id(ingest_id, 2) as site_name , split_ingest_id(ingest_id, 2) as source_id +, jsonb_build_object('fetchlogs_id', MIN(fetchlogs_id)) FROM meas WHERE sensors_id IS NULL -GROUP BY 1,2 +GROUP BY 1,2,3 ON CONFLICT (source_name, source_id) DO UPDATE SET source_id = EXCLUDED.source_id +, metadata = EXCLUDED.metadata||COALESCE(sensor_nodes.metadata, '{}'::jsonb) RETURNING sensor_nodes_id, source_id) INSERT INTO sensor_systems ( sensor_nodes_id @@ -58,13 +82,14 @@ ON CONFLICT DO NOTHING; -- now create a sensor for each -- this method depends on us having a match for the parameter WITH sen AS ( -SELECT ingest_id -, split_ingest_id(ingest_id, 1) as source_name -, split_ingest_id(ingest_id, 2) as source_id -, split_ingest_id(ingest_id, 3) as parameter -FROM meas -WHERE sensors_id IS NULL -GROUP BY 1,2,3,4) + SELECT ingest_id + , split_ingest_id(ingest_id, 1) as source_name + , split_ingest_id(ingest_id, 2) as source_id + , split_ingest_id(ingest_id, 3) as parameter + FROM meas + WHERE sensors_id IS NULL + GROUP BY 1,2,3,4 +), inserts AS ( INSERT INTO sensors (sensor_systems_id, measurands_id, source_id) SELECT sy.sensor_systems_id , m.measurands_id @@ -73,7 +98,10 @@ FROM sen s JOIN measurands_map_view m ON (s.parameter = m.key) JOIN sensor_nodes n ON (s.source_name = n.source_name AND s.source_id = n.source_id) JOIN sensor_systems sy ON (sy.sensor_nodes_id = n.sensor_nodes_id AND s.source_id = sy.source_id) -ON CONFLICT DO NOTHING; +ON CONFLICT DO NOTHING +RETURNING sensor_systems_id) +SELECT COUNT(DISTINCT sensor_systems_id) INTO __inserted_nodes +FROM inserts; -- try again to find the sensors UPDATE meas @@ -82,6 +110,12 @@ FROM sensors s WHERE s.source_id=ingest_id AND meas.sensors_id IS NULL; + +SELECT COUNT(DISTINCT sensors_id) +INTO __total_nodes +FROM meas; + + __process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); -- reject any missing. Most likely due to issues @@ -165,13 +199,77 @@ WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; -- track the time required to update cache tables __process_start := clock_timestamp(); --- Now we can use those temp_inserted_measurements to update the cache tables -INSERT INTO sensors_latest ( + +-- -- Now we can use those temp_inserted_measurements to update the cache tables +-- INSERT INTO sensors_latest ( +-- sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- ) +-- ---- identify the row that has the latest value +-- WITH numbered AS ( +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn +-- FROM temp_inserted_measurements +-- ), latest AS ( +-- ---- only insert those rows +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- FROM numbered +-- WHERE rn = 1 +-- ) +-- SELECT l.sensors_id +-- , l.datetime +-- , l.value +-- , l.lat +-- , l.lon +-- FROM latest l +-- LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +-- WHERE sl.sensors_id IS NULL +-- OR l.datetime > sl.datetime +-- ON CONFLICT (sensors_id) DO UPDATE +-- SET datetime = EXCLUDED.datetime +-- , value = EXCLUDED.value +-- , lat = EXCLUDED.lat +-- , lon = EXCLUDED.lon +-- , modified_on = now() +-- --, fetchlogs_id = EXCLUDED.fetchlogs_id +-- ; +-- update the exceedances +INSERT INTO sensor_exceedances (sensors_id, threshold_value, datetime_latest) + SELECT + m.sensors_id + , t.value + , MAX(datetime) + FROM temp_inserted_measurements m + JOIN sensors s ON (m.sensors_id = s.sensors_id) + JOIN thresholds t ON (s.measurands_id = t.measurands_id) + AND m.value > t.value + GROUP BY 1, 2 + ON CONFLICT (sensors_id, threshold_value) DO UPDATE SET + datetime_latest = GREATEST(sensor_exceedances.datetime_latest, EXCLUDED.datetime_latest) + , updated_on = now(); + + +INSERT INTO sensors_rollup ( sensors_id - , datetime - , value - , lat - , lon + , datetime_first + , datetime_last + , value_latest + , value_count + , value_avg + , value_min + , value_max + , geom_latest ) ---- identify the row that has the latest value WITH numbered AS ( @@ -180,6 +278,9 @@ WITH numbered AS ( , value , lat , lon + , sum(1) OVER (PARTITION BY sensors_id) as value_count + , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min + , avg(value) OVER (PARTITION BY sensors_id) as value_avg , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn FROM temp_inserted_measurements ), latest AS ( @@ -187,30 +288,56 @@ WITH numbered AS ( SELECT sensors_id , datetime , value + , value_count + , value_avg + , datetime_min , lat , lon FROM numbered WHERE rn = 1 ) SELECT l.sensors_id -, l.datetime -, l.value -, l.lat -, l.lon +, l.datetime_min -- first +, l.datetime -- last +, l.value -- last value +, l.value_count +, l.value_avg +, l.value -- min +, l.value -- max +, public.pt3857(lon, lat) FROM latest l -LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) -WHERE sl.sensors_id IS NULL -OR l.datetime > sl.datetime +LEFT JOIN sensors_rollup sr ON (l.sensors_id = sr.sensors_id) +WHERE sr.sensors_id IS NULL +OR l.datetime > sr.datetime_last +OR l.datetime_min < sr.datetime_first ON CONFLICT (sensors_id) DO UPDATE -SET datetime = EXCLUDED.datetime -, value = EXCLUDED.value -, lat = EXCLUDED.lat -, lon = EXCLUDED.lon +SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_last) +, value_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.value_latest + ELSE sensors_rollup.value_latest + END +, geom_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.geom_latest + ELSE sensors_rollup.geom_latest + END +, value_count = sensors_rollup.value_count + EXCLUDED.value_count +, value_min = LEAST(sensors_rollup.value_min, EXCLUDED.value_latest) +, value_max = GREATEST(sensors_rollup.value_max, EXCLUDED.value_latest) +, datetime_first = LEAST(sensors_rollup.datetime_first, EXCLUDED.datetime_first) , modified_on = now() --, fetchlogs_id = EXCLUDED.fetchlogs_id ; +-- Update the table that will help to track hourly rollups +-- INSERT INTO hourly_stats (datetime) +-- SELECT date_trunc('hour', datetime) +-- FROM temp_inserted_measurements +-- GROUP BY 1 +-- ON CONFLICT (datetime) DO UPDATE +-- SET modified_on = now(); + + --Update the export queue/logs to export these records --wrap it in a block just in case the database does not have this module installed --we subtract the second because the data is assumed to be time ending @@ -236,8 +363,88 @@ RETURNING 1) SELECT COUNT(1) INTO __exported_days FROM e; + __cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); +INSERT INTO ingest_stats ( + ingest_method + -- total + , total_measurements_processed + , total_measurements_inserted + , total_measurements_rejected + , total_nodes_processed + , total_nodes_inserted + , total_nodes_updated + , total_nodes_rejected + -- total times + , total_process_time_ms + , total_insert_time_ms + , total_cache_time_ms + -- latest + , latest_measurements_processed + , latest_measurements_inserted + , latest_measurements_rejected + , latest_nodes_processed + , latest_nodes_inserted + , latest_nodes_updated + , latest_nodes_rejected + -- times + , latest_process_time_ms + , latest_insert_time_ms + , latest_cache_time_ms + ) VALUES ( + -- totals + __ingest_method + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms + -- latest + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms +) ON CONFLICT (ingest_method) DO UPDATE SET + -- totals + total_measurements_processed = ingest_stats.total_measurements_processed + EXCLUDED.total_measurements_processed + , total_measurements_inserted = ingest_stats.total_measurements_inserted + EXCLUDED.total_measurements_inserted + , total_measurements_rejected = ingest_stats.total_measurements_rejected + EXCLUDED.total_measurements_rejected + , total_nodes_processed = ingest_stats.total_nodes_processed + EXCLUDED.total_nodes_processed + , total_nodes_inserted = ingest_stats.total_nodes_inserted + EXCLUDED.total_nodes_inserted + , total_nodes_updated = ingest_stats.total_nodes_updated + EXCLUDED.total_nodes_updated + , total_nodes_rejected = ingest_stats.total_nodes_rejected + EXCLUDED.total_nodes_rejected + , total_process_time_ms = ingest_stats.total_process_time_ms + EXCLUDED.total_process_time_ms + , total_insert_time_ms = ingest_stats.total_insert_time_ms + EXCLUDED.total_insert_time_ms + , total_cache_time_ms = ingest_stats.total_cache_time_ms + EXCLUDED.total_cache_time_ms + -- latest + , latest_measurements_processed = EXCLUDED.latest_measurements_processed + , latest_measurements_inserted = EXCLUDED.latest_measurements_inserted + , latest_measurements_rejected = EXCLUDED.latest_measurements_rejected + , latest_nodes_processed = EXCLUDED.latest_nodes_processed + , latest_nodes_inserted = EXCLUDED.latest_nodes_inserted + , latest_nodes_updated = EXCLUDED.latest_nodes_updated + , latest_nodes_rejected = EXCLUDED.latest_nodes_rejected + -- times + , latest_process_time_ms = EXCLUDED.latest_process_time_ms + , latest_insert_time_ms = EXCLUDED.latest_insert_time_ms + , latest_cache_time_ms = EXCLUDED.latest_cache_time_ms + , ingest_count = ingest_stats.ingest_count + 1 + , ingested_on = EXCLUDED.ingested_on; + RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: lcs' , __inserted_measurements @@ -249,8 +456,9 @@ RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, reject , __insert_time_ms , __cache_time_ms; + EXCEPTION WHEN OTHERS THEN - RAISE NOTICE 'Failed to export to logs: %', SQLERRM - USING HINT = 'Make sure that the open data module is installed'; + GET STACKED DIAGNOSTICS __error_context = PG_EXCEPTION_CONTEXT; + RAISE NOTICE 'Failed to ingest measurements: %, %', SQLERRM, __error_context; END $$; diff --git a/ingest/lcs_staging.sql b/ingest/lcs_staging.sql index 147fcb6..7b2e7b8 100644 --- a/ingest/lcs_staging.sql +++ b/ingest/lcs_staging.sql @@ -1,15 +1,24 @@ -CREATE TEMP TABLE IF NOT EXISTS ms_sensornodes ( +-- DROP TABLE IF EXISTS +-- ms_sensornodes +-- , ms_sensorsystems +-- , ms_sensors +-- , meas +-- , keys +-- , temp_inserted_measurements; + +CREATE {table} IF NOT EXISTS ms_sensornodes ( sensor_nodes_id int, ingest_id text, site_name text, source_name text, + source_id text, ismobile boolean, geom geometry, metadata jsonb, fetchlogs_id int ); -CREATE TEMP TABLE IF NOT EXISTS ms_sensorsystems ( +CREATE {table} IF NOT EXISTS ms_sensorsystems ( sensor_systems_id int, ingest_id text, ingest_sensor_nodes_id text, @@ -18,8 +27,7 @@ CREATE TEMP TABLE IF NOT EXISTS ms_sensorsystems ( fetchlogs_id int ); - -CREATE TEMP TABLE IF NOT EXISTS ms_sensors ( +CREATE {table} IF NOT EXISTS ms_sensors ( ingest_id text, sensors_id int, sensor_systems_id int, @@ -31,7 +39,7 @@ CREATE TEMP TABLE IF NOT EXISTS ms_sensors ( fetchlogs_id int ); -CREATE TEMP TABLE IF NOT EXISTS meas ( +CREATE {table} IF NOT EXISTS meas ( ingest_id text, sensors_id int, value float, @@ -41,7 +49,7 @@ CREATE TEMP TABLE IF NOT EXISTS meas ( fetchlogs_id int ); -CREATE TEMP TABLE IF NOT EXISTS keys ( +CREATE {table} IF NOT EXISTS keys ( fetchlogs_id int , key text , last_modified timestamptz @@ -51,7 +59,7 @@ CREATE TEMP TABLE IF NOT EXISTS keys ( -- actually been inserted into the measurements table -- this is to deal with the overlap that we see in the -- incoming files -CREATE TEMP TABLE IF NOT EXISTS temp_inserted_measurements ( +CREATE {table} IF NOT EXISTS temp_inserted_measurements ( sensors_id int , datetime timestamptz , value double precision diff --git a/ingest/settings.py b/ingest/settings.py index ced8420..1cebc47 100644 --- a/ingest/settings.py +++ b/ingest/settings.py @@ -24,6 +24,8 @@ class Settings(BaseSettings): METADATA_LIMIT: int = 10 REALTIME_LIMIT: int = 10 LOG_LEVEL: str = 'INFO' + USE_TEMP_TABLES: bool = True + PAUSE_INGESTING: bool = False @validator('DATABASE_READ_URL', allow_reuse=True) def get_read_url(cls, v, values): diff --git a/ingest/utils.py b/ingest/utils.py index 32a03df..f98982e 100644 --- a/ingest/utils.py +++ b/ingest/utils.py @@ -4,6 +4,7 @@ import logging from urllib.parse import unquote_plus import gzip +import uuid import boto3 from io import StringIO @@ -107,7 +108,7 @@ def clean_csv_value(value): def get_query(file, **params): - logger.debug("get_query: {file}, params: {params}") + logger.debug(f"get_query: {file}, params: {params}") query = Path(os.path.join(dir_path, file)).read_text() if params is not None and len(params) >= 1: query = query.format(**params) @@ -392,40 +393,23 @@ def load_errors_list(limit: int = 10): return rows -def load_fail(cursor, key, e): - print("full copy failed", key, e) +def load_fail(cursor, fetchlogsId, e): + logger.warning(f"full copy of {fetchlogsId} failed: {e}") cursor.execute( """ UPDATE fetchlogs - SET - last_message=%s - WHERE - key=%s + SET last_message=%s + , has_error = true + , completed_datetime = clock_timestamp() + WHERE fetchlogs_id=%s """, ( str(e), - key, + fetchlogsId, ), ) -# def load_success(cursor, key): -# cursor.execute( -# """ -# UPDATE fetchlogs -# SET -# last_message=%s, -# loaded_datetime=clock_timestamp() -# WHERE -# key=%s -# """, -# ( -# str(cursor.statusmessage), -# key, -# ), -# ) - - def load_success(cursor, keys, message: str = 'success'): cursor.execute( """ @@ -433,6 +417,7 @@ def load_success(cursor, keys, message: str = 'success'): SET last_message=%s , completed_datetime=clock_timestamp() + , has_error = false WHERE key=ANY(%s) """, ( @@ -442,6 +427,50 @@ def load_success(cursor, keys, message: str = 'success'): ) +def load_fetchlogs( + pattern: str, + limit: int = 250, + ascending: bool = False, +): + order = 'ASC' if ascending else 'DESC' + conn = psycopg2.connect(settings.DATABASE_WRITE_URL) + cur = conn.cursor() + batch_uuid = uuid.uuid4().hex + cur.execute( + f""" + UPDATE fetchlogs + SET loaded_datetime = CURRENT_TIMESTAMP + , jobs = jobs + 1 + , batch_uuid = %s + FROM ( + SELECT fetchlogs_id + FROM fetchlogs + WHERE key~E'{pattern}' + AND NOT has_error + AND completed_datetime is null + AND ( + loaded_datetime IS NULL + OR loaded_datetime < now() - '30min'::interval + ) + ORDER BY last_modified {order} nulls last + LIMIT %s + FOR UPDATE SKIP LOCKED + ) as q + WHERE q.fetchlogs_id = fetchlogs.fetchlogs_id + RETURNING fetchlogs.fetchlogs_id + , fetchlogs.key + , fetchlogs.last_modified; + """, + (batch_uuid, limit,), + ) + rows = cur.fetchall() + logger.debug(f'Loaded {len(rows)} from fetchlogs using {pattern}/{order}') + conn.commit() + cur.close() + conn.close() + return rows + + def add_fetchlog(key: str): with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: with connection.cursor() as cursor: @@ -500,6 +529,7 @@ def mark_success( SET last_message=%s , completed_datetime={completed} + , has_error = false WHERE {where} """, ( From 16769507ba26a64fc6f63be6580abb945feb8385 Mon Sep 17 00:00:00 2001 From: Christian Date: Wed, 25 Jan 2023 09:10:40 -0800 Subject: [PATCH 10/42] Updated ingester --- ingest/lcs.py | 47 +++++++++++++------------------------- ingest/lcs_ingest_full.sql | 5 +++- requirements_dev.txt | 1 + 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/ingest/lcs.py b/ingest/lcs.py index fdb0a1c..7b922c7 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -266,6 +266,7 @@ def get_metadata(self): id = obj["id"] last_modified = obj["LastModified"] try: + logger.debug(f"Loading station file: {id}:{key}") self.get_station(key, id) self.keys.append( { @@ -316,45 +317,29 @@ def load_metadata_bucketscan(count=100): break -def load_metadata_db(count=250, ascending: bool = False): +def load_metadata_db(limit=250, ascending: bool = False): order = 'ASC' if ascending else 'DESC' - with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: - connection.set_session(autocommit=True) - with connection.cursor() as cursor: - cursor.execute( - f""" - SELECT key - , last_modified - , fetchlogs_id - FROM fetchlogs - WHERE key~'lcs-etl-pipeline/stations/' - AND completed_datetime is null - ORDER BY last_modified {order} nulls last - LIMIT %s; - """, - (count,), - ) - rows = cursor.fetchall() - rowcount = cursor.rowcount - contents = [] - for row in rows: - contents.append( - { - "Key": unquote_plus(row[0]), - "LastModified": row[1], - "id": row[2], - } - ) - for notice in connection.notices: - logger.debug(notice) + pattern = 'lcs-etl-pipeline/stations/' + rows = load_fetchlogs(pattern, limit, ascending) + contents = [] + for row in rows: + logger.debug(row) + contents.append( + { + "Key": unquote_plus(row[1]), + "LastModified": row[2], + "id": row[0], + } + ) if len(contents) > 0: load_metadata(contents) # data = LCSData(contents) # data.get_metadata() - return rowcount + return len(rows) def load_metadata(keys): + logger.debug(f'Load metadata: {len(keys)}') data = LCSData(keys) data.get_metadata() diff --git a/ingest/lcs_ingest_full.sql b/ingest/lcs_ingest_full.sql index d186636..33cd5a6 100644 --- a/ingest/lcs_ingest_full.sql +++ b/ingest/lcs_ingest_full.sql @@ -39,7 +39,10 @@ OR ingest_sensor_systems_id IS NULL; -- something-blah-blah-blah -- and blah could be read as a paramter value UPDATE ms_sensornodes -SET source_id = split_ingest_id(ingest_id||'-station', 2); +SET source_id = CASE + WHEN source_name ~* 'purpleair|habitatmap' THEN ingest_id + ELSE split_ingest_id(ingest_id||'-station', 2) -- station is a placeholder + END; -- match the sensor nodes to get the sensor_nodes_id diff --git a/requirements_dev.txt b/requirements_dev.txt index 30ddf82..2d3c225 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1 +1,2 @@ +-r requirements.txt boto3 From 8c768d26fd7c81b26dc9f1003242adb341c52f48 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 21 Mar 2023 10:31:42 -0700 Subject: [PATCH 11/42] Adding batch methods for check --- check.py | 4 +++- ingest/fetch_ingest_full.sql | 38 +++++++++++++++++++++++++++++++-- ingest/lcs.py | 41 +++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/check.py b/check.py index f277418..60cd198 100644 --- a/check.py +++ b/check.py @@ -72,6 +72,7 @@ load_metadata, load_measurements, load_measurements_batch, + load_metadata_batch, ) from ingest.fetch import ( @@ -174,7 +175,8 @@ def check_realtime_key(key: str, fix: bool = False): ]) elif args.batch is not None: - load_measurements_batch(args.batch) + # load_measurements_batch(args.batch) + load_metadata_batch(args.batch) # Otherwise if we set the summary flag return a daily summary of errors elif args.summary: diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index 9d718d5..1f0e7e2 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -25,6 +25,7 @@ __process_time_ms int; __insert_time_ms int; __cache_time_ms int; __ingest_method text := 'realtime'; +__inserted_spatial_rollups int := 0; BEGIN -- REQUIRED @@ -243,7 +244,7 @@ __process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_st -- File #5 -- ------------- -DROP TABLE IF EXISTS checkrealtime_matched; +--DROP TABLE IF EXISTS checkrealtime_matched; -- CREATE TABLE IF NOT EXISTS checkrealtime_matched ( -- sensor_nodes_id int -- , site_name text @@ -729,6 +730,38 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las --, fetchlogs_id = EXCLUDED.fetchlogs_id ; +\set gridsize 250.0 + +WITH spatial_inserts AS ( +INSERT INTO sensor_nodes_spatial_rollup ( +sensor_nodes_id +, geom +, cell_size +, start_datetime +, end_datetime +, measurements_count +, added_on) +SELECT sensor_nodes_id +, st_snaptogrid(s.geom, :gridsize) +, :gridsize +, MIN(datetime) as start_datetime +, MAX(datetime) as end_datetime +, COUNT(DISTINCT datetime) as measurements +, now() +FROM temp_inserted_measurements +JOIN tempfetchdata_sensors s USING (sensors_id) +JOIN sensor_systems ss USING (sensor_systems_id) +WHERE lat IS NOT NULL +AND lon IS NOT NULL +GROUP BY 1,2 +ON CONFLICT (sensor_nodes_id, geom) DO UPDATE SET + start_datetime = LEAST(sensor_nodes_spatial_rollup.start_datetime, EXCLUDED.start_datetime) +, end_datetime = GREATEST(sensor_nodes_spatial_rollup.end_datetime, EXCLUDED.end_datetime) +, measurements_count = sensor_nodes_spatial_rollup.measurements_count + EXCLUDED.measurements_count +, modified_on = now() +RETURNING 1) +SELECT COUNT(1) INTO __inserted_spatial_rollups +FROM spatial_inserts; -- Update the table that will help to track hourly rollups @@ -847,7 +880,7 @@ INSERT INTO ingest_stats ( -RAISE NOTICE 'inserted-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, source: fetch' +RAISE NOTICE 'inserted-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, inserted-spatial-rollups: %, source: fetch' , __total_measurements , __deleted_timescaledb , __deleted_future_measurements @@ -864,6 +897,7 @@ RAISE NOTICE 'inserted-measurements: %, deleted-timescaledb: %, deleted-future-m , __rejected_systems , __rejected_sensors , __exported_days + , __inserted_spatial_rollups , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); END $$; diff --git a/ingest/lcs.py b/ingest/lcs.py index 7b922c7..804a418 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -338,10 +338,49 @@ def load_metadata_db(limit=250, ascending: bool = False): return len(rows) +def load_metadata_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT key + , last_modified + , fetchlogs_id + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + rowcount = cursor.rowcount + contents = [] + for row in rows: + contents.append( + { + "Key": unquote_plus(row[0]), + "LastModified": row[1], + "id": row[2], + } + ) + for notice in connection.notices: + logger.debug(notice) + if len(contents) > 0: + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() + return rowcount + + def load_metadata(keys): logger.debug(f'Load metadata: {len(keys)}') data = LCSData(keys) - data.get_metadata() + try: + data.get_metadata() + except Exception as e: + ids = ','.join([str(k['id']) for k in keys]) + logger.error(f'load error: {e} ids: {ids}') + raise def select_object(key): From d4aecc36b4014f81f6c685c29b9380d4c64914c3 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 30 Mar 2023 06:28:39 -0700 Subject: [PATCH 12/42] Ingest improvements --- ingest/fetch_ingest_full.sql | 84 ++++++++++++++++++------------------ ingest/lcs_meas_ingest.sql | 12 +++--- ingest/utils.py | 1 + 3 files changed, 49 insertions(+), 48 deletions(-) diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index 1f0e7e2..06d1435 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -36,15 +36,6 @@ BEGIN -- File fetch_filter.sql -- --------------------------- --- This makes sense though we should track in case its systemic -WITH deletes AS ( - DELETE - FROM tempfetchdata - WHERE datetime > now() - RETURNING 1) -SELECT COUNT(1) INTO __deleted_future_measurements -FROM deletes; - -- this seems questionable, I dont want to pass data to this -- process only to have some of it filtered out because its too old -- Commenting this out because it will prevent us from submitting patch @@ -66,7 +57,8 @@ SELECT COUNT(1) INTO __total_measurements , __start_datetime , __end_datetime -FROM tempfetchdata; +FROM tempfetchdata +WHERE datetime <= now(); -- Now we start the old fetch_ingest#.sql files ------------- @@ -570,12 +562,21 @@ WHERE sensors_id IS NULL; __process_start := clock_timestamp(); +-- moved down +-- count the future measurements +SELECT COUNT(1) INTO __deleted_future_measurements +FROM tempfetchdata +WHERE datetime > now() +; + + WITH inserts AS ( INSERT INTO measurements (sensors_id, datetime, value) SELECT sensors_id , datetime , value FROM tempfetchdata + WHERE datetime <= now() ON CONFLICT DO NOTHING RETURNING sensors_id, datetime, value ), inserted as ( @@ -730,38 +731,37 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las --, fetchlogs_id = EXCLUDED.fetchlogs_id ; -\set gridsize 250.0 - -WITH spatial_inserts AS ( -INSERT INTO sensor_nodes_spatial_rollup ( -sensor_nodes_id -, geom -, cell_size -, start_datetime -, end_datetime -, measurements_count -, added_on) -SELECT sensor_nodes_id -, st_snaptogrid(s.geom, :gridsize) -, :gridsize -, MIN(datetime) as start_datetime -, MAX(datetime) as end_datetime -, COUNT(DISTINCT datetime) as measurements -, now() -FROM temp_inserted_measurements -JOIN tempfetchdata_sensors s USING (sensors_id) -JOIN sensor_systems ss USING (sensor_systems_id) -WHERE lat IS NOT NULL -AND lon IS NOT NULL -GROUP BY 1,2 -ON CONFLICT (sensor_nodes_id, geom) DO UPDATE SET - start_datetime = LEAST(sensor_nodes_spatial_rollup.start_datetime, EXCLUDED.start_datetime) -, end_datetime = GREATEST(sensor_nodes_spatial_rollup.end_datetime, EXCLUDED.end_datetime) -, measurements_count = sensor_nodes_spatial_rollup.measurements_count + EXCLUDED.measurements_count -, modified_on = now() -RETURNING 1) -SELECT COUNT(1) INTO __inserted_spatial_rollups -FROM spatial_inserts; + +-- WITH spatial_inserts AS ( +-- INSERT INTO sensor_nodes_spatial_rollup ( +-- sensor_nodes_id +-- , geom +-- , cell_size +-- , start_datetime +-- , end_datetime +-- , measurements_count +-- , added_on) +-- SELECT sensor_nodes_id +-- , st_snaptogrid(s.geom, 250) +-- , 250 +-- , MIN(datetime) as start_datetime +-- , MAX(datetime) as end_datetime +-- , COUNT(DISTINCT datetime) as measurements +-- , now() +-- FROM temp_inserted_measurements +-- JOIN tempfetchdata_sensors s USING (sensors_id) +-- JOIN sensor_systems ss USING (sensor_systems_id) +-- WHERE lat IS NOT NULL +-- AND lon IS NOT NULL +-- GROUP BY 1,2 +-- ON CONFLICT (sensor_nodes_id, geom) DO UPDATE SET +-- start_datetime = LEAST(sensor_nodes_spatial_rollup.start_datetime, EXCLUDED.start_datetime) +-- , end_datetime = GREATEST(sensor_nodes_spatial_rollup.end_datetime, EXCLUDED.end_datetime) +-- , measurements_count = sensor_nodes_spatial_rollup.measurements_count + EXCLUDED.measurements_count +-- , modified_on = now() +-- RETURNING 1) +-- SELECT COUNT(1) INTO __inserted_spatial_rollups +-- FROM spatial_inserts; -- Update the table that will help to track hourly rollups diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 2bd6b3a..7702c75 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -330,12 +330,12 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las -- Update the table that will help to track hourly rollups --- INSERT INTO hourly_stats (datetime) --- SELECT date_trunc('hour', datetime) --- FROM temp_inserted_measurements --- GROUP BY 1 --- ON CONFLICT (datetime) DO UPDATE --- SET modified_on = now(); +INSERT INTO hourly_stats (datetime) + SELECT date_trunc('hour', datetime) + FROM temp_inserted_measurements + GROUP BY 1 +ON CONFLICT (datetime) DO UPDATE +SET modified_on = now(); --Update the export queue/logs to export these records diff --git a/ingest/utils.py b/ingest/utils.py index f98982e..83979d9 100644 --- a/ingest/utils.py +++ b/ingest/utils.py @@ -228,6 +228,7 @@ def get_object( return text + def put_object( data: str, key: str, From e4ef1f983712095cd45fb17ad5c4bdd08625e8c7 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 18 Jul 2023 12:32:49 -0700 Subject: [PATCH 13/42] Added start/end date lookup to realtime fetcher --- check.py | 6 +++ ingest/fetch_ingest_full.sql | 81 ++++++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 8 deletions(-) diff --git a/check.py b/check.py index 60cd198..2dcb3be 100644 --- a/check.py +++ b/check.py @@ -50,6 +50,8 @@ help='Show list of errors') parser.add_argument('--resubmit', action="store_true", help='Mark the fetchlogs file for resubmittal') +parser.add_argument('--keep', action="store_true", + help='Do not use TEMP tables for the ingest staging tables') args = parser.parse_args() if 'DOTENV' not in os.environ.keys() and args.env is not None: @@ -64,6 +66,9 @@ if args.debug: os.environ['LOG_LEVEL'] = 'DEBUG' +if args.keep: + os.environ['USE_TEMP_TABLES'] = 'False' + from botocore.exceptions import ClientError from ingest.handler import cronhandler, logger from ingest.settings import settings @@ -137,6 +142,7 @@ def check_realtime_key(key: str, fix: bool = False): mark_success(key=key, reset=True) +logger.debug(settings) # If we have passed an id than we check that if args.id is not None: # get the details for that id diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index 06d1435..72ef991 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -2,6 +2,8 @@ DO $$ DECLARE __process_start timestamptz := clock_timestamp(); +__min_measurement_date date := '1970-01-01'::date; +__max_measurement_date date := current_date + 1; __total_measurements int; __total_nodes int; __updated_nodes int; @@ -48,8 +50,15 @@ BEGIN -- SELECT COUNT(1) INTO __deleted_past_measurements -- FROM deletes; ----------------------------------- +-- use the partitions to determine start and end date +SELECT partition_start_date + , partition_end_date +INTO __min_measurement_date + , __max_measurement_date +FROM data_table_stats +WHERE table_name = 'public.measurements'; +--------------------------------- -- start with simple count SELECT COUNT(1) , MIN(datetime) @@ -169,7 +178,7 @@ SELECT * FROM (SELECT FROM tempfetchdata_sensors WHERE geom IS NOT NULL GROUP BY - 1,2,3,4,5,6,7,8,9,st_snaptogrid(geom, .0001) + 1,2,3,4,5,6,7,8,9,st_snaptogrid(geom, .00001) ) AS wgeom UNION ALL SELECT * FROM @@ -303,7 +312,8 @@ UPDATE sensor_nodes s SET , providers_id = get_providers_id(COALESCE(t.source_name, s.source_name)) , modified_on = now() FROM tempfetchdata_nodes t -WHERE t.sensor_nodes_id = s.sensor_nodes_id AND +WHERE t.sensor_nodes_id = s.sensor_nodes_id +AND ( (s.geom IS NULL and t.geom IS NOT NULL) OR @@ -315,7 +325,7 @@ OR t.source_name, t.city, t.country, - t.metadata + t.metadata - ARRAY['imported','fetchlogs_id']::text[] ) IS DISTINCT FROM ( s.sensor_nodes_id, s.ismobile, @@ -323,13 +333,62 @@ OR s.source_name, s.city, s.country, - s.metadata - 'timezone' + s.metadata - ARRAY['imported','fetchlogs_id']::text[] ) ) RETURNING 1) SELECT COUNT(1) INTO __updated_nodes FROM updates; + +-- SELECT s.sensor_nodes_id +-- , t.site_name +-- , s.site_name +-- , t.metadata - ARRAY['imported','fetchlogs_id']::text[] as temp +-- , s.metadata - ARRAY['imported','fetchlogs_id']::text[] as node +-- FROM tempfetchdata_nodes t +-- JOIN sensor_nodes s ON (t.sensor_nodes_id = s.sensor_nodes_id) +-- WHERE (s.geom IS NULL and t.geom IS NOT NULL) +-- OR +-- ROW ( +-- t.sensor_nodes_id, +-- -- t.ismobile, +-- -- t.site_name, +-- -- t.source_name, +-- -- t.city, +-- -- t.country, +-- t.metadata - ARRAY['imported','fetchlogs_id']::text[] +-- ) IS DISTINCT FROM ( +-- s.sensor_nodes_id, +-- -- s.ismobile, +-- -- s.site_name, +-- -- s.source_name, +-- -- s.city, +-- -- s.country, +-- s.metadata - ARRAY['imported','fetchlogs_id']::text[] +-- ) +-- LIMIT 20; + +-- SELECT h.site_name +-- , n.site_name +-- , st_astext(h.geom) +-- , st_astext(n.geom) +-- , h.origin +-- , n.origin +-- , h.metadata - ARRAY['imported','fetchlogs_id']::text[] as history +-- , n.metadata - ARRAY['imported','fetchlogs_id']::text[] as current +-- FROM sensor_nodes_history h +-- JOIN sensor_nodes n USING (sensor_nodes_id) +-- WHERE created > now() - '2min'::interval; + +-- SELECT source_name +-- , COALESCE(jsonb_array_length(metadata->'attribution'), 0) as attributes +-- , COUNT(1) as n +-- FROM sensor_nodes +-- GROUP BY 1,2 +-- ORDER BY 2 DESC +-- LIMIT 500; + ------------- -- File #6 -- ------------- @@ -566,7 +625,12 @@ __process_start := clock_timestamp(); -- count the future measurements SELECT COUNT(1) INTO __deleted_future_measurements FROM tempfetchdata -WHERE datetime > now() +WHERE datetime > __max_measurement_date +; + + SELECT COUNT(1) INTO __deleted_past_measurements +FROM tempfetchdata +WHERE datetime < __min_measurement_date ; @@ -576,7 +640,8 @@ WITH inserts AS ( , datetime , value FROM tempfetchdata - WHERE datetime <= now() + WHERE datetime > __min_measurement_date + AND datetime < __max_measurement_date ON CONFLICT DO NOTHING RETURNING sensors_id, datetime, value ), inserted as ( @@ -880,7 +945,7 @@ INSERT INTO ingest_stats ( -RAISE NOTICE 'inserted-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, inserted-spatial-rollups: %, source: fetch' +RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, inserted-spatial-rollups: %, source: fetch' , __total_measurements , __deleted_timescaledb , __deleted_future_measurements From f6f54c4d593c65959f9527085e91b3cebec79830 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 7 Sep 2023 05:23:46 -0700 Subject: [PATCH 14/42] Cleaning up --- benchmark.py | 95 +++++++++++++++++++++ cdk/app.py | 7 ++ cdk/cdk.json | 2 +- cdk/config.py | 1 + cdk/lambda_ingest_stack.py | 12 ++- cdk/requirements.txt | 18 +--- cdk/utils.py | 6 +- check.py | 55 +++++++++++- ingest/fetch.py | 38 ++++++--- ingest/fetch_ingest_full.sql | 10 ++- ingest/fetch_staging.sql | 6 ++ ingest/lcs_ingest_full.sql | 4 + ingest/utils.py | 78 +++++++++++++++++ requirements_dev.txt | 1 + tests/benchmark.py | 43 ++++++++++ tests/benchmarking.r | 157 +++++++++++++++++++++++++++++++++++ tests/check_lcs_file.py | 59 ------------- tests/check_realtime_file.py | 111 ------------------------- 18 files changed, 494 insertions(+), 209 deletions(-) create mode 100644 benchmark.py create mode 100644 tests/benchmark.py create mode 100644 tests/benchmarking.r delete mode 100644 tests/check_lcs_file.py delete mode 100644 tests/check_realtime_file.py diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..e58dbda --- /dev/null +++ b/benchmark.py @@ -0,0 +1,95 @@ +import logging +import os +import sys +import argparse +from time import time +import re + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + description=""" +Test benchmarks for ingestion + """) + +parser.add_argument( + '--name', + type=str, + required=False, + default="4xlarge", + help='Name to use for the test' + ) +parser.add_argument( + '--env', + type=str, + default='.env', + required=False, + help='The dot env file to use' + ) +parser.add_argument( + '--debug', + action="store_true", + help='Output at DEBUG level' + ) +args = parser.parse_args() + +if 'DOTENV' not in os.environ.keys() and args.env is not None: + os.environ['DOTENV'] = args.env + +if args.debug: + os.environ['LOG_LEVEL'] = 'DEBUG' + +from ingest.settings import settings +from fake import config, get_locations, as_realtime +from ingest.fetch import load_realtime + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level=settings.LOG_LEVEL.upper(), + force=True, +) + +f = open(f"benchmark_ingest_output_{args.name}.csv", "w") +f.writelines("name,key,locations,inserted_nodes,updated_nodes,total_meas,inserted_meas,ingest_time,process_time,log_time,copy_time,load_process_time\n") +n = 10 +locations = [50, 250, 1000] +keys = [] +ii = 1 + +## make a set of files +for r in locations: + for i in range(n): + config(source=f"benchmark-test-{r}-{i+1}", gz=True) + l = get_locations(n=r) + key = as_realtime(l["locations"], l["latitude"], l["longitude"]) + keys.append({ "key": key, "locations": len(l["locations"]) }) + ii=+1 + + +## ingest each of the +for i, k in enumerate(keys): + key = k["key"] + locations = k["locations"] + logger.info(f"Ingesting {i+1} of {len(keys)}: {key} with {locations} locations") + + start_time = time() + copy_time, load_process_time, log_time, notice = load_realtime([ + (-1, key, None) + ]) + m = re.findall('([a-z-]+): (.+?),', notice) + + process_time = round(float(m[17][1])) + total_meas = int(m[0][1]) + inserted_meas = int(m[9][1]) + updated_nodes = int(m[8][1]) + inserted_nodes = int(m[11][1]) + ingest_time = round((time() - start_time)*1000) + f.writelines(f"'{args.name}','{key}',{locations},{inserted_nodes},{updated_nodes},{total_meas},{inserted_meas},{ingest_time},{process_time},{log_time},{copy_time},{load_process_time}\n") + + logger.info( + "loaded realtime records, timer: %0.4f, process: %0.4f", + ingest_time, process_time + ) + + +f.close() diff --git a/cdk/app.py b/cdk/app.py index d71236b..9b86548 100644 --- a/cdk/app.py +++ b/cdk/app.py @@ -3,6 +3,7 @@ Environment, Tags, ) +import os from lambda_ingest_stack import LambdaIngestStack @@ -19,6 +20,10 @@ app = aws_cdk.App() +env = Environment( + account=os.environ['CDK_DEFAULT_ACCOUNT'], + region=os.environ['CDK_DEFAULT_REGION'] + ) ingest = LambdaIngestStack( app, @@ -26,10 +31,12 @@ env_name=settings.ENV, lambda_env=lambda_env, fetch_bucket=settings.FETCH_BUCKET, + vpc_id=settings.VPC_ID, ingest_lambda_timeout=settings.INGEST_LAMBDA_TIMEOUT, ingest_lambda_memory_size=settings.INGEST_LAMBDA_MEMORY_SIZE, ingest_rate_minutes=settings.INGEST_RATE_MINUTES, topic_arn=settings.TOPIC_ARN, + env=env, ) Tags.of(ingest).add("project", settings.PROJECT) diff --git a/cdk/cdk.json b/cdk/cdk.json index f1770f9..289cf21 100644 --- a/cdk/cdk.json +++ b/cdk/cdk.json @@ -1,5 +1,5 @@ { - "app": "python3.8 app.py", + "app": "python app.py", "context": { "aws-cdk:enableDiffNoFail": "true", "@aws-cdk/core:stackRelativeExports": "true", diff --git a/cdk/config.py b/cdk/config.py index ccae88d..da05621 100644 --- a/cdk/config.py +++ b/cdk/config.py @@ -13,6 +13,7 @@ class Settings(BaseSettings): INGEST_RATE_MINUTES: int = 15 LOG_LEVEL: str = 'INFO' TOPIC_ARN: str = None + VPC_ID: str = None class Config: parent = Path(__file__).resolve().parent.parent diff --git a/cdk/lambda_ingest_stack.py b/cdk/lambda_ingest_stack.py index 3b2e380..e6b66b6 100644 --- a/cdk/lambda_ingest_stack.py +++ b/cdk/lambda_ingest_stack.py @@ -2,8 +2,10 @@ from typing import Dict from aws_cdk import ( + Environment, aws_lambda, aws_s3, + aws_ec2, Stack, Duration, aws_events, @@ -24,6 +26,7 @@ def __init__( self, scope: Construct, id: str, + env: Environment, env_name: str, lambda_env: Dict, fetch_bucket: str, @@ -31,11 +34,15 @@ def __init__( ingest_lambda_memory_size: int, ingest_rate_minutes: int = 15, topic_arn: str = None, + vpc_id: str = None, **kwargs, ) -> None: """Lambda plus cronjob to ingest metadata, realtime and pipeline data""" - super().__init__(scope, id, *kwargs) + super().__init__(scope, id, env=env,*kwargs) + + if vpc_id is not None: + vpc_id = aws_ec2.Vpc.from_lookup(self, f"{id}-vpc", vpc_id=vpc_id) ingest_function = aws_lambda.Function( self, @@ -58,7 +65,8 @@ def __init__( ], ), handler="ingest.handler.handler", - runtime=aws_lambda.Runtime.PYTHON_3_8, + vpc=vpc_id, + runtime=aws_lambda.Runtime.PYTHON_3_9, allow_public_subnet=True, memory_size=ingest_lambda_memory_size, environment=stringify_settings(lambda_env), diff --git a/cdk/requirements.txt b/cdk/requirements.txt index f44b370..87c952c 100644 --- a/cdk/requirements.txt +++ b/cdk/requirements.txt @@ -1,14 +1,4 @@ -attrs==21.4.0 -aws-cdk-lib==2.3.0 -aws-cdk.aws-apigatewayv2-alpha==2.3.0a0 -aws-cdk.aws-apigatewayv2-integrations-alpha==2.3.0a0 -cattrs==22.1.0 -constructs==10.1.16 -exceptiongroup==1.0.0rc7 -jsii==1.59.0 -publication==0.0.3 -pydantic==1.9.1 -python-dateutil==2.8.2 -python-dotenv==0.20.0 -six==1.16.0 -typing_extensions==4.2.0 +aws-cdk-lib==2.87.0 +boto3 +pydantic==1.10 +python-dotenv diff --git a/cdk/utils.py b/cdk/utils.py index 42bea63..78318ce 100644 --- a/cdk/utils.py +++ b/cdk/utils.py @@ -26,13 +26,13 @@ def create_dependencies_layer( if not environ.get('SKIP_PIP'): print(f'Building {layer_id} from {requirements_file} into {output_dir}') subprocess.run( - f"""python3.8 -m pip install -qq -r {requirements_file} \ + f"""python -m pip install -qq -r {requirements_file} \ -t {output_dir}/python && \ cd {output_dir}/python && \ find . -type f -name '*.pyc' | \ while read f; do n=$(echo $f | \ sed 's/__pycache__\///' | \ - sed 's/.cpython-[2-3] [0-9]//'); \ + sed 's/.cpython-[2-3][0-9]//'); \ cp $f $n; \ done \ && find . -type d -a -name '__pycache__' -print0 | xargs -0 rm -rf \ @@ -47,5 +47,5 @@ def create_dependencies_layer( self, layer_id, code=layer_code, - compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_8] + compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_9] ) diff --git a/check.py b/check.py index 2dcb3be..0707775 100644 --- a/check.py +++ b/check.py @@ -1,7 +1,10 @@ import argparse import logging import os +import sys import orjson +import psycopg2 + logger = logging.getLogger(__name__) @@ -16,8 +19,12 @@ """) parser.add_argument('--id', type=int, required=False, help='The fetchlogs_id value') +parser.add_argument('--file', type=str, required=False, + help='A local file to load') parser.add_argument('--batch', type=str, required=False, help='The batch id value. Loads files based on batch uuid.') +parser.add_argument('--pattern', type=str, required=False, + help='A reqex to match keys for loading') parser.add_argument('--env', type=str, required=False, help='The dot env file to use') parser.add_argument('--profile', type=str, required=False, @@ -82,18 +89,23 @@ from ingest.fetch import ( load_realtime, + create_staging_table, parse_json, ) from ingest.utils import ( + load_fetchlogs, load_errors_list, load_errors_summary, load_rejects_summary, + get_data, get_object, put_object, get_logs_from_ids, get_logs_from_pattern, mark_success, + StringIteratorIO, + deconstruct_path, ) @@ -143,6 +155,16 @@ def check_realtime_key(key: str, fix: bool = False): logger.debug(settings) + +if args.file is not None: + # check if the files exists + # is it a realtime file or a lcs file? + # upload the file + load_realtime([ + (-1, args.file, None) + ]) + sys.exit() + # If we have passed an id than we check that if args.id is not None: # get the details for that id @@ -151,11 +173,12 @@ def check_realtime_key(key: str, fix: bool = False): keys = [log[1] for log in logs] # loop through and check each for idx, key in enumerate(keys): - print(key) if args.download: - print(f'downloading: {key}') - txt = get_object(key) - fpath = os.path.expanduser(f'~/Downloads/{key}') + logger.info(f'downloading: {key}') + # we may be using the new source pat + p = deconstruct_path(key) + txt = get_object(**p) + fpath = os.path.expanduser(f'~/Downloads/{p["bucket"]}/{p["key"]}') os.makedirs(os.path.dirname(fpath), exist_ok=True) with open(fpath.replace('.gz', ''), 'w') as f: f.write(txt) @@ -184,6 +207,30 @@ def check_realtime_key(key: str, fix: bool = False): # load_measurements_batch(args.batch) load_metadata_batch(args.batch) +elif args.pattern is not None: + keys = load_fetchlogs(pattern=args.pattern, limit=25, ascending=True) + # loop through and check each + for row in keys: + id = row[0] + key = row[1] + last = row[2] + logger.debug(f"{key}: {id}") + if args.load: + if 'realtime' in key: + load_realtime([ + (id, key, last) + ]) + elif 'stations' in key: + load_metadata([ + {"id": id, "Key": key, "LastModified": last} + ]) + else: + load_measurements([ + (id, key, last) + ]) + + + # Otherwise if we set the summary flag return a daily summary of errors elif args.summary: rows = load_errors_summary(args.n) diff --git a/ingest/fetch.py b/ingest/fetch.py index d0c3f50..3f70892 100644 --- a/ingest/fetch.py +++ b/ingest/fetch.py @@ -2,7 +2,7 @@ import io import os import logging -import time +from time import time from datetime import datetime, timedelta import orjson @@ -15,6 +15,7 @@ StringIteratorIO, clean_csv_value, get_query, + get_data, load_fail, load_success, load_fetchlogs, @@ -24,7 +25,7 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) -logger = logging.getLogger(__name__) +logger = logging.getLogger('fetch') FETCH_BUCKET = settings.FETCH_BUCKET s3 = boto3.resource("s3") @@ -95,7 +96,7 @@ def create_staging_table(cursor): def copy_data(cursor, key, fetchlogsId=None): - obj = s3.Object(FETCH_BUCKET, key) + #obj = s3.Object(FETCH_BUCKET, key) # This should not be checked here, # if we ask it to copy data it should do that # if we want to prevent duplicate attemps we should @@ -106,12 +107,12 @@ def copy_data(cursor, key, fetchlogsId=None): # we are also removing the try/catch # if it fails we want to deal with it elsewhere logger.debug(f"Copying data for {key}") - with gzip.GzipFile(fileobj=obj.get()["Body"]) as gz: - f = io.BufferedReader(gz) + with get_data(key) as f: # make sure that the file is complete iterator = StringIteratorIO( (f"{fetchlogsId}\t"+parse_json(orjson.loads(line)) for line in f) ) + query = """ COPY tempfetchdata ( fetchlogs_id, @@ -131,6 +132,7 @@ def copy_data(cursor, key, fetchlogsId=None): avpd_value ) FROM STDIN; """ + logger.debug("Loading data from STDIN") cursor.copy_expert(query, iterator) @@ -213,7 +215,7 @@ def load_fetch_file(file: str): @app.command() def load_fetch_day(day: str): - start = time.time() + start = time() conn = boto3.client("s3") prefix = f"realtime-gzipped/{day}" keys = [] @@ -233,7 +235,7 @@ def load_fetch_day(day: str): create_staging_table(cursor) for key in keys: copy_data(cursor, key) - print(f"All data copied {time.time()-start}") + print(f"All data copied {time()-start}") filter_data(cursor) mindate, maxdate = process_data(cursor) update_rollups(cursor, mindate=mindate, maxdate=maxdate) @@ -302,20 +304,27 @@ def load_db(limit: int = 50, ascending: bool = False): def load_realtime(rows): # create a connection and share for all keys + logger.debug(f"Loading {len(rows)} keys") + log_time = -1 + process_time = -1 + copy_time = 0 with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: # create all the data staging table create_staging_table(cursor) + logger.debug('Created realtime staging tables') # now copy all the data keys = [] - + start = time() for row in rows: key = row[1] fetchlogsId = row[0] + logger.debug(f"Loading {key}, id: {fetchlogsId}") try: copy_data(cursor, key, fetchlogsId) keys.append(key) + copy_time += (time() - start) except Exception as e: # all until now is lost # reset things and try to recover @@ -326,15 +335,20 @@ def load_realtime(rows): # finally process the data as one if len(keys) > 0: + logger.debug(f"Processing realtime files") + start = time() process_data(cursor) + process_time = time() - start # we are outputing some stats for notice in connection.notices: - print(notice) - # mark files as done - load_success(cursor, keys) + logger.info(notice) + # mark files as done + start = time() + load_success(cursor, keys) + log_time = time() - start # close and commit connection.commit() - + return round(copy_time*1000), round(process_time*1000), round(log_time*1000), notice if __name__ == "__main__": app() diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index 72ef991..f504ff7 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -25,6 +25,7 @@ __deleted_past_measurements int; __exported_days int; __process_time_ms int; __insert_time_ms int; +__node_time_ms int; __cache_time_ms int; __ingest_method text := 'realtime'; __inserted_spatial_rollups int := 0; @@ -135,7 +136,7 @@ AND st_y(geom) = 0; UPDATE tempfetchdata_sensors SET units = 'µg/m³' -WHERE units IN ('µg/m��','��g/m³'); +WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); UPDATE tempfetchdata_sensors SET node_metadata = @@ -617,6 +618,7 @@ WHERE sensors_id IS NULL; --WHERE m.datetime = t.datetime --AND m.sensors_id = t.sensors_id; +__node_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); -- restart the clock to measure just inserts __process_start := clock_timestamp(); @@ -945,7 +947,7 @@ INSERT INTO ingest_stats ( -RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, process-time-ms: %, inserted-spatial-rollups: %, source: fetch' +RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-measurements: %, deleted-past-measurements: %, from: %, to: %, inserted-from: %, inserted-to: %, updated-nodes: %, inserted-measurements: %, inserted-measurands: %, inserted-nodes: %, rejected-nodes: %, rejected-systems: %, rejected-sensors: %, exported-sensor-days: %, inserted-spatial-rollups: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: fetch' , __total_measurements , __deleted_timescaledb , __deleted_future_measurements @@ -963,7 +965,9 @@ RAISE NOTICE 'total-measurements: %, deleted-timescaledb: %, deleted-future-meas , __rejected_sensors , __exported_days , __inserted_spatial_rollups - , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + , __process_time_ms + , __insert_time_ms + , __cache_time_ms; END $$; diff --git a/ingest/fetch_staging.sql b/ingest/fetch_staging.sql index d6b717d..48ddc93 100644 --- a/ingest/fetch_staging.sql +++ b/ingest/fetch_staging.sql @@ -1,3 +1,9 @@ +-- DROP TABLE IF EXISTS tempfetchdata +-- , temp_inserted_measurements +-- , tempfetchdata_nodes +-- , tempfetchdata_sensors +-- , tempfetchdata_sensors_clean; + CREATE {table} IF NOT EXISTS tempfetchdata ( fetchlogs_id int, location text, diff --git a/ingest/lcs_ingest_full.sql b/ingest/lcs_ingest_full.sql index 33cd5a6..a9523fd 100644 --- a/ingest/lcs_ingest_full.sql +++ b/ingest/lcs_ingest_full.sql @@ -29,6 +29,10 @@ FROM ms_sensors WHERE ms_sensors.ingest_id IS NULL OR ingest_sensor_systems_id IS NULL; +UPDATE ms_sensors +SET units = 'µg/m³' +WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); + -- first thing we want to do is to get the source -- and the source_id from the ingest id -- adding the station forces our method to treat the station as the parameter diff --git a/ingest/utils.py b/ingest/utils.py index 83979d9..b9741b7 100644 --- a/ingest/utils.py +++ b/ingest/utils.py @@ -1,5 +1,6 @@ import io import os +import sys from pathlib import Path import logging from urllib.parse import unquote_plus @@ -7,6 +8,7 @@ import uuid import boto3 +import re from io import StringIO import psycopg2 # import typer @@ -59,6 +61,7 @@ def read(self, n=None): return "".join(line) + def put_metric( namespace, metricname, @@ -210,12 +213,87 @@ def check_if_done(cursor, key): return False +def deconstruct_path(key: str): + is_local = os.path.isfile(key) + is_s3 = bool(re.match(r"s3://[a-zA-Z]+[a-zA-Z0-9_-]+/[a-zA-Z]+", key)) + is_csv = bool(re.search(r"\.csv(.gz)?$", key)) + is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) + is_compressed = bool(re.search(r"\.gz$", key)) + path = {} + if is_local: + path["local"] = True + path["key"] = key + elif is_s3: + # pull out the bucket name + p = key.split("//")[1].split("/") + path["bucket"] = p.pop(0) + path["key"] = "/".join(p) + else: + # use the current bucket from settings + path["bucket"] = settings.ETL_BUCKET + path["key"] = key + + logger.debug(path) + return path + +def get_data(key: str): + # check to see if we were provided with a path that includes the source + # e.g. + # s3://bucket/key + # local://drive/key + # /key (assume local) + # or no source + # key (no forward slash, assume etl bucket) + if re.match(r"local://[a-zA-Z]+", key): + key = key.replace("local://", "") + + is_local = os.path.isfile(key) + is_s3 = bool(re.match(r"s3://[a-zA-Z]+[a-zA-Z0-9_-]+/[a-zA-Z]+", key)) + #is_csv = bool(re.search(r"\.csv(.gz)?$", key)) + #is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) + is_compressed = bool(re.search(r"\.gz$", key)) + logger.debug(f"checking - {key}\ns3: {is_s3}; is_local: {is_local}") + + if is_local: + return get_file(key) + elif is_s3: + # pull out the bucket name + path = key.split("//")[1].split("/") + bucket = path.pop(0) + key = "/".join(path) + else: + # use the current bucket from settings + bucket = settings.ETL_BUCKET + + # stream the file + logger.debug(f"streaming s3 file data from s3://{bucket}/{key}") + obj = s3.get_object( + Bucket=bucket, + Key=key, + ) + f = obj["Body"] + if is_compressed: + return gzip.GzipFile(fileobj=obj["Body"]) + else: + return obj["Body"] + + +def get_file(filepath: str): + is_compressed = bool(re.search(r"\.gz$", filepath)) + logger.debug(f"streaming local file data from {filepath}") + if is_compressed: + return gzip.open(filepath, 'rb') + else: + return io.open(filepath, "r", encoding="utf-8") + + def get_object( key: str, bucket: str = settings.ETL_BUCKET ): key = unquote_plus(key) text = '' + logger.debug(f"Getting {key} from {bucket}") obj = s3.get_object( Bucket=bucket, Key=key, diff --git a/requirements_dev.txt b/requirements_dev.txt index 2d3c225..7278aca 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,2 +1,3 @@ -r requirements.txt boto3 +numpy diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..d8dfc34 --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,43 @@ +import logging +import os +import sys +import argparse + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + description=""" +Test benchmarks for ingestion + """) + +parser.add_argument( + '--name', + type=str, + required=False, + default="test", + help='Name to use for the test' + ) +parser.add_argument( + '--env', + type=str, + default='.env', + required=False, + help='The dot env file to use' + ) +parser.add_argument( + '--debug', + action="store_true", + help='Output at DEBUG level' + ) +args = parser.parse_args() + +from ingest.settings import settings + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level=settings.LOG_LEVEL.upper(), + force=True, +) + + +print(args) diff --git a/tests/benchmarking.r b/tests/benchmarking.r new file mode 100644 index 0000000..3b9a076 --- /dev/null +++ b/tests/benchmarking.r @@ -0,0 +1,157 @@ + +source("~/git/R/ESRfunctions.r") + +stats <- dir('tests/benchmark_output', pattern = "*stats.csv$", full.names=TRUE) + +params <- data.frame( + ram = c( + 2, 1, 0.25, 0.5, .02, .25, 0.25, 5, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256, + 4,8,16,32,64,128,256 + ), + cores = c( + 16, 16, 16, 16, 4, 4, + 8, 8, 8, 8, 8, 8, 8, 8, 8, + 16, 16, 16, 16, 16, 16, 16, + 4, 4, 4, 4, 4, 4, 4, + 8, 8, 8, 8, 8, 8, 8, + 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16 + ), + x86 = c( + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE + ), + v1 = c( + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + ), + ingesting = c( + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, + TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE + ), + row.names = c( + "4xlarge_stats.csv", "4xlarge1g_stats.csv", "4xlarge256mb_stats.csv", "4xlarge500mb_stats.csv", + "prod_stats.csv", "r5xlarge_stats.csv", "r6i2xlarge_stats.csv", "xxlarge-5gb_stats.csv", + "2xlarge-4MB_stats.csv", "2xlarge-8MB_stats.csv","2xlarge-16MB_stats.csv","2xlarge-32MB_stats.csv", + "2xlarge-64MB_stats.csv","2xlarge-128MB_stats.csv","2xlarge-256MB_stats.csv", + "4xlarge-4MB_stats.csv", "4xlarge-8MB_stats.csv","4xlarge-16MB_stats.csv","4xlarge-32MB_stats.csv", + "4xlarge-64MB_stats.csv","4xlarge-128MB_stats.csv","4xlarge-256MB_stats.csv", + "xlarge-4MB_stats.csv", "xlarge-8MB_stats.csv","xlarge-16MB_stats.csv","xlarge-32MB_stats.csv", + "xlarge-64MB_stats.csv","xlarge-128MB_stats.csv","xlarge-256MB_stats.csv", + "2xlargeV2-4MB_stats.csv", "2xlargeV2-8MB_stats.csv","2xlargeV2-16MB_stats.csv","2xlargeV2-32MB_stats.csv", + "2xlargeV2-64MB_stats.csv","2xlargeV2-128MB_stats.csv","2xlargeV2-256MB_stats.csv", + "2xlargeARM-4MB_stats.csv", "2xlargeARM-8MB_stats.csv","2xlargeARM-16MB_stats.csv","2xlargeARM-32MB_stats.csv", + "2xlargeARM-64MB_stats.csv","2xlargeARM-128MB_stats.csv","2xlargeARM-256MB_stats.csv", + "2xlargeX86-4MB_stats.csv", "2xlargeX86-8MB_stats.csv","2xlargeX86-16MB_stats.csv","2xlargeX86-32MB_stats.csv", + "2xlargeX86-64MB_stats.csv","2xlargeX86-128MB_stats.csv","2xlargeX86-256MB_stats.csv" + ) +) + +x <- do.call(rbind, lapply(stats, function(path) { + x <- read.csv(path) + x$path <- basename(path) + x[x$Name == 'Aggregated', ] + #x[x$Name == 'v2/locations/:id', ] + #x[x$Name == 'v2/latest/empty', ] +})) + +x$cores <- params[x$path, "cores"] +x$ram <- params[x$path, "ram"] +x$x86 <- params[x$path, "x86"] +x$v1 <- params[x$path, "v1"] +x$ingesting <- params[x$path, "ingesting"] + + +x <- x[x$path != "prod_stats.csv", ] + +x <- x[order(x$ram), ] + +plot(Average.Response.Time ~ cores, x) +plot(Requests.s ~ cores, x) + +plot(Average.Response.Time ~ ram, x) + +ncores <- 16 +plot(Requests.s ~ ram, subset(x, cores == ncores)) +plot(Average.Response.Time ~ ram, subset(x, cores == ncores)) +plot(Average.Response.Time ~ cores, subset(x, cores <= ncores), pch=cores, col=1) + +plot(Average.Response.Time ~ ram, subset(x, cores <= ncores), pch=cores, col=1) +legend('topright', legend=unique(x$cores), pch=unique(x$cores), bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores <= ncores), pch=19, col=as.numeric(1:nrow(x) %in% grep('V2', x$path))+1) +legend('topright', legend=c('V1', 'V2'), pch=19, col=1:2, bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores == ncores), pch=19, col=as.numeric(1:nrow(x) %in% grep('ARM', x$path))+1) +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores == ncores), pch=19) +points(Average.Response.Time ~ ram, x[grep('ARM', x$path), ], pch=19, col='red') +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + +plot(Average.Response.Time ~ ram, subset(x, cores == ncores & ingesting), pch=19, col=x$x86+1) +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + + +points(Average.Response.Time ~ ram, x[grep('ARM', x$path), ], pch=19, col='red') +legend('topright', legend=c('x86', 'ARM'), pch=19, col=1:2, bty='n', ncol=3) + + +plot(X50. ~ ram, x) +plot(X75. ~ ram, subset(x, cores == ncores)) +plot(Request.Count ~ ram, subset(x, cores == ncores)) +plot(Failure.Count ~ ram, subset(x, cores == ncores)) +plot(ram ~ cores, x) + + +exporters <- dir('tests/benchmark_output', pattern = "*export_output*", full.names=TRUE) + +params <- data.frame( + ram = c( + 64, 0.128, .004, .004, + .004, 1, 20, 40, + 5, 8, .004, .004 + ), + cores = c( + 16, 16, 4, 4, + 2, 8, 8, 8, + 8, 8, 4, 4 + ), + row.names = c( + "4xlarge-wm64gb","4xlarge", "prod", "r5", + "small", "xxlarge-wm1g", "xxlarge-wm20g", "xxlarge-wm40g", + "xxlarge-wm5g", "xxlarge-wm8g", "xxlarge", "benchmark_export_output" + ) +) + +x <- do.call(rbind, lapply(exporters, function(path) { + x <- read.csv(path, quote="'") + x$path <- basename(path) + x$test <- gsub("benchmark_export_output_|.csv$", "", basename(path)) + return(x) +})) +x$cores = params[x$test,"cores"] +x$ram = params[x$test,"ram"] + +boxplot(time_ms~cores, x) +plot(I(time_ms/1000)~jitter(ram, 10),x, log="y") diff --git a/tests/check_lcs_file.py b/tests/check_lcs_file.py deleted file mode 100644 index 06fa454..0000000 --- a/tests/check_lcs_file.py +++ /dev/null @@ -1,59 +0,0 @@ -import logging -import sys -import os -import json - -if 'DOTENV' not in os.environ.keys(): - os.environ['DOTENV'] = '.env.testing' - -if 'AWS_PROFILE' not in os.environ.keys(): - os.environ['AWS_PROFILE'] = 'python-user' - -from pandas import DataFrame -from botocore.exceptions import ClientError -from openaq_fastapi.ingest.handler import cronhandler, logger -from openaq_fastapi.settings import settings - -from openaq_fastapi.ingest.lcs import ( - LCSData, - load_metadata_db, - load_measurements_db, - load_measurements_file, - load_measurements, - get_measurements, -) - - -from openaq_fastapi.ingest.utils import ( - load_errors, - select_object, - get_object, - get_logs_from_ids, - get_logs_from_pattern, - unquote_plus, -) - - -# load_realtime('realtime-gzipped/2022-02-04/1643994434.ndjson.gz') - -# logs = get_logs_from_pattern('stations/clarity', 2) -# - -# station data -# logs = get_logs_from_ids(ids=[5544399, 4874871]) - -# for each of them lets try and import the data -# contents = [] -# for row in logs: -# contents.append( -# {"Key": unquote_plus(row[1]), "LastModified": row[6], "id": row[0], } -# ) - -# data = LCSData(contents) -# data.get_metadata() - - -# measurement data -logs = get_logs_from_ids(ids=[5609404]) - -load_measurements(logs) diff --git a/tests/check_realtime_file.py b/tests/check_realtime_file.py deleted file mode 100644 index 880417e..0000000 --- a/tests/check_realtime_file.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging -import sys -import os -import json - -if 'DOTENV' not in os.environ.keys(): - os.environ['DOTENV'] = '.env.testing' - -if 'AWS_PROFILE' not in os.environ.keys(): - os.environ['AWS_PROFILE'] = 'python-user' - -from botocore.exceptions import ClientError -from openaq_fastapi.ingest.handler import cronhandler, logger -from openaq_fastapi.settings import settings - -from openaq_fastapi.ingest.lcs import ( - load_metadata_db, - load_measurements_db, - load_measurements_file, - load_measurements, - get_measurements, -) - -from openaq_fastapi.ingest.fetch import ( - load_realtime, - parse_json, -) - -from openaq_fastapi.ingest.utils import ( - load_errors, - select_object, - get_object, - get_logs_from_ids, -) - - -# load_realtime('realtime-gzipped/2022-02-04/1643994434.ndjson.gz') - -logs = get_logs_from_ids(ids=[5634328]) - -# logs = load_errors() - -keys = [log[1] for log in logs] - -#load_realtime(keys) - -print(f"Found {len(keys)} potential errors") - -for idx, key in enumerate(keys): - print(f"\n## Checking #{idx}: {key}") - # get text of object - try: - txt = get_object(key) - except Exception as e: - print(f"\t*** Error getting file: {e}") - continue - # break into lines - lines = txt.split("\n") - # check parse for each line - n = len(lines) - errors = [] - for jdx, line in enumerate(lines): - try: - # first just try and load it - obj = json.loads(line) - except Exception as e: - errors.append(jdx) - print(f"\t*** Loading error on line #{jdx} (of {n}): {e}\n{line}") - try: - # then we can try to parse it - row = parse_json(obj) - except Exception as e: - errors.append(jdx) - print(f"\t*** Parsing rror on line #{jdx} (of {n}): {e}\n{line}") - - - -# load_realtime(keys) - # load_realtime([ - # 'realtime-gzipped/2022-02-05/1644020232.ndjson.gz', - # 'realtime-gzipped/2022-02-05/1644068231.ndjson.gz' - # ]) - -# errors = load_errors(10) - -# print(f"Found {len(errors)} possible error files") - -# for file in errors: -# key = file[3] -# print(f"Checking file {key}") -# try: -# obj = select_object(key) -# except ClientError as e: -# if e.response['Error']['Code'] == 'JSONParsingError': -# print("There was an error parsing the file, fetching as raw file") -# print(e.response['Error']) -# obj = get_object(key) -# else: -# print("Some other error") -# except Exception as e: -# print(f"post-boto error: {e}") -# obj = get_object(key) - -# print(obj[-50:]) -# # save the file locally -# filepath = os.path.join(settings.LOCAL_SAVE_DIRECTORY, key) -# print(f"Writing file to {filepath}") -# os.makedirs(os.path.dirname(filepath), exist_ok=True) -# fle = open(filepath.replace(".gz", ""), 'w') -# fle.write(obj) -# fle.close() From 2680b3d6d1f93acfd9b50203d199b255b6af7a10 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Sat, 9 Mar 2024 12:03:59 -0800 Subject: [PATCH 15/42] Temporary fix to the airgradient duplication issue --- ingest/lcs.py | 22 +++++++++++----------- ingest/lcs_meas_ingest.sql | 24 ++++++++++++++++++++---- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/ingest/lcs.py b/ingest/lcs.py index 804a418..2389917 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -169,7 +169,7 @@ def load_data(self): connection.set_session(autocommit=True) with connection.cursor() as cursor: start_time = time() - self.create_staging_table(cursor) + create_staging_table(cursor) write_csv( cursor, @@ -253,11 +253,6 @@ def process_data(self, cursor): query = get_query("lcs_ingest_full.sql") cursor.execute(query) - def create_staging_table(self, cursor): - cursor.execute(get_query( - "lcs_staging.sql", - table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' - )) def get_metadata(self): hasnew = False @@ -287,6 +282,14 @@ def get_metadata(self): self.load_data() + +def create_staging_table(cursor): + # table and batch are used primarily for testing + cursor.execute(get_query( + "lcs_staging.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) + def write_csv(cursor, data, table, columns): fields = ",".join(columns) sio = StringIO() @@ -330,7 +333,7 @@ def load_metadata_db(limit=250, ascending: bool = False): "LastModified": row[2], "id": row[0], } - ) + ) if len(contents) > 0: load_metadata(contents) # data = LCSData(contents) @@ -562,10 +565,7 @@ def load_measurements(rows): connection.set_session(autocommit=True) with connection.cursor() as cursor: - cursor.execute(get_query( - "lcs_staging.sql", - table="TEMP TABLE" - )) + create_staging_table(cursor) write_csv( cursor, new, "keys", ["key",], diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 7702c75..468b6ab 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -47,10 +47,26 @@ INTO __total_measurements FROM meas; -UPDATE meas -SET sensors_id=s.sensors_id -FROM sensors s -WHERE s.source_id=ingest_id; +-- The ranking is to deal with the current possibility +-- that duplicate sensors with the same ingest/source id are created + -- this is a short term fix + -- a long term fix would not allow duplicate source_id's +WITH ranked_sensors AS ( + SELECT s.sensors_id + , s.source_id + , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk + FROM sensors s + JOIN meas m ON (s.source_id = m.ingest_id) + WHERE s.is_active +), active_sensors AS ( + SELECT source_id + , sensors_id + FROM ranked_sensors + WHERE rnk = 1) + UPDATE meas + SET sensors_id=s.sensors_id + FROM active_sensors s + WHERE s.source_id=ingest_id; -- first the sensor nodes From 60c931c0ada8152cae9e880d42f0ee8d5e9a4c46 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 25 Apr 2024 10:47:49 -0700 Subject: [PATCH 16/42] Updated the lcs class to support the newer data format New class is meant to be more flexible and to work for all file formats. --- check.py | 7 +- ingest/etl_process_measurements.sql | 482 +++++++++++++++++ ingest/etl_process_nodes.sql | 342 ++++++++++++ ingest/lcsV2.py | 770 ++++++++++++++++++++++++++++ ingest/temp_locations_dump.sql | 52 ++ ingest/temp_measurements_dump.sql | 46 ++ local.py | 76 +++ matching_clarity_sensors.sql | 80 +++ 8 files changed, 1852 insertions(+), 3 deletions(-) create mode 100644 ingest/etl_process_measurements.sql create mode 100644 ingest/etl_process_nodes.sql create mode 100644 ingest/lcsV2.py create mode 100644 ingest/temp_locations_dump.sql create mode 100644 ingest/temp_measurements_dump.sql create mode 100644 local.py create mode 100644 matching_clarity_sensors.sql diff --git a/check.py b/check.py index 0707775..aa247df 100644 --- a/check.py +++ b/check.py @@ -174,11 +174,12 @@ def check_realtime_key(key: str, fix: bool = False): # loop through and check each for idx, key in enumerate(keys): if args.download: - logger.info(f'downloading: {key}') - # we may be using the new source pat + # we may be using the new source pat p = deconstruct_path(key) + download_path = f'~/Downloads/{p["bucket"]}/{p["key"]}'; + logger.info(f'downloading to {download_path}') txt = get_object(**p) - fpath = os.path.expanduser(f'~/Downloads/{p["bucket"]}/{p["key"]}') + fpath = os.path.expanduser(download_path) os.makedirs(os.path.dirname(fpath), exist_ok=True) with open(fpath.replace('.gz', ''), 'w') as f: f.write(txt) diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql new file mode 100644 index 0000000..256240c --- /dev/null +++ b/ingest/etl_process_measurements.sql @@ -0,0 +1,482 @@ +-- lcs_meas_ingest +DO $$ +DECLARE +__process_start timestamptz := clock_timestamp(); +__total_measurements int; +__inserted_measurements int; +__rejected_measurements int := 0; +__rejected_nodes int := 0; +__total_nodes int := 0; +__updated_nodes int := 0; +__inserted_nodes int := 0; +__exported_days int; +__start_datetime timestamptz; +__end_datetime timestamptz; +__inserted_start_datetime timestamptz; +__inserted_end_datetime timestamptz; +__process_time_ms int; +__insert_time_ms int; +__cache_time_ms int; +__error_context text; +__ingest_method text := 'lcs'; +BEGIN + + +DELETE +FROM staging_measurements +WHERE ingest_id IS NULL +OR datetime is NULL +OR value IS NULL; + +--DELETE +--FROM staging_measurements +--WHERE datetime < '2018-01-01'::timestamptz +--OR datetime>now(); + +DELETE +FROM rejects +WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM staging_measurements) +AND tbl ~* '^meas'; + + +SELECT COUNT(1) +, MIN(datetime) +, MAX(datetime) +INTO __total_measurements +, __start_datetime +, __end_datetime +FROM staging_measurements; + + +-- The ranking is to deal with the current possibility +-- that duplicate sensors with the same ingest/source id are created + -- this is a short term fix + -- a long term fix would not allow duplicate source_id's +WITH ranked_sensors AS ( + SELECT s.sensors_id + , s.source_id + , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk + FROM sensors s + JOIN staging_measurements m ON (s.source_id = m.ingest_id) +), active_sensors AS ( + SELECT source_id + , sensors_id + FROM ranked_sensors + WHERE rnk = 1) + UPDATE staging_measurements + SET sensors_id=s.sensors_id + FROM active_sensors s + WHERE s.source_id=ingest_id; + +-- Now we have to fill in any missing information +-- first add the nodes and systems that dont exist +-- add just the bare minimum amount of data to the system +-- we assume that the node information will be added later +WITH nodes AS ( +INSERT INTO sensor_nodes ( + source_name +, site_name +, source_id +, metadata) +SELECT source_name +, source_name +, source_id +, jsonb_build_object('fetchlogs_id', MIN(fetchlogs_id)) +FROM staging_measurements +WHERE sensors_id IS NULL +GROUP BY 1,2,3 +ON CONFLICT (source_name, source_id) DO UPDATE +SET source_id = EXCLUDED.source_id +, metadata = EXCLUDED.metadata||COALESCE(sensor_nodes.metadata, '{}'::jsonb) +RETURNING sensor_nodes_id, source_id) +INSERT INTO sensor_systems ( + sensor_nodes_id +, source_id) +SELECT sensor_nodes_id +, source_id +FROM nodes +ON CONFLICT DO NOTHING; + +-- now create a sensor for each +-- this method depends on us having a match for the parameter +WITH sen AS ( + SELECT ingest_id + , source_name + , source_id + , measurand as parameter + FROM staging_measurements + WHERE sensors_id IS NULL + GROUP BY 1,2,3,4 +), inserts AS ( +INSERT INTO sensors (sensor_systems_id, measurands_id, source_id) +SELECT sy.sensor_systems_id +, m.measurands_id +, ingest_id +FROM sen s +JOIN measurands_map_view m ON (s.parameter = m.key) +JOIN sensor_nodes n ON (s.source_name = n.source_name AND s.source_id = n.source_id) +JOIN sensor_systems sy ON (sy.sensor_nodes_id = n.sensor_nodes_id AND s.source_id = sy.source_id) +ON CONFLICT DO NOTHING +RETURNING sensor_systems_id) +SELECT COUNT(DISTINCT sensor_systems_id) INTO __inserted_nodes +FROM inserts; + +-- try again to find the sensors +UPDATE staging_measurements +SET sensors_id=s.sensors_id +FROM sensors s +WHERE s.source_id=ingest_id +AND staging_measurements.sensors_id IS NULL; + + +SELECT COUNT(DISTINCT sensors_id) +INTO __total_nodes +FROM staging_measurements; + + +__process_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- reject any missing. Most likely due to issues +-- with the measurand +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT + current_timestamp + , 'meas-missing-sensors-id' + , to_jsonb(staging_measurements) + , fetchlogs_id +FROM staging_measurements +WHERE sensors_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_measurements +FROM r; + +-- restart the clock to measure just inserts +__process_start := clock_timestamp(); + +WITH inserts AS ( +INSERT INTO measurements ( + sensors_id, + datetime, + value, + lon, + lat +) SELECT + --DISTINCT + sensors_id, + datetime, + value, + lon, + lat +FROM staging_measurements +WHERE sensors_id IS NOT NULL +ON CONFLICT DO NOTHING +RETURNING sensors_id, datetime, value, lat, lon +), inserted as ( + INSERT INTO staging_inserted_measurements (sensors_id, datetime, value, lat, lon) + SELECT sensors_id + , datetime + , value + , lat + , lon + FROM inserts + RETURNING sensors_id, datetime +) +SELECT MIN(datetime) +, MAX(datetime) +, COUNT(1) +INTO __inserted_start_datetime +, __inserted_end_datetime +, __inserted_measurements +FROM inserted; + +__insert_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +-- mark the fetchlogs as done +WITH inserted AS ( + SELECT m.fetchlogs_id + , COUNT(m.*) as n_records + , COUNT(t.*) as n_inserted + , MIN(m.datetime) as fr_datetime + , MAX(m.datetime) as lr_datetime + , MIN(t.datetime) as fi_datetime + , MAX(t.datetime) as li_datetime + FROM staging_measurements m + LEFT JOIN staging_inserted_measurements t ON (t.sensors_id = m.sensors_id AND t.datetime = m.datetime) + GROUP BY m.fetchlogs_id) +UPDATE fetchlogs +SET completed_datetime = CURRENT_TIMESTAMP +, inserted = COALESCE(n_inserted, 0) +, records = COALESCE(n_records, 0) +, first_recorded_datetime = fr_datetime +, last_recorded_datetime = lr_datetime +, first_inserted_datetime = fi_datetime +, last_inserted_datetime = li_datetime +FROM inserted +WHERE inserted.fetchlogs_id = fetchlogs.fetchlogs_id; + +-- track the time required to update cache tables +__process_start := clock_timestamp(); + +-- -- Now we can use those staging_inserted_measurements to update the cache tables +-- INSERT INTO sensors_latest ( +-- sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- ) +-- ---- identify the row that has the latest value +-- WITH numbered AS ( +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn +-- FROM staging_inserted_measurements +-- ), latest AS ( +-- ---- only insert those rows +-- SELECT sensors_id +-- , datetime +-- , value +-- , lat +-- , lon +-- FROM numbered +-- WHERE rn = 1 +-- ) +-- SELECT l.sensors_id +-- , l.datetime +-- , l.value +-- , l.lat +-- , l.lon +-- FROM latest l +-- LEFT JOIN sensors_latest sl ON (l.sensors_id = sl.sensors_id) +-- WHERE sl.sensors_id IS NULL +-- OR l.datetime > sl.datetime +-- ON CONFLICT (sensors_id) DO UPDATE +-- SET datetime = EXCLUDED.datetime +-- , value = EXCLUDED.value +-- , lat = EXCLUDED.lat +-- , lon = EXCLUDED.lon +-- , modified_on = now() +-- --, fetchlogs_id = EXCLUDED.fetchlogs_id +-- ; +-- update the exceedances +INSERT INTO sensor_exceedances (sensors_id, threshold_value, datetime_latest) + SELECT + m.sensors_id + , t.value + , MAX(datetime) + FROM staging_inserted_measurements m + JOIN sensors s ON (m.sensors_id = s.sensors_id) + JOIN thresholds t ON (s.measurands_id = t.measurands_id) + AND m.value > t.value + GROUP BY 1, 2 + ON CONFLICT (sensors_id, threshold_value) DO UPDATE SET + datetime_latest = GREATEST(sensor_exceedances.datetime_latest, EXCLUDED.datetime_latest) + , updated_on = now(); + + +INSERT INTO sensors_rollup ( + sensors_id + , datetime_first + , datetime_last + , value_latest + , value_count + , value_avg + , value_min + , value_max + , geom_latest + ) +---- identify the row that has the latest value +WITH numbered AS ( + SELECT sensors_id + , datetime + , value + , lat + , lon + , sum(1) OVER (PARTITION BY sensors_id) as value_count + , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min + , avg(value) OVER (PARTITION BY sensors_id) as value_avg + , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn + FROM staging_inserted_measurements +), latest AS ( +---- only insert those rows + SELECT sensors_id + , datetime + , value + , value_count + , value_avg + , datetime_min + , lat + , lon + FROM numbered + WHERE rn = 1 +) +SELECT l.sensors_id +, l.datetime_min -- first +, l.datetime -- last +, l.value -- last value +, l.value_count +, l.value_avg +, l.value -- min +, l.value -- max +, public.pt3857(lon, lat) +FROM latest l +LEFT JOIN sensors_rollup sr ON (l.sensors_id = sr.sensors_id) +WHERE sr.sensors_id IS NULL +OR l.datetime > sr.datetime_last +OR l.datetime_min < sr.datetime_first +ON CONFLICT (sensors_id) DO UPDATE +SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_last) +, value_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.value_latest + ELSE sensors_rollup.value_latest + END +, geom_latest = CASE WHEN EXCLUDED.datetime_last > sensors_rollup.datetime_last + THEN EXCLUDED.geom_latest + ELSE sensors_rollup.geom_latest + END +, value_count = sensors_rollup.value_count + EXCLUDED.value_count +, value_min = LEAST(sensors_rollup.value_min, EXCLUDED.value_latest) +, value_max = GREATEST(sensors_rollup.value_max, EXCLUDED.value_latest) +, datetime_first = LEAST(sensors_rollup.datetime_first, EXCLUDED.datetime_first) +, modified_on = now() +--, fetchlogs_id = EXCLUDED.fetchlogs_id +; + + +-- Update the table that will help to track hourly rollups +INSERT INTO hourly_stats (datetime) + SELECT date_trunc('hour', datetime) + FROM staging_inserted_measurements + GROUP BY 1 +ON CONFLICT (datetime) DO UPDATE +SET modified_on = now(); + + +--Update the export queue/logs to export these records +--wrap it in a block just in case the database does not have this module installed +--we subtract the second because the data is assumed to be time ending +WITH e AS ( +INSERT INTO open_data_export_logs (sensor_nodes_id, day, records, measurands, modified_on) +SELECT sn.sensor_nodes_id +, ((m.datetime - '1sec'::interval) AT TIME ZONE (COALESCE(sn.metadata->>'timezone', 'UTC'))::text)::date as day +, COUNT(1) +, COUNT(DISTINCT p.measurands_id) +, MAX(now()) +FROM staging_inserted_measurements m -- meas m +JOIN sensors s ON (m.sensors_id = s.sensors_id) +JOIN measurands p ON (s.measurands_id = p.measurands_id) +JOIN sensor_systems ss ON (s.sensor_systems_id = ss.sensor_systems_id) +JOIN sensor_nodes sn ON (ss.sensor_nodes_id = sn.sensor_nodes_id) +GROUP BY sn.sensor_nodes_id +, ((m.datetime - '1sec'::interval) AT TIME ZONE (COALESCE(sn.metadata->>'timezone', 'UTC'))::text)::date +ON CONFLICT (sensor_nodes_id, day) DO UPDATE +SET records = EXCLUDED.records +, measurands = EXCLUDED.measurands +, modified_on = EXCLUDED.modified_on +RETURNING 1) +SELECT COUNT(1) INTO __exported_days +FROM e; + + +__cache_time_ms := 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +INSERT INTO ingest_stats ( + ingest_method + -- total + , total_measurements_processed + , total_measurements_inserted + , total_measurements_rejected + , total_nodes_processed + , total_nodes_inserted + , total_nodes_updated + , total_nodes_rejected + -- total times + , total_process_time_ms + , total_insert_time_ms + , total_cache_time_ms + -- latest + , latest_measurements_processed + , latest_measurements_inserted + , latest_measurements_rejected + , latest_nodes_processed + , latest_nodes_inserted + , latest_nodes_updated + , latest_nodes_rejected + -- times + , latest_process_time_ms + , latest_insert_time_ms + , latest_cache_time_ms + ) VALUES ( + -- totals + __ingest_method + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms + -- latest + , __total_measurements + , __inserted_measurements + , __rejected_measurements + , __total_nodes + , __inserted_nodes + , __updated_nodes + , __rejected_nodes + -- times + , __process_time_ms + , __insert_time_ms + , __cache_time_ms +) ON CONFLICT (ingest_method) DO UPDATE SET + -- totals + total_measurements_processed = ingest_stats.total_measurements_processed + EXCLUDED.total_measurements_processed + , total_measurements_inserted = ingest_stats.total_measurements_inserted + EXCLUDED.total_measurements_inserted + , total_measurements_rejected = ingest_stats.total_measurements_rejected + EXCLUDED.total_measurements_rejected + , total_nodes_processed = ingest_stats.total_nodes_processed + EXCLUDED.total_nodes_processed + , total_nodes_inserted = ingest_stats.total_nodes_inserted + EXCLUDED.total_nodes_inserted + , total_nodes_updated = ingest_stats.total_nodes_updated + EXCLUDED.total_nodes_updated + , total_nodes_rejected = ingest_stats.total_nodes_rejected + EXCLUDED.total_nodes_rejected + , total_process_time_ms = ingest_stats.total_process_time_ms + EXCLUDED.total_process_time_ms + , total_insert_time_ms = ingest_stats.total_insert_time_ms + EXCLUDED.total_insert_time_ms + , total_cache_time_ms = ingest_stats.total_cache_time_ms + EXCLUDED.total_cache_time_ms + -- latest + , latest_measurements_processed = EXCLUDED.latest_measurements_processed + , latest_measurements_inserted = EXCLUDED.latest_measurements_inserted + , latest_measurements_rejected = EXCLUDED.latest_measurements_rejected + , latest_nodes_processed = EXCLUDED.latest_nodes_processed + , latest_nodes_inserted = EXCLUDED.latest_nodes_inserted + , latest_nodes_updated = EXCLUDED.latest_nodes_updated + , latest_nodes_rejected = EXCLUDED.latest_nodes_rejected + -- times + , latest_process_time_ms = EXCLUDED.latest_process_time_ms + , latest_insert_time_ms = EXCLUDED.latest_insert_time_ms + , latest_cache_time_ms = EXCLUDED.latest_cache_time_ms + , ingest_count = ingest_stats.ingest_count + 1 + , ingested_on = EXCLUDED.ingested_on; + + +RAISE NOTICE 'inserted-measurements: %, inserted-from: %, inserted-to: %, rejected-measurements: %, exported-sensor-days: %, process-time-ms: %, insert-time-ms: %, cache-time-ms: %, source: lcs' + , __inserted_measurements + , __inserted_start_datetime + , __inserted_end_datetime + , __rejected_measurements + , __exported_days + , __process_time_ms + , __insert_time_ms + , __cache_time_ms; + + +EXCEPTION WHEN OTHERS THEN + GET STACKED DIAGNOSTICS __error_context = PG_EXCEPTION_CONTEXT; + RAISE NOTICE 'Failed to ingest measurements: %, %', SQLERRM, __error_context; + +END $$; diff --git a/ingest/etl_process_nodes.sql b/ingest/etl_process_nodes.sql new file mode 100644 index 0000000..bf4b9cd --- /dev/null +++ b/ingest/etl_process_nodes.sql @@ -0,0 +1,342 @@ +-- lcs_ingest_full +DO $$ +DECLARE +__process_start timestamptz := clock_timestamp(); +__inserted_nodes int; +__inserted_sensors int; +__rejected_nodes int; +__rejected_systems int; +__rejected_sensors int; +__rejected_measurands int; + +BEGIN + +-------------------------- +-- lcs_ingest_nodes.sql -- +-------------------------- + +DELETE +FROM staging_sensornodes +WHERE staging_sensornodes.ingest_id IS NULL; + +DELETE +FROM staging_sensorsystems +WHERE staging_sensorsystems.ingest_id IS NULL +OR ingest_sensor_nodes_id IS NULL; + +DELETE +FROM staging_sensors +WHERE staging_sensors.ingest_id IS NULL +OR ingest_sensor_systems_id IS NULL; + +UPDATE staging_sensors +SET units = 'µg/m³' +WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); + + + +-- match the locations to the nodes using the source_name/id combo +UPDATE staging_sensornodes +SET sensor_nodes_id = s.sensor_nodes_id +, timezones_id = s.timezones_id +, countries_id = s.countries_id +, is_new = false +, is_moved = st_astext(s.geom) != st_astext(staging_sensornodes.geom) +FROM sensor_nodes s +WHERE s.source_name = staging_sensornodes.source_name +AND s.source_id = staging_sensornodes.source_id +AND ( staging_sensornodes.matching_method IS NULL + OR staging_sensornodes.matching_method = 'ingest-id'); + + +-- now update them using the source + spatial method +UPDATE staging_sensornodes +SET sensor_nodes_id = s.sensor_nodes_id +, timezones_id = s.timezones_id +, countries_id = s.countries_id +, is_new = false +, is_moved = st_astext(s.geom) != st_astext(staging_sensornodes.geom) +FROM sensor_nodes s +WHERE s.source_name = staging_sensornodes.source_name +AND st_distance(staging_sensornodes.geom, s.geom) < 0.00001 -- about 1.11 meters difference +AND staging_sensornodes.matching_method = 'source-spatial'; + + +-- only update the nodes where the geom has changed +-- the geom queries are really slow so we dont want to be doing that all the time +-- ~18 locations per second +UPDATE staging_sensornodes SET + timezones_id = get_timezones_id(geom) +, countries_id = get_countries_id(geom) +WHERE is_new + OR is_moved + OR timezones_id IS NULL + OR countries_id IS NULL; + + +-- we are going to update the source_id where we are matching via geometry +-- for ingest-id matches this should not matter. +UPDATE sensor_nodes +SET source_id = COALESCE(s.source_id, sensor_nodes.source_id) + , geom = COALESCE(s.geom, sensor_nodes.geom) + , site_name = COALESCE(s.site_name, sensor_nodes.site_name) + , timezones_id = COALESCE(s.timezones_id, sensor_nodes.timezones_id) + , countries_id = COALESCE(s.countries_id, sensor_nodes.countries_id) + , ismobile = COALESCE(s.ismobile, sensor_nodes.ismobile) + , metadata = COALESCE(s.metadata, '{}') || COALESCE(sensor_nodes.metadata, '{}') + , modified_on = now() +FROM staging_sensornodes s +WHERE sensor_nodes.sensor_nodes_id = s.sensor_nodes_id; + + +-- And now we insert those into the sensor nodes table +WITH inserts AS ( +INSERT INTO sensor_nodes ( + site_name +, source_name +, ismobile +, geom +, metadata +, source_id +, timezones_id +, providers_id +, countries_id +) +SELECT site_name +, source_name +, ismobile +, geom +, metadata +, source_id +, timezones_id +, get_providers_id(source_name) +, countries_id +FROM staging_sensornodes +WHERE sensor_nodes_id IS NULL +ON CONFLICT (source_name, source_id) DO UPDATE +SET + site_name=coalesce(EXCLUDED.site_name,sensor_nodes.site_name) + , source_id=COALESCE(EXCLUDED.source_id, sensor_nodes.source_id) + , ismobile=coalesce(EXCLUDED.ismobile,sensor_nodes.ismobile) + , geom=coalesce(EXCLUDED.geom,sensor_nodes.geom) + , metadata=COALESCE(sensor_nodes.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , timezones_id = COALESCE(EXCLUDED.timezones_id, sensor_nodes.timezones_id) + , providers_id = COALESCE(EXCLUDED.providers_id, sensor_nodes.providers_id) + , modified_on = now() +RETURNING 1) +SELECT COUNT(1) INTO __inserted_nodes +FROM inserts; + +---------------------------- +-- lcs_ingest_systems.sql -- +---------------------------- + +-- fill in any new sensor_nodes_id +UPDATE staging_sensornodes +SET sensor_nodes_id = sensor_nodes.sensor_nodes_id +FROM sensor_nodes +WHERE staging_sensornodes.sensor_nodes_id is null +AND sensor_nodes.source_name = staging_sensornodes.source_name +AND sensor_nodes.source_id = staging_sensornodes.source_id; + +-- log anything we were not able to get an id for +WITH r AS ( +INSERT INTO rejects (t, tbl,r,fetchlogs_id) +SELECT now() +, 'staging_sensornodes-missing-nodes-id' +, to_jsonb(staging_sensornodes) +, fetchlogs_id +FROM staging_sensornodes +WHERE sensor_nodes_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_nodes +FROM r; + +-------------------- +-- Sensor Systems -- +-------------------- + +-- make sure that we have a system entry for every ingest_id +-- this is to deal with fetchers that do not add these data +INSERT INTO staging_sensorsystems (sensor_nodes_id, ingest_id, fetchlogs_id, metadata) +SELECT sensor_nodes_id +, source_id -- the ingest_id has the source_name in it and we dont need/want that +, fetchlogs_id +, '{"note":"automatically added for sensor node"}' +FROM staging_sensornodes +WHERE is_new +ON CONFLICT (ingest_id) DO UPDATE + SET sensor_nodes_id = EXCLUDED.sensor_nodes_id; + +-- Now match the sensor nodes to the system +UPDATE staging_sensorsystems +SET sensor_nodes_id = staging_sensornodes.sensor_nodes_id +FROM staging_sensornodes +WHERE staging_sensorsystems.ingest_sensor_nodes_id = staging_sensornodes.ingest_id; + +-- And match to any existing sensor systems +UPDATE staging_sensorsystems +SET sensor_systems_id = sensor_systems.sensor_systems_id +, is_new = false +FROM sensor_systems +WHERE sensor_systems.sensor_nodes_id = staging_sensorsystems.sensor_nodes_id +AND sensor_systems.source_id = staging_sensorsystems.ingest_id; + + +-- log anything we were not able to get an id for +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT now() +, 'staging_sensorsystems-missing-nodes-id' +, to_jsonb(staging_sensorsystems) +, fetchlogs_id +FROM staging_sensorsystems +WHERE sensor_nodes_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_systems +FROM r; + +-- And finally we add/update the sensor systems +INSERT INTO sensor_systems (sensor_nodes_id, source_id, metadata) +SELECT sensor_nodes_id +, ingest_id +, metadata +FROM staging_sensorsystems +WHERE sensor_nodes_id IS NOT NULL +GROUP BY sensor_nodes_id, ingest_id, metadata +ON CONFLICT (sensor_nodes_id, source_id) DO UPDATE SET + metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , modified_on = now(); + +---------------------------- +-- lcs_ingest_sensors.sql -- +---------------------------- + +-- Match the sensor system data +UPDATE staging_sensorsystems +SET sensor_systems_id = sensor_systems.sensor_systems_id +FROM sensor_systems +WHERE staging_sensorsystems.sensor_systems_id IS NULL +AND staging_sensorsystems.sensor_nodes_id=sensor_systems.sensor_nodes_id +AND staging_sensorsystems.ingest_id=sensor_systems.source_id +; + +WITH r AS ( +INSERT INTO rejects (t, tbl,r,fetchlogs_id) +SELECT + now() +, 'staging_sensorsystems-missing-systems-id' +, to_jsonb(staging_sensorsystems) +, fetchlogs_id +FROM staging_sensorsystems +WHERE sensor_systems_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_systems +FROM r; + +------------- +-- SENSORS -- +------------- + + -- We do not want to create default sensors because we are not dealling with measurements here +UPDATE staging_sensors +SET sensor_systems_id = staging_sensorsystems.sensor_systems_id +FROM staging_sensorsystems +WHERE staging_sensors.ingest_sensor_systems_id = staging_sensorsystems.ingest_id; + +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT + now() +, 'staging_sensors-missing-systems-id' +, to_jsonb(staging_sensors) +, fetchlogs_id +FROM staging_sensors +WHERE sensor_systems_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_sensors +FROM r; + + +UPDATE staging_sensors +SET sensors_id = sensors.sensors_id +FROM sensors +WHERE sensors.sensor_systems_id=staging_sensors.sensor_systems_id +AND sensors.source_id = staging_sensors.ingest_id; + + +UPDATE staging_sensors +SET measurands_id = measurands.measurands_id +from measurands +WHERE staging_sensors.measurand=measurands.measurand +and staging_sensors.units=measurands.units; + + +WITH r AS ( +INSERT INTO rejects (t, tbl,r,fetchlogs_id) +SELECT + now() +, 'staging_sensors-missing-measurands-id' +, to_jsonb(staging_sensors) +, fetchlogs_id +FROM staging_sensors +WHERE measurands_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_measurands +FROM r; + +WITH inserts AS ( +INSERT INTO sensors ( + source_id +, sensor_systems_id +, measurands_id +, metadata) +SELECT ingest_id +, sensor_systems_id +, measurands_id +, metadata +FROM staging_sensors +WHERE measurands_id is not null +AND sensor_systems_id is not null +GROUP BY ingest_id +, sensor_systems_id +, measurands_id +, metadata +ON CONFLICT (sensor_systems_id, measurands_id, source_id) DO UPDATE +SET metadata = COALESCE(sensors.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') +RETURNING 1) +SELECT COUNT(1) INTO __inserted_sensors +FROM inserts; + +UPDATE staging_sensors +SET sensors_id = sensors.sensors_id +FROM sensors +WHERE sensors.sensor_systems_id=staging_sensors.sensor_systems_id +AND sensors.source_id = staging_sensors.ingest_id; + +WITH r AS ( +INSERT INTO rejects (t,tbl,r,fetchlogs_id) +SELECT + now() + , 'staging_sensors-missing-sensors-id' + , to_jsonb(staging_sensors) + , fetchlogs_id +FROM staging_sensors +WHERE sensors_id IS NULL +RETURNING 1) +SELECT COUNT(1) INTO __rejected_sensors +FROM r; + +------------------ +-- Return stats -- +------------------ + +RAISE NOTICE 'inserted-nodes: %, inserted-sensors: %, rejected-nodes: %, rejected-sensors: %, rejected-measurands: %, process-time-ms: %, source: lcs' + , __inserted_nodes + , __inserted_sensors + , __rejected_nodes + , __rejected_sensors + , __rejected_measurands + , 1000 * (extract(epoch FROM clock_timestamp() - __process_start)); + +END $$; diff --git a/ingest/lcsV2.py b/ingest/lcsV2.py new file mode 100644 index 0000000..f0f6c2d --- /dev/null +++ b/ingest/lcsV2.py @@ -0,0 +1,770 @@ +import os +import logging +from datetime import datetime, timezone +import dateparser +import pytz +import orjson +import uuid +import csv +from time import time +from urllib.parse import unquote_plus +import warnings +import re + +import boto3 +import psycopg2 +import typer +from io import StringIO +from .settings import settings +from .utils import ( + get_query, + clean_csv_value, + StringIteratorIO, + fix_units, + load_fetchlogs, + select_object, + get_file, +) + +s3 = boto3.resource("s3") +s3c = boto3.client("s3") + +app = typer.Typer() +dir_path = os.path.dirname(os.path.realpath(__file__)) + +FETCH_BUCKET = settings.ETL_BUCKET + +logger = logging.getLogger(__name__) + +warnings.filterwarnings( + "ignore", + message="The localize method is no longer necessary, as this time zone supports the fold attribute", +) + + +def to_geometry(key, data): + # could be passed as lat/lng or coordinates + if key in ['lat','lon']: + lat = data.get('lat') + lon = data.get('lon') + elif key == 'coordinates': + lat = data.get('coordinates', {}).get('lat') + lon = data.get('coordinates', {}).get('lon') + if None in [lat, lon]: + raise Exception('Missing value for coordinates') + # could add more checks + return f"SRID=4326;POINT({lon} {lat})" + +def to_timestamp(key, data): + dt = data.get(key) + value = None + if dt in [None, '']: + logger.warning('Passed none type value for timestamp') + # no need for exception, we check for nones later + return None; + if dt.isnumeric(): + if len(dt) == 13: + dt = datetime.fromtimestamp(int(dt)/1000.0, timezone.utc) + else: + dt = datetime.fromtimestamp(int(dt), timezone.utc) + else: + dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) + + return dt.isoformat() + + +class IngestClient: + def __init__( + self, key=None, fetchlogs_id=None, data=None + ): + self.key = key + self.fetchlogs_id = fetchlogs_id + self.keys = [] + self.st = datetime.now().replace(tzinfo=pytz.UTC) + self.sensors = [] + self.systems = [] + self.nodes = [] + self.node_ids = {} + self.measurements = [] + self.matching_method = 'ingest-id' + self.source = None + self.node_map = { + "fetchlogs_id": {}, + "site_name": { "col":"site_name" }, + "source_name": {}, + "ismobile": {}, + "ingest_id": {}, + "matching_method": {}, + "location": {"col":"ingest_id"}, + "sensor_node_id": {"col":"ingest_id"}, + "label": {"col":"site_name"}, + "coordinates": {"col":"geom","func": to_geometry }, + "geometry": {"col":"geom", "func": to_geometry }, + "lat": {"col":"geom","func": to_geometry }, + "lon": {"col":"geom","func": to_geometry }, + } + self.measurement_map = { + "sensor_id": {"col": "ingest_id"}, + "ingest_id": {"col": "ingest_id"}, + "timestamp": {"col": "datetime", "func": to_timestamp }, + "datetime": {"col": "datetime", "func": to_timestamp }, + "measure": {"col": "value"}, + "value": {}, + "lat": {}, + "lon": {}, + } + # if fetchlogs_id but no key or data + # get key + # if key, load data + # if data + if data is not None and isinstance(data, dict): + self.load(data) + + def process(self, key, data, mp): + col = None + value = None + m = mp.get(key) + if m is not None: + col = m.get('col', key) + func = m.get('func') + if func is None: + # just return value + value = data.get(key) + else: + # functions require key and data + value = func(key, data) + return col, value + + def dump(self): + """ + Dump any data that is currenly loaded into the database + We will dump if there is data OR if we have loaded any keys + We do this because its possible that a file is empty but we + need to run the dump method to get the file to be marked as finished + """ + logger.debug(f"Dumping data from {len(self.keys)} files") + if len(self.nodes)>0 or len(self.keys)>0: + self.dump_locations() + if len(self.measurements)>0 or len(self.keys)>0: + self.dump_measurements() + + def dump_locations(self): + """ + Dump the nodes into the temporary tables + """ + logger.debug(f"Dumping {len(self.nodes)} nodes") + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + start_time = time() + + cursor.execute(get_query( + "temp_locations_dump.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else "TABLE" + )) + + write_csv( + cursor, + self.keys, + f"staging_keys", + [ + "key", + "last_modified", + "fetchlogs_id", + ], + ) + # update by id instead of key due to matching issue + cursor.execute( + """ + UPDATE fetchlogs + SET loaded_datetime = clock_timestamp() + , last_message = 'load_data' + WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM staging_keys) + """ + ) + connection.commit() + + write_csv( + cursor, + self.nodes, + "staging_sensornodes", + [ + "ingest_id", + "site_name", + "matching_method", + "source_name", + "source_id", + "ismobile", + "geom", + "metadata", + "fetchlogs_id", + ], + ) + + write_csv( + cursor, + self.systems, + "staging_sensorsystems", + [ + "ingest_id", + "ingest_sensor_nodes_id", + "metadata", + "fetchlogs_id", + ], + ) + write_csv( + cursor, + self.sensors, + "staging_sensors", + [ + "ingest_id", + "ingest_sensor_systems_id", + "measurand", + "units", + "metadata", + "fetchlogs_id", + ], + ) + connection.commit() + + # and now we load all the nodes,systems and sensors + query = get_query("etl_process_nodes.sql") + cursor.execute(query) + + for notice in connection.notices: + logger.debug(notice) + + cursor.execute( + """ + UPDATE fetchlogs + SET completed_datetime = clock_timestamp() + , last_message = NULL + WHERE fetchlogs_id IN (SELECT fetchlogs_id FROM staging_keys) + """ + ) + + connection.commit() + logger.info("dump_locations: locations: %s; time: %0.4f", len(self.nodes), time() - start_time) + for notice in connection.notices: + logger.debug(notice) + + + def dump_measurements(self): + logger.debug(f"Dumping {len(self.measurements)} measurements") + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + start_time = time() + + cursor.execute(get_query( + "temp_measurements_dump.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) + + iterator = StringIteratorIO( + (to_tsv(line) for line in self.measurements) + ) + cursor.copy_expert( + """ + COPY staging_measurements (ingest_id, source_name, source_id, measurand, value, datetime, lon, lat, fetchlogs_id) + FROM stdin; + """, + iterator, + ) + + # process the measurements + logger.info(f'processing {len(self.measurements)} measurements'); + query = get_query("etl_process_measurements.sql") + try: + cursor.execute(query) + connection.commit() + logger.info("dump_measurements: measurements: %s; time: %0.4f", len(self.measurements), time() - start_time) + for notice in connection.notices: + logger.debug(notice) + + except Exception as err: + logger.error(err) + + def load(self, data = {}): + if "meta" in data.keys(): + self.load_metadata(data.get('meta')) + if "locations" in data.keys(): + self.load_locations(data.get('locations')) + if "measures" in data.keys(): + self.load_measurements(data.get('measures')) + + def load_keys(self, rows): + # for each fetchlog we need to read and load + for row in rows: + key = row[1] + fetchlogs_id = row[0] + last_modified = row[2] + self.load_key(key, fetchlogs_id, last_modified) + + + def load_key(self, key, fetchlogs_id, last_modified): + logger.debug(f"Loading key: {fetchlogs_id}//:{key}") + is_csv = bool(re.search(r"\.csv(.gz)?$", key)) + is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) + + #if file_exists(key): + content = get_file(key).read() + #else: + #content = select_object(key) + self.fetchlogs_id = fetchlogs_id + + if is_csv: + # all csv data will be measurements + for rw in csv.reader(content.split("\n")): + self.add_measurement(rw) + elif is_json: + # all json data should just be parsed and loaded + data = orjson.loads(content) + self.load(data) + else: + raise Exception('No idea what to do') + + # add the key to the table to update + self.keys.append({"key": key, "last_modified": last_modified, "fetchlogs_id": fetchlogs_id}) + + + + def load_metadata(self, meta): + if "source" in meta.keys(): + self.source = meta.get('source') + if "matching_method" in meta.keys(): + self.matching_method = meta.get('matching_method') + if "schema" in meta.keys(): + self.schema = meta.get('schema') + + def load_locations(self, locations): + for loc in locations: + self.add_node(loc) + + def load_measurements(self, measurements): + for meas in measurements: + self.add_measurement(meas) + + def add_sensor(self, j, system_id, fetchlogsId): + for s in j: + sensor = {} + metadata = {} + sensor["ingest_sensor_systems_id"] = system_id + sensor["fetchlogs_id"] = fetchlogsId + for key, value in s.items(): + key = str.replace(key, "sensor_", "") + if key == "id": + sensor["ingest_id"] = value + elif key == "measurand_parameter": + sensor["measurand"] = value + elif key == "measurand_unit": + sensor["units"] = fix_units(value) + else: + metadata[key] = value + sensor["metadata"] = orjson.dumps(metadata).decode() + self.sensors.append(sensor) + + def add_system(self, j, node_id, fetchlogsId): + for s in j: + system = {} + metadata = {} + if "sensor_system_id" in s: + id = s["sensor_system_id"] + else: + id = node_id + system["ingest_sensor_nodes_id"] = node_id + system["ingest_id"] = id + system["fetchlogs_id"] = fetchlogsId + for key, value in s.items(): + key = str.replace(key, "sensor_system_", "") + if key == "sensors": + self.add_sensor(value, id, fetchlogsId) + else: + metadata[key] = value + system["metadata"] = orjson.dumps(metadata).decode() + self.systems.append(system) + + def add_node(self, j): + fetchlogs_id = j.get('fetchlogs_id', self.fetchlogs_id) + node = { "fetchlogs_id": fetchlogs_id } + metadata = {} + mp = self.node_map + + for k, v in j.items(): + # pass the whole measure + col, value = self.process(k, j, self.node_map) + if col is not None: + node[col] = value + else: + metadata[k] = v + + # make sure we actually have data to add + if len(node.keys())>0: + # check for id + ingest_id = node.get('ingest_id') + if ingest_id is None: + raise Exception('Missing ingest id') + + ingest_arr = ingest_id.split('-') + # source name could be set explicitly + # or in the ingest id + # or in the metadata + if node.get('source_name') is None: + if len(ingest_arr)>1: + node['source_name'] = ingest_arr[0] + elif self.source is not None: + node['source_name'] = self.source + else: + raise Exception('Could not find source name') + + # support ingest id that is just the source id + if node.get('source_id') is None: + if len(ingest_arr)>1: + node['source_id'] = ingest_arr[1] + else: + node['source_id'] = ingest_arr[0] + + if node.get('matching_method') is None: + node['matching_method'] = self.matching_method + + # prevent adding the node more than once + # this does not save processing time of course + # logger.debug(node) + if ingest_id not in self.node_ids: + node["metadata"] = orjson.dumps(metadata).decode() + self.node_ids[ingest_id] = True + self.nodes.append(node) + # now look for systems + if "sensor_system" in j.keys(): + self.system(j.get('sensor_system'), node.get('ingest_id'), node.get('fetchlogs_id')) + else: + logger.warning('nothing mapped to node') + + + def add_measurement(self, m): + # create a row with + # ingest_id,datetime,value,lon,lat + # where ingest id will be what links to the sensor + meas = {} + lat = None + lon = None + + # csv method + if isinstance(m, list): + if len(m) < 3: + logger.warning(f'Not enough data in list value: {m}') + return + + fetchlogs_id = self.fetchlogs_id + ingest_id = m[0] + value = m[1] + # using the same key/data format as below + datetime = to_timestamp('dt', {"dt": m[2]}) + if len(m) == 5: + lat = m[3] + lon = m[4] + + elif isinstance(m, dict): + for k, v in m.items(): + # pass the whole measure + col, value = self.process(k, m, self.measurement_map) + if col is not None: + meas[col] = value + + ingest_id = meas.get('ingest_id') + datetime = meas.get('datetime') + value = meas.get('value') + lon = meas.get('lon', None) + lat = meas.get('lat', None) + fetchlogs_id = m.get('fetchlogs_id', self.fetchlogs_id) + + # parse the ingest id here + ingest_arr = ingest_id.split('-') + if len(ingest_arr) < 3: + logger.warning(f'Not enough information in ingest-id: `{ingest_id}`') + return + + source_name = ingest_arr[0] + source_id = ingest_arr[1] + measurand = ingest_arr[2] + + if not None in [ingest_id, datetime, source_name, source_id, measurand]: + self.measurements.append([ingest_id, source_name, source_id, measurand, value, datetime, lon, lat, fetchlogs_id]) + + + def get_metadata(self): + hasnew = False + for obj in self.page: + key = obj["Key"] + id = obj["id"] + last_modified = obj["LastModified"] + try: + logger.debug(f"Loading station file: {id}:{key}") + self.get_station(key, id) + self.keys.append( + { + "key": key, + "last_modified": last_modified, + "fetchlogs_id": id + } + ) + hasnew = True + except Exception as e: + # catch and continue to next page + logger.error( + f"Could not process file: {id}: {key}: {e}" + ) + + if hasnew: + logger.debug(f"get_metadata:hasnew - {self.keys}") + self.load_data() + + + + + + +def create_staging_table(cursor): + # table and batch are used primarily for testing + cursor.execute(get_query( + "etl_staging_v2.sql", + table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + )) + +def write_csv(cursor, data, table, columns): + fields = ",".join(columns) + sio = StringIO() + writer = csv.DictWriter(sio, columns) + writer.writerows(data) + sio.seek(0) + cursor.copy_expert( + f""" + copy {table} ({fields}) from stdin with csv; + """, + sio, + ) + logger.debug(f"table: {table}; rowcount: {cursor.rowcount}") + + +def load_metadata_bucketscan(count=100): + paginator = s3c.get_paginator("list_objects_v2") + for page in paginator.paginate( + Bucket=FETCH_BUCKET, + Prefix="lcs-etl-pipeline/stations", + PaginationConfig={"PageSize": count}, + ): + try: + contents = page["Contents"] + data = LCSData(contents) + data.get_metadata() + except KeyError: + break + + +def load_metadata_db(limit=250, ascending: bool = False): + order = 'ASC' if ascending else 'DESC' + pattern = 'lcs-etl-pipeline/stations/' + rows = load_fetchlogs(pattern, limit, ascending) + contents = [] + for row in rows: + logger.debug(row) + contents.append( + { + "Key": unquote_plus(row[1]), + "LastModified": row[2], + "id": row[0], + } + ) + if len(contents) > 0: + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() + return len(rows) + + +def load_metadata_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT key + , last_modified + , fetchlogs_id + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + rowcount = cursor.rowcount + contents = [] + for row in rows: + contents.append( + { + "Key": unquote_plus(row[0]), + "LastModified": row[1], + "id": row[2], + } + ) + for notice in connection.notices: + logger.debug(notice) + if len(contents) > 0: + load_metadata(contents) + # data = LCSData(contents) + # data.get_metadata() + return rowcount + + +def load_metadata(keys): + logger.debug(f'Load metadata: {len(keys)}') + data = LCSData(keys) + try: + data.get_metadata() + except Exception as e: + ids = ','.join([str(k['id']) for k in keys]) + logger.error(f'load error: {e} ids: {ids}') + raise + + +def get_measurements(key, fetchlogsId): + start = time() + content = select_object(key) + fetch_time = time() - start + + ret = [] + start = time() + for row in csv.reader(content.split("\n")): + if len(row) not in [3, 5]: + continue + if len(row) == 5: + try: + lon = float(row[3]) + lat = float(row[4]) + if not ( + lon is None + or lat is None + or lat == "" + or lon == "" + or lon == 0 + or lat == 0 + or lon < -180 + or lon > 180 + or lat < -90 + or lat > 90 + ): + row[3] = lon + row[4] = lat + else: + row[3] = None + row[4] = None + except Exception: + row[3] = None + row[4] = None + else: + row.insert(3, None) + row.insert(4, None) + if row[0] == "" or row[0] is None: + continue + dt = row[2] + + try: + if dt.isnumeric(): + if len(dt) == 13: + dt = datetime.fromtimestamp(int(dt)/1000.0, timezone.utc) + else: + dt = datetime.fromtimestamp(int(dt), timezone.utc) + row[2] = dt.isoformat() + except Exception: + try: + dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) + except Exception: + logger.warning(f"Exception in parsing date for {dt} {Exception}") + + #row[2] = dt.isoformat() + # addd the log id for tracing purposes + row.insert(5, fetchlogsId) + ret.append(row) + logger.info("get_measurements:csv: %s; size: %s; rows: %s; fetching: %0.4f; reading: %0.4f", key, len(content)/1000, len(ret), fetch_time, time() - start) + return ret + + +def submit_file_error(key, e): + """Update the log to reflect the error and prevent a retry""" + logger.error(f"{key}: {e}") + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + UPDATE fetchlogs + SET completed_datetime = clock_timestamp() + , last_message = %s + WHERE key = %s + """, + (f"ERROR: {e}", key), + ) + + +def to_tsv(row): + tsv = "\t".join(map(clean_csv_value, row)) + "\n" + return tsv + return "" + + +def load_measurements_file(fetchlogs_id: int): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT fetchlogs_id + , key + FROM fetchlogs + WHERE fetchlogs_id = %s + LIMIT 1 + ; + """, + (fetchlogs_id,), + ) + rows = cursor.fetchall() + load_measurements(rows) + + +def load_measurements_batch(batch: str): + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT fetchlogs_id + , key + FROM fetchlogs + WHERE batch_uuid = %s + """, + (batch,), + ) + rows = cursor.fetchall() + load_measurements(rows) + + +def load_measurements_db(limit=250, ascending: bool = False): + #pattern = '^lcs-etl-pipeline/measures/.*\\.(csv|json)' + pattern = '^/home/christian/.*\\.(csv|json)' + rows = load_fetchlogs(pattern, limit, ascending) + load_measurements(rows) + return len(rows) + + +def load_measurements(rows): + logger.debug(f"loading {len(rows)} measurements") + start_time = time() + # get a client object to hold all the data + client = IngestClient() + # load all the keys + client.load_keys(rows) + # and finally we can dump it all into the db + client.dump() + # write to the log + logger.info("load_measurements:get: %s keys; %s measurements; %s locations; %0.4f seconds", + len(client.keys), len(client.measurements), len(client.nodes), time() - start_time) diff --git a/ingest/temp_locations_dump.sql b/ingest/temp_locations_dump.sql new file mode 100644 index 0000000..71ce88f --- /dev/null +++ b/ingest/temp_locations_dump.sql @@ -0,0 +1,52 @@ +DROP TABLE IF EXISTS + staging_sensornodes +, staging_sensorsystems +, staging_sensors +, staging_keys; + +CREATE {table} IF NOT EXISTS staging_keys ( + fetchlogs_id int, + key text, + last_modified timestamptz +); + +CREATE {table} IF NOT EXISTS staging_sensornodes ( + sensor_nodes_id int, + is_new boolean DEFAULT true, + is_moved boolean DEFAULT false, + ingest_id text NOT NULL UNIQUE, + source_name text NOT NULL, + source_id text NOT NULL, + matching_method text NOT NULL DEFAULT 'ingest-id', + site_name text, + ismobile boolean, + geom geometry, + timezones_id int, + countries_id int, + metadata jsonb, + fetchlogs_id int, + UNIQUE (source_name, source_id) +); + +CREATE {table} IF NOT EXISTS staging_sensorsystems ( + sensor_systems_id int, + is_new boolean DEFAULT true, + ingest_id text NOT NULL UNIQUE, + ingest_sensor_nodes_id text, + sensor_nodes_id int, + metadata jsonb, + fetchlogs_id int +); + +CREATE {table} IF NOT EXISTS staging_sensors ( + ingest_id text, + is_new boolean DEFAULT true, + sensors_id int, + sensor_systems_id int, + ingest_sensor_systems_id text, + measurand text, + units text, + measurands_id int, + metadata jsonb, + fetchlogs_id int +); diff --git a/ingest/temp_measurements_dump.sql b/ingest/temp_measurements_dump.sql new file mode 100644 index 0000000..0d47f76 --- /dev/null +++ b/ingest/temp_measurements_dump.sql @@ -0,0 +1,46 @@ +DROP TABLE IF EXISTS + staging_sensors +, staging_measurements +, staging_inserted_measurements; + + +CREATE {table} IF NOT EXISTS staging_sensors ( + ingest_id text NOT NULL, + is_new boolean DEFAULT true, + source_name text NOT NULL, + source_id text NOT NULL, + measurand text NOT NULL, + sensors_id int, + sensor_systems_id int, + ingest_sensor_systems_id text, + units text, + measurands_id int, + metadata jsonb, + fetchlogs_id int +); + +CREATE {table} IF NOT EXISTS staging_measurements ( + ingest_id text NOT NULL, + source_name text NOT NULL, + source_id text NOT NULL, + measurand text NOT NULL, + sensors_id int, + value float, + datetime timestamptz, + lon float, + lat float, + fetchlogs_id int +); + +--This table will hold measurements that have +--actually been inserted into the measurements table +--this is to deal with the overlap that we see in the +--incoming files +CREATE {table} IF NOT EXISTS staging_inserted_measurements ( + sensors_id int + , datetime timestamptz + , value double precision + , lat double precision + , lon double precision + , fetchlogs_id int +); diff --git a/local.py b/local.py new file mode 100644 index 0000000..6315396 --- /dev/null +++ b/local.py @@ -0,0 +1,76 @@ +import os +import sys +import orjson +import psycopg2 +import logging +from time import time +import csv + + +from ingest.lcsV2 import ( + IngestClient, + load_measurements, + load_measurements_db, +) + +from ingest.utils import ( + select_object, + get_file, +) + +logger = logging.getLogger('handler') + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level='DEBUG', + force=True, +) + + + +rows = [ + [3, '/home/christian/Downloads/habitatmap-1714036497-h84j.csv', '2024-01-01 00:00:00'], + [4, '/home/christian/Downloads/airgradient-1714003639-h32tu.csv', '2024-01-05'], + [5, '/home/christian/Downloads/senstate-1714007461-ivz5g.csv', '2021-02-01'], + [1, '/home/christian/Downloads/1610335354.csv', '2022-01-01'] + ] + + + + +#print(rows) +#load_measurements(rows) +#load_measurements_db() + +#content = select_object(key) +#content = get_file(file).read() + +#print(type(content)) +#print(len(content)) + +# select object returns a string so we need to convert it +#data = orjson.loads(content) + +#print(type(data)) +#print(len(data)) + + +# # load all the data +# start_time = time() + +client = IngestClient() + +client.load_keys(rows) +client.dump() + +# #client.load(data) +# # client.load_metadata(data['meta']) +# #client.load_locations(data['locations']) +# client.load_measurements(data['measures']) + +# #client.dump() + + +# print(time() - start_time) +# print(f"measurements: {len(client.measurements)}") +# print(f"locations: {len(client.nodes)}") diff --git a/matching_clarity_sensors.sql b/matching_clarity_sensors.sql new file mode 100644 index 0000000..83997b9 --- /dev/null +++ b/matching_clarity_sensors.sql @@ -0,0 +1,80 @@ + + +CREATE TABLE current_clarity_nodes AS +SELECT unnest(ARRAY[1285716,1314366,1373846,1378636,1533718,1533720,1533721,1533915,1533917,1533920,1894630,1894631,1894632,1894634,1894636,1894637,1894638,1894639,1894640,1894641,1894642,1894643,1924313,1949202,1949203,1949206,2152632,2152633,2156118,2402491,290475,290476,290477,290478,290479,290480,290481,290482,290483,290484,290485,290487,290488,290489,290490,290491,290492,290495,290496,290498,290499,290500,290501,290502,290504,290505,290506,290508,290510,290512,290513,290515,290517,290518,290519,290520,290521,290522,290523,290524,290526,290528,290529,290530,290531,290532,290533,290534,290535,290536,290537,290538,290540,290541,290542,290543,290544,290545,290546,290549,290551,290552,290553,290554,290555,290557,290558,290559,290560,290561,290563,290564,290565,290566,290567,290569,290570,290571,290572,290573,290574,290575,290576,290578,290582,290583,290584,290585,290587,290588,290589,290590,290591,290593,290594,290595,290596,290597,290599,290600,290601,290602,290603,290604,290605,290606,290607,290608,290609,290610,290611,290614,290615,290616,290618,290620,290621,290622,290623,290624,290625,290626,290628,290629,290630,290632,290633,290634,290635,290636,290637,290638,290639,290641,290642,290643,290644,290645,290646,290648,290649,290650,290651,290652,290653,290654,290655,290656,290657,290658,290659,290660,290661,290662,290664,290665,290667,290668,290670,290671,290672,290674,290675,290677,290678,290679,290680,290681,290683,290685,290686,290687,300026,300027,300028,300030,301884,301885,308728,310353,310354,310355,310356,310357,310358,310360,351822,351823,351824,351825,351826,367083,367107,367110,367112,367113,367114,367116,367117,367118,370742,370743,370744,370750,370751,370752,815609,923364,923365,929705,938377,947124,947125,947126,947127,947128,947130,947132,947133,947134,947137,947138,947139,947140,947141,947142,947143,947144,947150,947151,947152,947153,947154,947155,947156,947157,947158,947159,947160,947161,947162,947163,947164,947165,947166,947168,947169,947170,947171,947172,947173,947174,947175,947176,947177,947178,947180,947182,947183,947184,947185,947186,947187,947188,947189,947190,947191,947192,947194,947195,947196,947197,947198,947199,947200,947201,947202,947203,947204,947205,947206,947207,947208,947210,947211,947212,947213,947214,947216,947217,947218,947219,947220,947221,947222,947223,947224,947225,947226,947227,947228,947229,947230,947231,947232,947234,947235,947236,947237,947238,947239,947240,947241,947242,947243,947244,947245,947246,947247,947248,947249,947250,947251,947252,947253,947254,947255,947256,947257,947258,947259,947260,947261,947262,947264,947265,947266,947267,947268,947270,947271,947273,947274,947275,947276,947277,947278,947279,947280,947281,947283,947284,947285,947286,947287,947288,947289,947290,947291,947292,947295,947296,947297,947298,947299,947300,947301,947302,947303,947304,947305,947306,947307,947308,947309,947310,947312,947313,947314,947315,947316,947317,947318,947319,947320,947321,947322,947323,947324,947325,947326,947327,947328,947329,947330,947332,947334,947335,947336,947338,947339,947340,947341,947342,947343,947344,947345,947346,947347,947348,947349]) as node; + + + + WITH clarity AS ( + SELECT sensor_nodes_id + , source_id + , site_name + , geom + , added_on + , node IS NOT NULL as is_active + FROM sensor_nodes + JOIN current_clarity_nodes ON (sensor_nodes_id = node) + WHERE source_name = 'clarity') + SELECT c.sensor_nodes_id + , c.source_id + , c.site_name + , n.source_id + , n.site_name + , c.geom = n.geom + , is_active + , ROUND(st_distance(c.geom, n.geom)::numeric, 4) as distance + FROM clarity c + LEFT JOIN staging_sensornodes n ON (st_distance(c.geom, n.geom)<0.0001) + -- WHERE n.source_id IS NOT NULL OR is_active + WHERE n.source_id IS NULL + ORDER BY c.sensor_nodes_id DESC NULLS FIRST; + + + + + WITH clarity AS ( + SELECT sensor_nodes_id + , source_id + , site_name + , geom + , added_on + , node IS NOT NULL as is_active + FROM sensor_nodes + LEFT JOIN current_clarity_nodes ON (sensor_nodes_id = node) + WHERE source_name = 'clarity') + SELECT n.source_id + , n.site_name + , c.source_id + , c.site_name + --, c.geom + --, n.geom + --, c.added_on + , c.geom = n.geom + , is_active + , ROUND(st_distance(c.geom, n.geom)::numeric, 4) as distance + , c.sensor_nodes_id + , c.sensor_nodes_id = LAG(c.sensor_nodes_id) OVER (ORDER BY c.sensor_nodes_id) + FROM staging_sensornodes n + --JOIN clarity c ON (n.site_name = c.site_name) + --JOIN clarity c ON (n.geom = c.geom) + JOIN clarity c ON (n.source_id = c.source_id) + --LEFT JOIN clarity c ON (st_distance(c.geom, n.geom)<0.00001) + --WHERE n.source_id IS NOT NULL OR is_active + WHERE is_active + ORDER BY sensor_nodes_id DESC; + + + + + + SELECT * + FROM staging_sensornodes + WHERE source_id = 'DBXRI9190'; + + + -- How many active clarity sensor nodes do we have? + SELECT string_agg(DISTINCT sensor_nodes_id::text, ',') + FROM sensor_nodes_check + WHERE source_name = 'clarity' + AND datetime_last > current_date + ; From 6d725202ef3ea00721b8e3a714cba75a855a69ca Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 25 Apr 2024 11:21:52 -0700 Subject: [PATCH 17/42] Cleaned up to work in production setting --- ingest/lcsV2.py | 23 +++++++++++++++-------- local.py | 42 +++++++++++++++--------------------------- 2 files changed, 30 insertions(+), 35 deletions(-) diff --git a/ingest/lcsV2.py b/ingest/lcsV2.py index f0f6c2d..23c8599 100644 --- a/ingest/lcsV2.py +++ b/ingest/lcsV2.py @@ -306,13 +306,17 @@ def load_key(self, key, fetchlogs_id, last_modified): logger.debug(f"Loading key: {fetchlogs_id}//:{key}") is_csv = bool(re.search(r"\.csv(.gz)?$", key)) is_json = bool(re.search(r"\.(nd)?json(.gz)?$", key)) - - #if file_exists(key): - content = get_file(key).read() - #else: - #content = select_object(key) self.fetchlogs_id = fetchlogs_id + # is it a local file? This is used for dev + # but likely fine to leave in + if os.path.exists(key): + content = get_file(key).read() + else: + content = select_object(key) + + logger.debug(f"Read content containing {len(content)} lines") + if is_csv: # all csv data will be measurements for rw in csv.reader(content.split("\n")): @@ -748,14 +752,17 @@ def load_measurements_batch(batch: str): load_measurements(rows) -def load_measurements_db(limit=250, ascending: bool = False): - #pattern = '^lcs-etl-pipeline/measures/.*\\.(csv|json)' - pattern = '^/home/christian/.*\\.(csv|json)' +def load_measurements_db( + limit=250, + ascending: bool = False, + pattern = '^lcs-etl-pipeline/measures/.*\\.(csv|json)' + ): rows = load_fetchlogs(pattern, limit, ascending) load_measurements(rows) return len(rows) +# Keep seperate from above so we can test rows not from the database def load_measurements(rows): logger.debug(f"loading {len(rows)} measurements") start_time = time() diff --git a/local.py b/local.py index 6315396..acaafdd 100644 --- a/local.py +++ b/local.py @@ -26,6 +26,11 @@ force=True, ) +logging.getLogger('boto3').setLevel(logging.WARNING) +logging.getLogger('botocore').setLevel(logging.WARNING) +logging.getLogger('urllib3').setLevel(logging.WARNING) + + rows = [ @@ -36,36 +41,19 @@ ] +# local files +#load_measurements_db(pattern = '^/home/christian/.*\\.(csv|json)') +# remote files, make sure it can at least read it +load_measurements_db() - -#print(rows) -#load_measurements(rows) -#load_measurements_db() - -#content = select_object(key) -#content = get_file(file).read() - -#print(type(content)) -#print(len(content)) - -# select object returns a string so we need to convert it -#data = orjson.loads(content) - -#print(type(data)) -#print(len(data)) - - -# # load all the data -# start_time = time() - -client = IngestClient() - -client.load_keys(rows) -client.dump() +## client based methods +#client = IngestClient() +#client.load_keys(rows) +#client.dump() # #client.load(data) -# # client.load_metadata(data['meta']) -# #client.load_locations(data['locations']) +# client.load_metadata(data['meta']) +# client.load_locations(data['locations']) # client.load_measurements(data['measures']) # #client.dump() From 1a5a82d7dca985686511794fe92731a1974fe173 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 25 Apr 2024 11:30:17 -0700 Subject: [PATCH 18/42] Swtiched source of loader in the handler --- ingest/handler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ingest/handler.py b/ingest/handler.py index 7f97dd8..463dec0 100644 --- a/ingest/handler.py +++ b/ingest/handler.py @@ -2,7 +2,8 @@ import logging import psycopg2 from .settings import settings -from .lcs import load_measurements_db, load_metadata_db +from .lcs import load_metadata_db +from .lcsV2 import load_measurements_db from .fetch import load_db from time import time import json From feec1b0c454ddd9a53b2b867e557f9fbed6e8e37 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Mon, 10 Jun 2024 16:02:45 -0700 Subject: [PATCH 19/42] Adding git action --- .github/workflows/deploy.yml | 70 ++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/deploy.yml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..f4e6576 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,70 @@ +name: Deploy ingestor + +on: + push: + branches: + - versions/aeolus + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Configure aws credentials + uses: aws-actions/configure-aws-credentials@master + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_PROD }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_PROD }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Get envionmental values + uses: aws-actions/aws-secretsmanager-get-secrets@v2 + with: + secret-ids: | + AEOLUS, openaq-env/aeolus + name-transformation: uppercase + parse-json-secrets: true + + - uses: actions/setup-node@v4 + with: + node-version: "20" + + + - name: Install CDK + run: | + npm install -g aws-cdk@2.92.0 + + - uses: actions/setup-python@v3 + with: + python-version: '3.11' + + - name: Deploy stack + env: + ENV: "aeolus" + PROJECT: "openaq" + + ## deployment variables + CDK_ACCOUNT: ${{ secrets.CDK_ACCOUNT }} + CDK_REGION: ${{ secrets.CDK_REGION }} + VPC_ID: ${{ env.AEOLUS_VPC_ID }} + + TOPIC_ARN: ${{ env.AEOLUS_FETCH_OBJECT_TOPIC_ARN }} + + ## application variables + DATABASE_READ_USER: ${{ env.AEOLUS_DATABASE_READ_USER }} + DATABASE_READ_PASSWORD: ${{ env.AEOLUS_DATABASE_READ_PASSWORD }} + DATABASE_WRITE_USER: ${{ env.AEOLUS_DATABASE_WRITE_USER }} + DATABASE_WRITE_PASSWORD: ${{ env.AEOLUS_DATABASE_WRITE_PASSWORD }} + DATABASE_DB: ${{ env.AEOLUS_DATABASE_DB }} + DATABASE_HOST: ${{ env.AEOLUS_DATABASE_HOST }} + DATABASE_PORT: ${{ env.AEOLUS_DATABASE_PORT }} + FETCH_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} + ETL_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} + + + working-directory: ./cdk + run: | + pip install -r requirements.txt + cdk deploy openaq-ingest-aeolus --require-approval never From 97360fb60bd1f577fefaeddb2d7495aea0ddd0f5 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Mon, 10 Jun 2024 16:09:13 -0700 Subject: [PATCH 20/42] Fix action branch name --- .github/workflows/deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f4e6576..5e9c5f6 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -3,7 +3,7 @@ name: Deploy ingestor on: push: branches: - - versions/aeolus + - version/aeolus jobs: deploy: From a82b13e5f024eb042729e991a50a2b449ca8b821 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 12:23:35 -0700 Subject: [PATCH 21/42] Updated deployment --- .github/workflows/deploy.yml | 5 +++-- cdk/utils.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 5e9c5f6..4aa11bf 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -46,8 +46,9 @@ jobs: PROJECT: "openaq" ## deployment variables - CDK_ACCOUNT: ${{ secrets.CDK_ACCOUNT }} - CDK_REGION: ${{ secrets.CDK_REGION }} + # CDK_ACCOUNT: ${{ secrets.CDK_ACCOUNT }} + # CDK_REGION: ${{ secrets.CDK_REGION }} + VPC_ID: ${{ env.AEOLUS_VPC_ID }} TOPIC_ARN: ${{ env.AEOLUS_FETCH_OBJECT_TOPIC_ARN }} diff --git a/cdk/utils.py b/cdk/utils.py index 78318ce..1e1fab4 100644 --- a/cdk/utils.py +++ b/cdk/utils.py @@ -23,7 +23,7 @@ def create_dependencies_layer( output_dir = f'../.build/{function_name}' layer_id = f'openaq-{function_name}-{env_name}-dependencies' - if not environ.get('SKIP_PIP'): + if not environ.get('SKIP_BUILD'): print(f'Building {layer_id} from {requirements_file} into {output_dir}') subprocess.run( f"""python -m pip install -qq -r {requirements_file} \ From a1211669264771c5a6ca2254fafc72323280bae3 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 12:29:02 -0700 Subject: [PATCH 22/42] cleaning up the settings --- ingest/lcs.py | 4 ++-- ingest/lcsV2.py | 2 +- ingest/settings.py | 1 - ingest/utils.py | 12 ++++++------ 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/ingest/lcs.py b/ingest/lcs.py index 2389917..c59df22 100644 --- a/ingest/lcs.py +++ b/ingest/lcs.py @@ -29,7 +29,7 @@ app = typer.Typer() dir_path = os.path.dirname(os.path.realpath(__file__)) -FETCH_BUCKET = settings.ETL_BUCKET +FETCH_BUCKET = settings.FETCH_BUCKET logger = logging.getLogger(__name__) @@ -395,7 +395,7 @@ def select_object(key): try: content = "" resp = s3c.select_object_content( - Bucket=settings.ETL_BUCKET, + Bucket=settings.FETCH_BUCKET, Key=key, ExpressionType="SQL", Expression=""" diff --git a/ingest/lcsV2.py b/ingest/lcsV2.py index 23c8599..f162175 100644 --- a/ingest/lcsV2.py +++ b/ingest/lcsV2.py @@ -32,7 +32,7 @@ app = typer.Typer() dir_path = os.path.dirname(os.path.realpath(__file__)) -FETCH_BUCKET = settings.ETL_BUCKET +FETCH_BUCKET = settings.FETCH_BUCKET logger = logging.getLogger(__name__) diff --git a/ingest/settings.py b/ingest/settings.py index 1cebc47..38e74b4 100644 --- a/ingest/settings.py +++ b/ingest/settings.py @@ -14,7 +14,6 @@ class Settings(BaseSettings): DATABASE_PORT: int DATABASE_READ_URL: Union[str, None] DATABASE_WRITE_URL: Union[str, None] - FASTAPI_URL: str DRYRUN: bool = False FETCH_BUCKET: str ETL_BUCKET: str diff --git a/ingest/utils.py b/ingest/utils.py index b9741b7..e4ef524 100644 --- a/ingest/utils.py +++ b/ingest/utils.py @@ -230,7 +230,7 @@ def deconstruct_path(key: str): path["key"] = "/".join(p) else: # use the current bucket from settings - path["bucket"] = settings.ETL_BUCKET + path["bucket"] = settings.FETCH_BUCKET path["key"] = key logger.debug(path) @@ -263,7 +263,7 @@ def get_data(key: str): key = "/".join(path) else: # use the current bucket from settings - bucket = settings.ETL_BUCKET + bucket = settings.FETCH_BUCKET # stream the file logger.debug(f"streaming s3 file data from s3://{bucket}/{key}") @@ -289,7 +289,7 @@ def get_file(filepath: str): def get_object( key: str, - bucket: str = settings.ETL_BUCKET + bucket: str = settings.FETCH_BUCKET ): key = unquote_plus(key) text = '' @@ -310,7 +310,7 @@ def get_object( def put_object( data: str, key: str, - bucket: str = settings.ETL_BUCKET + bucket: str = settings.FETCH_BUCKET ): out = io.BytesIO() with gzip.GzipFile(fileobj=out, mode='wb') as gz: @@ -362,7 +362,7 @@ def select_object(key: str): content = "" logger.debug(f"Getting object: {key}, {output_serialization}") resp = s3.select_object_content( - Bucket=settings.ETL_BUCKET, + Bucket=settings.FETCH_BUCKET, Key=key, ExpressionType="SQL", Expression=""" @@ -662,7 +662,7 @@ def crawl(bucket, prefix): def crawl_lcs(): - crawl(settings.ETL_BUCKET, "lcs-etl-pipeline/") + crawl(settings.FETCH_BUCKET, "lcs-etl-pipeline/") def crawl_fetch(): From 7c3e22beaca66cdcbfa6dc0c44b0ffc8381dfcbc Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 14:22:02 -0700 Subject: [PATCH 23/42] Updated from 3.9 and added poetry --- cdk/cdk.json | 2 +- cdk/lambda_ingest_stack.py | 2 +- cdk/utils.py | 2 +- pyproject.toml | 24 ++++++++++++++++++++++++ requirements.txt | 8 -------- requirements_dev.txt | 3 --- 6 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 requirements_dev.txt diff --git a/cdk/cdk.json b/cdk/cdk.json index 289cf21..76af4a2 100644 --- a/cdk/cdk.json +++ b/cdk/cdk.json @@ -1,5 +1,5 @@ { - "app": "python app.py", + "app": "poetry run python app.py", "context": { "aws-cdk:enableDiffNoFail": "true", "@aws-cdk/core:stackRelativeExports": "true", diff --git a/cdk/lambda_ingest_stack.py b/cdk/lambda_ingest_stack.py index e6b66b6..70d47d6 100644 --- a/cdk/lambda_ingest_stack.py +++ b/cdk/lambda_ingest_stack.py @@ -66,7 +66,7 @@ def __init__( ), handler="ingest.handler.handler", vpc=vpc_id, - runtime=aws_lambda.Runtime.PYTHON_3_9, + runtime=aws_lambda.Runtime.PYTHON_3_11, allow_public_subnet=True, memory_size=ingest_lambda_memory_size, environment=stringify_settings(lambda_env), diff --git a/cdk/utils.py b/cdk/utils.py index 1e1fab4..77bf57f 100644 --- a/cdk/utils.py +++ b/cdk/utils.py @@ -47,5 +47,5 @@ def create_dependencies_layer( self, layer_id, code=layer_code, - compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_9] + compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11] ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4a496f6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[tool.poetry] +name = "ingest" +version = "0.1.0" +description = "Data ingestor for OpenAQ Framework" +authors = ["OpenAQ "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.12" +dateparser = "^1.2.0" +orjson = "^3.10.4" +psycopg2-binary = "^2.9.9" +pytz = "^2024.1" +typer = "^0.12.3" +typing-extensions = "^4.12.2" +pydantic = {extras = ["dotenv"], version = "^2.7.3"} + + +[tool.poetry.group.cdk.dependencies] +aws-cdk-lib = "^2.145.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4296f15..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -dateparser==1.1.1 -orjson==3.6.8 -psycopg2-binary==2.9.3 -pydantic[dotenv] -pytz==2022.1 -pytz-deprecation-shim==0.1.0.post0 -typer==0.4.1 -typing_extensions==4.2.0 diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index 7278aca..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,3 +0,0 @@ --r requirements.txt -boto3 -numpy From 36ce602123bd1750e882a437375660cb9230871a Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 14:24:03 -0700 Subject: [PATCH 24/42] fixed deployment code --- .github/workflows/deploy.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 4aa11bf..37e66ac 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -36,9 +36,12 @@ jobs: run: | npm install -g aws-cdk@2.92.0 - - uses: actions/setup-python@v3 + - name: Install Poetry + uses: snok/install-poetry@v1 + + - uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: '3.12' - name: Deploy stack env: @@ -67,5 +70,5 @@ jobs: working-directory: ./cdk run: | - pip install -r requirements.txt + poetry install cdk deploy openaq-ingest-aeolus --require-approval never From d451de97207319d65eafa79c7fc9627baf2e90bd Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 14:28:37 -0700 Subject: [PATCH 25/42] Changed position of the python install in deployment --- .github/workflows/deploy.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 37e66ac..9e16918 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -36,13 +36,13 @@ jobs: run: | npm install -g aws-cdk@2.92.0 - - name: Install Poetry - uses: snok/install-poetry@v1 - - uses: actions/setup-python@v5 with: python-version: '3.12' + - name: Install Poetry + uses: snok/install-poetry@v1 + - name: Deploy stack env: ENV: "aeolus" From 68ecd22156c1d84877abcde81e178ed43f87e77f Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 14:56:43 -0700 Subject: [PATCH 26/42] Updated pydantic settings --- cdk/app.py | 6 +++--- cdk/config.py | 21 +++++++++++---------- cdk/lambda_ingest_stack.py | 14 +++++++------- ingest/settings.py | 30 +++++++++++++++++------------- pyproject.toml | 1 + 5 files changed, 39 insertions(+), 33 deletions(-) diff --git a/cdk/app.py b/cdk/app.py index 9b86548..e0b9a04 100644 --- a/cdk/app.py +++ b/cdk/app.py @@ -32,9 +32,9 @@ lambda_env=lambda_env, fetch_bucket=settings.FETCH_BUCKET, vpc_id=settings.VPC_ID, - ingest_lambda_timeout=settings.INGEST_LAMBDA_TIMEOUT, - ingest_lambda_memory_size=settings.INGEST_LAMBDA_MEMORY_SIZE, - ingest_rate_minutes=settings.INGEST_RATE_MINUTES, + lambda_timeout=settings.LAMBDA_TIMEOUT, + lambda_memory_size=settings.LAMBDA_MEMORY_SIZE, + rate_minutes=settings.RATE_MINUTES, topic_arn=settings.TOPIC_ARN, env=env, ) diff --git a/cdk/config.py b/cdk/config.py index da05621..03cb150 100644 --- a/cdk/config.py +++ b/cdk/config.py @@ -1,5 +1,8 @@ from typing import List -from pydantic import BaseSettings +from pydantic_settings import ( + BaseSettings, + SettingsConfigDict, + ) from pathlib import Path from os import environ @@ -8,19 +11,17 @@ class Settings(BaseSettings): FETCH_BUCKET: str ENV: str = "staging" PROJECT: str = "openaq" - INGEST_LAMBDA_TIMEOUT: int = 900 - INGEST_LAMBDA_MEMORY_SIZE: int = 1536 - INGEST_RATE_MINUTES: int = 15 + LAMBDA_TIMEOUT: int = 900 + LAMBDA_MEMORY_SIZE: int = 1536 + RATE_MINUTES: int = 15 LOG_LEVEL: str = 'INFO' TOPIC_ARN: str = None VPC_ID: str = None - class Config: - parent = Path(__file__).resolve().parent.parent - if 'DOTENV' in environ: - env_file = Path.joinpath(parent, environ['DOTENV']) - else: - env_file = Path.joinpath(parent, ".env") + + model_config = SettingsConfigDict( + extra="ignore", env_file=f"../{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8" + ) settings = Settings() diff --git a/cdk/lambda_ingest_stack.py b/cdk/lambda_ingest_stack.py index 70d47d6..35a27d6 100644 --- a/cdk/lambda_ingest_stack.py +++ b/cdk/lambda_ingest_stack.py @@ -30,9 +30,9 @@ def __init__( env_name: str, lambda_env: Dict, fetch_bucket: str, - ingest_lambda_timeout: int, - ingest_lambda_memory_size: int, - ingest_rate_minutes: int = 15, + lambda_timeout: int, + lambda_memory_size: int, + rate_minutes: int = 15, topic_arn: str = None, vpc_id: str = None, **kwargs, @@ -68,9 +68,9 @@ def __init__( vpc=vpc_id, runtime=aws_lambda.Runtime.PYTHON_3_11, allow_public_subnet=True, - memory_size=ingest_lambda_memory_size, + memory_size=lambda_memory_size, environment=stringify_settings(lambda_env), - timeout=Duration.seconds(ingest_lambda_timeout), + timeout=Duration.seconds(lambda_timeout), layers=[ create_dependencies_layer( self, @@ -89,12 +89,12 @@ def __init__( # Set how often the ingester will run # If 0 the ingester will not run automatically - if ingest_rate_minutes > 0: + if rate_minutes > 0: aws_events.Rule( self, f"{id}-ingest-event-rule", schedule=aws_events.Schedule.cron( - minute=f"0/{ingest_rate_minutes}" + minute=f"0/{rate_minutes}" ), targets=[ aws_events_targets.LambdaFunction(ingest_function), diff --git a/ingest/settings.py b/ingest/settings.py index 38e74b4..d0b94c8 100644 --- a/ingest/settings.py +++ b/ingest/settings.py @@ -1,5 +1,12 @@ from typing import Union -from pydantic import BaseSettings, validator + +from pydantic_settings import ( + BaseSettings, + SettingsConfigDict, + ) + +from pydantic import computed_field + from pathlib import Path from os import environ @@ -26,20 +33,17 @@ class Settings(BaseSettings): USE_TEMP_TABLES: bool = True PAUSE_INGESTING: bool = False - @validator('DATABASE_READ_URL', allow_reuse=True) - def get_read_url(cls, v, values): - return v or f"postgresql://{values['DATABASE_READ_USER']}:{values['DATABASE_READ_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" + @computed_field + def DATABASE_READ_URL(self) -> str: + return f"postgresql://{values['DATABASE_READ_USER']}:{values['DATABASE_READ_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" - @validator('DATABASE_WRITE_URL', allow_reuse=True) - def get_write_url(cls, v, values): - return v or f"postgresql://{values['DATABASE_WRITE_USER']}:{values['DATABASE_WRITE_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" + @computed_field + def DATABASE_WRITE_URL(self) -> str: + return f"postgresql://{values['DATABASE_WRITE_USER']}:{values['DATABASE_WRITE_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" - class Config: - parent = Path(__file__).resolve().parent.parent - if 'DOTENV' in environ: - env_file = Path.joinpath(parent, environ['DOTENV']) - else: - env_file = Path.joinpath(parent, ".env") + model_config = SettingsConfigDict( + extra="ignore", env_file=f"../{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8" + ) settings = Settings() diff --git a/pyproject.toml b/pyproject.toml index 4a496f6..98ecf3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ pytz = "^2024.1" typer = "^0.12.3" typing-extensions = "^4.12.2" pydantic = {extras = ["dotenv"], version = "^2.7.3"} +pydantic-settings = "^2.3.2" [tool.poetry.group.cdk.dependencies] From 0b2144a30553b7b7c7cc387fa19d9f4c4238a236 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 15:03:15 -0700 Subject: [PATCH 27/42] Fixing computed field issue --- ingest/settings.py | 8 +++----- pyproject.toml | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ingest/settings.py b/ingest/settings.py index d0b94c8..f7fa380 100644 --- a/ingest/settings.py +++ b/ingest/settings.py @@ -19,8 +19,6 @@ class Settings(BaseSettings): DATABASE_DB: str DATABASE_HOST: str DATABASE_PORT: int - DATABASE_READ_URL: Union[str, None] - DATABASE_WRITE_URL: Union[str, None] DRYRUN: bool = False FETCH_BUCKET: str ETL_BUCKET: str @@ -35,14 +33,14 @@ class Settings(BaseSettings): @computed_field def DATABASE_READ_URL(self) -> str: - return f"postgresql://{values['DATABASE_READ_USER']}:{values['DATABASE_READ_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" + return f"postgresql://{self.DATABASE_READ_USER}:{self.DATABASE_READ_PASSWORD}@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_DB}" @computed_field def DATABASE_WRITE_URL(self) -> str: - return f"postgresql://{values['DATABASE_WRITE_USER']}:{values['DATABASE_WRITE_PASSWORD']}@{values['DATABASE_HOST']}:{values['DATABASE_PORT']}/{values['DATABASE_DB']}" + return f"postgresql://{self.DATABASE_WRITE_USER}:{self.DATABASE_WRITE_PASSWORD}@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_DB}" model_config = SettingsConfigDict( - extra="ignore", env_file=f"../{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8" + extra="ignore", env_file=f"{environ.get('DOTENV', '.env')}", env_file_encoding="utf-8" ) diff --git a/pyproject.toml b/pyproject.toml index 98ecf3f..e198914 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ pydantic-settings = "^2.3.2" [tool.poetry.group.cdk.dependencies] aws-cdk-lib = "^2.145.0" +boto3 = "^1.34.124" [build-system] requires = ["poetry-core"] From 4b12116e0bbc3b555649df0e81e71070547e5dca Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 15:19:17 -0700 Subject: [PATCH 28/42] Adding export plugin --- .github/workflows/deploy.yml | 1 + cdk/utils.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 9e16918..28e5444 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -71,4 +71,5 @@ jobs: working-directory: ./cdk run: | poetry install + poetry self add poetry-plugin-export cdk deploy openaq-ingest-aeolus --require-approval never diff --git a/cdk/utils.py b/cdk/utils.py index 77bf57f..9c827c5 100644 --- a/cdk/utils.py +++ b/cdk/utils.py @@ -19,14 +19,16 @@ def create_dependencies_layer( function_name: str, requirements_path: Path ) -> aws_lambda.LayerVersion: - requirements_file = str(requirements_path.resolve()) + #requirements_file = str(requirements_path.resolve()) output_dir = f'../.build/{function_name}' layer_id = f'openaq-{function_name}-{env_name}-dependencies' if not environ.get('SKIP_BUILD'): - print(f'Building {layer_id} from {requirements_file} into {output_dir}') + print(f'Building {layer_id} into {output_dir}') subprocess.run( - f"""python -m pip install -qq -r {requirements_file} \ + f""" + poetry export --without=cdk -o requirements.txt --without-hashes && \ + poetry run python -m pip install -qq -r requirements.txt \ -t {output_dir}/python && \ cd {output_dir}/python && \ find . -type f -name '*.pyc' | \ From 509d3a8fd9674ef84c60ff9759a8ad6245b0fd94 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 15:25:29 -0700 Subject: [PATCH 29/42] UPdated cdk version --- .github/workflows/deploy.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 28e5444..3df6505 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -70,6 +70,6 @@ jobs: working-directory: ./cdk run: | - poetry install poetry self add poetry-plugin-export + poetry install cdk deploy openaq-ingest-aeolus --require-approval never diff --git a/pyproject.toml b/pyproject.toml index e198914..ae1f644 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ pydantic-settings = "^2.3.2" [tool.poetry.group.cdk.dependencies] -aws-cdk-lib = "^2.145.0" +aws-cdk-lib = "^2.132.0" boto3 = "^1.34.124" [build-system] From b5d0daf29f166cda8f5955f6ca945b0f498ed5cd Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 15:30:38 -0700 Subject: [PATCH 30/42] Removed cdk version from deploy --- .github/workflows/deploy.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 3df6505..b078d71 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -34,7 +34,7 @@ jobs: - name: Install CDK run: | - npm install -g aws-cdk@2.92.0 + npm install -g aws-cdk - uses: actions/setup-python@v5 with: diff --git a/pyproject.toml b/pyproject.toml index ae1f644..e198914 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ pydantic-settings = "^2.3.2" [tool.poetry.group.cdk.dependencies] -aws-cdk-lib = "^2.132.0" +aws-cdk-lib = "^2.145.0" boto3 = "^1.34.124" [build-system] From 7fc9e87e48ca0a5ec8052c3a2554a337b80f68c4 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 11 Jun 2024 15:39:50 -0700 Subject: [PATCH 31/42] Updated the python version to 12 --- cdk/lambda_ingest_stack.py | 2 +- cdk/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdk/lambda_ingest_stack.py b/cdk/lambda_ingest_stack.py index 35a27d6..b12d179 100644 --- a/cdk/lambda_ingest_stack.py +++ b/cdk/lambda_ingest_stack.py @@ -66,7 +66,7 @@ def __init__( ), handler="ingest.handler.handler", vpc=vpc_id, - runtime=aws_lambda.Runtime.PYTHON_3_11, + runtime=aws_lambda.Runtime.PYTHON_3_12, allow_public_subnet=True, memory_size=lambda_memory_size, environment=stringify_settings(lambda_env), diff --git a/cdk/utils.py b/cdk/utils.py index 9c827c5..1e7cec4 100644 --- a/cdk/utils.py +++ b/cdk/utils.py @@ -49,5 +49,5 @@ def create_dependencies_layer( self, layer_id, code=layer_code, - compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_11] + compatible_runtimes=[aws_lambda.Runtime.PYTHON_3_12] ) From ec92f4922e2eb71fc72d7e4da088e8ac17a72d60 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 13 Jun 2024 16:02:04 -0700 Subject: [PATCH 32/42] Redeploy with ingesting turned off --- .github/workflows/deploy.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index b078d71..cdce764 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -66,6 +66,7 @@ jobs: DATABASE_PORT: ${{ env.AEOLUS_DATABASE_PORT }} FETCH_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} ETL_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} + PAUSE_INGESTING: True working-directory: ./cdk From d59f4f1fe92ad96523599c95e7b8fced8da0f6d3 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Fri, 2 Aug 2024 12:35:42 -0700 Subject: [PATCH 33/42] Clean up --- cdk/requirements.txt | 4 ---- local.py | 10 ++++++---- pyproject.toml | 1 - 3 files changed, 6 insertions(+), 9 deletions(-) delete mode 100644 cdk/requirements.txt diff --git a/cdk/requirements.txt b/cdk/requirements.txt deleted file mode 100644 index 87c952c..0000000 --- a/cdk/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -aws-cdk-lib==2.87.0 -boto3 -pydantic==1.10 -python-dotenv diff --git a/local.py b/local.py index acaafdd..5df871c 100644 --- a/local.py +++ b/local.py @@ -38,18 +38,20 @@ [4, '/home/christian/Downloads/airgradient-1714003639-h32tu.csv', '2024-01-05'], [5, '/home/christian/Downloads/senstate-1714007461-ivz5g.csv', '2021-02-01'], [1, '/home/christian/Downloads/1610335354.csv', '2022-01-01'] + [6, '/home/christian/Downloads/1722384430-2vfvm.json', '2024-07-30'], + [7, '/home/christian/Downloads/1722384430-2vfvm_meas.json', '2024-07-30'] ] # local files #load_measurements_db(pattern = '^/home/christian/.*\\.(csv|json)') # remote files, make sure it can at least read it -load_measurements_db() +#load_measurements_db() ## client based methods -#client = IngestClient() -#client.load_keys(rows) -#client.dump() +client = IngestClient() +client.load_keys(rows) +client.dump() # #client.load(data) # client.load_metadata(data['meta']) diff --git a/pyproject.toml b/pyproject.toml index e198914..1c86072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ typing-extensions = "^4.12.2" pydantic = {extras = ["dotenv"], version = "^2.7.3"} pydantic-settings = "^2.3.2" - [tool.poetry.group.cdk.dependencies] aws-cdk-lib = "^2.145.0" boto3 = "^1.34.124" From b9fab848283634508dc7bfa5ced04b700cd1b24a Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Sun, 4 Aug 2024 07:51:49 -0700 Subject: [PATCH 34/42] Resetting PAUSE_INGESTING to be False --- .github/workflows/deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index cdce764..eb50fa2 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -66,7 +66,7 @@ jobs: DATABASE_PORT: ${{ env.AEOLUS_DATABASE_PORT }} FETCH_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} ETL_BUCKET: ${{ env.AEOLUS_FETCH_BUCKET }} - PAUSE_INGESTING: True + PAUSE_INGESTING: False working-directory: ./cdk From db082ed583db193344cb93dbb12b5036a7eee6f7 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 23 Oct 2024 08:29:03 -0700 Subject: [PATCH 35/42] Updates to support the CAC data (#14) * Cleaned up the ingester to work better for CAC data * Added support for ingesting logging intervals * Support for ingesting instrument and status * Added flag ingest method * Fixed bug in the flag ingest process Flags with null notes where not being matched * Added the start of some testing files The test_flags script is not an automated test yet but I thought the data files and process would still be helpful to have committed * Flagging updates and hourly data rollup fixes * Some cleanup and bug fixes Added some methods that help in the testing/dev environments --- ingest/etl_process_measurements.sql | 36 +++- ingest/etl_process_nodes.sql | 106 +++++++++-- ingest/lcsV2.py | 262 +++++++++++++++++++++++----- ingest/temp_locations_dump.sql | 22 +++ ingest/temp_measurements_dump.sql | 22 ++- local.py | 33 ++-- tests/test_file1.json | 120 +++++++++++++ tests/test_file2.json | 127 ++++++++++++++ tests/test_flags.py | 48 +++++ 9 files changed, 699 insertions(+), 77 deletions(-) create mode 100644 tests/test_file1.json create mode 100644 tests/test_file2.json create mode 100644 tests/test_flags.py diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql index 256240c..f6513b9 100644 --- a/ingest/etl_process_measurements.sql +++ b/ingest/etl_process_measurements.sql @@ -52,12 +52,16 @@ FROM staging_measurements; -- that duplicate sensors with the same ingest/source id are created -- this is a short term fix -- a long term fix would not allow duplicate source_id's -WITH ranked_sensors AS ( +WITH staged_sensors AS ( + -- this first part signficantly speeds it up on slow machines + SELECT DISTINCT ingest_id + FROM staging_measurements +), ranked_sensors AS ( SELECT s.sensors_id , s.source_id , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk FROM sensors s - JOIN staging_measurements m ON (s.source_id = m.ingest_id) + JOIN staged_sensors m ON (s.source_id = m.ingest_id) ), active_sensors AS ( SELECT source_id , sensors_id @@ -68,6 +72,7 @@ WITH ranked_sensors AS ( FROM active_sensors s WHERE s.source_id=ingest_id; + -- Now we have to fill in any missing information -- first add the nodes and systems that dont exist -- add just the bare minimum amount of data to the system @@ -285,6 +290,7 @@ INSERT INTO sensors_rollup ( , value_latest , value_count , value_avg + , value_sd , value_min , value_max , geom_latest @@ -299,6 +305,7 @@ WITH numbered AS ( , sum(1) OVER (PARTITION BY sensors_id) as value_count , min(datetime) OVER (PARTITION BY sensors_id) as datetime_min , avg(value) OVER (PARTITION BY sensors_id) as value_avg + , stddev(value) OVER (PARTITION BY sensors_id) as value_sd , row_number() OVER (PARTITION BY sensors_id ORDER BY datetime DESC) as rn FROM staging_inserted_measurements ), latest AS ( @@ -308,6 +315,7 @@ WITH numbered AS ( , value , value_count , value_avg + , value_sd , datetime_min , lat , lon @@ -320,6 +328,7 @@ SELECT l.sensors_id , l.value -- last value , l.value_count , l.value_avg +, l.value_sd , l.value -- min , l.value -- max , public.pt3857(lon, lat) @@ -348,12 +357,23 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las -- Update the table that will help to track hourly rollups -INSERT INTO hourly_stats (datetime) - SELECT date_trunc('hour', datetime) - FROM staging_inserted_measurements - GROUP BY 1 -ON CONFLICT (datetime) DO UPDATE -SET modified_on = now(); +-- this is a replacement to the hourly stats table + WITH inserted_hours AS ( + -- first we group things, adding an hour to make it time-ending after truncating + SELECT datetime + '1h'::interval as datetime + , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset + FROM staging_inserted_measurements m + JOIN sensors s ON (s.sensors_id = m.sensors_id) + JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) + JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id) + JOIN timezones tz ON (sn.timezones_id = tz.timezones_id) + GROUP BY 1, 2 + ) + INSERT INTO hourly_data_queue (datetime, tz_offset) + SELECT as_utc_hour(datetime, tz_offset), tz_offset + FROM inserted_hours + ON CONFLICT (datetime, tz_offset) DO UPDATE + SET modified_on = now(); --Update the export queue/logs to export these records diff --git a/ingest/etl_process_nodes.sql b/ingest/etl_process_nodes.sql index bf4b9cd..5b78e6c 100644 --- a/ingest/etl_process_nodes.sql +++ b/ingest/etl_process_nodes.sql @@ -35,7 +35,7 @@ WHERE units IN ('µg/m��','��g/m³', 'ug/m3'); --- match the locations to the nodes using the source_name/id combo +-- match the locations to existing nodes using the source_name/id combo UPDATE staging_sensornodes SET sensor_nodes_id = s.sensor_nodes_id , timezones_id = s.timezones_id @@ -109,7 +109,9 @@ SELECT site_name , metadata , source_id , timezones_id -, get_providers_id(source_name) +-- default to the unknown provider +-- just to make sure we have one set +, COALESCE(get_providers_id(source_name), 1) , countries_id FROM staging_sensornodes WHERE sensor_nodes_id IS NULL @@ -156,17 +158,20 @@ FROM r; -- Sensor Systems -- -------------------- + -- make sure that we have a system entry for every ingest_id -- this is to deal with fetchers that do not add these data INSERT INTO staging_sensorsystems (sensor_nodes_id, ingest_id, fetchlogs_id, metadata) SELECT sensor_nodes_id -, source_id -- the ingest_id has the source_name in it and we dont need/want that +--, source_id -- the ingest_id has the source_name in it and we dont need/want that +, ingest_id , fetchlogs_id , '{"note":"automatically added for sensor node"}' FROM staging_sensornodes -WHERE is_new +WHERE is_new AND ingest_id NOT IN (SELECT ingest_sensor_nodes_id FROM staging_sensorsystems) ON CONFLICT (ingest_id) DO UPDATE - SET sensor_nodes_id = EXCLUDED.sensor_nodes_id; + SET sensor_nodes_id = EXCLUDED.sensor_nodes_id + ; -- Now match the sensor nodes to the system UPDATE staging_sensorsystems @@ -197,15 +202,18 @@ SELECT COUNT(1) INTO __rejected_systems FROM r; -- And finally we add/update the sensor systems -INSERT INTO sensor_systems (sensor_nodes_id, source_id, metadata) +INSERT INTO sensor_systems (sensor_nodes_id, source_id, instruments_id, metadata) SELECT sensor_nodes_id -, ingest_id +, s.ingest_id +, i.instruments_id , metadata -FROM staging_sensorsystems +FROM staging_sensorsystems s +LEFT JOIN instruments i ON (s.instrument_ingest_id = i.ingest_id) WHERE sensor_nodes_id IS NOT NULL -GROUP BY sensor_nodes_id, ingest_id, metadata +GROUP BY sensor_nodes_id, s.ingest_id, instruments_id, metadata ON CONFLICT (sensor_nodes_id, source_id) DO UPDATE SET metadata=COALESCE(sensor_systems.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , instruments_id = EXCLUDED.instruments_id , modified_on = now(); ---------------------------- @@ -266,10 +274,11 @@ AND sensors.source_id = staging_sensors.ingest_id; UPDATE staging_sensors -SET measurands_id = measurands.measurands_id -from measurands -WHERE staging_sensors.measurand=measurands.measurand -and staging_sensors.units=measurands.units; +SET measurands_id = m.measurands_id +FROM (SELECT measurand, MIN(measurands_id) AS measurands_id FROM measurands GROUP BY measurand) as m +WHERE staging_sensors.measurand=m.measurand +--AND staging_sensors.units=measurands.units +; WITH r AS ( @@ -290,20 +299,34 @@ INSERT INTO sensors ( source_id , sensor_systems_id , measurands_id +, data_logging_period_seconds +, data_averaging_period_seconds +, sensor_statuses_id , metadata) SELECT ingest_id , sensor_systems_id , measurands_id +, logging_interval_seconds +, averaging_interval_seconds +, COALESCE(ss.sensor_statuses_id, 1) , metadata -FROM staging_sensors +FROM staging_sensors s +LEFT JOIN sensor_statuses ss ON (ss.short_code = s.status) WHERE measurands_id is not null AND sensor_systems_id is not null GROUP BY ingest_id , sensor_systems_id , measurands_id +, logging_interval_seconds +, averaging_interval_seconds +, ss.sensor_statuses_id , metadata ON CONFLICT (sensor_systems_id, measurands_id, source_id) DO UPDATE SET metadata = COALESCE(sensors.metadata, '{}') || COALESCE(EXCLUDED.metadata, '{}') + , data_logging_period_seconds = EXCLUDED.data_logging_period_seconds + , data_averaging_period_seconds = EXCLUDED.data_averaging_period_seconds + , sensor_statuses_id = EXCLUDED.sensor_statuses_id + , modified_on = now() RETURNING 1) SELECT COUNT(1) INTO __inserted_sensors FROM inserts; @@ -327,6 +350,61 @@ RETURNING 1) SELECT COUNT(1) INTO __rejected_sensors FROM r; + +-- update the period so that we dont have to keep doing it later +-- we could do this on import as well if we feel this is slowing us down +UPDATE staging_flags + SET period = tstzrange(COALESCE(datetime_from, '-infinity'::timestamptz),COALESCE(datetime_to, 'infinity'::timestamptz), '[]'); + +-- Now we have to match things +-- get the right node id and sensors id for the flags +UPDATE staging_flags +SET sensors_id = s.sensors_id + , sensor_nodes_id = sy.sensor_nodes_id +FROM sensors s +JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) +WHERE staging_flags.sensor_ingest_id = s.source_id; + +-- and then get the right flags_id +UPDATE staging_flags +SET flag_types_id = ft.flag_types_id +FROM flag_types ft +WHERE split_part(staging_flags.ingest_id, '::', 1) = ft.ingest_id; + +-- now we should look to see if we should be just extending a flag +UPDATE staging_flags sf + SET flags_id = fm.flags_id + FROM flags fm + -- where the core information is the same (exactly) + WHERE sf.sensor_nodes_id = fm.sensor_nodes_id + AND sf.flag_types_id = fm.flag_types_id + AND ((sf.note = fm.note) OR (sf.note IS NULL AND fm.note IS NULL)) + -- the periods touch or overlap + AND fm.period && sf.period + -- and the flagged record sensors contains the current sensors + AND fm.sensors_ids @> ARRAY[sf.sensors_id]; + +-- and finally we will insert the new flags +INSERT INTO flags (flag_types_id, sensor_nodes_id, sensors_ids, period, note) + SELECT flag_types_id + , sensor_nodes_id + , CASE WHEN sensors_id IS NOT NULL THEN ARRAY[sensors_id] ELSE NULL END + , period + , note + FROM staging_flags + WHERE flag_types_id IS NOT NULL + AND sensor_nodes_id IS NOT NULL + AND flags_id IS NULL; + +-- And then update any that need to be updated + UPDATE flags fm + SET period = sf.period + fm.period + , note = sf.note + , modified_on = now() + FROM staging_flags sf + WHERE sf.flags_id = fm.flags_id; + + ------------------ -- Return stats -- ------------------ diff --git a/ingest/lcsV2.py b/ingest/lcsV2.py index f162175..5aebd59 100644 --- a/ingest/lcsV2.py +++ b/ingest/lcsV2.py @@ -68,6 +68,7 @@ def to_timestamp(key, data): else: dt = datetime.fromtimestamp(int(dt), timezone.utc) else: + return dt dt = dateparser.parse(dt).replace(tzinfo=timezone.utc) return dt.isoformat() @@ -83,8 +84,11 @@ def __init__( self.st = datetime.now().replace(tzinfo=pytz.UTC) self.sensors = [] self.systems = [] + self.flags = [] self.nodes = [] - self.node_ids = {} + self.node_ids = [] + self.system_ids = [] + self.sensor_ids = [] self.measurements = [] self.matching_method = 'ingest-id' self.source = None @@ -135,7 +139,7 @@ def process(self, key, data, mp): value = func(key, data) return col, value - def dump(self): + def dump(self, load: bool = True): """ Dump any data that is currenly loaded into the database We will dump if there is data OR if we have loaded any keys @@ -144,15 +148,16 @@ def dump(self): """ logger.debug(f"Dumping data from {len(self.keys)} files") if len(self.nodes)>0 or len(self.keys)>0: - self.dump_locations() + self.dump_locations(load) if len(self.measurements)>0 or len(self.keys)>0: - self.dump_measurements() + self.dump_measurements(load) - def dump_locations(self): + def dump_locations(self, load: bool = True): """ Dump the nodes into the temporary tables """ - logger.debug(f"Dumping {len(self.nodes)} nodes") + db_table = "TEMP TABLE" if (settings.USE_TEMP_TABLES and load) else "TABLE" + logger.debug(f"Dumping {len(self.nodes)} nodes using {db_table} ({settings.USE_TEMP_TABLES}|{load})") with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: @@ -160,7 +165,7 @@ def dump_locations(self): cursor.execute(get_query( "temp_locations_dump.sql", - table="TEMP TABLE" if settings.USE_TEMP_TABLES else "TABLE" + table=db_table )) write_csv( @@ -207,11 +212,13 @@ def dump_locations(self): "staging_sensorsystems", [ "ingest_id", + "instrument_ingest_id", "ingest_sensor_nodes_id", "metadata", "fetchlogs_id", ], ) + write_csv( cursor, self.sensors, @@ -221,15 +228,35 @@ def dump_locations(self): "ingest_sensor_systems_id", "measurand", "units", + "status", + "logging_interval_seconds", + "averaging_interval_seconds", "metadata", "fetchlogs_id", ], ) + + write_csv( + cursor, + self.flags, + "staging_flags", + [ + "ingest_id", + "sensor_ingest_id", + "datetime_from", + "datetime_to", + "note", + "metadata", + "fetchlogs_id", + ], + ) + connection.commit() # and now we load all the nodes,systems and sensors - query = get_query("etl_process_nodes.sql") - cursor.execute(query) + if load: + query = get_query("etl_process_nodes.sql") + cursor.execute(query) for notice in connection.notices: logger.debug(notice) @@ -249,8 +276,10 @@ def dump_locations(self): logger.debug(notice) - def dump_measurements(self): - logger.debug(f"Dumping {len(self.measurements)} measurements") + + def dump_measurements(self, load: bool = True): + db_table = "TEMP TABLE" if (settings.USE_TEMP_TABLES and load) else "TABLE" + logger.debug(f"Dumping {len(self.measurements)} measurements using {db_table} ({settings.USE_TEMP_TABLES}|{load})") with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: connection.set_session(autocommit=True) with connection.cursor() as cursor: @@ -258,7 +287,7 @@ def dump_measurements(self): cursor.execute(get_query( "temp_measurements_dump.sql", - table="TEMP TABLE" if settings.USE_TEMP_TABLES else 'TABLE' + table=db_table )) iterator = StringIteratorIO( @@ -272,18 +301,19 @@ def dump_measurements(self): iterator, ) - # process the measurements - logger.info(f'processing {len(self.measurements)} measurements'); - query = get_query("etl_process_measurements.sql") - try: - cursor.execute(query) - connection.commit() - logger.info("dump_measurements: measurements: %s; time: %0.4f", len(self.measurements), time() - start_time) - for notice in connection.notices: - logger.debug(notice) + if load: + logger.info(f'processing {len(self.measurements)} measurements'); + query = get_query("etl_process_measurements.sql") + try: + cursor.execute(query) + connection.commit() + logger.info("dump_measurements: measurements: %s; time: %0.4f", len(self.measurements), time() - start_time) + for notice in connection.notices: + logger.debug(notice) + + except Exception as err: + logger.error(err) - except Exception as err: - logger.error(err) def load(self, data = {}): if "meta" in data.keys(): @@ -293,6 +323,25 @@ def load(self, data = {}): if "measures" in data.keys(): self.load_measurements(data.get('measures')) + + def reset(self): + """ + Reset the client to the new state. Mostly for testing purposes + """ + logger.debug("Reseting the client data") + self.measurements = [] + self.nodes = [] + self.systems = [] + self.sensors = [] + self.flags = [] + self.keys = [] + self.key = None + self.fetchlogs_id = None + self.node_ids = [] + self.system_ids = [] + self.sensor_ids = [] + + def load_keys(self, rows): # for each fetchlog we need to read and load for row in rows: @@ -310,12 +359,15 @@ def load_key(self, key, fetchlogs_id, last_modified): # is it a local file? This is used for dev # but likely fine to leave in - if os.path.exists(key): - content = get_file(key).read() + if os.path.exists(os.path.expanduser(key)): + content = get_file(os.path.expanduser(key)).read() else: content = select_object(key) - logger.debug(f"Read content containing {len(content)} lines") + if is_json: + logger.debug(f"Read JSON containing {len(content)} characters") + else: + logger.debug(f"Read CSV containing {len(content)} lines") if is_csv: # all csv data will be measurements @@ -332,7 +384,6 @@ def load_key(self, key, fetchlogs_id, last_modified): self.keys.append({"key": key, "last_modified": last_modified, "fetchlogs_id": fetchlogs_id}) - def load_metadata(self, meta): if "source" in meta.keys(): self.source = meta.get('source') @@ -346,8 +397,12 @@ def load_locations(self, locations): self.add_node(loc) def load_measurements(self, measurements): + logger.debug(f'Loading {len(measurements)} measurements') for meas in measurements: self.add_measurement(meas) + logger.debug(f'Loaded measurements') + + def add_sensor(self, j, system_id, fetchlogsId): for s in j: @@ -355,27 +410,87 @@ def add_sensor(self, j, system_id, fetchlogsId): metadata = {} sensor["ingest_sensor_systems_id"] = system_id sensor["fetchlogs_id"] = fetchlogsId + + if "sensor_id" in s: + id = s.get("sensor_id") + elif "id" in s: + id = s.get("id") + else: + id = system_id + + if id in self.sensor_ids: + # would it make more sense to merge or skip or throw error? + # merge and submit a warning maybe? + continue + + sensor["ingest_id"] = id + for key, value in s.items(): key = str.replace(key, "sensor_", "") - if key == "id": - sensor["ingest_id"] = value + if key == "flags": + self.add_flags(value, id, fetchlogsId) elif key == "measurand_parameter": sensor["measurand"] = value elif key == "measurand_unit": sensor["units"] = fix_units(value) + elif key == "status": + sensor["status"] = value + elif key == "interval_seconds": + sensor["logging_interval_seconds"] = value + sensor["averaging_interval_seconds"] = value else: metadata[key] = value + if not sensor.get('measurand'): + # get it from the ingest id + ingest_arr = sensor.get('ingest_id').split('-') + sensor['measurand'] = ingest_arr[-1] sensor["metadata"] = orjson.dumps(metadata).decode() self.sensors.append(sensor) + self.sensor_ids.append(id) + + def add_flags(self, flags, sensor_id, fetchlogsId): + for f in flags: + flag = {} + metadata = {} + flag["sensor_ingest_id"] = sensor_id + flag["fetchlogs_id"] = fetchlogsId + for key, value in f.items(): + key = str.replace(key, "flag_", "") + if key == "id": + v = str.replace(value, f"{sensor_id}-", "") + flag["ingest_id"] = v + + elif key == 'datetime_from': + flag["datetime_from"] = value + elif key == 'datetime_to': + flag["datetime_to"] = value + elif key == 'note': + flag["note"] = value + else: + metadata[key] = value - def add_system(self, j, node_id, fetchlogsId): + flag["metadata"] = orjson.dumps(metadata).decode() + self.flags.append(flag) + + def add_systems(self, j, node_id, fetchlogsId): for s in j: system = {} metadata = {} if "sensor_system_id" in s: - id = s["sensor_system_id"] + id = s.get("sensor_system_id") + elif "system_id" in s: + id = s.get("system_id") else: id = node_id + + if id in self.system_ids: + # would it make more sense to merge or skip or throw error? + continue + + ingest_arr = id.split('-') + if len(ingest_arr) == 3: + system["instrument_ingest_id"] = ingest_arr[-1]; + system["ingest_sensor_nodes_id"] = node_id system["ingest_id"] = id system["fetchlogs_id"] = fetchlogsId @@ -387,6 +502,7 @@ def add_system(self, j, node_id, fetchlogsId): metadata[key] = value system["metadata"] = orjson.dumps(metadata).decode() self.systems.append(system) + self.system_ids.append(id) def add_node(self, j): fetchlogs_id = j.get('fetchlogs_id', self.fetchlogs_id) @@ -400,7 +516,8 @@ def add_node(self, j): if col is not None: node[col] = value else: - metadata[k] = v + if not k in ['systems','sensor_system']: + metadata[k] = v # make sure we actually have data to add if len(node.keys())>0: @@ -433,14 +550,18 @@ def add_node(self, j): # prevent adding the node more than once # this does not save processing time of course - # logger.debug(node) if ingest_id not in self.node_ids: node["metadata"] = orjson.dumps(metadata).decode() - self.node_ids[ingest_id] = True + self.node_ids.append(ingest_id) self.nodes.append(node) # now look for systems if "sensor_system" in j.keys(): - self.system(j.get('sensor_system'), node.get('ingest_id'), node.get('fetchlogs_id')) + self.add_systems(j.get('sensor_system'), node.get('ingest_id'), node.get('fetchlogs_id')) + elif "systems" in j.keys(): + self.add_systems(j.get("systems"), node.get('ingest_id'), node.get('fetchlogs_id')) + else: + # no systems + logger.debug(j.keys()) else: logger.warning('nothing mapped to node') @@ -496,6 +617,70 @@ def add_measurement(self, m): self.measurements.append([ingest_id, source_name, source_id, measurand, value, datetime, lon, lat, fetchlogs_id]) + + def refresh_cached_tables(self): + """ + Refresh the cached tables that we use for most production endpoints. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + logger.debug("Refreshing the cached tables") + cursor.execute("REFRESH MATERIALIZED VIEW locations_view_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW locations_manufacturers_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW locations_latest_measurements_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW providers_view_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW countries_view_cached;") + cursor.execute("REFRESH MATERIALIZED VIEW parameters_view_cached;") + + + + def process_hourly_data(self,n: int = 1000): + """ + Process any pending hourly data rollups. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute("SELECT datetime, tz_offset FROM fetch_hourly_data_jobs(%s)", (n,)) + rows = cursor.fetchall() + for row in rows: + cursor.execute("SELECT update_hourly_data(%s, %s)", row) + connection.commit() + + + def process_daily_data(self,n: int = 500): + """ + Process any pending daily data rollups. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute("SELECT datetime, tz_offset FROM fetch_daily_data_jobs(%s)", (n,)) + rows = cursor.fetchall() + for row in rows: + cursor.execute("SELECT update_daily_data(%s, %s)", row) + connection.commit() + + + def process_annual_data(self,n: int = 25): + """ + Process any pending annual data rollups. + Right now this is just for testing purposes + """ + with psycopg2.connect(settings.DATABASE_WRITE_URL) as connection: + connection.set_session(autocommit=True) + with connection.cursor() as cursor: + cursor.execute("SELECT datetime, tz_offset FROM fetch_annual_data_jobs(%s)", (n,)) + rows = cursor.fetchall() + for row in rows: + cursor.execute("SELECT update_annual_data(%s, %s)", row) + connection.commit() + + def get_metadata(self): hasnew = False for obj in self.page: @@ -523,11 +708,6 @@ def get_metadata(self): logger.debug(f"get_metadata:hasnew - {self.keys}") self.load_data() - - - - - def create_staging_table(cursor): # table and batch are used primarily for testing cursor.execute(get_query( @@ -550,6 +730,8 @@ def write_csv(cursor, data, table, columns): logger.debug(f"table: {table}; rowcount: {cursor.rowcount}") + + def load_metadata_bucketscan(count=100): paginator = s3c.get_paginator("list_objects_v2") for page in paginator.paginate( diff --git a/ingest/temp_locations_dump.sql b/ingest/temp_locations_dump.sql index 71ce88f..5cc645a 100644 --- a/ingest/temp_locations_dump.sql +++ b/ingest/temp_locations_dump.sql @@ -2,6 +2,7 @@ DROP TABLE IF EXISTS staging_sensornodes , staging_sensorsystems , staging_sensors +, staging_flags , staging_keys; CREATE {table} IF NOT EXISTS staging_keys ( @@ -32,6 +33,7 @@ CREATE {table} IF NOT EXISTS staging_sensorsystems ( sensor_systems_id int, is_new boolean DEFAULT true, ingest_id text NOT NULL UNIQUE, + instrument_ingest_id text, ingest_sensor_nodes_id text, sensor_nodes_id int, metadata jsonb, @@ -41,12 +43,32 @@ CREATE {table} IF NOT EXISTS staging_sensorsystems ( CREATE {table} IF NOT EXISTS staging_sensors ( ingest_id text, is_new boolean DEFAULT true, + -- source_name text NOT NULL, + -- source_id text NOT NULL, sensors_id int, sensor_systems_id int, ingest_sensor_systems_id text, + status text, measurand text, units text, measurands_id int, + averaging_interval_seconds int, + logging_interval_seconds int, + metadata jsonb, + fetchlogs_id int +); + +CREATE {table} IF NOT EXISTS staging_flags ( + ingest_id text NOT NULL, + sensor_ingest_id text NOT NULL, + flags_id int, + sensor_nodes_id int, + sensors_id int, + flag_types_id int, + datetime_from timestamptz, + datetime_to timestamptz, + period tstzrange, + note text, metadata jsonb, fetchlogs_id int ); diff --git a/ingest/temp_measurements_dump.sql b/ingest/temp_measurements_dump.sql index 0d47f76..4840750 100644 --- a/ingest/temp_measurements_dump.sql +++ b/ingest/temp_measurements_dump.sql @@ -5,20 +5,38 @@ DROP TABLE IF EXISTS CREATE {table} IF NOT EXISTS staging_sensors ( - ingest_id text NOT NULL, + ingest_id text, is_new boolean DEFAULT true, source_name text NOT NULL, source_id text NOT NULL, - measurand text NOT NULL, sensors_id int, sensor_systems_id int, ingest_sensor_systems_id text, + status text, + measurand text, units text, measurands_id int, + averaging_interval_seconds int, + logging_interval_seconds int, metadata jsonb, fetchlogs_id int ); +-- CREATE {table} IF NOT EXISTS staging_sensors ( +-- ingest_id text NOT NULL, +-- is_new boolean DEFAULT true, +-- source_name text NOT NULL, +-- source_id text NOT NULL, +-- measurand text NOT NULL, +-- sensors_id int, +-- sensor_systems_id int, +-- ingest_sensor_systems_id text, +-- units text, +-- measurands_id int, +-- metadata jsonb, +-- fetchlogs_id int +-- ); + CREATE {table} IF NOT EXISTS staging_measurements ( ingest_id text NOT NULL, source_name text NOT NULL, diff --git a/local.py b/local.py index 5df871c..ce645d5 100644 --- a/local.py +++ b/local.py @@ -31,28 +31,35 @@ logging.getLogger('urllib3').setLevel(logging.WARNING) - - -rows = [ - [3, '/home/christian/Downloads/habitatmap-1714036497-h84j.csv', '2024-01-01 00:00:00'], - [4, '/home/christian/Downloads/airgradient-1714003639-h32tu.csv', '2024-01-05'], - [5, '/home/christian/Downloads/senstate-1714007461-ivz5g.csv', '2021-02-01'], - [1, '/home/christian/Downloads/1610335354.csv', '2022-01-01'] - [6, '/home/christian/Downloads/1722384430-2vfvm.json', '2024-07-30'], - [7, '/home/christian/Downloads/1722384430-2vfvm_meas.json', '2024-07-30'] - ] - - # local files #load_measurements_db(pattern = '^/home/christian/.*\\.(csv|json)') # remote files, make sure it can at least read it #load_measurements_db() ## client based methods +## get a client client = IngestClient() -client.load_keys(rows) +## load all the data into the client +client.load_keys([ + [8, '~/Downloads/cac-pipeline/measures/cac/2024-10-18/test_data.json.gz', '2024-10-09'] +]) + +## dump just the locations client.dump() +# rollups and cached tables +client.process_hourly_data() +client.process_daily_data() +client.process_annual_data() +client.refresh_cached_tables() + +#client.dump_locations(load=False) +#client.dump_measurements(load=False) +## dump just the measurements +# client.dump_measurements +## Dump both +#client.dump() + # #client.load(data) # client.load_metadata(data['meta']) # client.load_locations(data['locations']) diff --git a/tests/test_file1.json b/tests/test_file1.json new file mode 100644 index 0000000..228fb0b --- /dev/null +++ b/tests/test_file1.json @@ -0,0 +1,120 @@ +{ + "meta": { + "schema": "v0.1", + "source": "local", + "matching_method": "ingest-id" + }, + "measures": [ + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-01T00:00:00Z", + "measure": 0.01 + }, + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-02T00:00:00Z", + "measure": 0.02 + }, + { + "sensor_id": "local-test_site_2-wind_speed", + "timestamp": "2024-01-01T00:00:00Z", + "measure": 0.01 + }, + { + "sensor_id": "local-test_site_2-wind_speed", + "timestamp": "2024-01-02T00:00:00Z", + "measure": 0.02 + } + ], + "locations": [ + { + "location": "local-test_site_1", + "label": "Test Site #1", + "lat": "45.56", + "lon": -123.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_1-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_1-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-wind_speed-info::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-02", + "flag_name": "info", + "note": "initial flag for sensor" + } + ] + } + ] + }, + { + "system_id": "local-test_site_1-ecotech:serinus_30", + "manufacturer_name": "Ecotech", + "model_name": "Serinus 30", + "sensors": [ + { + "sensor_id": "local-test_site_1-co", + "status": "u", + "parameter": "co", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-co-info::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-05", + "flag_name": "info" + } + ] + } + ] + } + ] + }, + { + "location": "local-test_site_2", + "label": "Test Site #2", + "lat": "47.56", + "lon": -124.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_2-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_2-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [] + }, + { + "sensor_id": "local-test_site_2-wind_direction", + "status": "u", + "parameter": "wd", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_2-wind_direction-error::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-02", + "flag_name": "info" + } + ] + } + ] + } + ] + } + ] +} diff --git a/tests/test_file2.json b/tests/test_file2.json new file mode 100644 index 0000000..9a619d5 --- /dev/null +++ b/tests/test_file2.json @@ -0,0 +1,127 @@ +{ + "meta": { + "schema": "v0.1", + "source": "local", + "matching_method": "ingest-id" + }, + "measures": [ + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-03T00:00:00Z", + "measure": 0.03 + }, + { + "sensor_id": "local-test_site_1-co", + "timestamp": "2024-01-04T00:00:00Z", + "measure": 0.04 + }, + { + "sensor_id": "local-test_site_1-wind_speed", + "timestamp": "2024-01-03T00:00:00Z", + "measure": 0.03 + }, + { + "sensor_id": "local-test_site_1-wind_speed", + "timestamp": "2024-01-04T00:00:00Z", + "measure": 0.04 + } + ], + "locations": [ + { + "location": "local-test_site_1", + "label": "Test Site #1", + "lat": "45.56", + "lon": -123.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_1-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_1-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-wind_speed-info::2024-01-01", + "datetime_from": "2024-01-02", + "datetime_to": "2024-01-04", + "flag_name": "info", + "note": "initial flag for sensor" + }, + { + "flag_id": "local-test_site_1-wind_speed-info::2024-01-01", + "datetime_from": "2024-01-02", + "datetime_to": "2024-01-04", + "flag_name": "info", + "note": "A new note for this sensor" + } + ] + } + ] + }, + { + "system_id": "local-test_site_1-ecotech:serinus_30", + "manufacturer_name": "Ecotech", + "model_name": "Serinus 30", + "sensors": [ + { + "sensor_id": "local-test_site_1-co", + "status": "u", + "parameter": "co", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_1-co-info::2024-01-01", + "datetime_from": "2024-01-01", + "datetime_to": "2024-01-05", + "flag_name": "info" + } + ] + } + ] + } + ] + }, + { + "location": "local-test_site_2", + "label": "Test Site #2", + "lat": "47.56", + "lon": -124.45, + "ismobile": "false", + "systems": [ + { + "system_id": "local-test_site_2-metone:aio2", + "manufacturer_name": "MetOne", + "model_name": "AIO2", + "sensors": [ + { + "sensor_id": "local-test_site_2-wind_speed", + "status": "u", + "parameter": "ws", + "interval_seconds": "3600", + "flags": [] + }, + { + "sensor_id": "local-test_site_2-wind_direction", + "status": "u", + "parameter": "wd", + "interval_seconds": "3600", + "flags": [ + { + "flag_id": "local-test_site_2-wind_direction-error::2024-01-03", + "datetime_from": "2024-01-03", + "datetime_to": "2024-01-04", + "flag_name": "info" + } + ] + } + ] + } + ] + } + ] +} diff --git a/tests/test_flags.py b/tests/test_flags.py new file mode 100644 index 0000000..8ac6fd4 --- /dev/null +++ b/tests/test_flags.py @@ -0,0 +1,48 @@ +import os +import sys +import orjson +import psycopg2 +import logging +from time import time +import csv + +os.chdir(os.path.dirname(os.path.dirname(__file__))) + +from ingest.lcsV2 import ( + IngestClient, +) + + +logger = logging.getLogger('handler') + +logging.basicConfig( + format='[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s', + level='DEBUG', + force=True, +) + +logging.getLogger('boto3').setLevel(logging.WARNING) +logging.getLogger('botocore').setLevel(logging.WARNING) +logging.getLogger('urllib3').setLevel(logging.WARNING) + + +## client based methods +## get a client +client = IngestClient() +## load all the data into the client +client.load_keys([[1, './tests/test_file1.json', '2024-01-01']]) +## load the data +client.dump(load=True) +#client.dump_locations(load=False) +#client.dump_measurements(load=True) + +client.reset() + +client.load_keys([[2, './tests/test_file2.json', '2024-01-02']]) +## load the data +client.dump(load=True) + +client.process_hourly_data() +client.process_daily_data() +client.process_annual_data() +client.refresh_cached_tables() From 9a9f3cf9639577cf6348a8ab0967816ba7c078d0 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 23 Oct 2024 09:58:54 -0700 Subject: [PATCH 36/42] Fixed bug with inserted new hourly data --- ingest/etl_process_measurements.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql index f6513b9..39029eb 100644 --- a/ingest/etl_process_measurements.sql +++ b/ingest/etl_process_measurements.sql @@ -372,6 +372,7 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las INSERT INTO hourly_data_queue (datetime, tz_offset) SELECT as_utc_hour(datetime, tz_offset), tz_offset FROM inserted_hours + GROUP BY 1, 2 ON CONFLICT (datetime, tz_offset) DO UPDATE SET modified_on = now(); From fb0ee0ccdf025dbbf48817a9c0fe64e9ed157ea3 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 24 Oct 2024 13:26:07 -0700 Subject: [PATCH 37/42] Updated the insterted_hours method --- check.py | 5 ++++- ingest/etl_process_measurements.sql | 24 ++++++++++++++++++++++- ingest/lcs_meas_ingest.sql | 30 ++++++++++++++++++++++++++--- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/check.py b/check.py index aa247df..5744fb5 100644 --- a/check.py +++ b/check.py @@ -82,9 +82,12 @@ from ingest.lcs import ( load_metadata, + load_metadata_batch, +) + +from ingest.lcsV2 import ( load_measurements, load_measurements_batch, - load_metadata_batch, ) from ingest.fetch import ( diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql index 39029eb..ee38adc 100644 --- a/ingest/etl_process_measurements.sql +++ b/ingest/etl_process_measurements.sql @@ -53,7 +53,7 @@ FROM staging_measurements; -- this is a short term fix -- a long term fix would not allow duplicate source_id's WITH staged_sensors AS ( - -- this first part signficantly speeds it up on slow machines + -- this first part significantly speeds it up on slow machines SELECT DISTINCT ingest_id FROM staging_measurements ), ranked_sensors AS ( @@ -377,6 +377,28 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las SET modified_on = now(); + + WITH inserted_hours AS ( + -- first we group things, adding an hour to make it time-ending after truncating + SELECT datetime + '1h'::interval as datetime + , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset + FROM measurements m + JOIN sensors s ON (s.sensors_id = m.sensors_id) + JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) + JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id) + JOIN timezones tz ON (sn.timezones_id = tz.timezones_id) + WHERE m.added_on > now() - '1h'::interval + GROUP BY 1, 2 + ) + INSERT INTO hourly_data_queue (datetime, tz_offset) + SELECT as_utc_hour(datetime, tz_offset), tz_offset + FROM inserted_hours + GROUP BY 1, 2 + ON CONFLICT (datetime, tz_offset) DO UPDATE + SET modified_on = now(); + + + --Update the export queue/logs to export these records --wrap it in a block just in case the database does not have this module installed --we subtract the second because the data is assumed to be time ending diff --git a/ingest/lcs_meas_ingest.sql b/ingest/lcs_meas_ingest.sql index 468b6ab..8ab06a7 100644 --- a/ingest/lcs_meas_ingest.sql +++ b/ingest/lcs_meas_ingest.sql @@ -47,17 +47,41 @@ INTO __total_measurements FROM meas; +-- -- The ranking is to deal with the current possibility +-- -- that duplicate sensors with the same ingest/source id are created +-- -- this is a short term fix +-- -- a long term fix would not allow duplicate source_id's +-- WITH ranked_sensors AS ( +-- SELECT s.sensors_id +-- , s.source_id +-- , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk +-- FROM sensors s +-- JOIN meas m ON (s.source_id = m.ingest_id) +-- WHERE s.is_active +-- ), active_sensors AS ( +-- SELECT source_id +-- , sensors_id +-- FROM ranked_sensors +-- WHERE rnk = 1) +-- UPDATE meas +-- SET sensors_id=s.sensors_id +-- FROM active_sensors s +-- WHERE s.source_id=ingest_id; + -- The ranking is to deal with the current possibility -- that duplicate sensors with the same ingest/source id are created -- this is a short term fix -- a long term fix would not allow duplicate source_id's -WITH ranked_sensors AS ( +WITH staged_sensors AS ( + -- this first part signficantly speeds it up on slow machines + SELECT DISTINCT ingest_id + FROM meas +), ranked_sensors AS ( SELECT s.sensors_id , s.source_id , RANK() OVER (PARTITION BY s.source_id ORDER BY added_on ASC) as rnk FROM sensors s - JOIN meas m ON (s.source_id = m.ingest_id) - WHERE s.is_active + JOIN staged_sensors m ON (s.source_id = m.ingest_id) ), active_sensors AS ( SELECT source_id , sensors_id From 0016d3d6802c1637a1c3f10524ab73b35d73bc0c Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Thu, 24 Oct 2024 13:30:20 -0700 Subject: [PATCH 38/42] Removed query --- ingest/etl_process_measurements.sql | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql index ee38adc..5d911f7 100644 --- a/ingest/etl_process_measurements.sql +++ b/ingest/etl_process_measurements.sql @@ -377,28 +377,6 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las SET modified_on = now(); - - WITH inserted_hours AS ( - -- first we group things, adding an hour to make it time-ending after truncating - SELECT datetime + '1h'::interval as datetime - , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset - FROM measurements m - JOIN sensors s ON (s.sensors_id = m.sensors_id) - JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) - JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id) - JOIN timezones tz ON (sn.timezones_id = tz.timezones_id) - WHERE m.added_on > now() - '1h'::interval - GROUP BY 1, 2 - ) - INSERT INTO hourly_data_queue (datetime, tz_offset) - SELECT as_utc_hour(datetime, tz_offset), tz_offset - FROM inserted_hours - GROUP BY 1, 2 - ON CONFLICT (datetime, tz_offset) DO UPDATE - SET modified_on = now(); - - - --Update the export queue/logs to export these records --wrap it in a block just in case the database does not have this module installed --we subtract the second because the data is assumed to be time ending From 37ba6176e142f065154c5a23f9f0bd093a528ff3 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Tue, 12 Nov 2024 10:08:02 -0800 Subject: [PATCH 39/42] Redirected the realtime hourly queue update to the new table --- ingest/fetch_ingest_full.sql | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/ingest/fetch_ingest_full.sql b/ingest/fetch_ingest_full.sql index f504ff7..3234d53 100644 --- a/ingest/fetch_ingest_full.sql +++ b/ingest/fetch_ingest_full.sql @@ -832,12 +832,31 @@ SET datetime_last = GREATEST(sensors_rollup.datetime_last, EXCLUDED.datetime_las -- Update the table that will help to track hourly rollups -INSERT INTO hourly_stats (datetime) - SELECT date_trunc('hour', datetime) - FROM temp_inserted_measurements - GROUP BY 1 -ON CONFLICT (datetime) DO UPDATE -SET modified_on = now(); +--INSERT INTO hourly_stats (datetime) +-- SELECT date_trunc('hour', datetime) +-- FROM temp_inserted_measurements +-- GROUP BY 1 +--ON CONFLICT (datetime) DO UPDATE +--SET modified_on = now(); + + WITH inserted_hours AS ( + -- first we group things, adding an hour to make it time-ending after truncating + SELECT datetime + '1h'::interval as datetime + , utc_offset(datetime + '1h'::interval, tz.tzid) as tz_offset + FROM temp_inserted_measurements m + JOIN sensors s ON (s.sensors_id = m.sensors_id) + JOIN sensor_systems sy ON (s.sensor_systems_id = sy.sensor_systems_id) + JOIN sensor_nodes sn ON (sy.sensor_nodes_id = sn.sensor_nodes_id) + JOIN timezones tz ON (sn.timezones_id = tz.timezones_id) + GROUP BY 1, 2 + ) + INSERT INTO hourly_data_queue (datetime, tz_offset) + SELECT as_utc_hour(datetime, tz_offset), tz_offset + FROM inserted_hours + GROUP BY 1, 2 + ON CONFLICT (datetime, tz_offset) DO UPDATE + SET modified_on = now(); + -- update the table that will track the daily exports WITH e AS ( From 398c0f5dac08d4838ce5eaafb460a9a438615212 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 13 Nov 2024 10:57:52 -0800 Subject: [PATCH 40/42] Updated to support uuid --- README.md | 2 ++ check.py | 8 +++----- ingest/handler.py | 1 - ingest/lcsV2.py | 12 ++++++------ local.py | 12 ++++++------ 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index d90d3e4..a5a4068 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,5 @@ # Testing a realtime file + +# Testing files diff --git a/check.py b/check.py index 5744fb5..71020fc 100644 --- a/check.py +++ b/check.py @@ -5,8 +5,7 @@ import orjson import psycopg2 - -logger = logging.getLogger(__name__) +logger = logging.getLogger('check.py') #os.chdir('/home/christian/git/caparker/openaq-ingestor/ingest') #print(os.getcwd()) @@ -77,7 +76,7 @@ os.environ['USE_TEMP_TABLES'] = 'False' from botocore.exceptions import ClientError -from ingest.handler import cronhandler, logger +from ingest.handler import cronhandler from ingest.settings import settings from ingest.lcs import ( @@ -157,8 +156,6 @@ def check_realtime_key(key: str, fix: bool = False): mark_success(key=key, reset=True) -logger.debug(settings) - if args.file is not None: # check if the files exists # is it a realtime file or a lcs file? @@ -175,6 +172,7 @@ def check_realtime_key(key: str, fix: bool = False): # get just the keys keys = [log[1] for log in logs] # loop through and check each + logger.info(f"Downloading {len(keys)} files") for idx, key in enumerate(keys): if args.download: # we may be using the new source pat diff --git a/ingest/handler.py b/ingest/handler.py index 463dec0..f827c3b 100644 --- a/ingest/handler.py +++ b/ingest/handler.py @@ -39,7 +39,6 @@ def handler(event, context): else: keys = getKeysFromS3Record(record) - logger.debug(keys) for obj in keys: bucket = obj['bucket'] key = obj['key'] diff --git a/ingest/lcsV2.py b/ingest/lcsV2.py index 5aebd59..f3fecae 100644 --- a/ingest/lcsV2.py +++ b/ingest/lcsV2.py @@ -400,8 +400,6 @@ def load_measurements(self, measurements): logger.debug(f'Loading {len(measurements)} measurements') for meas in measurements: self.add_measurement(meas) - logger.debug(f'Loaded measurements') - def add_sensor(self, j, system_id, fetchlogsId): @@ -443,7 +441,7 @@ def add_sensor(self, j, system_id, fetchlogsId): if not sensor.get('measurand'): # get it from the ingest id ingest_arr = sensor.get('ingest_id').split('-') - sensor['measurand'] = ingest_arr[-1] + sensor['measurand'] = ingest_arr[-1] # take the last one sensor["metadata"] = orjson.dumps(metadata).decode() self.sensors.append(sensor) self.sensor_ids.append(id) @@ -488,6 +486,7 @@ def add_systems(self, j, node_id, fetchlogsId): continue ingest_arr = id.split('-') + # this will not work with a uuid passed as a site id if len(ingest_arr) == 3: system["instrument_ingest_id"] = ingest_arr[-1]; @@ -541,7 +540,8 @@ def add_node(self, j): # support ingest id that is just the source id if node.get('source_id') is None: if len(ingest_arr)>1: - node['source_id'] = ingest_arr[1] + # updated to handle uuid + node['source_id'] = '-'.join(ingest_arr[1:len(ingest_arr)]) else: node['source_id'] = ingest_arr[0] @@ -610,8 +610,8 @@ def add_measurement(self, m): return source_name = ingest_arr[0] - source_id = ingest_arr[1] - measurand = ingest_arr[2] + source_id = '-'.join(ingest_arr[1:len(ingest_arr)-1]) + measurand = ingest_arr[-1] if not None in [ingest_id, datetime, source_name, source_id, measurand]: self.measurements.append([ingest_id, source_name, source_id, measurand, value, datetime, lon, lat, fetchlogs_id]) diff --git a/local.py b/local.py index ce645d5..3e770c1 100644 --- a/local.py +++ b/local.py @@ -41,19 +41,19 @@ client = IngestClient() ## load all the data into the client client.load_keys([ - [8, '~/Downloads/cac-pipeline/measures/cac/2024-10-18/test_data.json.gz', '2024-10-09'] + [4264878, '~/Downloads/openaq-fetches/lcs-etl-pipeline/measures/lovemyair/2024-11-12/1731445632-1snpf.json', '2024-10-23'] ]) ## dump just the locations client.dump() # rollups and cached tables -client.process_hourly_data() -client.process_daily_data() -client.process_annual_data() -client.refresh_cached_tables() +#client.process_hourly_data() +#client.process_daily_data() +#client.process_annual_data() +#client.refresh_cached_tables() -#client.dump_locations(load=False) +#client.dump_locations(False) #client.dump_measurements(load=False) ## dump just the measurements # client.dump_measurements From 62f017ca0e25eb61a417f6da17397966744d8253 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 13 Nov 2024 11:05:24 -0800 Subject: [PATCH 41/42] Changed fake fetchlogs id --- local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/local.py b/local.py index 3e770c1..7a52adf 100644 --- a/local.py +++ b/local.py @@ -41,7 +41,7 @@ client = IngestClient() ## load all the data into the client client.load_keys([ - [4264878, '~/Downloads/openaq-fetches/lcs-etl-pipeline/measures/lovemyair/2024-11-12/1731445632-1snpf.json', '2024-10-23'] + [1, '~/Downloads/openaq-fetches/lcs-etl-pipeline/measures/lovemyair/2024-11-12/1731445632-1snpf.json', '2024-10-23'] ]) ## dump just the locations From d1ca447ac2f390858c8f9206a94aed3bf2596822 Mon Sep 17 00:00:00 2001 From: Christian Parker Date: Wed, 13 Nov 2024 12:18:31 -0800 Subject: [PATCH 42/42] Added backup sd value of zero --- ingest/etl_process_measurements.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/etl_process_measurements.sql b/ingest/etl_process_measurements.sql index 5d911f7..7c7a44a 100644 --- a/ingest/etl_process_measurements.sql +++ b/ingest/etl_process_measurements.sql @@ -328,7 +328,7 @@ SELECT l.sensors_id , l.value -- last value , l.value_count , l.value_avg -, l.value_sd +, COALESCE(l.value_sd, 0) , l.value -- min , l.value -- max , public.pt3857(lon, lat)