Skip to content

Commit

Permalink
Merge pull request #16 from EJOOSTEROP/feature_mapping
Browse files Browse the repository at this point in the history
Add Feature mapping, additional test and documentation.
  • Loading branch information
EJOOSTEROP authored Apr 18, 2023
2 parents 27d53cb + fa9433c commit 5f35ccc
Show file tree
Hide file tree
Showing 17 changed files with 211 additions and 663 deletions.
File renamed without changes.
3 changes: 2 additions & 1 deletion dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ RUN mkdir -p /${MELTANO_PROJ_ROOT}/data/dev/ \
&& mkdir -p /${MELTANO_PROJ_ROOT}/data/prod/ \
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/dev/data.duckdb "select * from pg_tables;" \
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/test/data.duckdb "select * from pg_tables;" \
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/prod/data.duckdb "select * from pg_tables;"
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/prod/data.duckdb "select * from pg_tables;" \
&& meltano invoke dbt-duckdb:deps


###RUN chmod -R u+x /project/data/
Expand Down
27 changes: 12 additions & 15 deletions meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ plugins:
params:
date: $ENV_DATE_GIE
#date: '2023-02-15'
stream_maps:
stg_gie_storage:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
#__alias__: stg_gie_storage_vX
stg_gie_company:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
stg_gie_country:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
stg_gie_region:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
stream_map_config:
hash_seed: 01AWZh7A6DzGm6iJZZ2T
streams:
- name: stg_gie_storage
path: /api
Expand Down Expand Up @@ -132,15 +144,13 @@ plugins:
pip_url: target-duckdb~=0.4
config:
add_metadata_columns: true
#default_target_schema: gie_stage
default_target_schema: main
filepath: $DB_LOCATION
data_flattening_max_level: 10
- name: target-duckdb-usgs
inherit_from: target-duckdb
config:
add_metadata_columns: true
#default_target_schema: usgs_stage
default_target_schema: main
filepath: $DB_LOCATION
data_flattening_max_level: 10
Expand All @@ -149,7 +159,6 @@ plugins:
inherit_from: target-duckdb
config:
add_metadata_columns: true
#default_target_schema: gie
default_target_schema: main
filepath: $DB_LOCATION
data_flattening_max_level: 10
Expand All @@ -158,24 +167,18 @@ plugins:
- name: airflow
variant: apache
pip_url: apache-airflow==2.1.2 --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.1.2/constraints-${MELTANO__PYTHON_VERSION}.txt
# transformers:
# - name: dbt-duckdb
# variant: jwills
# pip_url: dbt-core~=1.2.0 dbt-duckdb~=1.2.0
files:
- name: files-airflow
variant: meltano
pip_url: git+https://github.com/meltano/files-airflow.git --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.1.2/constraints-${MELTANO__PYTHON_VERSION}.txt
utilities:
- name: superset
variant: apache
#pip_url: apache-superset==1.5.0 markupsafe==2.0.1 duckdb-engine==0.6.4
pip_url: apache-superset==2.0.0 flask==2.0.3 werkzeug==2.0.3 jinja2==3.0.1 wtforms==2.3.3
git+https://github.com/meltano/superset-ext.git@main cryptography==3.4.7 markupsafe==2.0.1
duckdb-engine==0.7.0
- name: dbt-duckdb
variant: jwills
#pip_url: dbt-core~=1.3.0 dbt-duckdb~=1.3.0 git+https://github.com/meltano/dbt-ext.git@main
pip_url: dbt-core~=1.4.0 dbt-duckdb~=1.4.0 git+https://github.com/meltano/dbt-ext.git@main
commands:
usgs:
Expand All @@ -188,12 +191,6 @@ jobs:
tasks:
- stg_usgs target-duckdb-usgs dbt-duckdb:usgs
schedules:
#- name: USGS-Earthquake
# interval: 35 */1 * * *
# extractor: stg_usgs
# loader: target-duckdb-usgs
# transform: skip
# start_date: 2023-01-01 15:40:21.295936
- name: USGS-Earthquake
interval: 35 */1 * * *
job: usgs-to-duckdb-rpt
Expand Down
2 changes: 2 additions & 0 deletions meltano_transform/models/gie_rpt/rpt_gie_storage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ SSO AS
gasdaystart::DATE gasdaystart,
split_part(url, '/', 2) as country,
split_part(url, '/', 3) as company_eic,
key_hash,
code as sso_eic,
name as sso_name,
status,
Expand Down Expand Up @@ -36,6 +37,7 @@ FROM
select
_sdc_batched_at,
_sdc_extracted_at,
key_hash,
sso.gasdaystart,
country,
SSO.company_eic,
Expand Down
12 changes: 12 additions & 0 deletions meltano_transform/models/gie_rpt/rpt_gie_storage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,25 @@ models:
+tags:
- gie

tests:
- dbt_utils.unique_combination_of_columns:
combination_of_columns:
- sso_eic
- gasdaystart

columns:
- name: _sdc_batched_at
description: Timestamp when the data was captured in the database.

- name: _sdc_extracted_at
description: Timestamp when the data was retrieved from the REST API.

- name: key_hash
description: Has of sso_eic and gasdaystart.
tests:
- unique
- not_null

- name: gasdaystart
description: Date of the observation. Ex. the injection field refers to the injection on this date. gasinstorage as per end of the gasdaystart.
tests:
Expand Down
4 changes: 4 additions & 0 deletions meltano_transform/models/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

![mimodast Logo](https://github.com/EJOOSTEROP/mimodast/blob/master/assets/hatchful/logo_transparent.png)

<a href="https://github.com/EJOOSTEROP/mimodast">
<img src="assets/hatchful/logo_transparent.png" alt="Logo" width="180" height="180">
</a>

# Mimodast dbt Project
Mimodast is a minimal modern data stack with working data pipelines in a single Docker container.

Expand Down
3 changes: 3 additions & 0 deletions meltano_transform/packages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
packages:
- package: dbt-labs/dbt_utils
version: 1.0.0
3 changes: 3 additions & 0 deletions meltano_transform/tests/richter_max.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT *
FROM {{ ref('rpt_usgs_events')}}
WHERE magnitude > 10
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT *
FROM {{ ref('rpt_gie_storage')}}
WHERE _sdc_batched_at < _sdc_extracted_at
WHERE gasinstorage - workinggasvolume > 1
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ version: 2

# NOTE: THIS DOES NOT WORK. UNCLEAR IF YOU CAN DOCUMENT A TEST AT THIS STAGE.
tests:
- name: stage_timing
- name: storage_max
description: Data needs to be extracted before it can be stored in the database.
Loading

0 comments on commit 5f35ccc

Please sign in to comment.