Skip to content

Commit

Permalink
[Feature] Add spark support (#279)
Browse files Browse the repository at this point in the history
* Add Spark CI support

* Udate macros to support Spark

* Update tests

* Update CirlceCI dbt profiles

* Update test configs to warn for Spark only

* Fix CI typo
  • Loading branch information
clausherther authored Sep 20, 2023
1 parent f22c7e3 commit af7ed5c
Show file tree
Hide file tree
Showing 15 changed files with 233 additions and 27 deletions.
91 changes: 71 additions & 20 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ version: 2.1

jobs:

integration-tests:
integration-tests-core:

docker:
- image: cimg/python:3.9.9
- image: cimg/postgres:14.0
Expand All @@ -13,38 +14,32 @@ jobs:
DBT_PROFILES_DIR: ./integration_tests/ci
DBT_PROJECT_DIR: ./integration_tests
BIGQUERY_SERVICE_KEY_PATH: "/home/circleci/bigquery-service-key.json"
DBT_VERSION: 1.6.0
DBT_VERSION: 1.6.*

steps:
- checkout
- run:
name: Install Python packages
- run: &pip-install-core
name: Install core Python packages & dbt-core
command: |
python3 -m venv venv
. venv/bin/activate
pip install -U pip setuptools wheel
pip install dbt-core==$DBT_VERSION dbt-postgres==$DBT_VERSION dbt-bigquery==$DBT_VERSION dbt-snowflake==$DBT_VERSION dbt-duckdb==$DBT_VERSION
pip install "dbt-core==$DBT_VERSION"
- run:
name: Install dbt adapter packages
command: |
python3 -m venv venv
. venv/bin/activate
pip install "dbt-postgres==$DBT_VERSION" "dbt-bigquery==$DBT_VERSION" "dbt-snowflake==$DBT_VERSION"
pip install "dbt-duckdb==$DBT_VERSION"
- run: &dbt-deps
name: Install dbt dependencies
command: |
. venv/bin/activate
dbt deps --project-dir $DBT_PROJECT_DIR
# - run:
# name: "Run SQLFluff"
# environment:
# POSTGRES_HOST: localhost
# POSTGRES_TEST_USER: postgres
# POSTGRES_TEST_PASSWORD: ''
# POSTGRES_TEST_PORT: 5432
# POSTGRES_TEST_DATABASE: circle_test
# POSTGRES_TEST_SCHEMA: dbt_expectations_integration_tests
# command: |
# . venv/bin/activate
# cd $DBT_PROJECT_DIR
# sqlfluff lint models

- run:
name: "Run Tests - Postgres"
environment:
Expand All @@ -63,6 +58,9 @@ jobs:
command: |
echo "Writing to $BIGQUERY_SERVICE_KEY_PATH"
echo $BIGQUERY_SERVICE_KEY > $BIGQUERY_SERVICE_KEY_PATH
FILESIZE=$(stat -c%s "$BIGQUERY_SERVICE_KEY_PATH")
echo "Size of $BIGQUERY_SERVICE_KEY_PATH = $FILESIZE bytes."
echo "BIGQUERY_TEST_DATABASE = $BIGQUERY_TEST_DATABASE"
- run:
name: "Run Tests - BigQuery"
Expand All @@ -85,12 +83,65 @@ jobs:
- store_artifacts:
path: ./logs

integration-tests-spark-thrift:

docker:
- image: cimg/python:3.9.9
- image: godatadriven/spark:3.1.1
environment:
WAIT_FOR: localhost:5432
command: >
--class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
--name Thrift JDBC/ODBC Server
- image: postgres:9.6.17-alpine
environment:
POSTGRES_USER: dbt
POSTGRES_PASSWORD: dbt
POSTGRES_DB: metastore

resource_class: small

environment:
DBT_PROFILES_DIR: ./integration_tests/ci
DBT_PROJECT_DIR: ./integration_tests
DBT_VERSION: 1.6.*

steps:
- checkout
- run:
name: Install Ubuntu packages
command: |
sudo apt-get update
sudo apt-get install libsasl2-dev libsasl2-2
- run: *pip-install-core
- run:
name: Install dbt adapter packages
command: |
python3 -m venv venv
. venv/bin/activate
pip install dbt-spark "dbt-spark[PyHive]"
- run: *dbt-deps
- run:
name: Wait for Spark-Thrift
command: dockerize -wait tcp://localhost:10000 -timeout 15m -wait-retry-interval 5s
- run:
name: "Run Tests - Spark"
command: |
. venv/bin/activate
dbt build -t spark --project-dir $DBT_PROJECT_DIR
- store_artifacts:
path: ./logs

workflows:
version: 2
test-all:
jobs:
- hold:
type: approval
- integration-tests:
- integration-tests-core:
requires:
- hold
- integration-tests-spark-thrift:
requires:
- hold
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ target/
dbt_packages/
logs/
.python-version
integration_tests/.spark-warehouse
integration_tests/.hive-metastore
13 changes: 13 additions & 0 deletions integration_tests/ci/profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,17 @@ integration_tests:
type: duckdb
path: ":memory:"

spark:
type: spark
method: thrift
host: 127.0.0.1
port: 10000
user: dbt
schema: analytics
connect_retries: 5
connect_timeout: 60
retry_all: true
server_side_parameters:
"spark.sql.parser.escapedStringLiterals": true

target: postgres
28 changes: 28 additions & 0 deletions integration_tests/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
version: "3.7"
services:

dbt-spark3-thrift:
image: godatadriven/spark:3.1.3
ports:
- "10000:10000"
- "4040:4040"
depends_on:
- dbt-hive-metastore
command: >
--class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
--name Thrift JDBC/ODBC Server
volumes:
- ./.spark-warehouse/:/spark-warehouse/
- ./docker/hive-site.xml:/usr/spark/conf/hive-site.xml
- ./docker/spark-defaults.conf:/usr/spark/conf/spark-defaults.conf
environment:
- WAIT_FOR=dbt-hive-metastore:5432

dbt-hive-metastore:
image: postgres:9.6.17-alpine
volumes:
- ./.hive-metastore/:/var/lib/postgresql/data
environment:
- POSTGRES_USER=dbt
- POSTGRES_PASSWORD=dbt
- POSTGRES_DB=metastore
1 change: 1 addition & 0 deletions integration_tests/docker-start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker-compose up -d
1 change: 1 addition & 0 deletions integration_tests/docker-stop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker-compose down && rm -rf ./.hive-metastore/ && rm -rf ./.spark-warehouse/
46 changes: 46 additions & 0 deletions integration_tests/docker/hive-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<configuration>

<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:postgresql://dbt-hive-metastore/metastore</value>
</property>

<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.postgresql.Driver</value>
</property>

<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>dbt</value>
</property>

<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>dbt</value>
</property>

<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
</configuration>
10 changes: 10 additions & 0 deletions integration_tests/docker/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
spark.driver.memory 2g
spark.executor.memory 2g
spark.hadoop.datanucleus.autoCreateTables true
spark.hadoop.datanucleus.schema.autoCreateTables true
spark.hadoop.datanucleus.fixedDatastore false
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.jars.packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.driver.userClassPathFirst true
# spark.sql.parser.escapedStringLiterals true
27 changes: 25 additions & 2 deletions integration_tests/models/schema_tests/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ models:
- dbt_expectations.expect_column_values_to_match_regex:
regex: "[A-Z]"
flags: i
config:
enabled: "{{ target.type not in ['bigquery', 'spark' ] }}"
# match all uppercase with inline case-insensitive flag and case-insensitive flag parameter (where implemented)
# check that adapters handling flags by inlining them don't break because of the flag duplication
- dbt_expectations.expect_column_values_to_match_regex:
Expand All @@ -29,7 +31,7 @@ models:
regex: "[A-Z]"
flags: c
config:
enabled: "{{ target.type in ['postgres', 'snowflake', 'redshift' ] }}"
enabled: "{{ target.type not in ['bigquery', 'spark' ] }}"
error_if: "=0"
warn_if: "<4"
# do not match other non-email string, should pass
Expand Down Expand Up @@ -63,12 +65,14 @@ models:
- dbt_expectations.expect_column_values_to_match_regex_list:
regex_list: ["[A-G]", "[H-Z]"]
flags: i
config:
enabled: "{{ target.type not in ['bigquery', 'spark' ] }}"
# match all uppercase, but match case-sensitive (where implemented), should fail
- dbt_expectations.expect_column_values_to_match_regex_list:
regex_list: ["[A-G]", "[H-Z]"]
flags: c
config:
enabled: "{{ target.type in ['postgres', 'snowflake', 'redshift' ] }}"
enabled: "{{ target.type not in ['bigquery', 'spark' ] }}"
error_if: "=0"
warn_if: "<4"
# match email address or other string
Expand Down Expand Up @@ -98,12 +102,15 @@ models:
- dbt_expectations.expect_column_values_to_not_match_regex_list:
regex_list: ["[A-Z]", "[0-9]"]
flags: i
config:
enabled: "{{ target.type not in ['bigquery', 'spark' ] }}"
# do not match all uppercase and numbers, case-insensitive
- dbt_expectations.expect_column_values_to_not_match_regex_list:
regex_list: ["[A-Z]", "[0-9]"]
flags: i
match_on: all
config:
enabled: "{{ target.type not in ['bigquery', 'spark' ] }}"
error_if: "=0"
warn_if: "<4"
# match '@' anywhere in string
Expand Down Expand Up @@ -144,15 +151,19 @@ models:
config:
error_if: "=0"
warn_if: "<4"

- name: postal_code_5
tests:
- dbt_expectations.expect_column_values_to_match_regex:
regex: "^\\d{5}"
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
- dbt_expectations.expect_column_values_to_match_regex:
regex: "^\\d{55}"
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
error_if: "=0"
warn_if: "<4"
- dbt_expectations.expect_column_values_to_not_match_regex:
Expand All @@ -162,19 +173,25 @@ models:
regex: "^\\d{5}"
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
error_if: "=0"
warn_if: "<4"
- dbt_expectations.expect_column_values_to_match_regex_list:
regex_list: ["^\\d{5}"]
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
- dbt_expectations.expect_column_values_to_match_regex_list:
regex_list: ["^\\d{5}", "@[^.]*"]
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
- dbt_expectations.expect_column_values_to_match_regex_list:
regex_list: ["^\\d{5}", "@[^.]*"]
is_raw: True
match_on: all
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
error_if: "=0"
warn_if: "<4"
- dbt_expectations.expect_column_values_to_not_match_regex_list:
Expand All @@ -183,22 +200,28 @@ models:
- dbt_expectations.expect_column_values_to_not_match_regex_list:
regex_list: ["^\\d{5}", "@[^.]*"]
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
- dbt_expectations.expect_column_values_to_not_match_regex_list:
regex_list: ["^\\d{5}", "@[^.]*"]
is_raw: True
match_on: all
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
error_if: "=0"
warn_if: "<4"
- name: postal_code_5_3
tests:
- dbt_expectations.expect_column_values_to_match_regex:
regex: "^\\d{5}-\\d{3}"
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
- dbt_expectations.expect_column_values_to_match_regex:
regex: "^\\d{5}-\\d{9}"
is_raw: True
config:
severity: "{{ 'warn' if target.type in ['spark' ] else 'error' }}"
error_if: "=0"
warn_if: "<4"

Expand Down
6 changes: 6 additions & 0 deletions macros/math/log_natural.sql
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@
ln({{ x }})

{%- endmacro -%}

{% macro spark__log_natural(x) -%}

ln({{ x }})

{%- endmacro -%}
5 changes: 5 additions & 0 deletions macros/math/percentile_cont.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@
percentile_cont({{ field }}, {{ quantile }})
over({%- if partition %}partition by {{ partition }}{% endif -%})
{% endmacro %}

{% macro spark__quantile(field, quantile, partition) -%}
percentile({{ field }}, {{ quantile }})
over({%- if partition %}partition by {{ partition }}{% endif -%})
{% endmacro %}
Loading

0 comments on commit af7ed5c

Please sign in to comment.