Skip to content

Commit

Permalink
Merge pull request #14 from sfc-gh-vraudszus/main
Browse files Browse the repository at this point in the history
Cover new CoAs, PythonAPI and CLI features
  • Loading branch information
sfc-gh-vraudszus authored Nov 5, 2024
2 parents c1cb092 + 901c32d commit 945c056
Show file tree
Hide file tree
Showing 15 changed files with 337 additions and 172 deletions.
30 changes: 21 additions & 9 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
// For format details, see https://aka.ms/devcontainer.json.
{
"name": "Snowflake Demo Codespace",
// Configure tool-specific properties.
"customizations": {
"vscode": {
"extensions": [
"snowflake.snowflake-vsc"
]
"name": "Snowflake Demo Codespace",
"features": {
"ghcr.io/devcontainers/features/python:1": {
"version": "3.11"
}
},
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"snowflake.snowflake-vsc"
],
"settings": {
"snowflake.snowsqlConfigPath": "${containerWorkspaceFolder}/.snowflake/config.toml"
}
}
}
},
"updateContentCommand": "bash .devcontainer/install-dependencies.sh",
"postCreateCommand": "chmod 0600 \"${containerWorkspaceFolder}/.snowflake/config.toml\"",
"containerEnv": {
"SNOWFLAKE_HOME": "${containerWorkspaceFolder}/.snowflake"
}
}
4 changes: 4 additions & 0 deletions .devcontainer/install-dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
pip install snowflake-snowpark-python
pip install snowflake.core
pipx install snowflake-cli --python 3.11
9 changes: 5 additions & 4 deletions .github/workflows/deploy_pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ jobs:
uses: actions/checkout@v4

# Install Snowflake CLI GitHub Action and point to config file
- uses: Snowflake-Labs/snowflake-cli-action@v1
- name: Install snowflake-cli
uses: Snowflake-Labs/[email protected]
with:
cli-version: "latest"
default-config-file-path: "config.toml"
default-config-file-path: ".snowflake/config.toml"

# Update Snowflake's copy of the repository
- name: Fetch repository changes
Expand All @@ -42,10 +43,10 @@ jobs:
BRANCH_NAME=${{ github.ref_name }}
if [ "${BRANCH_NAME}" == "main" ]; then
RETENTION_TIME=1
elif [ "${BRANCH_NAME}" == "dev" ]; then
else
RETENTION_TIME=0
fi
snow git execute \
"@${REPO_NAME}/branches/${BRANCH_NAME}/deploy_parametrized_pipeline.sql" \
"@${REPO_NAME}/branches/${BRANCH_NAME}/steps/0[134]_*" \
-D "environment='${BRANCH_NAME}'" \
-D "retention_time=${RETENTION_TIME}"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.snowflake/logs
8 changes: 8 additions & 0 deletions .snowflake/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[connections.default]
warehouse = "QUICKSTART_WH"
role = "ACCOUNTADMIN"
database = "QUICKSTART_COMMON"
schema = "PUBLIC"
account = "<INSERT YOUR ACCOUNT IDENTIFIER>"
user = "<INSERT YOUR USERNAME>"
password = "<INSERT YOUR PASSWORD>"
3 changes: 0 additions & 3 deletions config.toml

This file was deleted.

3 changes: 0 additions & 3 deletions deploy_parametrized_pipeline.sql

This file was deleted.

3 changes: 0 additions & 3 deletions deploy_pipeline.sql

This file was deleted.

17 changes: 10 additions & 7 deletions steps/01_setup_snowflake.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
USE ROLE ACCOUNTADMIN;

CREATE WAREHOUSE IF NOT EXISTS QUICKSTART_WH WAREHOUSE_SIZE = XSMALL, AUTO_SUSPEND = 300, AUTO_RESUME= TRUE;
CREATE OR ALTER WAREHOUSE QUICKSTART_WH
WAREHOUSE_SIZE = XSMALL
AUTO_SUSPEND = 300
AUTO_RESUME= TRUE;


-- Separate database for git repository
CREATE DATABASE IF NOT EXISTS QUICKSTART_COMMON;
CREATE OR ALTER DATABASE QUICKSTART_COMMON;


-- API integration is needed for GitHub integration
Expand All @@ -20,7 +23,7 @@ CREATE OR REPLACE GIT REPOSITORY quickstart_common.public.quickstart_repo
ORIGIN = '<insert URL of forked GitHub repo>'; -- INSERT URL OF FORKED REPO HERE


CREATE OR REPLACE DATABASE QUICKSTART_PROD;
CREATE OR ALTER DATABASE QUICKSTART_PROD;


-- To monitor data pipeline's completion
Expand All @@ -30,14 +33,14 @@ CREATE OR REPLACE NOTIFICATION INTEGRATION email_integration


-- Database level objects
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;
CREATE SCHEMA IF NOT EXISTS gold;
CREATE OR ALTER SCHEMA bronze;
CREATE OR ALTER SCHEMA silver;
CREATE OR ALTER SCHEMA gold;


-- Schema level objects
CREATE OR REPLACE FILE FORMAT bronze.json_format TYPE = 'json';
CREATE OR REPLACE STAGE bronze.raw;
CREATE OR ALTER STAGE bronze.raw;


-- Copy file from GitHub to internal stage
Expand Down
4 changes: 2 additions & 2 deletions steps/02_access_marketplace_data.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ USE ROLE ACCOUNTADMIN;
SELECT * FROM oag_flight_emissions_data_sample.public.estimated_emissions_schedules_sample LIMIT 100;
SELECT * FROM oag_flight_status_data_sample.public.flight_status_latest_sample LIMIT 100;
SELECT * FROM global_weather__climate_data_for_bi.standard_tile.forecast_day LIMIT 100;
SELECT * FROM government_essentials.cybersyn.datacommons_timeseries LIMIT 100;
SELECT * FROM us_points_of_interest__addresses.cybersyn.point_of_interest_index LIMIT 100;
SELECT * FROM global_government.cybersyn.datacommons_timeseries LIMIT 100;
SELECT * FROM us_addresses__poi.cybersyn.point_of_interest_index LIMIT 100;
247 changes: 247 additions & 0 deletions steps/03_harmonize_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
# Views to transform marketplace data in pipeline

import os

from snowflake.core import Root, CreateMode
from snowflake.snowpark import Session
from snowflake.core.user_defined_function import (
Argument,
ReturnDataType,
PythonFunction,
UserDefinedFunction,
)
from snowflake.core.view import View, ViewColumn


"""
To join the flight and location focused tables
we need to cross the gap between the airport and cities domains.
For this we make use of a Snowpark Python UDF.
What's really cool is that Snowpark allows us to define a vectorized UDF
making the processing super efficient as we don’t have to invoke the
function on each row individually!
To compute the mapping between airports and cities,
we use SnowflakeFile to read a JSON list from the pyairports package.
The SnowflakeFile class provides dynamic file access, to stream files of any size.
"""
map_city_to_airport = UserDefinedFunction(
name="get_city_for_airport",
arguments=[Argument(name="iata", datatype="VARCHAR")],
return_type=ReturnDataType(datatype="VARCHAR"),
language_config=PythonFunction(
runtime_version="3.11", packages=["snowflake-snowpark-python"], handler="main"
),
body="""
from snowflake.snowpark.files import SnowflakeFile
from _snowflake import vectorized
import pandas
import json
@vectorized(input=pandas.DataFrame)
def main(df):
airport_list = json.loads(
SnowflakeFile.open("@bronze.raw/airport_list.json", 'r', require_scoped_url = False).read()
)
airports = {airport[3]: airport[1] for airport in airport_list}
return df[0].apply(lambda iata: airports.get(iata.upper()))
""",
)


"""
To mangle the data into a more usable form,
we make use of views to not materialize the marketplace data
and avoid the corresponding storage costs.
"""

pipeline = [
# We are interested in the per seat carbon emissions.
# To obtain these, we need to divide the emission data by the number of seats in the airplane.
View(
name="flight_emissions",
columns=[
ViewColumn(name="departure_airport"),
ViewColumn(name="arrival_airport"),
ViewColumn(name="co2_emissions_kg_per_person"),
],
query="""
select
departure_airport,
arrival_airport,
avg(estimated_co2_total_tonnes / seats) * 1000 as co2_emissions_kg_per_person
from oag_flight_emissions_data_sample.public.estimated_emissions_schedules_sample
where seats != 0 and estimated_co2_total_tonnes is not null
group by departure_airport, arrival_airport
""",
),
# To avoid unreliable flight connections, we compute the fraction of flights that arrive
# early or on time from the flight status data provided by OAG.
View(
name="flight_punctuality",
columns=[
ViewColumn(name="departure_iata_airport_code"),
ViewColumn(name="arrival_iata_airport_code"),
ViewColumn(name="punctual_pct"),
],
query="""
select
departure_iata_airport_code,
arrival_iata_airport_code,
count(
case when arrival_actual_ingate_timeliness IN ('OnTime', 'Early') THEN 1 END
) / COUNT(*) * 100 as punctual_pct
from oag_flight_status_data_sample.public.flight_status_latest_sample
where arrival_actual_ingate_timeliness is not null
group by departure_iata_airport_code, arrival_iata_airport_code
""",
),
# When joining the flight emissions with the punctuality view,
# we filter for flights starting from the airport closest to where we live.
# This information is provided in the tiny JSON file data/home.json which we query directly in the view.
View(
name="flights_from_home",
columns=[
ViewColumn(name="departure_airport"),
ViewColumn(name="arrival_airport"),
ViewColumn(name="arrival_city"),
ViewColumn(name="co2_emissions_kg_per_person"),
ViewColumn(name="punctual_pct"),
],
query="""
select
departure_airport,
arrival_airport,
get_city_for_airport(arrival_airport) arrival_city,
co2_emissions_kg_per_person,
punctual_pct,
from flight_emissions
join flight_punctuality
on departure_airport = departure_iata_airport_code
and arrival_airport = arrival_iata_airport_code
where departure_airport = (
select $1:airport
from @quickstart_common.public.quickstart_repo/branches/main/data/home.json
(FILE_FORMAT => bronze.json_format))
""",
),
# Weather Source provides a weather forecast for the upcoming two weeks.
# As the free versions of the data sets we use do not cover the entire globe,
# we limit our pipeline to zip codes inside the US and compute the average
# temperature, humidity, precipitation probability and cloud coverage.
View(
name="weather_forecast",
columns=[
ViewColumn(name="postal_code"),
ViewColumn(name="avg_temperature_air_f"),
ViewColumn(name="avg_relative_humidity_pct"),
ViewColumn(name="avg_cloud_cover_pct"),
ViewColumn(name="precipitation_probability_pct"),
],
query="""
select
postal_code,
avg(avg_temperature_air_2m_f) avg_temperature_air_f,
avg(avg_humidity_relative_2m_pct) avg_relative_humidity_pct,
avg(avg_cloud_cover_tot_pct) avg_cloud_cover_pct,
avg(probability_of_precipitation_pct) precipitation_probability_pct
from global_weather__climate_data_for_bi.standard_tile.forecast_day
where country = 'US'
group by postal_code
""",
),
# We use the data provided by Cybersyn to limit our pipeline to US cities with atleast
# 100k residents to enjoy all the benefits a big city provides during our vacation.
View(
name="major_us_cities",
columns=[
ViewColumn(name="geo_id"),
ViewColumn(name="geo_name"),
ViewColumn(name="total_population"),
],
query="""
select
geo.geo_id,
geo.geo_name,
max(ts.value) total_population
from global_government.cybersyn.datacommons_timeseries ts
join global_government.cybersyn.geography_index geo
on ts.geo_id = geo.geo_id
join global_government.cybersyn.geography_relationships geo_rel
on geo_rel.related_geo_id = geo.geo_id
where true
and ts.variable_name = 'Total Population, census.gov'
and date >= '2020-01-01'
and geo.level = 'City'
and geo_rel.geo_id = 'country/USA'
and value > 100000
group by geo.geo_id, geo.geo_name
order by total_population desc
""",
),
# Using the geography relationships provided by Cybersyn we collect all the
# zip codes belonging to a city.
View(
name="zip_codes_in_city",
columns=[
ViewColumn(name="city_geo_id"),
ViewColumn(name="city_geo_name"),
ViewColumn(name="zip_geo_id"),
ViewColumn(name="zip_geo_name"),
],
query="""
select
city.geo_id city_geo_id,
city.geo_name city_geo_name,
city.related_geo_id zip_geo_id,
city.related_geo_name zip_geo_name
from us_addresses__poi.cybersyn.geography_relationships country
join us_addresses__poi.cybersyn.geography_relationships city
on country.related_geo_id = city.geo_id
where true
and country.geo_id = 'country/USA'
and city.level = 'City'
and city.related_level = 'CensusZipCodeTabulationArea'
order by city_geo_id
""",
),
View(
name="weather_joined_with_major_cities",
columns=[
ViewColumn(name="geo_id"),
ViewColumn(name="geo_name"),
ViewColumn(name="total_population"),
ViewColumn(name="avg_temperature_air_f"),
ViewColumn(name="avg_relative_humidity_pct"),
ViewColumn(name="avg_cloud_cover_pct"),
ViewColumn(name="precipitation_probability_pct"),
],
query="""
select
city.geo_id,
city.geo_name,
city.total_population,
avg(avg_temperature_air_f) avg_temperature_air_f,
avg(avg_relative_humidity_pct) avg_relative_humidity_pct,
avg(avg_cloud_cover_pct) avg_cloud_cover_pct,
avg(precipitation_probability_pct) precipitation_probability_pct
from major_us_cities city
join zip_codes_in_city zip on city.geo_id = zip.city_geo_id
join weather_forecast weather on zip.zip_geo_name = weather.postal_code
group by city.geo_id, city.geo_name, city.total_population
""",
),
# Placeholder: Add new view definition here
]


# entry point for PythonAPI
root = Root(Session.builder.getOrCreate())

# create views in Snowflake
silver_schema = root.databases["quickstart_prod"].schemas["silver"]
silver_schema.user_defined_functions.create(
map_city_to_airport, mode=CreateMode.or_replace
)
for view in pipeline:
silver_schema.views.create(view, mode=CreateMode.or_replace)
Loading

0 comments on commit 945c056

Please sign in to comment.