-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from Datateer/dat-494-data-source-setup
dat-494: surrogate key macros
- Loading branch information
Showing
3 changed files
with
122 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
{% macro find_surrogate_key(table_name, schema_name, database_name="RAW", sample_size=None) %} | ||
{{ log('Starting to find surrogate key for table: ' ~ database_name ~ '.' ~ schema_name ~ '.' ~ table_name, info=True) }} | ||
|
||
-- Handle sampling if specified | ||
{% set sample_clause = "" %} | ||
{% if sample_size %} | ||
{% if sample_size < 1 %} | ||
{% set sample_clause = "TABLESAMPLE BERNOULLI(" ~ (sample_size * 100) ~ ")" %} | ||
{% endif %} | ||
{% endif %} | ||
|
||
-- Check if the table has records | ||
{% set records_check_query %} | ||
SELECT COUNT(*) AS total_count | ||
FROM {{ database_name }}.{{ schema_name }}.{{ table_name }} | ||
{% endset %} | ||
|
||
{% set records_result = run_query(records_check_query) %} | ||
{% set total_records = records_result.columns[0].values()[0] %} | ||
|
||
-- If no records, log and return message | ||
{% if total_records == 0 %} | ||
{{ print('No records in raw table so cannot evaluate potential surrogate keys') }} | ||
{{ return('No records in raw table so cannot evaluate potential surrogate keys') }} | ||
{% endif %} | ||
|
||
|
||
{% set column_query = get_columns_query(database_name, table_name, schema_name) %} | ||
{% set result = run_query(column_query) %} | ||
{% set columns = result.columns[0].values() %} | ||
{% set potential_keys = [] %} | ||
|
||
-- Evaluate uniqueness for each column | ||
{% for column in columns %} | ||
{{ log('Checking column: ' ~ column, info=True) }} | ||
{% set unique_count_query %} | ||
SELECT COUNT(DISTINCT "{{ column }}") AS unique_count, | ||
COUNT(*) AS total_count | ||
FROM {{ database_name }}.{{ schema_name }}.{{ table_name }} | ||
{{ sample_clause }} | ||
{% endset %} | ||
{{ log('Column check query: ' ~ unique_count_query, info=False) }} | ||
|
||
{% set results = run_query(unique_count_query) %} | ||
{% if results.columns[0].values()[0] == results.columns[1].values()[0] %} | ||
{% do potential_keys.append(column) %} | ||
{% endif %} | ||
{% endfor %} | ||
|
||
-- Check combinations of two and three columns if no single column is a definitive surrogate key | ||
{% if potential_keys | length == 0 %} | ||
{% set all_combinations = generate_column_combinations(columns) %} | ||
|
||
-- Iterate through each combination to execute the uniqueness query | ||
{% for combined_columns in all_combinations %} | ||
{% set combo_query %} | ||
SELECT COUNT(DISTINCT CONCAT({{ combined_columns }})) AS unique_count, | ||
COUNT(*) AS total_count | ||
FROM {{ database_name }}.{{ schema_name }}.{{ table_name }} | ||
WHERE _sdc_batched_at = (SELECT MAX(_sdc_batched_at) FROM {{ database_name }}.{{ schema_name }}.{{ table_name }}) | ||
{{ sample_clause }} | ||
{% endset %} | ||
{{ log('Combination check query: ' ~ combo_query, info=True) }} | ||
{% set combo_results = run_query(combo_query) %} | ||
|
||
-- Check if the combination provides a unique set of values | ||
{% if combo_results.columns[0].values()[0] == combo_results.columns[1].values()[0] %} | ||
{% do potential_keys.append(combined_columns) %} | ||
{% endif %} | ||
{% endfor %} | ||
{% endif %} | ||
|
||
-- Return the list of potential keys or combinations | ||
{% if potential_keys | length > 0 %} | ||
{{ log('Potential keys found: ' ~ potential_keys | join(' -and- '), info=True) }} | ||
|
||
-- Sort keys by the number of columns (count of commas + 1) and select the first one | ||
{% set potential_keys_sorted = potential_keys | sort(attribute='length') %} | ||
{% set shortest_key = potential_keys_sorted[0] | replace('"', '') | lower %} | ||
|
||
{{ log('Chosen surrogate key with the fewest columns: ' ~ shortest_key, info=True) }} | ||
{{ print(shortest_key) }} | ||
{% else %} | ||
{{ log('No definitive surrogate key found.', info=True) }} | ||
{{ print('No definitive surrogate key found.') }} | ||
{% endif %} | ||
|
||
{% endmacro %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{% macro generate_column_combinations(columns) %} | ||
{% set combinations = [] %} | ||
|
||
-- Generate all pairs of columns | ||
{% for i in range(columns | length) %} | ||
{% for j in range(i + 1, columns | length) %} | ||
{% set pair = '"' ~ columns[i] ~ '", "' ~ columns[j] ~ '"' %} | ||
{% do combinations.append(pair) %} | ||
{% endfor %} | ||
{% endfor %} | ||
|
||
{{ log('combinations: ' ~ combinations, info=False) }} | ||
{{ return(combinations) }} | ||
{% endmacro %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{% macro get_columns_query(database_name, table_name, schema_name) %} | ||
|
||
{% set column_query %} | ||
SELECT COLUMN_NAME, DATA_TYPE | ||
FROM {{ database_name }}.INFORMATION_SCHEMA.COLUMNS | ||
WHERE TABLE_NAME ilike '{{ table_name }}' | ||
AND TABLE_SCHEMA ilike '{{ schema_name }}' | ||
AND TABLE_CATALOG ilike '{{ database_name }}' | ||
AND COLUMN_NAME not ilike '_sdc_%' | ||
ORDER BY CASE | ||
WHEN DATA_TYPE IN ('UNIQUEIDENTIFIER', 'GUID') THEN 0 | ||
WHEN COLUMN_NAME LIKE '%ID%' OR COLUMN_NAME LIKE '%KEY%' THEN 1 | ||
ELSE 2 | ||
END, COLUMN_NAME | ||
{% endset %} | ||
|
||
{{ log('Generated column query: ' ~ column_query, info=True) }} | ||
{% do return(column_query) %} | ||
|
||
{% endmacro %} |