Skip to content

Commit

Permalink
Merge pull request #24 from Datateer/dat-494-data-source-setup
Browse files Browse the repository at this point in the history
dat-494: surrogate key macros
  • Loading branch information
aroder authored May 17, 2024
2 parents 8777735 + 56260e1 commit 4837dad
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 0 deletions.
88 changes: 88 additions & 0 deletions macros/find_surrogate_key.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{% macro find_surrogate_key(table_name, schema_name, database_name="RAW", sample_size=None) %}
{{ log('Starting to find surrogate key for table: ' ~ database_name ~ '.' ~ schema_name ~ '.' ~ table_name, info=True) }}

-- Handle sampling if specified
{% set sample_clause = "" %}
{% if sample_size %}
{% if sample_size < 1 %}
{% set sample_clause = "TABLESAMPLE BERNOULLI(" ~ (sample_size * 100) ~ ")" %}
{% endif %}
{% endif %}

-- Check if the table has records
{% set records_check_query %}
SELECT COUNT(*) AS total_count
FROM {{ database_name }}.{{ schema_name }}.{{ table_name }}
{% endset %}

{% set records_result = run_query(records_check_query) %}
{% set total_records = records_result.columns[0].values()[0] %}

-- If no records, log and return message
{% if total_records == 0 %}
{{ print('No records in raw table so cannot evaluate potential surrogate keys') }}
{{ return('No records in raw table so cannot evaluate potential surrogate keys') }}
{% endif %}


{% set column_query = get_columns_query(database_name, table_name, schema_name) %}
{% set result = run_query(column_query) %}
{% set columns = result.columns[0].values() %}
{% set potential_keys = [] %}

-- Evaluate uniqueness for each column
{% for column in columns %}
{{ log('Checking column: ' ~ column, info=True) }}
{% set unique_count_query %}
SELECT COUNT(DISTINCT "{{ column }}") AS unique_count,
COUNT(*) AS total_count
FROM {{ database_name }}.{{ schema_name }}.{{ table_name }}
{{ sample_clause }}
{% endset %}
{{ log('Column check query: ' ~ unique_count_query, info=False) }}

{% set results = run_query(unique_count_query) %}
{% if results.columns[0].values()[0] == results.columns[1].values()[0] %}
{% do potential_keys.append(column) %}
{% endif %}
{% endfor %}

-- Check combinations of two and three columns if no single column is a definitive surrogate key
{% if potential_keys | length == 0 %}
{% set all_combinations = generate_column_combinations(columns) %}

-- Iterate through each combination to execute the uniqueness query
{% for combined_columns in all_combinations %}
{% set combo_query %}
SELECT COUNT(DISTINCT CONCAT({{ combined_columns }})) AS unique_count,
COUNT(*) AS total_count
FROM {{ database_name }}.{{ schema_name }}.{{ table_name }}
WHERE _sdc_batched_at = (SELECT MAX(_sdc_batched_at) FROM {{ database_name }}.{{ schema_name }}.{{ table_name }})
{{ sample_clause }}
{% endset %}
{{ log('Combination check query: ' ~ combo_query, info=True) }}
{% set combo_results = run_query(combo_query) %}

-- Check if the combination provides a unique set of values
{% if combo_results.columns[0].values()[0] == combo_results.columns[1].values()[0] %}
{% do potential_keys.append(combined_columns) %}
{% endif %}
{% endfor %}
{% endif %}

-- Return the list of potential keys or combinations
{% if potential_keys | length > 0 %}
{{ log('Potential keys found: ' ~ potential_keys | join(' -and- '), info=True) }}

-- Sort keys by the number of columns (count of commas + 1) and select the first one
{% set potential_keys_sorted = potential_keys | sort(attribute='length') %}
{% set shortest_key = potential_keys_sorted[0] | replace('"', '') | lower %}

{{ log('Chosen surrogate key with the fewest columns: ' ~ shortest_key, info=True) }}
{{ print(shortest_key) }}
{% else %}
{{ log('No definitive surrogate key found.', info=True) }}
{{ print('No definitive surrogate key found.') }}
{% endif %}

{% endmacro %}
14 changes: 14 additions & 0 deletions macros/generate_column_combinations.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{% macro generate_column_combinations(columns) %}
{% set combinations = [] %}

-- Generate all pairs of columns
{% for i in range(columns | length) %}
{% for j in range(i + 1, columns | length) %}
{% set pair = '"' ~ columns[i] ~ '", "' ~ columns[j] ~ '"' %}
{% do combinations.append(pair) %}
{% endfor %}
{% endfor %}

{{ log('combinations: ' ~ combinations, info=False) }}
{{ return(combinations) }}
{% endmacro %}
20 changes: 20 additions & 0 deletions macros/get_surrogate_key_candidate_columns.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{% macro get_columns_query(database_name, table_name, schema_name) %}

{% set column_query %}
SELECT COLUMN_NAME, DATA_TYPE
FROM {{ database_name }}.INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME ilike '{{ table_name }}'
AND TABLE_SCHEMA ilike '{{ schema_name }}'
AND TABLE_CATALOG ilike '{{ database_name }}'
AND COLUMN_NAME not ilike '_sdc_%'
ORDER BY CASE
WHEN DATA_TYPE IN ('UNIQUEIDENTIFIER', 'GUID') THEN 0
WHEN COLUMN_NAME LIKE '%ID%' OR COLUMN_NAME LIKE '%KEY%' THEN 1
ELSE 2
END, COLUMN_NAME
{% endset %}

{{ log('Generated column query: ' ~ column_query, info=True) }}
{% do return(column_query) %}

{% endmacro %}

0 comments on commit 4837dad

Please sign in to comment.