From d84ad5d0d10a81c2483029ef5252954d49045a8a Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:29:43 +0100 Subject: [PATCH 1/5] test unlinkables --- tests/test_basic_functionality.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_basic_functionality.py b/tests/test_basic_functionality.py index 0a8f3c2..714563f 100644 --- a/tests/test_basic_functionality.py +++ b/tests/test_basic_functionality.py @@ -117,6 +117,14 @@ def test_m_u_chart(api_info, fake_1000_factory, fake_1000_settings): linker.visualisations.m_u_parameters_chart() +def test_unlinkables_chart(api_info, fake_1000_factory, fake_1000_settings): + db_api = api_info["db_api"] + df = fake_1000_factory(api_info["version"]) + linker = Linker(df, fake_1000_settings, db_api) + + linker.evaluation.unlinkables_chart() + + def test_comparison_viewer_dashboard( api_info, fake_1000_factory, fake_1000_settings, tmp_path ): From b6ae603cfd337878c3158d20e9bad0d364dab64e Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:31:19 +0100 Subject: [PATCH 2/5] Rename workflow --- .github/workflows/test-clickhouse.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-clickhouse.yaml b/.github/workflows/test-clickhouse.yaml index 9d4e677..f4b0887 100644 --- a/.github/workflows/test-clickhouse.yaml +++ b/.github/workflows/test-clickhouse.yaml @@ -1,4 +1,4 @@ -name: chDB tests +name: Clickhouse tests on: pull_request: branches: From 231a07d1d234e984b60aa123064f487d7a255a96 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:45:07 +0100 Subject: [PATCH 3/5] settings fixture -> factory we need different versions for chdb and clickhouse, to workaround an issue where NULL values are not interpreted as such on import to chdb --- tests/conftest.py | 70 +++++++++++++++++++++---------- tests/test_basic_functionality.py | 49 ++++++++++++++++------ 2 files changed, 85 insertions(+), 34 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6903c1e..54b1f3a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,7 @@ import splink.comparison_library as cl from chdb import dbapi from pytest import fixture, mark, param -from splink import SettingsCreator, block_on, splink_datasets +from splink import ColumnExpression, SettingsCreator, block_on, splink_datasets from splinkclickhouse import ChDBAPI, ClickhouseAPI @@ -84,23 +84,51 @@ def fake_1000(version): @fixture -def fake_1000_settings(): - return SettingsCreator( - link_type="dedupe_only", - comparisons=[ - cl.JaroWinklerAtThresholds("first_name"), - cl.JaroAtThresholds("surname"), - cl.DateOfBirthComparison( - "dob", - input_is_string=True, - ), - cl.DamerauLevenshteinAtThresholds("city").configure( - term_frequency_adjustments=True - ), - cl.JaccardAtThresholds("email"), - ], - blocking_rules_to_generate_predictions=[ - block_on("first_name", "dob"), - block_on("surname"), - ], - ) +def fake_1000_settings_factory(): + def fake_1000_settings(version): + if version == "clickhouse": + return SettingsCreator( + link_type="dedupe_only", + comparisons=[ + cl.JaroWinklerAtThresholds("first_name"), + cl.JaroAtThresholds("surname"), + cl.DateOfBirthComparison( + "dob", + input_is_string=True, + ), + cl.DamerauLevenshteinAtThresholds("city").configure( + term_frequency_adjustments=True + ), + cl.JaccardAtThresholds("email"), + ], + blocking_rules_to_generate_predictions=[ + block_on("first_name", "dob"), + block_on("surname"), + ], + ) + # for chdb we wrap all columns in regex_extract, which also includes a nullif + # this circumvents issue where string column NULL values are parsed as empty + # string instead of NULL when we import them into chdb + return SettingsCreator( + link_type="dedupe_only", + comparisons=[ + cl.JaroWinklerAtThresholds( + ColumnExpression("first_name").regex_extract(".*") + ), + cl.JaroAtThresholds(ColumnExpression("surname").regex_extract(".*")), + cl.DateOfBirthComparison( + ColumnExpression("dob").regex_extract(".*"), + input_is_string=True, + ), + cl.DamerauLevenshteinAtThresholds( + ColumnExpression("city").regex_extract(".*") + ).configure(term_frequency_adjustments=True), + cl.JaccardAtThresholds(ColumnExpression("email").regex_extract(".*")), + ], + blocking_rules_to_generate_predictions=[ + block_on("first_name", "dob"), + block_on("surname"), + ], + ) + + return fake_1000_settings diff --git a/tests/test_basic_functionality.py b/tests/test_basic_functionality.py index 714563f..fe1239d 100644 --- a/tests/test_basic_functionality.py +++ b/tests/test_basic_functionality.py @@ -5,31 +5,35 @@ from splink.exploratory import completeness_chart, profile_columns -def test_make_linker(api_info, fake_1000_factory, fake_1000_settings): +def test_make_linker(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) Linker(df, fake_1000_settings, db_api) -def test_train_u(api_info, fake_1000_factory, fake_1000_settings): +def test_train_u(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.training.estimate_u_using_random_sampling(max_pairs=3e4) -def test_train_lambda(api_info, fake_1000_factory, fake_1000_settings): +def test_train_lambda(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.training.estimate_probability_two_random_records_match( [block_on("dob"), block_on("first_name", "surname")], recall=0.8 ) -def test_em_training(api_info, fake_1000_factory, fake_1000_settings): +def test_em_training(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.training.estimate_parameters_using_expectation_maximisation( block_on("dob"), @@ -39,16 +43,18 @@ def test_em_training(api_info, fake_1000_factory, fake_1000_settings): ) -def test_predict(api_info, fake_1000_factory, fake_1000_settings): +def test_predict(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.inference.predict() -def test_clustering(api_info, fake_1000_factory, fake_1000_settings): +def test_clustering(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) df_predict = linker.inference.predict() linker.clustering.cluster_pairwise_predictions_at_threshold( @@ -57,9 +63,12 @@ def test_clustering(api_info, fake_1000_factory, fake_1000_settings): ) -def test_cumulative_comparisons(api_info, fake_1000_factory, fake_1000_settings): +def test_cumulative_comparisons( + api_info, fake_1000_factory, fake_1000_settings_factory +): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) blocking_rules = fake_1000_settings.blocking_rules_to_generate_predictions @@ -89,16 +98,20 @@ def test_completeness(api_info, fake_1000_factory): completeness_chart(df, db_api=db_api) -def test_match_weights_chart(api_info, fake_1000_factory, fake_1000_settings): +def test_match_weights_chart(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.visualisations.match_weights_chart() -def test_parameter_estimates_chart(api_info, fake_1000_factory, fake_1000_settings): +def test_parameter_estimates_chart( + api_info, fake_1000_factory, fake_1000_settings_factory +): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.training.estimate_parameters_using_expectation_maximisation( block_on("dob"), @@ -109,27 +122,36 @@ def test_parameter_estimates_chart(api_info, fake_1000_factory, fake_1000_settin linker.visualisations.parameter_estimate_comparisons_chart() -def test_m_u_chart(api_info, fake_1000_factory, fake_1000_settings): +def test_m_u_chart(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) linker.visualisations.m_u_parameters_chart() -def test_unlinkables_chart(api_info, fake_1000_factory, fake_1000_settings): +def test_unlinkables_chart(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) + linker = Linker(df, fake_1000_settings, db_api) + # db_api.debug_mode = True linker.evaluation.unlinkables_chart() + # import json + # with open(f"tmp_{api_info['version']}.json", "w+") as f: + # json.dump(ch, f) + # raise TypeError() def test_comparison_viewer_dashboard( - api_info, fake_1000_factory, fake_1000_settings, tmp_path + api_info, fake_1000_factory, fake_1000_settings_factory, tmp_path ): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) fake_1000_settings.retain_intermediate_calculation_columns = True linker = Linker(df, fake_1000_settings, db_api) @@ -138,10 +160,11 @@ def test_comparison_viewer_dashboard( def test_cluster_studio_dashboard( - api_info, fake_1000_factory, fake_1000_settings, tmp_path + api_info, fake_1000_factory, fake_1000_settings_factory, tmp_path ): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) fake_1000_settings.retain_intermediate_calculation_columns = True linker = Linker(df, fake_1000_settings, db_api) From 928b3464f74c92f90b54ce45c38c930f8a38feab Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:47:22 +0100 Subject: [PATCH 4/5] update full run test to use new settings factory --- tests/test_full_run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_full_run.py b/tests/test_full_run.py index eb3dab3..a714bde 100644 --- a/tests/test_full_run.py +++ b/tests/test_full_run.py @@ -3,9 +3,10 @@ # this tests similar steps to test_basic_functionality.py, but alltogether # this should catch issues we may have in building up cache/other state -def test_full_basic_run(api_info, fake_1000_factory, fake_1000_settings): +def test_full_basic_run(api_info, fake_1000_factory, fake_1000_settings_factory): db_api = api_info["db_api"] df = fake_1000_factory(api_info["version"]) + fake_1000_settings = fake_1000_settings_factory(api_info["version"]) linker = Linker(df, fake_1000_settings, db_api) # training From bfa33989e25bfdda93b28d6c4619ba0b451475e7 Mon Sep 17 00:00:00 2001 From: ADBond <48208438+ADBond@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:57:19 +0100 Subject: [PATCH 5/5] Explain `NULL` issue in README + workaround --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 7fa6e93..64ddd7a 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,18 @@ If you require different behaviour (for instance if you have an unusual date for There is not currently a way in Clickhouse to deal directly with date values before 1900 - if you require such values you will have to manually process these to a different type, and construct the relevant SQL logic. +### `NULL` values in `chDB` + +When passing data into `chdb` from pandas or pyarrow tables, `NULL` values in `String` columns are converted into empty strings, instead of remaining `NULL`. + +For now this is not handled within the package. You can workaround the issue by wrapping column names in `NULLIF`: + +```python +import splink.comparison_level as cl + +fn_comparison = cl.DamerauLevenshteinAtThresholds("NULLIF(first_name, '')") +``` + ### Term-frequency adjustments Currently at most one term frequency adjustment can be used with `ClickhouseAPI`.