From de3765d2b947f4f39a5c063d094fdc36e67186de Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 8 Jan 2025 13:12:17 -0300 Subject: [PATCH] Cut out any cube result less than 10 by default. With an option (--option min-bucket-size:3) to change it. --- README.md | 12 +++++++ cumulus_library_data_metrics/base.py | 20 +++++++++++ cumulus_library_data_metrics/utils.jinja | 33 +++++++++++-------- tests/data/c_pt_count/min-bucket/expected.csv | 4 +++ .../c_pt_count/min-bucket/patients.ndjson | 13 ++++++++ tests/test_metrics.py | 15 ++++++++- 6 files changed, 82 insertions(+), 15 deletions(-) create mode 100644 tests/data/c_pt_count/min-bucket/expected.csv create mode 100644 tests/data/c_pt_count/min-bucket/patients.ndjson diff --git a/README.md b/README.md index 68a144b..82ca23e 100644 --- a/README.md +++ b/README.md @@ -99,3 +99,15 @@ That is, run it like: ```sh cumulus-library build --option output-mode:aggregate ... ``` + +#### Bucket sizes + +To help preserve privacy, this study ignores any count results of less than ten. + +For example, if there are only two male patients that died at age 55, +that combination of male & 55 will be dropped from the `c_pt_deceased_count` table. + +This makes it easier to share the count results with other institutions. +But if that's not a concern and you want the fine-grained details, +you can run the build step with `--option min-bucket-size:0` to turn this feature off. +Or use another value to change the bucket threshold (the default value is 10). diff --git a/cumulus_library_data_metrics/base.py b/cumulus_library_data_metrics/base.py index 9a3f4e6..2f86bd2 100644 --- a/cumulus_library_data_metrics/base.py +++ b/cumulus_library_data_metrics/base.py @@ -110,6 +110,7 @@ def prepare_queries( ) -> None: self.study_prefix = manifest.get_study_prefix() self.output_mode = self.get_output_mode(config) + self.min_bucket_size = self.get_min_bucket_size(config) self._query_schema(config) self.extra_schema_checks(config) self.add_metric_queries() @@ -129,6 +130,21 @@ def get_output_mode(self, config: cumulus_library.StudyConfig) -> str: output_mode = "cube" return output_mode + def get_min_bucket_size(self, config: cumulus_library.StudyConfig) -> int: + orig_min_bucket_size = config.options.get("min-bucket-size") + if orig_min_bucket_size is None: + return 10 + + try: + min_bucket_size = int(orig_min_bucket_size) + except ValueError: + min_bucket_size = -1 + + if min_bucket_size < 0: + sys.exit(f"Did not understand minimum bucket size '{orig_min_bucket_size}'.") + + return min_bucket_size + def render_sql(self, template: str, **kwargs) -> str: path = os.path.dirname(__file__) @@ -147,6 +163,10 @@ def render_sql(self, template: str, **kwargs) -> str: template = file.read() loader = jinja2.FileSystemLoader(path) env = jinja2.Environment(loader=loader).from_string(template) # noqa: S701 + + # Set global variables that we want easily accessible even in macros + env.globals["min_bucket_size"] = self.min_bucket_size + sql = env.render(**kwargs) # print(sql) return sql diff --git a/cumulus_library_data_metrics/utils.jinja b/cumulus_library_data_metrics/utils.jinja index 558eb01..fac2bde 100644 --- a/cumulus_library_data_metrics/utils.jinja +++ b/cumulus_library_data_metrics/utils.jinja @@ -219,21 +219,26 @@ {% macro make_counts(src_table, mode, unique_ids=null) -%} {% set cols = caller() %} {% set unique_ids = unique_ids or ['id'] %} -SELECT -{% if unique_ids|length > 1 %} - COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt, -{% else %} - COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt, -{% endif %} - {{ cols }} -FROM {{ src_table }} -{% if mode == "cube" %} -GROUP BY CUBE( -{% else %} -GROUP BY ( -{% endif %} - {{ cols }} +SELECT cnt, {{ cols }} +FROM ( + SELECT + {% if unique_ids|length > 1 %} + COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt, + {% else %} + COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt, + {% endif %} + {{ cols }} + FROM {{ src_table }} + {% if mode == "cube" %} + GROUP BY CUBE( + {% else %} + GROUP BY ( + {% endif %} + {{ cols }} + ) ) +-- Chop off any rows that don't have enough patients to meet privacy needs +WHERE cnt >= {{ min_bucket_size }} {%- endmacro %} diff --git a/tests/data/c_pt_count/min-bucket/expected.csv b/tests/data/c_pt_count/min-bucket/expected.csv new file mode 100644 index 0000000..6b51ecb --- /dev/null +++ b/tests/data/c_pt_count/min-bucket/expected.csv @@ -0,0 +1,4 @@ +cnt,birth_year,administrative_gender,ethnicity,race,deceased,status +13,,,,,, +11,,,,,false, +10,,,,cumulus__none,, diff --git a/tests/data/c_pt_count/min-bucket/patients.ndjson b/tests/data/c_pt_count/min-bucket/patients.ndjson new file mode 100644 index 0000000..add6c0f --- /dev/null +++ b/tests/data/c_pt_count/min-bucket/patients.ndjson @@ -0,0 +1,13 @@ +{"resourceType": "Patient", "id": "all-values", "deceasedBoolean": false, "active": true, "gender": "female", "birthDate": "2000-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2028-9", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2106-3", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2076-8", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2054-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "1002-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]} +{"resourceType": "Patient", "id": "unk-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "UNK", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]} +{"resourceType": "Patient", "id": "asku-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "ASKU", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]} +{"resourceType": "Patient", "id": "unexpected-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "race", "system": "weird-system"}}]}]} +{"resourceType": "Patient", "id": "non-hispanic-ethnicity", "active": true, "gender": "other", "birthDate": "2003-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2186-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]} +{"resourceType": "Patient", "id": "unexpected-ethnicity", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "eth", "system": "weird-system"}}]}]} +{"resourceType": "Patient", "id": "no-extensions", "active": true, "gender": "other", "birthDate": "2001-01-02"} +{"resourceType": "Patient", "id": "no-birthdate", "active": true, "gender": "other", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]} +{"resourceType": "Patient", "id": "no-gender", "active": true, "birthDate": "1990-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]} +{"resourceType": "Patient", "id": "deceased-bool", "deceasedBoolean": true} +{"resourceType": "Patient", "id": "deceased-datetime", "deceasedDateTime": "2022-09-21"} +{"resourceType": "Patient", "id": "inactive", "active": false} +{"resourceType": "Patient", "id": "nothing"} diff --git a/tests/test_metrics.py b/tests/test_metrics.py index ddf1835..4990a6f 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -47,6 +47,12 @@ def test_cubed_output_mode(self): # Now do the same test but without any input, to confirm the default is cube self.run_study("c_pt_count", test="cubed", prefix="count_", output=None) + def test_min_bucket_size(self): + """Test that the default is 10 and that it cuts out small buckets.""" + self.run_study( + "c_pt_count", test="min-bucket", prefix="count_", output="cube", min_bucket_size=None + ) + def test_c_pt_deceased_count(self): self.run_study("c_pt_deceased_count", prefix="count_") @@ -106,7 +112,12 @@ def setUp(self): self.maxDiff = None def run_study( - self, metric: str, test: str = "general", prefix: str = "", output="aggregate" + self, + metric: str, + test: str = "general", + prefix: str = "", + output: str = "aggregate", + min_bucket_size: int = 0, ) -> None: """Runs a single test case""" test_dir = os.path.dirname(__file__) @@ -160,6 +171,8 @@ def run_study( f"--database={tmpdir}/duck.db", f"--load-ndjson-dir={data_dir}", ] + if min_bucket_size is not None: + args.append(f"--option=min-bucket-size:{min_bucket_size}") if output: args.append(f"--option=output-mode:{output}") cli.main(args)