From de3765d2b947f4f39a5c063d094fdc36e67186de Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Wed, 8 Jan 2025 13:12:17 -0300
Subject: [PATCH] Cut out any cube result less than 10 by default.

With an option (--option min-bucket-size:3) to change it.
---
 README.md                                     | 12 +++++++
 cumulus_library_data_metrics/base.py          | 20 +++++++++++
 cumulus_library_data_metrics/utils.jinja      | 33 +++++++++++--------
 tests/data/c_pt_count/min-bucket/expected.csv |  4 +++
 .../c_pt_count/min-bucket/patients.ndjson     | 13 ++++++++
 tests/test_metrics.py                         | 15 ++++++++-
 6 files changed, 82 insertions(+), 15 deletions(-)
 create mode 100644 tests/data/c_pt_count/min-bucket/expected.csv
 create mode 100644 tests/data/c_pt_count/min-bucket/patients.ndjson

diff --git a/README.md b/README.md
index 68a144b..82ca23e 100644
--- a/README.md
+++ b/README.md
@@ -99,3 +99,15 @@ That is, run it like:
 ```sh
 cumulus-library build --option output-mode:aggregate ...
 ```
+
+#### Bucket sizes
+
+To help preserve privacy, this study ignores any count results of less than ten.
+
+For example, if there are only two male patients that died at age 55,
+that combination of male & 55 will be dropped from the `c_pt_deceased_count` table.
+
+This makes it easier to share the count results with other institutions.
+But if that's not a concern and you want the fine-grained details,
+you can run the build step with `--option min-bucket-size:0` to turn this feature off.
+Or use another value to change the bucket threshold (the default value is 10).
diff --git a/cumulus_library_data_metrics/base.py b/cumulus_library_data_metrics/base.py
index 9a3f4e6..2f86bd2 100644
--- a/cumulus_library_data_metrics/base.py
+++ b/cumulus_library_data_metrics/base.py
@@ -110,6 +110,7 @@ def prepare_queries(
     ) -> None:
         self.study_prefix = manifest.get_study_prefix()
         self.output_mode = self.get_output_mode(config)
+        self.min_bucket_size = self.get_min_bucket_size(config)
         self._query_schema(config)
         self.extra_schema_checks(config)
         self.add_metric_queries()
@@ -129,6 +130,21 @@ def get_output_mode(self, config: cumulus_library.StudyConfig) -> str:
             output_mode = "cube"
         return output_mode
 
+    def get_min_bucket_size(self, config: cumulus_library.StudyConfig) -> int:
+        orig_min_bucket_size = config.options.get("min-bucket-size")
+        if orig_min_bucket_size is None:
+            return 10
+
+        try:
+            min_bucket_size = int(orig_min_bucket_size)
+        except ValueError:
+            min_bucket_size = -1
+
+        if min_bucket_size < 0:
+            sys.exit(f"Did not understand minimum bucket size '{orig_min_bucket_size}'.")
+
+        return min_bucket_size
+
     def render_sql(self, template: str, **kwargs) -> str:
         path = os.path.dirname(__file__)
 
@@ -147,6 +163,10 @@ def render_sql(self, template: str, **kwargs) -> str:
             template = file.read()
             loader = jinja2.FileSystemLoader(path)
             env = jinja2.Environment(loader=loader).from_string(template)  # noqa: S701
+
+            # Set global variables that we want easily accessible even in macros
+            env.globals["min_bucket_size"] = self.min_bucket_size
+
             sql = env.render(**kwargs)
             # print(sql)
             return sql
diff --git a/cumulus_library_data_metrics/utils.jinja b/cumulus_library_data_metrics/utils.jinja
index 558eb01..fac2bde 100644
--- a/cumulus_library_data_metrics/utils.jinja
+++ b/cumulus_library_data_metrics/utils.jinja
@@ -219,21 +219,26 @@
 {% macro make_counts(src_table, mode, unique_ids=null) -%}
 {% set cols = caller() %}
 {% set unique_ids = unique_ids or ['id'] %}
-SELECT
-{% if unique_ids|length > 1 %}
-    COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt,
-{% else %}
-    COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt,
-{% endif %}
-    {{ cols }}
-FROM {{ src_table }}
-{% if mode == "cube" %}
-GROUP BY CUBE(
-{% else %}
-GROUP BY (
-{% endif %}
-    {{ cols }}
+SELECT cnt, {{ cols }}
+FROM (
+    SELECT
+    {% if unique_ids|length > 1 %}
+        COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt,
+    {% else %}
+        COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt,
+    {% endif %}
+        {{ cols }}
+    FROM {{ src_table }}
+    {% if mode == "cube" %}
+    GROUP BY CUBE(
+    {% else %}
+    GROUP BY (
+    {% endif %}
+        {{ cols }}
+    )
 )
+-- Chop off any rows that don't have enough patients to meet privacy needs
+WHERE cnt >= {{ min_bucket_size }}
 {%- endmacro %}
 
 
diff --git a/tests/data/c_pt_count/min-bucket/expected.csv b/tests/data/c_pt_count/min-bucket/expected.csv
new file mode 100644
index 0000000..6b51ecb
--- /dev/null
+++ b/tests/data/c_pt_count/min-bucket/expected.csv
@@ -0,0 +1,4 @@
+cnt,birth_year,administrative_gender,ethnicity,race,deceased,status
+13,,,,,,
+11,,,,,false,
+10,,,,cumulus__none,,
diff --git a/tests/data/c_pt_count/min-bucket/patients.ndjson b/tests/data/c_pt_count/min-bucket/patients.ndjson
new file mode 100644
index 0000000..add6c0f
--- /dev/null
+++ b/tests/data/c_pt_count/min-bucket/patients.ndjson
@@ -0,0 +1,13 @@
+{"resourceType": "Patient", "id": "all-values", "deceasedBoolean": false, "active": true, "gender": "female", "birthDate": "2000-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2028-9", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2106-3", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2076-8", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2054-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "1002-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
+{"resourceType": "Patient", "id": "unk-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "UNK", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]}
+{"resourceType": "Patient", "id": "asku-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "ASKU", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]}
+{"resourceType": "Patient", "id": "unexpected-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "race", "system": "weird-system"}}]}]}
+{"resourceType": "Patient", "id": "non-hispanic-ethnicity", "active": true, "gender": "other", "birthDate": "2003-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2186-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
+{"resourceType": "Patient", "id": "unexpected-ethnicity", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "eth", "system": "weird-system"}}]}]}
+{"resourceType": "Patient", "id": "no-extensions", "active": true, "gender": "other", "birthDate": "2001-01-02"}
+{"resourceType": "Patient", "id": "no-birthdate", "active": true, "gender": "other", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
+{"resourceType": "Patient", "id": "no-gender", "active": true, "birthDate": "1990-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
+{"resourceType": "Patient", "id": "deceased-bool", "deceasedBoolean": true}
+{"resourceType": "Patient", "id": "deceased-datetime", "deceasedDateTime": "2022-09-21"}
+{"resourceType": "Patient", "id": "inactive", "active": false}
+{"resourceType": "Patient", "id": "nothing"}
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index ddf1835..4990a6f 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -47,6 +47,12 @@ def test_cubed_output_mode(self):
         # Now do the same test but without any input, to confirm the default is cube
         self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)
 
+    def test_min_bucket_size(self):
+        """Test that the default is 10 and that it cuts out small buckets."""
+        self.run_study(
+            "c_pt_count", test="min-bucket", prefix="count_", output="cube", min_bucket_size=None
+        )
+
     def test_c_pt_deceased_count(self):
         self.run_study("c_pt_deceased_count", prefix="count_")
 
@@ -106,7 +112,12 @@ def setUp(self):
         self.maxDiff = None
 
     def run_study(
-        self, metric: str, test: str = "general", prefix: str = "", output="aggregate"
+        self,
+        metric: str,
+        test: str = "general",
+        prefix: str = "",
+        output: str = "aggregate",
+        min_bucket_size: int = 0,
     ) -> None:
         """Runs a single test case"""
         test_dir = os.path.dirname(__file__)
@@ -160,6 +171,8 @@ def run_study(
                 f"--database={tmpdir}/duck.db",
                 f"--load-ndjson-dir={data_dir}",
             ]
+            if min_bucket_size is not None:
+                args.append(f"--option=min-bucket-size:{min_bucket_size}")
             if output:
                 args.append(f"--option=output-mode:{output}")
             cli.main(args)