Skip to content

Commit

Permalink
Cut out any cube result less than 10 by default.
Browse files Browse the repository at this point in the history
With an option (--option min-bucket-size:3) to change it.
  • Loading branch information
mikix committed Jan 8, 2025
1 parent a09709f commit de3765d
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 15 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,15 @@ That is, run it like:
```sh
cumulus-library build --option output-mode:aggregate ...
```

#### Bucket sizes

To help preserve privacy, this study ignores any count results of less than ten.

For example, if there are only two male patients that died at age 55,
that combination of male & 55 will be dropped from the `c_pt_deceased_count` table.

This makes it easier to share the count results with other institutions.
But if that's not a concern and you want the fine-grained details,
you can run the build step with `--option min-bucket-size:0` to turn this feature off.
Or use another value to change the bucket threshold (the default value is 10).
20 changes: 20 additions & 0 deletions cumulus_library_data_metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def prepare_queries(
) -> None:
self.study_prefix = manifest.get_study_prefix()
self.output_mode = self.get_output_mode(config)
self.min_bucket_size = self.get_min_bucket_size(config)
self._query_schema(config)
self.extra_schema_checks(config)
self.add_metric_queries()
Expand All @@ -129,6 +130,21 @@ def get_output_mode(self, config: cumulus_library.StudyConfig) -> str:
output_mode = "cube"
return output_mode

def get_min_bucket_size(self, config: cumulus_library.StudyConfig) -> int:
orig_min_bucket_size = config.options.get("min-bucket-size")
if orig_min_bucket_size is None:
return 10

try:
min_bucket_size = int(orig_min_bucket_size)
except ValueError:
min_bucket_size = -1

if min_bucket_size < 0:
sys.exit(f"Did not understand minimum bucket size '{orig_min_bucket_size}'.")

return min_bucket_size

def render_sql(self, template: str, **kwargs) -> str:
path = os.path.dirname(__file__)

Expand All @@ -147,6 +163,10 @@ def render_sql(self, template: str, **kwargs) -> str:
template = file.read()
loader = jinja2.FileSystemLoader(path)
env = jinja2.Environment(loader=loader).from_string(template) # noqa: S701

# Set global variables that we want easily accessible even in macros
env.globals["min_bucket_size"] = self.min_bucket_size

sql = env.render(**kwargs)
# print(sql)
return sql
33 changes: 19 additions & 14 deletions cumulus_library_data_metrics/utils.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -219,21 +219,26 @@
{% macro make_counts(src_table, mode, unique_ids=null) -%}
{% set cols = caller() %}
{% set unique_ids = unique_ids or ['id'] %}
SELECT
{% if unique_ids|length > 1 %}
COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt,
{% else %}
COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt,
{% endif %}
{{ cols }}
FROM {{ src_table }}
{% if mode == "cube" %}
GROUP BY CUBE(
{% else %}
GROUP BY (
{% endif %}
{{ cols }}
SELECT cnt, {{ cols }}
FROM (
SELECT
{% if unique_ids|length > 1 %}
COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt,
{% else %}
COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt,
{% endif %}
{{ cols }}
FROM {{ src_table }}
{% if mode == "cube" %}
GROUP BY CUBE(
{% else %}
GROUP BY (
{% endif %}
{{ cols }}
)
)
-- Chop off any rows that don't have enough patients to meet privacy needs
WHERE cnt >= {{ min_bucket_size }}
{%- endmacro %}


Expand Down
4 changes: 4 additions & 0 deletions tests/data/c_pt_count/min-bucket/expected.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cnt,birth_year,administrative_gender,ethnicity,race,deceased,status
13,,,,,,
11,,,,,false,
10,,,,cumulus__none,,
13 changes: 13 additions & 0 deletions tests/data/c_pt_count/min-bucket/patients.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{"resourceType": "Patient", "id": "all-values", "deceasedBoolean": false, "active": true, "gender": "female", "birthDate": "2000-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2028-9", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2106-3", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2076-8", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2054-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "1002-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "unk-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "UNK", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]}
{"resourceType": "Patient", "id": "asku-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "ASKU", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]}
{"resourceType": "Patient", "id": "unexpected-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "race", "system": "weird-system"}}]}]}
{"resourceType": "Patient", "id": "non-hispanic-ethnicity", "active": true, "gender": "other", "birthDate": "2003-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2186-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "unexpected-ethnicity", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "eth", "system": "weird-system"}}]}]}
{"resourceType": "Patient", "id": "no-extensions", "active": true, "gender": "other", "birthDate": "2001-01-02"}
{"resourceType": "Patient", "id": "no-birthdate", "active": true, "gender": "other", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "no-gender", "active": true, "birthDate": "1990-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "deceased-bool", "deceasedBoolean": true}
{"resourceType": "Patient", "id": "deceased-datetime", "deceasedDateTime": "2022-09-21"}
{"resourceType": "Patient", "id": "inactive", "active": false}
{"resourceType": "Patient", "id": "nothing"}
15 changes: 14 additions & 1 deletion tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ def test_cubed_output_mode(self):
# Now do the same test but without any input, to confirm the default is cube
self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)

def test_min_bucket_size(self):
"""Test that the default is 10 and that it cuts out small buckets."""
self.run_study(
"c_pt_count", test="min-bucket", prefix="count_", output="cube", min_bucket_size=None
)

def test_c_pt_deceased_count(self):
self.run_study("c_pt_deceased_count", prefix="count_")

Expand Down Expand Up @@ -106,7 +112,12 @@ def setUp(self):
self.maxDiff = None

def run_study(
self, metric: str, test: str = "general", prefix: str = "", output="aggregate"
self,
metric: str,
test: str = "general",
prefix: str = "",
output: str = "aggregate",
min_bucket_size: int = 0,
) -> None:
"""Runs a single test case"""
test_dir = os.path.dirname(__file__)
Expand Down Expand Up @@ -160,6 +171,8 @@ def run_study(
f"--database={tmpdir}/duck.db",
f"--load-ndjson-dir={data_dir}",
]
if min_bucket_size is not None:
args.append(f"--option=min-bucket-size:{min_bucket_size}")
if output:
args.append(f"--option=output-mode:{output}")
cli.main(args)
Expand Down

0 comments on commit de3765d

Please sign in to comment.