Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cut out any cube result less than 10 by default. #69

Merged
merged 1 commit into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,15 @@ That is, run it like:
```sh
cumulus-library build --option output-mode:aggregate ...
```

#### Bucket sizes

To help preserve privacy, this study ignores any count results of less than ten.

For example, if there are only two male patients that died at age 55,
that combination of male & 55 will be dropped from the `c_pt_deceased_count` table.

This makes it easier to share the count results with other institutions.
But if that's not a concern and you want the fine-grained details,
you can run the build step with `--option min-bucket-size:0` to turn this feature off.
Or use another value to change the bucket threshold (the default value is 10).
20 changes: 20 additions & 0 deletions cumulus_library_data_metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def prepare_queries(
) -> None:
self.study_prefix = manifest.get_study_prefix()
self.output_mode = self.get_output_mode(config)
self.min_bucket_size = self.get_min_bucket_size(config)
self._query_schema(config)
self.extra_schema_checks(config)
self.add_metric_queries()
Expand All @@ -129,6 +130,21 @@ def get_output_mode(self, config: cumulus_library.StudyConfig) -> str:
output_mode = "cube"
return output_mode

def get_min_bucket_size(self, config: cumulus_library.StudyConfig) -> int:
orig_min_bucket_size = config.options.get("min-bucket-size")
if orig_min_bucket_size is None:
return 10

try:
min_bucket_size = int(orig_min_bucket_size)
except ValueError:
min_bucket_size = -1

if min_bucket_size < 0:
sys.exit(f"Did not understand minimum bucket size '{orig_min_bucket_size}'.")

return min_bucket_size

def render_sql(self, template: str, **kwargs) -> str:
path = os.path.dirname(__file__)

Expand All @@ -147,6 +163,10 @@ def render_sql(self, template: str, **kwargs) -> str:
template = file.read()
loader = jinja2.FileSystemLoader(path)
env = jinja2.Environment(loader=loader).from_string(template) # noqa: S701

# Set global variables that we want easily accessible even in macros
env.globals["min_bucket_size"] = self.min_bucket_size

sql = env.render(**kwargs)
# print(sql)
return sql
33 changes: 19 additions & 14 deletions cumulus_library_data_metrics/utils.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -219,21 +219,26 @@
{% macro make_counts(src_table, mode, unique_ids=null) -%}
{% set cols = caller() %}
{% set unique_ids = unique_ids or ['id'] %}
SELECT
{% if unique_ids|length > 1 %}
COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt,
{% else %}
COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt,
{% endif %}
{{ cols }}
FROM {{ src_table }}
{% if mode == "cube" %}
GROUP BY CUBE(
{% else %}
GROUP BY (
{% endif %}
{{ cols }}
SELECT cnt, {{ cols }}
FROM (
SELECT
{% if unique_ids|length > 1 %}
COUNT(DISTINCT CONCAT({{ ", ':', ".join(unique_ids) }})) AS cnt,
{% else %}
COUNT(DISTINCT {{ unique_ids[0] }}) AS cnt,
{% endif %}
{{ cols }}
FROM {{ src_table }}
{% if mode == "cube" %}
GROUP BY CUBE(
{% else %}
GROUP BY (
{% endif %}
{{ cols }}
)
)
-- Chop off any rows that don't have enough patients to meet privacy needs
WHERE cnt >= {{ min_bucket_size }}
{%- endmacro %}


Expand Down
4 changes: 4 additions & 0 deletions tests/data/c_pt_count/min-bucket/expected.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cnt,birth_year,administrative_gender,ethnicity,race,deceased,status
13,,,,,,
11,,,,,false,
10,,,,cumulus__none,,
13 changes: 13 additions & 0 deletions tests/data/c_pt_count/min-bucket/patients.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{"resourceType": "Patient", "id": "all-values", "deceasedBoolean": false, "active": true, "gender": "female", "birthDate": "2000-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2028-9", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2106-3", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2076-8", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "2054-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}, {"url": "ombCategory", "valueCoding": {"code": "1002-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "unk-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "UNK", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]}
{"resourceType": "Patient", "id": "asku-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "ASKU", "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor"}}]}]}
{"resourceType": "Patient", "id": "unexpected-race", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"code": "race", "system": "weird-system"}}]}]}
{"resourceType": "Patient", "id": "non-hispanic-ethnicity", "active": true, "gender": "other", "birthDate": "2003-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2186-5", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "unexpected-ethnicity", "active": true, "gender": "other", "birthDate": "2001-01-02", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "eth", "system": "weird-system"}}]}]}
{"resourceType": "Patient", "id": "no-extensions", "active": true, "gender": "other", "birthDate": "2001-01-02"}
{"resourceType": "Patient", "id": "no-birthdate", "active": true, "gender": "other", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "no-gender", "active": true, "birthDate": "1990-04-04", "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"code": "2135-2", "system": "urn:oid:2.16.840.1.113883.6.238"}}]}]}
{"resourceType": "Patient", "id": "deceased-bool", "deceasedBoolean": true}
{"resourceType": "Patient", "id": "deceased-datetime", "deceasedDateTime": "2022-09-21"}
{"resourceType": "Patient", "id": "inactive", "active": false}
{"resourceType": "Patient", "id": "nothing"}
18 changes: 17 additions & 1 deletion tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ def test_cubed_output_mode(self):
# Now do the same test but without any input, to confirm the default is cube
self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)

def test_min_bucket_size(self):
"""Test that the default is 10 and that it cuts out small buckets."""
self.run_study(
"c_pt_count", test="min-bucket", prefix="count_", output="cube", min_bucket_size=None
)
with self.assertRaises(SystemExit) as cm:
self.run_study("c_pt_count", test="min-bucket", min_bucket_size="NaN")
self.assertEqual(cm.exception.code, "Did not understand minimum bucket size 'NaN'.")

def test_c_pt_deceased_count(self):
self.run_study("c_pt_deceased_count", prefix="count_")

Expand Down Expand Up @@ -106,7 +115,12 @@ def setUp(self):
self.maxDiff = None

def run_study(
self, metric: str, test: str = "general", prefix: str = "", output="aggregate"
self,
metric: str,
test: str = "general",
prefix: str = "",
output: str = "aggregate",
min_bucket_size: int = 0,
) -> None:
"""Runs a single test case"""
test_dir = os.path.dirname(__file__)
Expand Down Expand Up @@ -160,6 +174,8 @@ def run_study(
f"--database={tmpdir}/duck.db",
f"--load-ndjson-dir={data_dir}",
]
if min_bucket_size is not None:
args.append(f"--option=min-bucket-size:{min_bucket_size}")
if output:
args.append(f"--option=output-mode:{output}")
cli.main(args)
Expand Down
Loading