Scale up step 3 (#145)

* Updated agendas * Rerun pipelines * Updated version * update data export
UMCU-Digital-Health · Dec 4, 2024 · 47ecd9e · 47ecd9e
1 parent cd5975e
commit 47ecd9e
Show file tree

Hide file tree

Showing 14 changed files with 157 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.4.10] - 2024-12-03
+
+### Changed
+- Updated agendas to next scale up step and rerun pipelines
+- Updated data export script to use data from 2016 instead of 2015 and use the PUB publication.
+
+### Added
+- Added a script to export data for model training
+
 ## [1.4.9] - 2024-11-18
 
 ### Changed

diff --git a/data/raw/poliafspraken_no_show.csv.dvc b/data/raw/poliafspraken_no_show.csv.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 46c186d0fd0d69609b08131997cba338
-  size: 507224267
+- md5: 31983d8a20d487417a9da7cf024c7ecc
+  size: 487004530
   path: poliafspraken_no_show.csv
   hash: md5
diff --git a/data/sql/data_export.sql b/data/sql/data_export.sql
@@ -19,17 +19,17 @@ SELECT APP.identifier_value AS APP_ID
     ,ADDR.[address_postalCodeNumbersNL]
     ,LOC.[name]
     ,LOC.[description]
-FROM [DWH].[models].[HealthcareService] SUBAGENDA JOIN [DWH].[models].[HealthcareService] HOOFDAGENDA
+FROM [PUB].[no_show].[HealthcareService] SUBAGENDA JOIN [PUB].[no_show].[HealthcareService] HOOFDAGENDA
         ON SUBAGENDA.partOf_HealthcareService_value = HOOFDAGENDA.identifier_value AND SUBAGENDA.partOf_HealthcareService_system = HOOFDAGENDA.identifier_system
-    JOIN [DWH].[models].[Appointment] APP 
+    JOIN [PUB].[no_show].[Appointment] APP 
         ON APP.participant_actor_HealthcareService_value = SUBAGENDA.identifier_value AND APP.participant_actor_HealthcareService_system = SUBAGENDA.identifier_system
-    JOIN [DWH].[models].Encounter ENC
+    JOIN [PUB].[no_show].Encounter ENC
         ON ENC.appointment_Appointment_system = APP.identifier_system AND ENC.appointment_Appointment_value = APP.identifier_value
-    LEFT JOIN [DWH].[models].Location LOC
+    LEFT JOIN [PUB].[no_show].Location LOC
         ON ENC.location_Location_system = LOC.identifier_system AND ENC.location_Location_value = LOC.identifier_value
-    JOIN [DWH].[models].[Patient] PAT
+    JOIN [PUB].[no_show].[Patient] PAT
         ON APP.[participant_actor_Patient_value] = PAT.identifier_value
-    LEFT JOIN [DWH].[models].[Patient_Address] ADDR
+    LEFT JOIN [PUB].[no_show].[Patient_Address] ADDR
         ON ADDR.[parent_identifier_value] = PAT.identifier_value 
 WHERE 1=1
     AND SUBAGENDA.active = 1
@@ -115,11 +115,25 @@ WHERE 1=1
             -- Zorglijn Acute en intensieve zorg
             'ZH0297', -- PSY Acute en intensieve zorg
             -- Oncologische urologie
-            'ZH0033' -- B&O Urologische oncologie
+            'ZH0033', -- B&O Urologische oncologie
+            -- Hartfunctie
+            'ZH0116', -- Functie Hart
+            -- Medische oncoloie
+            'ZH0028', -- B&O Medische oncologie
+            -- Hematologie
+            'ZH0025', -- B&O Hematologie
+            -- Gynaecologische oncologie
+            'ZH0024', -- B&O Gynaecologische oncologie
+            -- Chirurgische oncologie
+            'ZH0020', -- B&O Chirurgische oncologie
+            -- Hoofd Hals oncologie
+            'ZH0027', -- B&O KNH oncologie
+            -- Functie KNF
+            'ZH0175' -- Functie KNF
         )
     AND APP.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
-    AND APP.created >= '2015-01-01'
-    AND APP.created <= '2024-10-01'
+    AND APP.created >= '2016-01-01'
+    AND APP.created <= '2024-11-01'
     AND APP.status <> 'booked'
     AND ENC.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
     AND ENC.type2_code NOT IN ('T', 'S', 'M')

diff --git a/data/sql/data_prediction.sql b/data/sql/data_prediction.sql
@@ -137,7 +137,21 @@ WHERE 1=1
         -- Zorglijn Acute en intensieve zorg
         'ZH0297', -- PSY Acute en intensieve zorg
         -- Oncologische urologie
-        'ZH0033' -- B&O Urologische oncologie
+        'ZH0033', -- B&O Urologische oncologie
+        -- Hartfunctie
+        'ZH0116', -- Functie Hart
+        -- Medische oncoloie
+        'ZH0028', -- B&O Medische oncologie
+        -- Hematologie
+        'ZH0025', -- B&O Hematologie
+        -- Gynaecologische oncologie
+        'ZH0024', -- B&O Gynaecologische oncologie
+        -- Chirurgische oncologie
+        'ZH0020', -- B&O Chirurgische oncologie
+        -- Hoofd Hals oncologie
+        'ZH0027', -- B&O KNH oncologie
+        -- Functie KNF
+        'ZH0175' -- Functie KNF
     )
     AND APP.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
     AND APP.[created] >= '2015-01-01'
@@ -240,7 +254,21 @@ WHERE 1=1
                 -- Zorglijn Acute en intensieve zorg
                 'ZH0297', -- PSY Acute en intensieve zorg
                 -- Oncologische urologie
-                'ZH0033' -- B&O Urologische oncologie
+                'ZH0033', -- B&O Urologische oncologie
+                -- Hartfunctie
+                'ZH0116', -- Functie Hart
+                -- Medische oncoloie
+                'ZH0028', -- B&O Medische oncologie
+                -- Hematologie
+                'ZH0025', -- B&O Hematologie
+                -- Gynaecologische oncologie
+                'ZH0024', -- B&O Gynaecologische oncologie
+                -- Chirurgische oncologie
+                'ZH0020', -- B&O Chirurgische oncologie
+                -- Hoofd Hals oncologie
+                'ZH0027', -- B&O KNH oncologie
+                -- Functie KNF
+                'ZH0175' -- Functie KNF
             )
             AND APP2.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
             AND CONVERT(DATE, APP2.[start]) = @start_date

diff --git a/dvc.lock b/dvc.lock
@@ -5,17 +5,17 @@ stages:
     deps:
     - path: data/processed/featuretable.parquet
       hash: md5
-      md5: 86589d93563f0a2fdc2373a7bf487732
-      size: 94648247
+      md5: 4e754146b5f7ea804ab79691e19038d4
+      size: 100497640
     - path: src/noshow/model/train_model.py
       hash: md5
-      md5: 00964a947199825f721ebbbe0bb23da6
-      size: 3708
+      md5: 12587629eddb081d94bc997c8d100dd4
+      size: 3713
     outs:
     - path: output/models/no_show_model_cv.pickle
       hash: md5
-      md5: 6517c5261b1eb56b2297ae827c37262f
-      size: 1156419
+      md5: dfc2275999ae50e74a9ab8459372ddfa
+      size: 1939776
   feature_building:
     cmd: python src/noshow/features/feature_pipeline.py
     deps:
@@ -24,14 +24,14 @@ stages:
       size: 279455
     - path: data/raw/poliafspraken_no_show.csv
       hash: md5
-      md5: 46c186d0fd0d69609b08131997cba338
-      size: 507224267
+      md5: 31983d8a20d487417a9da7cf024c7ecc
+      size: 487004530
     - path: src/noshow/features/feature_pipeline.py
       hash: md5
       md5: 71ffb7a162976bde11e0aed72ea19f98
       size: 2889
     outs:
     - path: data/processed/featuretable.parquet
       hash: md5
-      md5: 86589d93563f0a2fdc2373a7bf487732
-      size: 94648247
+      md5: 4e754146b5f7ea804ab79691e19038d4
+      size: 100497640
diff --git a/output/dvclive/metrics.json b/output/dvclive/metrics.json
@@ -1,7 +1,7 @@
 {
-    "best_score": 0.7424015243279327,
-    "mean_roc_auc": 0.7424015243279327,
-    "std_roc_auc": 0.007613920544359392,
-    "mean_precision": 0.562214765652999,
-    "mean_recall": 0.01428090066737743
+    "best_score": 0.7480471226772029,
+    "mean_roc_auc": 0.7480471226772029,
+    "std_roc_auc": 0.005831461640562374,
+    "mean_precision": 0.5541110092343222,
+    "mean_recall": 0.014451354691335214
 }
diff --git a/output/dvclive/plots/metrics/best_score.tsv b/output/dvclive/plots/metrics/best_score.tsv
@@ -1,2 +1,2 @@
 step	best_score
-0	0.7424015243279327
+0	0.7480471226772029
diff --git a/output/dvclive/plots/metrics/mean_precision.tsv b/output/dvclive/plots/metrics/mean_precision.tsv
@@ -1,2 +1,2 @@
 step	mean_precision
-0	0.562214765652999
+0	0.5541110092343222
diff --git a/output/dvclive/plots/metrics/mean_recall.tsv b/output/dvclive/plots/metrics/mean_recall.tsv
@@ -1,2 +1,2 @@
 step	mean_recall
-0	0.01428090066737743
+0	0.014451354691335214
diff --git a/output/dvclive/plots/metrics/mean_roc_auc.tsv b/output/dvclive/plots/metrics/mean_roc_auc.tsv
@@ -1,2 +1,2 @@
 step	mean_roc_auc
-0	0.7424015243279327
+0	0.7480471226772029
diff --git a/output/dvclive/plots/metrics/std_roc_auc.tsv b/output/dvclive/plots/metrics/std_roc_auc.tsv
@@ -1,2 +1,2 @@
 step	std_roc_auc
-0	0.007613920544359392
+0	0.005831461640562374
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "noshow"
-version = "1.4.9"
+version = "1.4.10"
 authors = [
   { name="Ruben Peters", email="[email protected]" },
   { name="Eric Wolters", email="[email protected]" }

diff --git a/run/config/config.toml.dvc b/run/config/config.toml.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 6ade00b7f8bece964bde000c8e982f9e
-  size: 14933
+- md5: 922735dc7dbcb39f8ab28e8cb7bb1297
+  size: 18980
   hash: md5
   path: config.toml
diff --git a/src/noshow/database/export.py b/src/noshow/database/export.py
@@ -0,0 +1,70 @@
+# Module to export data from dataplatform to a csv file to train the model
+# uses the export query in data/sql/data_export.sql
+import csv
+import logging
+from pathlib import Path
+
+from sqlalchemy import text
+
+from noshow.database.connection import get_connection_string, get_engine
+
+logger = logging.getLogger(__name__)
+
+
+def export_data(
+    db_host: str = "dataplatform",
+    db_database: str = "PUB",
+    output_path: str = "poliafspraken_no_show.csv",
+    batch_size: int = 10_000,
+):
+    """Function to efficiently export data from the dataplatform to a csv file
+
+    Used to export data to train the model. The data is exported in batches to
+    avoid memory issues.
+
+    Parameters
+    ----------
+    db_host : str, optional
+        hostname of the database server, by default "dataplatform"
+    db_database : str, optional
+        Name of the database, by default "PUB"
+    output_path : str, optional
+        Name of the output file, located in the data/raw folder,
+        by default "poliafspraken_no_show.csv"
+    batch_size : int, optional
+        batch size for reading from query result and writing to csv, by default 1000
+    """
+    connection_string = get_connection_string(db_database=db_database, db_host=db_host)
+    with open(Path(__file__).parents[3] / "data/sql/data_export.sql") as f:
+        sql_query = f.read()
+
+    output_csv = Path(__file__).parents[3] / "data/raw" / output_path
+
+    db_engine = get_engine(connection_string)
+    with db_engine.connect() as conn:
+        logger.info("Executing export query...")
+        result = conn.execution_options(stream_results=True).execute(text(sql_query))
+        logger.info("Export query executed successfully")
+
+        with open(output_csv, "w", newline="") as csvfile:
+            writer = csv.writer(csvfile)
+
+            # Write the header row
+            writer.writerow(result.keys())
+
+            # Write data in batches
+            while True:
+                rows = result.fetchmany(batch_size)
+                if not rows:
+                    break
+                writer.writerows(rows)
+
+    logger.info(f"Data exported to {output_csv}")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+    export_data()