Skip to content

Commit

Permalink
Scale up step 3 (#145)
Browse files Browse the repository at this point in the history
* Updated agendas

* Rerun pipelines

* Updated version

* update data export
  • Loading branch information
rubenpeters91 authored Dec 4, 2024
1 parent cd5975e commit 47ecd9e
Show file tree
Hide file tree
Showing 14 changed files with 157 additions and 36 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [1.4.10] - 2024-12-03

### Changed
- Updated agendas to next scale up step and rerun pipelines
- Updated data export script to use data from 2016 instead of 2015 and use the PUB publication.

### Added
- Added a script to export data for model training

## [1.4.9] - 2024-11-18

### Changed
Expand Down
4 changes: 2 additions & 2 deletions data/raw/poliafspraken_no_show.csv.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: 46c186d0fd0d69609b08131997cba338
size: 507224267
- md5: 31983d8a20d487417a9da7cf024c7ecc
size: 487004530
path: poliafspraken_no_show.csv
hash: md5
32 changes: 23 additions & 9 deletions data/sql/data_export.sql
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ SELECT APP.identifier_value AS APP_ID
,ADDR.[address_postalCodeNumbersNL]
,LOC.[name]
,LOC.[description]
FROM [DWH].[models].[HealthcareService] SUBAGENDA JOIN [DWH].[models].[HealthcareService] HOOFDAGENDA
FROM [PUB].[no_show].[HealthcareService] SUBAGENDA JOIN [PUB].[no_show].[HealthcareService] HOOFDAGENDA
ON SUBAGENDA.partOf_HealthcareService_value = HOOFDAGENDA.identifier_value AND SUBAGENDA.partOf_HealthcareService_system = HOOFDAGENDA.identifier_system
JOIN [DWH].[models].[Appointment] APP
JOIN [PUB].[no_show].[Appointment] APP
ON APP.participant_actor_HealthcareService_value = SUBAGENDA.identifier_value AND APP.participant_actor_HealthcareService_system = SUBAGENDA.identifier_system
JOIN [DWH].[models].Encounter ENC
JOIN [PUB].[no_show].Encounter ENC
ON ENC.appointment_Appointment_system = APP.identifier_system AND ENC.appointment_Appointment_value = APP.identifier_value
LEFT JOIN [DWH].[models].Location LOC
LEFT JOIN [PUB].[no_show].Location LOC
ON ENC.location_Location_system = LOC.identifier_system AND ENC.location_Location_value = LOC.identifier_value
JOIN [DWH].[models].[Patient] PAT
JOIN [PUB].[no_show].[Patient] PAT
ON APP.[participant_actor_Patient_value] = PAT.identifier_value
LEFT JOIN [DWH].[models].[Patient_Address] ADDR
LEFT JOIN [PUB].[no_show].[Patient_Address] ADDR
ON ADDR.[parent_identifier_value] = PAT.identifier_value
WHERE 1=1
AND SUBAGENDA.active = 1
Expand Down Expand Up @@ -115,11 +115,25 @@ WHERE 1=1
-- Zorglijn Acute en intensieve zorg
'ZH0297', -- PSY Acute en intensieve zorg
-- Oncologische urologie
'ZH0033' -- B&O Urologische oncologie
'ZH0033', -- B&O Urologische oncologie
-- Hartfunctie
'ZH0116', -- Functie Hart
-- Medische oncoloie
'ZH0028', -- B&O Medische oncologie
-- Hematologie
'ZH0025', -- B&O Hematologie
-- Gynaecologische oncologie
'ZH0024', -- B&O Gynaecologische oncologie
-- Chirurgische oncologie
'ZH0020', -- B&O Chirurgische oncologie
-- Hoofd Hals oncologie
'ZH0027', -- B&O KNH oncologie
-- Functie KNF
'ZH0175' -- Functie KNF
)
AND APP.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
AND APP.created >= '2015-01-01'
AND APP.created <= '2024-10-01'
AND APP.created >= '2016-01-01'
AND APP.created <= '2024-11-01'
AND APP.status <> 'booked'
AND ENC.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
AND ENC.type2_code NOT IN ('T', 'S', 'M')
Expand Down
32 changes: 30 additions & 2 deletions data/sql/data_prediction.sql
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,21 @@ WHERE 1=1
-- Zorglijn Acute en intensieve zorg
'ZH0297', -- PSY Acute en intensieve zorg
-- Oncologische urologie
'ZH0033' -- B&O Urologische oncologie
'ZH0033', -- B&O Urologische oncologie
-- Hartfunctie
'ZH0116', -- Functie Hart
-- Medische oncoloie
'ZH0028', -- B&O Medische oncologie
-- Hematologie
'ZH0025', -- B&O Hematologie
-- Gynaecologische oncologie
'ZH0024', -- B&O Gynaecologische oncologie
-- Chirurgische oncologie
'ZH0020', -- B&O Chirurgische oncologie
-- Hoofd Hals oncologie
'ZH0027', -- B&O KNH oncologie
-- Functie KNF
'ZH0175' -- Functie KNF
)
AND APP.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
AND APP.[created] >= '2015-01-01'
Expand Down Expand Up @@ -240,7 +254,21 @@ WHERE 1=1
-- Zorglijn Acute en intensieve zorg
'ZH0297', -- PSY Acute en intensieve zorg
-- Oncologische urologie
'ZH0033' -- B&O Urologische oncologie
'ZH0033', -- B&O Urologische oncologie
-- Hartfunctie
'ZH0116', -- Functie Hart
-- Medische oncoloie
'ZH0028', -- B&O Medische oncologie
-- Hematologie
'ZH0025', -- B&O Hematologie
-- Gynaecologische oncologie
'ZH0024', -- B&O Gynaecologische oncologie
-- Chirurgische oncologie
'ZH0020', -- B&O Chirurgische oncologie
-- Hoofd Hals oncologie
'ZH0027', -- B&O KNH oncologie
-- Functie KNF
'ZH0175' -- Functie KNF
)
AND APP2.identifier_system = 'https://metadata.umcutrecht.nl/ids/HixAgendaAfspraak'
AND CONVERT(DATE, APP2.[start]) = @start_date
Expand Down
20 changes: 10 additions & 10 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ stages:
deps:
- path: data/processed/featuretable.parquet
hash: md5
md5: 86589d93563f0a2fdc2373a7bf487732
size: 94648247
md5: 4e754146b5f7ea804ab79691e19038d4
size: 100497640
- path: src/noshow/model/train_model.py
hash: md5
md5: 00964a947199825f721ebbbe0bb23da6
size: 3708
md5: 12587629eddb081d94bc997c8d100dd4
size: 3713
outs:
- path: output/models/no_show_model_cv.pickle
hash: md5
md5: 6517c5261b1eb56b2297ae827c37262f
size: 1156419
md5: dfc2275999ae50e74a9ab8459372ddfa
size: 1939776
feature_building:
cmd: python src/noshow/features/feature_pipeline.py
deps:
Expand All @@ -24,14 +24,14 @@ stages:
size: 279455
- path: data/raw/poliafspraken_no_show.csv
hash: md5
md5: 46c186d0fd0d69609b08131997cba338
size: 507224267
md5: 31983d8a20d487417a9da7cf024c7ecc
size: 487004530
- path: src/noshow/features/feature_pipeline.py
hash: md5
md5: 71ffb7a162976bde11e0aed72ea19f98
size: 2889
outs:
- path: data/processed/featuretable.parquet
hash: md5
md5: 86589d93563f0a2fdc2373a7bf487732
size: 94648247
md5: 4e754146b5f7ea804ab79691e19038d4
size: 100497640
10 changes: 5 additions & 5 deletions output/dvclive/metrics.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"best_score": 0.7424015243279327,
"mean_roc_auc": 0.7424015243279327,
"std_roc_auc": 0.007613920544359392,
"mean_precision": 0.562214765652999,
"mean_recall": 0.01428090066737743
"best_score": 0.7480471226772029,
"mean_roc_auc": 0.7480471226772029,
"std_roc_auc": 0.005831461640562374,
"mean_precision": 0.5541110092343222,
"mean_recall": 0.014451354691335214
}
2 changes: 1 addition & 1 deletion output/dvclive/plots/metrics/best_score.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
step best_score
0 0.7424015243279327
0 0.7480471226772029
2 changes: 1 addition & 1 deletion output/dvclive/plots/metrics/mean_precision.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
step mean_precision
0 0.562214765652999
0 0.5541110092343222
2 changes: 1 addition & 1 deletion output/dvclive/plots/metrics/mean_recall.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
step mean_recall
0 0.01428090066737743
0 0.014451354691335214
2 changes: 1 addition & 1 deletion output/dvclive/plots/metrics/mean_roc_auc.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
step mean_roc_auc
0 0.7424015243279327
0 0.7480471226772029
2 changes: 1 addition & 1 deletion output/dvclive/plots/metrics/std_roc_auc.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
step std_roc_auc
0 0.007613920544359392
0 0.005831461640562374
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "noshow"
version = "1.4.9"
version = "1.4.10"
authors = [
{ name="Ruben Peters", email="[email protected]" },
{ name="Eric Wolters", email="[email protected]" }
Expand Down
4 changes: 2 additions & 2 deletions run/config/config.toml.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: 6ade00b7f8bece964bde000c8e982f9e
size: 14933
- md5: 922735dc7dbcb39f8ab28e8cb7bb1297
size: 18980
hash: md5
path: config.toml
70 changes: 70 additions & 0 deletions src/noshow/database/export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Module to export data from dataplatform to a csv file to train the model
# uses the export query in data/sql/data_export.sql
import csv
import logging
from pathlib import Path

from sqlalchemy import text

from noshow.database.connection import get_connection_string, get_engine

logger = logging.getLogger(__name__)


def export_data(
db_host: str = "dataplatform",
db_database: str = "PUB",
output_path: str = "poliafspraken_no_show.csv",
batch_size: int = 10_000,
):
"""Function to efficiently export data from the dataplatform to a csv file
Used to export data to train the model. The data is exported in batches to
avoid memory issues.
Parameters
----------
db_host : str, optional
hostname of the database server, by default "dataplatform"
db_database : str, optional
Name of the database, by default "PUB"
output_path : str, optional
Name of the output file, located in the data/raw folder,
by default "poliafspraken_no_show.csv"
batch_size : int, optional
batch size for reading from query result and writing to csv, by default 1000
"""
connection_string = get_connection_string(db_database=db_database, db_host=db_host)
with open(Path(__file__).parents[3] / "data/sql/data_export.sql") as f:
sql_query = f.read()

output_csv = Path(__file__).parents[3] / "data/raw" / output_path

db_engine = get_engine(connection_string)
with db_engine.connect() as conn:
logger.info("Executing export query...")
result = conn.execution_options(stream_results=True).execute(text(sql_query))
logger.info("Export query executed successfully")

with open(output_csv, "w", newline="") as csvfile:
writer = csv.writer(csvfile)

# Write the header row
writer.writerow(result.keys())

# Write data in batches
while True:
rows = result.fetchmany(batch_size)
if not rows:
break
writer.writerows(rows)

logger.info(f"Data exported to {output_csv}")


if __name__ == "__main__":
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
export_data()

0 comments on commit 47ecd9e

Please sign in to comment.