dotimplement · jenniferjiangkells · Jun 17, 2024 · Jun 11, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -45,21 +45,24 @@ A client is a healthcare system object that requests information and processing
 
 A client is typically an EHR system, but we may also support other health objects in the future such as a CPOE (Computerized Ohysician Order Entry).
 
-We can mark a client by using the decorator `@hc.ehr`. You **must** declare a **workflow** for EHR clients, which informs the sandbox how your data will be formatted (See [Use Cases](usecases.md)).
+We can mark a client by using the decorator `@hc.ehr`. You must declare a particular **workflow** for the EHR client, which informs the sandbox how your data will be formatted (See [Use Cases](usecases.md)).
+
+Data returned from the client should be wrapped in a [Pydantic](https://docs.pydantic.dev/latest/) model depending on use case, e.g. `CdsFhirData`.
 
 You can optionally specify if you want more than 1 request generated with the `num` parameter.
 
 ```python
 import healthchain as hc
 from healthchain.use_cases import ClinicalDecisionSupport
+from healthchain.models import CdsFhirData
 
 @hc.sandbox
 class MyCoolSandbox(ClinicalDecisionSupport):
     def __init__(self) -> None:
         pass
 
     @hc.ehr(workflow="patient-view", num=10)
-    def load_data_in_client(self):
+    def load_data_in_client(self) -> CdsFhirData:
         # Do things here to load in your data
         pass
 
@@ -69,9 +72,9 @@ class MyCoolSandbox(ClinicalDecisionSupport):
 ### Data Generator
 Healthcare data is interoperable, but not composable - every deployment site will have different ways of configuring data and terminology. This matters when you develop applications that need to integrate into these systems, especially when you need to reliably extract data for your model to consume.
 
-The aim of the Data Generator is not to generate realistic data suitable for use cases such as patient population studies, but rather to generate data that is structurally compliant with what is expected of EHR configurations, and to be able to test and handle variations in this.
+The aim of the data generator is not to generate realistic data suitable for use cases such as patient population studies, but rather to generate data that is structurally compliant with what is expected of EHR configurations, and to be able to test and handle variations in this.
 
-For this reason the data generator is opiniated by use case and workflow. See [Use Cases](usecases.md).
+For this reason the data generator is opiniated by use case and workflow. See [Use Cases](usecases.md) for more information.
 
 !!! note
     We're aware we may not cover everyone's use cases, so if you have strong opinions about this, please [reach out](https://discord.gg/jG4UWCUh)!
@@ -80,43 +83,54 @@ On the synthetic data spectrum defined by [this UK ONS methodology working paper
 
 ![Synthetic data](assets/synthetic_data_ons.png)
 
-You can use the data generator within a Client function or on its own. The `.data` attribute contains a Pydantic class containing `context` and `resources`.
+You can use the data generator within a client function or on its own. The `.generate()` is dependent on workflow. For CDS use cases, it will return a `CdsFhirData` model with the `prefetch` field populated with a [Bundle](https://www.hl7.org/fhir/bundle.html) of generated structural synthetic FHIR data.
 
 === "Within client"
     ```python
     import healthchain as hc
-    from healthchain.data_generator import DataGenerator
     from healthchain.use_cases import ClinicalDecisionSupport
+    from healthchain.models import CdsFhirData
+    from healthchain.data_generator import CdsDataGenerator
 
     @hc.sandbox
     class MyCoolSandbox(ClinicalDecisionSupport):
         def __init__(self) -> None:
-            self.data_generator = DataGenerator()
+            self.data_generator = CdsDataGenerator()
 
         @hc.ehr(workflow="patient-view")
-        def load_data_in_client(self):
-            self.data_generator.generate()
-            return self.data_generator.data
+        def load_data_in_client(self) -> CdsFhirData:
+            data = self.data_generator.generate()
+            return data
 
         @hc.api
-        def my_server(self, text):
+        def my_server(self, request) -> None:
             pass
     ```
 
 
 === "On its own"
     ```python
-    from healthchain.data_generator import DataGenerator
+    from healthchain.data_generator import CdsDataGenerator
     from healthchain.base import Workflow
 
     # Initialise data generator
-    data_generator = DataGenerator()
+    data_generator = CdsDataGenerator()
 
     # Generate FHIR resources for use case workflow
     data_generator.set_workflow(Workflow.encounter_discharge)
-    data_generator.generate()
-
-    print(data_generator.data.resources.model_dump(by_alias=True, exclude_unset=True))
+    data = data_generator.generate()
+
+    print(data.model_dump())
+
+    # {
+    #    "prefetch": {
+    #        "entry": [
+    #            {
+    #                "resource": ...
+    #            }
+    #        ]
+    #    }
+    #}
     ```
 
 <!-- You can pass in parameters in `contraint` argument to limit the general form of the FHIR resources you get back, but this feature is experimental. Arguments supported are:
@@ -147,10 +161,12 @@ data_generator.generate(free_text_csv="./dir/to/csv/file")
 
 
 ### Service API
-A service is typically an API of an external AI/NLP system that returns data to the client. This is where you define your application logic - it can be anything from a simple regex to a highly sophisticated LLM agentic workflow. The only constraint is that you have to return your data as a `Dict` that your workflow expects.
+A service is typically an API of an external AI/NLP system that returns data to the client. This is where you define your application logic - it can be anything from a simple regex to a highly sophisticated LLM agentic workflow.
 
 When you decorate a function with `@hc.api` in a sandbox, the function is mounted to a HL7-compliant service endpoint an EHR client can make requests to. This is usually a set of standardised API routes depending on the use case. HealthChain will start a [FastAPI](https://fastapi.tiangolo.com/) server with these APIs pre-defined for you.
 
+Your service function must accept and return models appropriate for your use case. Typically the service function should accept a `Request` model and return a use case specific model, such as a list of `Card` for CDS.
+
 If you are using a model that requires initialisation steps, we recommend you initialise this in your class `__init__`.
 
 === "Transformers"
@@ -161,35 +177,34 @@ If you are using a model that requires initialisation steps, we recommend you in
     import healthchain as hc
 
     from healthchain.use_cases import ClinicalDecisionSupport
-    from healthchain.data_generator import DataGenerator
+    from healthchain.data_generator import CdsDataGenerator
+    from healthchain.models import Card, CDSRequest, CdsFhirData
     from transformers import pipeline
 
-    from typing import Dict
+    from typing import List
 
     @hc.sandbox
     class MyCoolSandbox(ClinicalDecisionSupport):
         def __init__(self):
-            self.data_generator = DataGenerator()
+            self.data_generator = CdsDataGenerator()
             self.pipeline = pipeline('summarization')
 
-        @hc.ehr(workflow="patient-view")
-        def load_data_in_client(self):
-            self.data_generator.generate()
-            return self.data_generator.data
+        @hc.ehr(workflow="patient-view") -> CdsFhirData
+        def load_data_in_client(self) -> CdsFhirData:
+            data = self.data_generator.generate()
+            return data
 
         @hc.api
-        def my_service(self, text: str):
-            results = self.pipeline(text)
-            return {
-                "cards": [
-                    {
-                        "summary": "Patient summary",
-                        "indicator": "info",
-                        "source": {"label": "transformer"},
-                        "detail": results[0]['summary_text']
-                    }
-                ]
-            }
+        def my_service(self, request: CDSRequest) -> List[Card]:
+            results = self.pipeline(str(request.prefetch))
+            return [
+                Card(
+                    summary="Patient summary",
+                    indicator="info",
+                    source={"label": "transformers"},
+                    detail=results[0]['summary_text'],
+                )
+            ]
 
     if __name__ == "__main__":
         cds = MyCoolSandbox()
@@ -203,49 +218,47 @@ If you are using a model that requires initialisation steps, we recommend you in
     import healthchain as hc
 
     from healthchain.use_cases import ClinicalDecisionSupport
-    from healthchain.data_generator import DataGenerator
+    from healthchain.data_generator import CdsDataGenerator
+    from healthchain.models import Card, CdsFhirData, CDSRequest
 
     from langchain_openai import ChatOpenAI
     from langchain_core.prompts import PromptTemplate
     from langchain_core.output_parsers import StrOutputParser
 
-    from typing import Dict
+    from typing import List
 
     @hc.sandbox
     class MyCoolSandbox(ClinicalDecisionSupport):
         def __init__(self):
             self.chain = self._init_llm_chain()
-            self.data_generator = DataGenerator()
+            self.data_generator = CdsDataGenerator()
 
         def _init_llm_chain(self):
             prompt = PromptTemplate.from_template(
                 "Summarize the text below {text}"
-                )
+            )
             model = ChatOpenAI(model="gpt-4o")
             parser = StrOutputParser()
 
             chain = prompt | model | parser
-
             return chain
 
         @hc.ehr(workflow="patient-view")
-        def load_data_in_client(self):
-            self.data_generator.generate()
-            return self.data_generator.data
+        def load_data_in_client(self) -> CdsFhirData:
+            data = self.data_generator.generate()
+            return data
 
         @hc.api
-        def my_service(self, text: str) -> Dict:
-            result = self.chain.invoke(text)
-            return {
-                "cards": [
-                    {
-                        "summary": "Patient summary",
-                        "indicator": "info",
-                        "source": {"label": "openai"},
-                        "detail": result
-                    }
-                ]
-            }
+        def my_service(self, request: CDSRequest) -> List[Card]:
+            result = self.chain.invoke(str(request.prefetch))
+            return [
+                Card(
+                    summary="Patient summary",
+                    indicator="info",
+                    source={"label": "openai"},
+                    detail=result,
+                )
+            ]
 
     if __name__ == "__main__":
         cds = MyCoolSandbox()

diff --git a/healthchain/__init__.py b/healthchain/__init__.py
@@ -1,12 +1,12 @@
 import logging
 from .utils.logger import add_handlers
 from healthchain.decorators import ehr, api, sandbox
-from healthchain.data_generator.data_generator import DataGenerator
+from healthchain.data_generator.data_generator import CdsDataGenerator
 from healthchain.models.requests.cdsrequest import CDSRequest
 
 logger = logging.getLogger(__name__)
 add_handlers(logger)
 logger.setLevel(logging.INFO)
 
 # Export them at the top level
-__all__ = ["ehr", "api", "sandbox", "DataGenerator", "CDSRequest"]
+__all__ = ["ehr", "api", "sandbox", "CdsDataGenerator", "CDSRequest"]
diff --git a/healthchain/data_generator/__init__.py b/healthchain/data_generator/__init__.py
@@ -5,7 +5,7 @@
 from .procedure_generators import ProcedureGenerator
 from .medication_administration_generators import MedicationAdministrationGenerator
 from .medication_request_generators import MedicationRequestGenerator
-from .data_generator import DataGenerator
+from .data_generator import CdsDataGenerator
 
 __all__ = [
     "EncounterGenerator",
@@ -15,5 +15,5 @@
     "ProcedureGenerator",
     "MedicationAdministrationGenerator",
     "MedicationRequestGenerator",
-    "DataGenerator",
+    "CdsDataGenerator",
 ]
diff --git a/healthchain/data_generator/data_generator.py b/healthchain/data_generator/data_generator.py
@@ -1,15 +1,17 @@
+import random
+import json
+
+from pydantic import BaseModel
 from typing import Callable, Optional
+
+from healthchain.base import Workflow
 from healthchain.fhir_resources.bundle_resources import BundleModel, Bundle_EntryModel
 from healthchain.data_generator.base_generators import generator_registry
 from healthchain.fhir_resources.document_reference_resources import (
     DocumentReferenceModel,
 )
 from healthchain.fhir_resources.general_purpose_resources import NarrativeModel
-from healthchain.base import Workflow
-from pydantic import BaseModel
-
-import random
-import json
+from healthchain.models.data.cdsfhirdata import CdsFhirData
 
 
 workflow_mappings = {
@@ -30,16 +32,11 @@
 # TODO: Some of the resources should be allowed to be multiplied
 
 
-class OutputDataModel(BaseModel):
-    context: dict = {}
-    resources: BundleModel
-
-
-class DataGenerator:
+class CdsDataGenerator:
     def __init__(self):
         self.registry = generator_registry
         self.mappings = workflow_mappings
-        self.data = []
+        self.data: CdsFhirData = None
 
     def fetch_generator(self, generator_name: str) -> Callable:
         return self.registry.get(generator_name)
@@ -76,7 +73,7 @@ def generate(
                     resource=random.choice(parsed_free_text[self.workflow.value])
                 )
             )
-        output = OutputDataModel(context={}, resources=BundleModel(entry=results))
+        output = CdsFhirData(prefetch=BundleModel(entry=results))
         self.data = output
         return output
 

diff --git a/healthchain/decorators.py b/healthchain/decorators.py
@@ -14,7 +14,7 @@
 from .base import BaseUseCase, Workflow, UseCaseType
 from .clients import EHRClient
 from .service.service import Service
-from .data_generator.data_generator import DataGenerator
+from .data_generator.data_generator import CdsDataGenerator
 from .utils.apimethod import APIMethod
 from .utils.urlbuilder import UrlBuilder
 
@@ -25,13 +25,13 @@
 
 
 def generate_filename(prefix: str, unique_id: str, index: int):
-    timestamp = datetime.now().strftime("%Y-%m-%d")
-    filename = f"{timestamp}_sandbox_{unique_id}_{prefix}_{index}.json"
+    timestamp = datetime.now().strftime("%Y%m%d%H%M")
+    filename = f"{timestamp}_sandbox_{unique_id[:8]}_{prefix}_{index}.json"
     return filename
 
 
 def save_as_json(data, prefix, sandbox_id, index, save_dir):
-    save_name = generate_filename(prefix, sandbox_id, index)
+    save_name = generate_filename(prefix, str(sandbox_id), index)
     file_path = save_dir / save_name
     with open(file_path, "w") as outfile:
         json.dump(data, outfile, indent=4)
@@ -131,7 +131,7 @@ def wrapper(self, *args: Any, **kwargs: Any) -> EHRClient:
                 )
 
             # Set workflow in data generator if configured
-            data_generator_attributes = find_attributes_of_type(self, DataGenerator)
+            data_generator_attributes = find_attributes_of_type(self, CdsDataGenerator)
             for i in range(len(data_generator_attributes)):
                 attribute_name = data_generator_attributes[i]
                 try:

diff --git a/healthchain/models/__init__.py b/healthchain/models/__init__.py
@@ -0,0 +1,7 @@
+from .requests.cdsrequest import CDSRequest
+from .responses.cdsresponse import Card
+from .responses.cdsresponse import CDSResponse
+from .responses.cdsdiscovery import CDSService
+from .data.cdsfhirdata import CdsFhirData
+
+__all__ = ["CDSRequest", "Card", "CDSResponse", "CDSService", "CdsFhirData"]
diff --git a/healthchain/models/data/cdsfhirdata.py b/healthchain/models/data/cdsfhirdata.py
@@ -0,0 +1,16 @@
+from pydantic import BaseModel, Field
+from typing import Dict
+
+from ...fhir_resources.bundle_resources import BundleModel
+
+
+class CdsFhirData(BaseModel):
+    context: Dict = Field(default={})
+    prefetch: BundleModel
+
+    def model_dump(self, *args, **kwargs):
+        kwargs.setdefault("exclude_unset", True)
+        kwargs.setdefault("exclude_none", True)
+        kwargs.setdefault("by_alias", True)
+
+        return super().model_dump(*args, **kwargs)