From 9355eb9d0d0dc4aecccac95280c0cc956098c633 Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Thu, 24 Oct 2024 22:29:40 -0400
Subject: [PATCH 1/5] docs: :memo: pseudo code and docstring for
 `write_resource_parquet()`

Closes #642
---
 .../implementation/python-functions.qmd       |  6 +-
 sprout/core/write_resource_parquet.py         | 79 +++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)
 create mode 100644 sprout/core/write_resource_parquet.py

diff --git a/docs/design/implementation/python-functions.qmd b/docs/design/implementation/python-functions.qmd
index 47418b075..56c4e19dc 100644
--- a/docs/design/implementation/python-functions.qmd
+++ b/docs/design/implementation/python-functions.qmd
@@ -215,11 +215,7 @@ fig-alt="A Plant UML schematic of the detailed code flow within the `write_resou
 ::: {.callout-note collapse="true"}
 ### `write_resource_parquet(raw_files, path)`
 
-This function takes the files provided by `raw_files` and merges them
-into a `data.parquet` file provided by `path`. Use
-`path_resource_data()` to provide the correct path location for `path`
-and `path_resource_raw_files()` for the `raw_files` argument. Outputs
-the path object of the created file.
+See `help(write_resource_parquet)` for more details.
 :::
 
 ::: {.callout-note collapse="true"}
diff --git a/sprout/core/write_resource_parquet.py b/sprout/core/write_resource_parquet.py
new file mode 100644
index 000000000..be85333d2
--- /dev/null
+++ b/sprout/core/write_resource_parquet.py
@@ -0,0 +1,79 @@
+# ruff: noqa
+def write_resource_parquet(
+    raw_files_path: list[Path], parquet_path: Path, properties_path: Path
+) -> Path:
+    """Merge all raw resource file(s) and write into a Parquet file.
+
+    This function takes the file(s) provided by `raw_files_path` and merges them into
+    a `data.parquet` file provided by `parquet_path`. While Sprout generally assumes
+    that the files stored in the `resources/raw/` folder have already been
+    verified and validated, this function does some quick verification checks
+    of the data after reading it into Python from the raw file(s) by comparing
+    with the current properties given by the `properties_path`. All data in the
+    `resources/raw/` folder will be merged into one single data object and then
+    written back to the Parquet file. The Parquet file will be overwritten.
+
+    Examples:
+
+        ``` python
+        import seedcase_sprout.core as sp
+
+        sp.write_resource_parquet(
+            raw_files_path=sp.path_resources_raw_files(1, 1),
+            parquet_path=sp.path_resource_data(1, 1),
+            properties_path=sp.path_package_properties(1, 1),
+        )
+        ```
+
+    Args:
+        raw_files_path: A list of paths for all the raw files, mostly commonly stored in the
+            `.csv.gz` format. Use `path_resource_raw_files()` to help provide the
+            correct paths to the raw files.
+        parquet_path: The path to the Use `path_resource_data()` to help provide
+            the correct path location for the resource's `data.parquet` file.
+        properties_path: The path to the properties for this resource. Use
+            `path_package_properties()` to help give the correct location.
+
+    Returns:
+        Outputs the path object of the created Parquet file.
+    """
+    # Not sure if this is the correct way to verify multiple files.
+    [verify_is_file(path) for path in raw_files_path]
+
+    data = read_raw_files(raw_file)
+
+    # Confirms that the data matches the resource properties found in `datapackage.json`.
+    # Not sure if this is the best solution here to load in the properties file.
+    verify_is_file(properties_path)
+    properties = read_json(properties_path)
+
+    # This function could be several, e.g. verify_data_column_types(), verify_data_column_names()?
+    verify_data(data, properties)
+
+    # Could include validation here?
+    # validate_data(data, properties)
+
+    return write_parquet(data, parquet_path)
+
+
+def write_parquet(data: DataFrame, path: Path) -> Path:
+    return path
+
+
+def read_raw_files(paths: list[Path]) -> DataFrame:
+    # Can read gzip files.
+    data_list = [polars.read_csv(path) for path in paths]
+    # Merge them all together.
+    data = polars.concat(data_list)
+    return data
+
+
+def verify_data(data: DataFrame, properties: dict) -> Path:
+    # Compare against the properties for:
+    # - Header names
+    # - Data types
+    # - Number of rows?
+    # - Others?
+
+    # Error if fails, data if pass
+    return data

From b316ae5bcf0674ab7b7783c474fe1b7c0d7e7094 Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Mon, 11 Nov 2024 13:48:53 +0100
Subject: [PATCH 2/5] chore: :truck: move file into pseudocode folder

---
 .../design/implementation/pseudocode}/write_resource_parquet.py   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {seedcase_sprout/core => docs/design/implementation/pseudocode}/write_resource_parquet.py (100%)

diff --git a/seedcase_sprout/core/write_resource_parquet.py b/docs/design/implementation/pseudocode/write_resource_parquet.py
similarity index 100%
rename from seedcase_sprout/core/write_resource_parquet.py
rename to docs/design/implementation/pseudocode/write_resource_parquet.py

From 63a8ed42d2143401f9e47bd57abe31dd0a07af93 Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Thu, 13 Feb 2025 09:21:17 +0100
Subject: [PATCH 3/5] docs: :memo: add Mermaid diagram and rename to `build_`

---
 docs/design/interface/functions.qmd | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/docs/design/interface/functions.qmd b/docs/design/interface/functions.qmd
index 2d340a130..51ffaaf3c 100644
--- a/docs/design/interface/functions.qmd
+++ b/docs/design/interface/functions.qmd
@@ -101,7 +101,7 @@ more details.
 
 ## Data resource functions
 
-### {{< var done >}}`create_resource_structure(path)`
+### {{< var done >}} `create_resource_structure(path)`
 
 See the help documentation with `help(create_resource_structure)` for
 more details.
@@ -127,13 +127,21 @@ flowchart
     function --> out
 ```
 
-### {{< var wip >}} `write_resource_parquet(raw_files, path)`
+### {{< var wip >}} `build_resource_parquet(raw_files_path, resource_properties)`
 
-This function takes the files provided by `raw_files` and merges them
-into a `data.parquet` file provided by `path`. Use
-`path_resource_data()` to provide the correct path location for `path`
-and `path_resource_raw_files()` for the `raw_files` argument. Outputs
-the path object of the created file.
+See the help documentation with `help(build_resource_parquet)` for more
+details.
+
+```{mermaid}
+flowchart
+    in_raw_files_path[/raw_files_path/]
+    in_properties[/resource_properties/]
+    function("build_resource_parquet()")
+    out[("./resources/{id}/data.parquet")]
+    in_raw_files_path --> function
+    in_properties --> function
+    function --> out
+```
 
 ### {{< var wip >}} `edit_resource_properties(path, properties)`
 

From fa28b985e984a0a1bdbc4a208a513cb776d74ec7 Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Thu, 13 Feb 2025 09:22:09 +0100
Subject: [PATCH 4/5] docs: :building_construction: updated and finished
 pseudocode for `build_resource_parquet()`

---
 .../pseudocode/write_resource_parquet.py      | 54 +++++++++----------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/docs/design/implementation/pseudocode/write_resource_parquet.py b/docs/design/implementation/pseudocode/write_resource_parquet.py
index be85333d2..ac30d5d4c 100644
--- a/docs/design/implementation/pseudocode/write_resource_parquet.py
+++ b/docs/design/implementation/pseudocode/write_resource_parquet.py
@@ -1,18 +1,24 @@
 # ruff: noqa
-def write_resource_parquet(
-    raw_files_path: list[Path], parquet_path: Path, properties_path: Path
+def build_resource_parquet(
+    raw_files_path: list[Path], resource_properties: ResourceProperties
 ) -> Path:
     """Merge all raw resource file(s) and write into a Parquet file.
 
     This function takes the file(s) provided by `raw_files_path` and merges them into
-    a `data.parquet` file provided by `parquet_path`. While Sprout generally assumes
+    a `data.parquet` file. The Parquet file will be stored at the path found in `ResourceProperties.path`.
+    While Sprout generally assumes
     that the files stored in the `resources/raw/` folder have already been
     verified and validated, this function does some quick verification checks
     of the data after reading it into Python from the raw file(s) by comparing
-    with the current properties given by the `properties_path`. All data in the
+    with the current properties given by the `resource_properties`. All data in the
     `resources/raw/` folder will be merged into one single data object and then
     written back to the Parquet file. The Parquet file will be overwritten.
 
+    If there are any duplicate observation units in the data, only the most recent
+    observation unit will be kept. This way, if there are any errors or mistakes
+    in older raw files that has been corrected in later files, the mistake can still
+    be kept, but won't impact the data that will actually be used.
+
     Examples:
 
         ``` python
@@ -29,31 +35,23 @@ def write_resource_parquet(
         raw_files_path: A list of paths for all the raw files, mostly commonly stored in the
             `.csv.gz` format. Use `path_resource_raw_files()` to help provide the
             correct paths to the raw files.
-        parquet_path: The path to the Use `path_resource_data()` to help provide
-            the correct path location for the resource's `data.parquet` file.
-        properties_path: The path to the properties for this resource. Use
-            `path_package_properties()` to help give the correct location.
+        resource_properties: The `ResourceProperties` object that contains the properties
+            of the resource you want to create the Parquet file for.
 
     Returns:
         Outputs the path object of the created Parquet file.
     """
     # Not sure if this is the correct way to verify multiple files.
-    [verify_is_file(path) for path in raw_files_path]
-
-    data = read_raw_files(raw_file)
-
-    # Confirms that the data matches the resource properties found in `datapackage.json`.
-    # Not sure if this is the best solution here to load in the properties file.
-    verify_is_file(properties_path)
-    properties = read_json(properties_path)
+    [check_is_file(path) for path in raw_files_path]
+    check_resource_properties(resource_properties)
 
-    # This function could be several, e.g. verify_data_column_types(), verify_data_column_names()?
-    verify_data(data, properties)
+    data = read_raw_files(raw_files_path)
+    data = drop_duplicate_obs_units(data)
 
-    # Could include validation here?
-    # validate_data(data, properties)
+    # This function could be several functions or the one full function.
+    check_data(data, resource_properties)
 
-    return write_parquet(data, parquet_path)
+    return write_parquet(data, resource_properties["path"])
 
 
 def write_parquet(data: DataFrame, path: Path) -> Path:
@@ -68,12 +66,8 @@ def read_raw_files(paths: list[Path]) -> DataFrame:
     return data
 
 
-def verify_data(data: DataFrame, properties: dict) -> Path:
-    # Compare against the properties for:
-    # - Header names
-    # - Data types
-    # - Number of rows?
-    # - Others?
-
-    # Error if fails, data if pass
-    return data
+def drop_duplicate_obs_units(data: DataFrame) -> DataFrame:
+    # Drop duplicates based on the observation unit, keeping only the most
+    # recent one. This allows older raw files to contain potentially wrong
+    # data that was corrected in the most recent file.
+    return data.drop_duplicates()

From fa590b6c85c08ddeb0d38fe6a66c4b5fb62cffce Mon Sep 17 00:00:00 2001
From: "Luke W. Johnston" <lwjohnst@gmail.com>
Date: Thu, 13 Feb 2025 09:23:02 +0100
Subject: [PATCH 5/5] chore: :truck: rename and move to `interface/`

---
 .../pseudocode/build_resource_parquet.py}                         | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/design/{implementation/pseudocode/write_resource_parquet.py => interface/pseudocode/build_resource_parquet.py} (100%)

diff --git a/docs/design/implementation/pseudocode/write_resource_parquet.py b/docs/design/interface/pseudocode/build_resource_parquet.py
similarity index 100%
rename from docs/design/implementation/pseudocode/write_resource_parquet.py
rename to docs/design/interface/pseudocode/build_resource_parquet.py