From 9355eb9d0d0dc4aecccac95280c0cc956098c633 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Thu, 24 Oct 2024 22:29:40 -0400 Subject: [PATCH 1/5] docs: :memo: pseudo code and docstring for `write_resource_parquet()` Closes #642 --- .../implementation/python-functions.qmd | 6 +- sprout/core/write_resource_parquet.py | 79 +++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 sprout/core/write_resource_parquet.py diff --git a/docs/design/implementation/python-functions.qmd b/docs/design/implementation/python-functions.qmd index 47418b075..56c4e19dc 100644 --- a/docs/design/implementation/python-functions.qmd +++ b/docs/design/implementation/python-functions.qmd @@ -215,11 +215,7 @@ fig-alt="A Plant UML schematic of the detailed code flow within the `write_resou ::: {.callout-note collapse="true"} ### `write_resource_parquet(raw_files, path)` -This function takes the files provided by `raw_files` and merges them -into a `data.parquet` file provided by `path`. Use -`path_resource_data()` to provide the correct path location for `path` -and `path_resource_raw_files()` for the `raw_files` argument. Outputs -the path object of the created file. +See `help(write_resource_parquet)` for more details. ::: ::: {.callout-note collapse="true"} diff --git a/sprout/core/write_resource_parquet.py b/sprout/core/write_resource_parquet.py new file mode 100644 index 000000000..be85333d2 --- /dev/null +++ b/sprout/core/write_resource_parquet.py @@ -0,0 +1,79 @@ +# ruff: noqa +def write_resource_parquet( + raw_files_path: list[Path], parquet_path: Path, properties_path: Path +) -> Path: + """Merge all raw resource file(s) and write into a Parquet file. + + This function takes the file(s) provided by `raw_files_path` and merges them into + a `data.parquet` file provided by `parquet_path`. While Sprout generally assumes + that the files stored in the `resources/raw/` folder have already been + verified and validated, this function does some quick verification checks + of the data after reading it into Python from the raw file(s) by comparing + with the current properties given by the `properties_path`. All data in the + `resources/raw/` folder will be merged into one single data object and then + written back to the Parquet file. The Parquet file will be overwritten. + + Examples: + + ``` python + import seedcase_sprout.core as sp + + sp.write_resource_parquet( + raw_files_path=sp.path_resources_raw_files(1, 1), + parquet_path=sp.path_resource_data(1, 1), + properties_path=sp.path_package_properties(1, 1), + ) + ``` + + Args: + raw_files_path: A list of paths for all the raw files, mostly commonly stored in the + `.csv.gz` format. Use `path_resource_raw_files()` to help provide the + correct paths to the raw files. + parquet_path: The path to the Use `path_resource_data()` to help provide + the correct path location for the resource's `data.parquet` file. + properties_path: The path to the properties for this resource. Use + `path_package_properties()` to help give the correct location. + + Returns: + Outputs the path object of the created Parquet file. + """ + # Not sure if this is the correct way to verify multiple files. + [verify_is_file(path) for path in raw_files_path] + + data = read_raw_files(raw_file) + + # Confirms that the data matches the resource properties found in `datapackage.json`. + # Not sure if this is the best solution here to load in the properties file. + verify_is_file(properties_path) + properties = read_json(properties_path) + + # This function could be several, e.g. verify_data_column_types(), verify_data_column_names()? + verify_data(data, properties) + + # Could include validation here? + # validate_data(data, properties) + + return write_parquet(data, parquet_path) + + +def write_parquet(data: DataFrame, path: Path) -> Path: + return path + + +def read_raw_files(paths: list[Path]) -> DataFrame: + # Can read gzip files. + data_list = [polars.read_csv(path) for path in paths] + # Merge them all together. + data = polars.concat(data_list) + return data + + +def verify_data(data: DataFrame, properties: dict) -> Path: + # Compare against the properties for: + # - Header names + # - Data types + # - Number of rows? + # - Others? + + # Error if fails, data if pass + return data From b316ae5bcf0674ab7b7783c474fe1b7c0d7e7094 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Mon, 11 Nov 2024 13:48:53 +0100 Subject: [PATCH 2/5] chore: :truck: move file into pseudocode folder --- .../design/implementation/pseudocode}/write_resource_parquet.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {seedcase_sprout/core => docs/design/implementation/pseudocode}/write_resource_parquet.py (100%) diff --git a/seedcase_sprout/core/write_resource_parquet.py b/docs/design/implementation/pseudocode/write_resource_parquet.py similarity index 100% rename from seedcase_sprout/core/write_resource_parquet.py rename to docs/design/implementation/pseudocode/write_resource_parquet.py From 63a8ed42d2143401f9e47bd57abe31dd0a07af93 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Thu, 13 Feb 2025 09:21:17 +0100 Subject: [PATCH 3/5] docs: :memo: add Mermaid diagram and rename to `build_` --- docs/design/interface/functions.qmd | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/design/interface/functions.qmd b/docs/design/interface/functions.qmd index 2d340a130..51ffaaf3c 100644 --- a/docs/design/interface/functions.qmd +++ b/docs/design/interface/functions.qmd @@ -101,7 +101,7 @@ more details. ## Data resource functions -### {{< var done >}}`create_resource_structure(path)` +### {{< var done >}} `create_resource_structure(path)` See the help documentation with `help(create_resource_structure)` for more details. @@ -127,13 +127,21 @@ flowchart function --> out ``` -### {{< var wip >}} `write_resource_parquet(raw_files, path)` +### {{< var wip >}} `build_resource_parquet(raw_files_path, resource_properties)` -This function takes the files provided by `raw_files` and merges them -into a `data.parquet` file provided by `path`. Use -`path_resource_data()` to provide the correct path location for `path` -and `path_resource_raw_files()` for the `raw_files` argument. Outputs -the path object of the created file. +See the help documentation with `help(build_resource_parquet)` for more +details. + +```{mermaid} +flowchart + in_raw_files_path[/raw_files_path/] + in_properties[/resource_properties/] + function("build_resource_parquet()") + out[("./resources/{id}/data.parquet")] + in_raw_files_path --> function + in_properties --> function + function --> out +``` ### {{< var wip >}} `edit_resource_properties(path, properties)` From fa28b985e984a0a1bdbc4a208a513cb776d74ec7 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Thu, 13 Feb 2025 09:22:09 +0100 Subject: [PATCH 4/5] docs: :building_construction: updated and finished pseudocode for `build_resource_parquet()` --- .../pseudocode/write_resource_parquet.py | 54 +++++++++---------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/docs/design/implementation/pseudocode/write_resource_parquet.py b/docs/design/implementation/pseudocode/write_resource_parquet.py index be85333d2..ac30d5d4c 100644 --- a/docs/design/implementation/pseudocode/write_resource_parquet.py +++ b/docs/design/implementation/pseudocode/write_resource_parquet.py @@ -1,18 +1,24 @@ # ruff: noqa -def write_resource_parquet( - raw_files_path: list[Path], parquet_path: Path, properties_path: Path +def build_resource_parquet( + raw_files_path: list[Path], resource_properties: ResourceProperties ) -> Path: """Merge all raw resource file(s) and write into a Parquet file. This function takes the file(s) provided by `raw_files_path` and merges them into - a `data.parquet` file provided by `parquet_path`. While Sprout generally assumes + a `data.parquet` file. The Parquet file will be stored at the path found in `ResourceProperties.path`. + While Sprout generally assumes that the files stored in the `resources/raw/` folder have already been verified and validated, this function does some quick verification checks of the data after reading it into Python from the raw file(s) by comparing - with the current properties given by the `properties_path`. All data in the + with the current properties given by the `resource_properties`. All data in the `resources/raw/` folder will be merged into one single data object and then written back to the Parquet file. The Parquet file will be overwritten. + If there are any duplicate observation units in the data, only the most recent + observation unit will be kept. This way, if there are any errors or mistakes + in older raw files that has been corrected in later files, the mistake can still + be kept, but won't impact the data that will actually be used. + Examples: ``` python @@ -29,31 +35,23 @@ def write_resource_parquet( raw_files_path: A list of paths for all the raw files, mostly commonly stored in the `.csv.gz` format. Use `path_resource_raw_files()` to help provide the correct paths to the raw files. - parquet_path: The path to the Use `path_resource_data()` to help provide - the correct path location for the resource's `data.parquet` file. - properties_path: The path to the properties for this resource. Use - `path_package_properties()` to help give the correct location. + resource_properties: The `ResourceProperties` object that contains the properties + of the resource you want to create the Parquet file for. Returns: Outputs the path object of the created Parquet file. """ # Not sure if this is the correct way to verify multiple files. - [verify_is_file(path) for path in raw_files_path] - - data = read_raw_files(raw_file) - - # Confirms that the data matches the resource properties found in `datapackage.json`. - # Not sure if this is the best solution here to load in the properties file. - verify_is_file(properties_path) - properties = read_json(properties_path) + [check_is_file(path) for path in raw_files_path] + check_resource_properties(resource_properties) - # This function could be several, e.g. verify_data_column_types(), verify_data_column_names()? - verify_data(data, properties) + data = read_raw_files(raw_files_path) + data = drop_duplicate_obs_units(data) - # Could include validation here? - # validate_data(data, properties) + # This function could be several functions or the one full function. + check_data(data, resource_properties) - return write_parquet(data, parquet_path) + return write_parquet(data, resource_properties["path"]) def write_parquet(data: DataFrame, path: Path) -> Path: @@ -68,12 +66,8 @@ def read_raw_files(paths: list[Path]) -> DataFrame: return data -def verify_data(data: DataFrame, properties: dict) -> Path: - # Compare against the properties for: - # - Header names - # - Data types - # - Number of rows? - # - Others? - - # Error if fails, data if pass - return data +def drop_duplicate_obs_units(data: DataFrame) -> DataFrame: + # Drop duplicates based on the observation unit, keeping only the most + # recent one. This allows older raw files to contain potentially wrong + # data that was corrected in the most recent file. + return data.drop_duplicates() From fa590b6c85c08ddeb0d38fe6a66c4b5fb62cffce Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Thu, 13 Feb 2025 09:23:02 +0100 Subject: [PATCH 5/5] chore: :truck: rename and move to `interface/` --- .../pseudocode/build_resource_parquet.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/design/{implementation/pseudocode/write_resource_parquet.py => interface/pseudocode/build_resource_parquet.py} (100%) diff --git a/docs/design/implementation/pseudocode/write_resource_parquet.py b/docs/design/interface/pseudocode/build_resource_parquet.py similarity index 100% rename from docs/design/implementation/pseudocode/write_resource_parquet.py rename to docs/design/interface/pseudocode/build_resource_parquet.py