simplify archive_dataset

NYCPlanning · Feb 27, 2025 · 2d90da1 · 2d90da1
1 parent eb86aa1
commit 2d90da1
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 23 deletions.
diff --git a/dcpy/connectors/edm/recipes.py b/dcpy/connectors/edm/recipes.py
@@ -57,12 +57,19 @@ def exists(ds: Dataset) -> bool:
     return s3.folder_exists(BUCKET, s3_folder_path(ds))
 
 
-def _archive_dataset(config: ingest.Config, file_path: Path, s3_path: str) -> None:
+def archive_dataset(
+    config: ingest.Config, file_path: Path, raw: bool = False, latest: bool = False
+) -> None:
     """
     Given a config and a path to a file and an s3_path, archive it in edm-recipe
     It is assumed that s3_path has taken care of figuring out which top-level folder,
     how the dataset is being versioned, etc.
     """
+    s3_path = (
+        s3_raw_folder_path(config.raw_dataset_key)
+        if raw
+        else s3_folder_path(config.dataset_key)
+    )
     if s3.folder_exists(BUCKET, s3_path):
         raise Exception(
             f"Archived dataset at {s3_path} already exists, cannot overwrite"
@@ -81,14 +88,9 @@ def _archive_dataset(config: ingest.Config, file_path: Path, s3_path: str) -> No
             acl=config.archival.acl,
             contents_only=True,
         )
-
-
-def archive_raw_dataset(config: ingest.Config, file_path: Path):
-    """
-    Given a config and a path to a 'raw' input dataset, archive it in edm-recipes
-    Unique identifier of a raw dataset is its name and the timestamp of archival
-    """
-    _archive_dataset(config, file_path, s3_raw_folder_path(config.raw_dataset_key))
+    if latest:
+        assert not raw, "Cannot set raw dataset to 'latest'"
+        set_latest(config.dataset_key, config.archival.acl)
 
 
 def set_latest(key: DatasetKey, acl):
@@ -100,17 +102,6 @@ def set_latest(key: DatasetKey, acl):
     )
 
 
-def archive_dataset(config: ingest.Config, file_path: Path, *, latest: bool = False):
-    """
-    Given a config and a path to a processed parquet file, archive it in edm-recipes
-    Unique identifier of a raw dataset is its name and its version
-    """
-    s3_path = s3_folder_path(config.dataset_key)
-    _archive_dataset(config, file_path, s3_path)
-    if latest:
-        set_latest(config.dataset_key, config.archival.acl)
-
-
 def update_freshness(ds: DatasetKey, timestamp: datetime) -> datetime:
     path = f"{DATASET_FOLDER}/{ds.id}/{ds.version}/config.json"
     config = get_config(ds.id, ds.version)

diff --git a/dcpy/lifecycle/ingest/run.py b/dcpy/lifecycle/ingest/run.py
@@ -1,7 +1,6 @@
 import json
 from pathlib import Path
 import shutil
-
 from dcpy.models.lifecycle.ingest import Config
 from dcpy.connectors.edm import recipes
 from dcpy.lifecycle import BASE_PATH
@@ -45,6 +44,9 @@ def ingest(
     with open(staging_dir / "config.json", "w") as f:
         json.dump(config.model_dump(mode="json"), f, indent=4)
 
+    with open(staging_dir / CONFIG_FILENAME, "w") as f:
+        json.dump(config.model_dump(mode="json"), f, indent=4)
+
     # download dataset
     extract.download_file_from_source(
         config.ingestion.source,
@@ -56,7 +58,7 @@ def ingest(
 
     if not skip_archival:
         # archive to edm-recipes/raw_datasets
-        recipes.archive_raw_dataset(config, file_path)
+        recipes.archive_dataset(config, file_path, raw=True)
 
     init_parquet = "init.parquet"
     transform.to_parquet(

diff --git a/dcpy/test/connectors/edm/test_recipes.py b/dcpy/test/connectors/edm/test_recipes.py
@@ -89,7 +89,7 @@ class TestArchiveDataset:
     def test_archive_raw_dataset(self, create_buckets, create_temp_filesystem: Path):
         tmp_file = create_temp_filesystem / self.raw_file_name
         tmp_file.touch()
-        recipes.archive_raw_dataset(self.config, tmp_file)
+        recipes.archive_dataset(self.config, tmp_file, raw=True)
         assert s3.folder_exists(
             RECIPES_BUCKET, recipes.s3_raw_folder_path(self.config.raw_dataset_key)
         )