Skip to content

Commit

Permalink
simplify archive_dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
fvankrieken committed Feb 27, 2025
1 parent eb86aa1 commit 2d90da1
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 23 deletions.
31 changes: 11 additions & 20 deletions dcpy/connectors/edm/recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,19 @@ def exists(ds: Dataset) -> bool:
return s3.folder_exists(BUCKET, s3_folder_path(ds))


def _archive_dataset(config: ingest.Config, file_path: Path, s3_path: str) -> None:
def archive_dataset(
config: ingest.Config, file_path: Path, raw: bool = False, latest: bool = False
) -> None:
"""
Given a config and a path to a file and an s3_path, archive it in edm-recipe
It is assumed that s3_path has taken care of figuring out which top-level folder,
how the dataset is being versioned, etc.
"""
s3_path = (
s3_raw_folder_path(config.raw_dataset_key)
if raw
else s3_folder_path(config.dataset_key)
)
if s3.folder_exists(BUCKET, s3_path):
raise Exception(
f"Archived dataset at {s3_path} already exists, cannot overwrite"
Expand All @@ -81,14 +88,9 @@ def _archive_dataset(config: ingest.Config, file_path: Path, s3_path: str) -> No
acl=config.archival.acl,
contents_only=True,
)


def archive_raw_dataset(config: ingest.Config, file_path: Path):
"""
Given a config and a path to a 'raw' input dataset, archive it in edm-recipes
Unique identifier of a raw dataset is its name and the timestamp of archival
"""
_archive_dataset(config, file_path, s3_raw_folder_path(config.raw_dataset_key))
if latest:
assert not raw, "Cannot set raw dataset to 'latest'"
set_latest(config.dataset_key, config.archival.acl)


def set_latest(key: DatasetKey, acl):
Expand All @@ -100,17 +102,6 @@ def set_latest(key: DatasetKey, acl):
)


def archive_dataset(config: ingest.Config, file_path: Path, *, latest: bool = False):
"""
Given a config and a path to a processed parquet file, archive it in edm-recipes
Unique identifier of a raw dataset is its name and its version
"""
s3_path = s3_folder_path(config.dataset_key)
_archive_dataset(config, file_path, s3_path)
if latest:
set_latest(config.dataset_key, config.archival.acl)


def update_freshness(ds: DatasetKey, timestamp: datetime) -> datetime:
path = f"{DATASET_FOLDER}/{ds.id}/{ds.version}/config.json"
config = get_config(ds.id, ds.version)
Expand Down
6 changes: 4 additions & 2 deletions dcpy/lifecycle/ingest/run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
from pathlib import Path
import shutil

from dcpy.models.lifecycle.ingest import Config
from dcpy.connectors.edm import recipes
from dcpy.lifecycle import BASE_PATH
Expand Down Expand Up @@ -45,6 +44,9 @@ def ingest(
with open(staging_dir / "config.json", "w") as f:
json.dump(config.model_dump(mode="json"), f, indent=4)

with open(staging_dir / CONFIG_FILENAME, "w") as f:
json.dump(config.model_dump(mode="json"), f, indent=4)

# download dataset
extract.download_file_from_source(
config.ingestion.source,
Expand All @@ -56,7 +58,7 @@ def ingest(

if not skip_archival:
# archive to edm-recipes/raw_datasets
recipes.archive_raw_dataset(config, file_path)
recipes.archive_dataset(config, file_path, raw=True)

init_parquet = "init.parquet"
transform.to_parquet(
Expand Down
2 changes: 1 addition & 1 deletion dcpy/test/connectors/edm/test_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class TestArchiveDataset:
def test_archive_raw_dataset(self, create_buckets, create_temp_filesystem: Path):
tmp_file = create_temp_filesystem / self.raw_file_name
tmp_file.touch()
recipes.archive_raw_dataset(self.config, tmp_file)
recipes.archive_dataset(self.config, tmp_file, raw=True)
assert s3.folder_exists(
RECIPES_BUCKET, recipes.s3_raw_folder_path(self.config.raw_dataset_key)
)
Expand Down

0 comments on commit 2d90da1

Please sign in to comment.