Skip to content

Commit

Permalink
BREAKING CHANGE : Auxiliary Files stage out (#415)
Browse files Browse the repository at this point in the history
* fix: check by id first to see if the granule exists before updating it

* breaking: stage out for anciliary file

* feat: add checksum in stac

* fix: granule id needs to be the full URN, not just the name

* fix: adding checksum values

* fix: need stac_extensions type update

* fix: no need to add extra fields

* chore: add sample docker file
  • Loading branch information
wphyojpl authored Sep 9, 2024
1 parent ff7f4d0 commit 8af0ce1
Showing 14 changed files with 412 additions and 56 deletions.
3 changes: 3 additions & 0 deletions ci.cd/Makefile
Original file line number Diff line number Diff line change
@@ -20,6 +20,9 @@ build_lambda_public:
upload_lambda:
aws --profile saml-pub s3 cp cumulus_lambda_functions_deployment.zip s3://am-uds-dev-cumulus-tf-state/unity_cumulus_lambda/

move:
mv /Users/wphyo/Downloads/cumulus_lambda_functions-*.zip tf-module/unity-cumulus/build/cumulus_lambda_functions_deployment.zip

upload_lambda_mcp_dev:
aws s3 cp tf-module/unity-cumulus/build/cumulus_lambda_functions_deployment.zip s3://uds-dev-cumulus-public/unity_cumulus_lambda/
update_lambda_function_mcp_dev_6:
7 changes: 7 additions & 0 deletions cumulus_lambda_functions/cumulus_stac/item_transformer.py
Original file line number Diff line number Diff line change
@@ -344,12 +344,18 @@ def __get_asset_obj(self, input_dict):
:param input_dict:
:return:
"""
# https://github.com/stac-extensions/file
# https://github.com/stac-extensions/file/blob/main/examples/item.json
description_keys = ['size', 'checksumType', 'checksum']
descriptions = [f'{k}={input_dict[k]};' for k in description_keys if k in input_dict]
asset = Asset(
href=f"s3://{input_dict['bucket']}/{input_dict['key']}",
title=input_dict['fileName'],
description=''.join(descriptions),
extra_fields={
'file:size': input_dict['size'] if 'size' in input_dict else -1,
'file:checksum': input_dict['checksum'] if 'checksum' in input_dict else -1,
},
roles=[input_dict['type']]
)
return asset
@@ -472,6 +478,7 @@ def to_stac(self, source: dict) -> dict:
}
stac_item = Item(
id=source['granuleId'],
stac_extensions=["https://stac-extensions.github.io/file/v2.1.0/schema.json"],
bbox=[-180.0, -90.0, 180.0, 90.0],
properties={
**custom_metadata,
Original file line number Diff line number Diff line change
@@ -47,7 +47,7 @@ class GranulesIndexMapping:
"title": {"type": "text"}
}
},
"stac_extensions": {"type": "object"},
"stac_extensions": {"type": "keyword"},
"properties": {
"dynamic": "false",
"properties": {
6 changes: 3 additions & 3 deletions cumulus_lambda_functions/lib/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -67,10 +67,10 @@ def gunzip_file_os(zipped_file_path, output_file_path=None):
return output_file_path

@staticmethod
def get_checksum(file_path):
def get_checksum(file_path, is_md5=False, chunk_size=25 * 2**20):
with open(file_path, mode='rb') as f:
d = hashlib.sha512()
for buf in iter(partial(f.read, 512 * 2**10), b''):
d = hashlib.md5() if is_md5 else hashlib.sha512()
for buf in iter(partial(f.read, chunk_size), b''):
d.update(buf)
return d.hexdigest()

Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import json
import logging
import os.path
from glob import glob
from multiprocessing import Manager

from cumulus_lambda_functions.cumulus_stac.granules_catalog import GranulesCatalog

from cumulus_lambda_functions.lib.aws.aws_s3 import AwsS3
from cumulus_lambda_functions.lib.processing_jobs.job_manager_memory import JobManagerMemory
from cumulus_lambda_functions.lib.processing_jobs.multithread_processor import MultiThreadProcessorProps, MultiThreadProcessor
from cumulus_lambda_functions.lib.processing_jobs.job_manager_abstract import JobManagerProps
from cumulus_lambda_functions.lib.utils.file_utils import FileUtils
from cumulus_lambda_functions.lib.processing_jobs.job_executor_abstract import JobExecutorAbstract
from cumulus_lambda_functions.lib.time_utils import TimeUtils
from cumulus_lambda_functions.stage_in_out.upload_granules_abstract import UploadGranulesAbstract
from pystac import Item, Asset, ItemCollection, Catalog, Link

LOGGER = logging.getLogger(__name__)


class UploadItemExecutor(JobExecutorAbstract):
def __init__(self, result_list, error_list, collection_id, staging_bucket, retry_wait_time_sec, retry_times, delete_files: bool) -> None:
super().__init__()
self.__collection_id = collection_id
self.__staging_bucket = staging_bucket
self.__delete_files = delete_files

self.__gc = GranulesCatalog()
self.__result_list = result_list
self.__error_list = error_list
# self.__gc = GranulesCatalog()
self.__s3 = AwsS3()
self.__retry_wait_time_sec = retry_wait_time_sec
self.__retry_times = retry_times

def validate_job(self, job_obj):
return True

def generate_sample_stac(self, filepath: str):
filename = os.path.basename(filepath)
file_checksum = FileUtils.get_checksum(filepath, True)
# https://github.com/stac-extensions/file
# https://github.com/stac-extensions/file/blob/main/examples/item.json
sample_stac_item = Item(
id=f'{self.__collection_id}:{os.path.splitext(filename)[0]}',
stac_extensions=["https://stac-extensions.github.io/file/v2.1.0/schema.json"],
geometry={
"type": "Point",
"coordinates": [0.0, 0.0]
},
bbox=[0.0, 0.0, 0.0, 0.0],
datetime=TimeUtils().parse_from_unix(0, True).get_datetime_obj(),
properties={
"start_datetime": TimeUtils.get_current_time(),
"end_datetime": TimeUtils.get_current_time(),
"created": TimeUtils.get_current_time(),
"updated": TimeUtils.get_current_time(),
},
collection=self.__collection_id,
assets={
filename: Asset(
href=filepath,
roles=['data'],
title=os.path.basename(filename),
extra_fields={
'file:size': FileUtils.get_size(filepath),
'file:checksum': file_checksum,
},
description=f'size={FileUtils.get_size(filepath)};checksumType=md5;checksum={file_checksum}'),
f'{filename}.stac.json': Asset(href=f'{filepath}.stac.json', roles=['metadata'], description='desc=metadata stac;size=-1;checksumType=md5;checksum=unknown'), # How to update this? It's a circular dependency
})

return sample_stac_item

def execute_job(self, job_obj, lock) -> bool:
sample_stac_item = self.generate_sample_stac(job_obj)
updating_assets = {}
try:
s3_url = self.__s3.upload(job_obj, self.__staging_bucket, f'{self.__collection_id}/{self.__collection_id}:{sample_stac_item.id}', self.__delete_files)
updating_assets[os.path.basename(s3_url)] = s3_url
uploading_current_granule_stac = f'{s3_url}.stac.json'
self.__s3.set_s3_url(uploading_current_granule_stac)
self.__s3.upload_bytes(json.dumps(sample_stac_item.to_dict(False, False),indent=4).encode())
updating_assets[os.path.basename(uploading_current_granule_stac)] = uploading_current_granule_stac
self.__gc.update_assets_href(sample_stac_item, updating_assets)
self.__result_list.put(sample_stac_item.to_dict(False, False))
except Exception as e:
sample_stac_item.properties['upload_error'] = str(e)
LOGGER.exception(f'error while processing: {job_obj}')
self.__error_list.put(sample_stac_item.to_dict(False, False))
return True


class UploadArbitraryFilesAsGranules(UploadGranulesAbstract):
BASE_DIRECTORY = 'BASE_DIRECTORY'

def __init__(self):
super().__init__()
self.__s3 = AwsS3()

def upload(self, **kwargs) -> str:

"""
1. Use Glob to find files
2. Create stac.json for each file.
3. Need collection ID which has tenant + venue.
4. Create successful features.json
:param kwargs:
:return:
"""
self._set_props_from_env()
output_dir = os.environ.get(self.OUTPUT_DIRECTORY)
if not FileUtils.dir_exist(output_dir):
raise ValueError(f'OUTPUT_DIRECTORY: {output_dir} does not exist')
missing_keys = [k for k in [self.BASE_DIRECTORY] if k not in os.environ]
if len(missing_keys) > 0:
raise ValueError(f'missing environment keys: {missing_keys}')
base_dir = os.environ.get(self.BASE_DIRECTORY)
possible_files = [k for k in glob(os.path.join(base_dir, '**'), recursive=True) if os.path.isfile(k)]

local_items = Manager().Queue()
error_list = Manager().Queue()

if self._parallel_count == 1:
for each_child in possible_files:
temp_job = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files)
temp_job.execute_job(each_child, None)
else:
job_manager_props = JobManagerProps()
for each_child in possible_files:
job_manager_props.memory_job_dict[each_child] = each_child
# https://www.infoworld.com/article/3542595/6-python-libraries-for-parallel-processing.html
multithread_processor_props = MultiThreadProcessorProps(self._parallel_count)
multithread_processor_props.job_manager = JobManagerMemory(job_manager_props)
multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files)
multithread_processor = MultiThreadProcessor(multithread_processor_props)
multithread_processor.start()

LOGGER.debug(f'finished uploading all granules')
dapa_body_granules = []
while not local_items.empty():
dapa_body_granules.append(local_items.get())

errors = []
while not error_list.empty():
errors.append(error_list.get())
LOGGER.debug(f'successful count: {len(dapa_body_granules)}. failed count: {len(errors)}')
successful_item_collections = ItemCollection(items=dapa_body_granules)
failed_item_collections = ItemCollection(items=errors)
successful_features_file = os.path.join(output_dir, 'successful_features.json')

failed_features_file = os.path.join(output_dir, 'failed_features.json')
LOGGER.debug(f'writing results: {successful_features_file} && {failed_features_file}')
FileUtils.write_json(successful_features_file, successful_item_collections.to_dict(False))
FileUtils.write_json(failed_features_file, failed_item_collections.to_dict(False))
s3_url = self.__s3.upload(successful_features_file, self._staging_bucket,
self._result_path_prefix,
s3_name=f'successful_features_{TimeUtils.get_current_time()}.json',
delete_files=self._delete_files)
LOGGER.debug(f'uploaded successful features to S3: {s3_url}')
LOGGER.debug(f'creating response catalog')
catalog = Catalog(
id='NA',
description='NA')
catalog.add_link(Link('item', successful_features_file, 'application/json'))
catalog.add_link(Link('item', failed_features_file, 'application/json'))
catalog_json = catalog.to_dict(False, False)
LOGGER.debug(f'catalog_json: {catalog_json}')
return json.dumps(catalog_json)
Original file line number Diff line number Diff line change
@@ -1,7 +1,44 @@
import os
from abc import ABC, abstractmethod

from cumulus_lambda_functions.lib.constants import Constants


class UploadGranulesAbstract(ABC):
RESULT_PATH_PREFIX = 'RESULT_PATH_PREFIX' # s3 prefix
DEFAULT_RESULT_PATH_PREFIX = 'stage_out' # default s3 prefix
OUTPUT_DIRECTORY = 'OUTPUT_DIRECTORY' # To store successful & failed features json
COLLECTION_ID_KEY = 'COLLECTION_ID' # Need this
STAGING_BUCKET_KEY = 'STAGING_BUCKET' # S3 Bucket
VERIFY_SSL_KEY = 'VERIFY_SSL'
DELETE_FILES_KEY = 'DELETE_FILES'

def __init__(self) -> None:
super().__init__()
self._collection_id = ''
self._staging_bucket = ''
self._result_path_prefix = ''
self._parallel_count = int(os.environ.get(Constants.PARALLEL_COUNT, '-1'))
self._retry_wait_time_sec = int(os.environ.get('UPLOAD_RETRY_WAIT_TIME', '30'))
self._retry_times = int(os.environ.get('UPLOAD_RETRY_TIMES', '5'))
self._verify_ssl = True
self._delete_files = False

def _set_props_from_env(self):
missing_keys = [k for k in [self.COLLECTION_ID_KEY, self.STAGING_BUCKET_KEY] if k not in os.environ]
if len(missing_keys) > 0:
raise ValueError(f'missing environment keys: {missing_keys}')

self._collection_id = os.environ.get(self.COLLECTION_ID_KEY)
self._staging_bucket = os.environ.get(self.STAGING_BUCKET_KEY)
self._result_path_prefix = os.environ.get(self.RESULT_PATH_PREFIX, self.DEFAULT_RESULT_PATH_PREFIX)
self._result_path_prefix = self._result_path_prefix[:-1] if self._result_path_prefix.endswith('/') else self._result_path_prefix
self._result_path_prefix = self._result_path_prefix[1:] if self._result_path_prefix.startswith('/') else self._result_path_prefix

self._verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
self._delete_files = os.environ.get(self.DELETE_FILES_KEY, 'FALSE').strip().upper() == 'TRUE'
return self

@abstractmethod
def upload(self, **kwargs) -> list:
def upload(self, **kwargs) -> str:
raise NotImplementedError()
Original file line number Diff line number Diff line change
@@ -89,48 +89,21 @@ def execute_job(self, each_child, lock) -> bool:


class UploadGranulesByCompleteCatalogS3(UploadGranulesAbstract):
RESULT_PATH_PREFIX = 'RESULT_PATH_PREFIX'
DEFAULT_RESULT_PATH_PREFIX = 'stage_out'
CATALOG_FILE = 'CATALOG_FILE'
OUTPUT_DIRECTORY = 'OUTPUT_DIRECTORY'
COLLECTION_ID_KEY = 'COLLECTION_ID'
STAGING_BUCKET_KEY = 'STAGING_BUCKET'

VERIFY_SSL_KEY = 'VERIFY_SSL'
DELETE_FILES_KEY = 'DELETE_FILES'

def __init__(self) -> None:
super().__init__()
self.__gc = GranulesCatalog()
self.__collection_id = ''
self.__staging_bucket = ''
self.__result_path_prefix = ''
self.__verify_ssl = True
self.__delete_files = False
self.__s3 = AwsS3()
self._parallel_count = int(os.environ.get(Constants.PARALLEL_COUNT, '-1'))
self.__retry_wait_time_sec = int(os.environ.get('UPLOAD_RETRY_WAIT_TIME', '30'))
self.__retry_times = int(os.environ.get('UPLOAD_RETRY_TIMES', '5'))

def __set_props_from_env(self):
missing_keys = [k for k in [self.CATALOG_FILE, self.COLLECTION_ID_KEY, self.STAGING_BUCKET_KEY] if k not in os.environ]
if len(missing_keys) > 0:
raise ValueError(f'missing environment keys: {missing_keys}')

self.__collection_id = os.environ.get(self.COLLECTION_ID_KEY)
self.__staging_bucket = os.environ.get(self.STAGING_BUCKET_KEY)
self.__result_path_prefix = os.environ.get(self.RESULT_PATH_PREFIX, self.DEFAULT_RESULT_PATH_PREFIX)
self.__result_path_prefix = self.__result_path_prefix[:-1] if self.__result_path_prefix.endswith('/') else self.__result_path_prefix
self.__result_path_prefix = self.__result_path_prefix[1:] if self.__result_path_prefix.startswith('/') else self.__result_path_prefix
self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
self.__delete_files = os.environ.get(self.DELETE_FILES_KEY, 'FALSE').strip().upper() == 'TRUE'
return self

def upload(self, **kwargs) -> str:
self.__set_props_from_env()
self._set_props_from_env()
output_dir = os.environ.get(self.OUTPUT_DIRECTORY)
if not FileUtils.dir_exist(output_dir):
raise ValueError(f'OUTPUT_DIRECTORY: {output_dir} does not exist')
missing_keys = [k for k in [self.CATALOG_FILE] if k not in os.environ]
if len(missing_keys) > 0:
raise ValueError(f'missing environment keys: {missing_keys}')
catalog_file_path = os.environ.get(self.CATALOG_FILE)
child_links = self.__gc.get_child_link_hrefs(catalog_file_path)
local_items = Manager().Queue()
@@ -142,7 +115,7 @@ def upload(self, **kwargs) -> str:
# https://www.infoworld.com/article/3542595/6-python-libraries-for-parallel-processing.html
multithread_processor_props = MultiThreadProcessorProps(self._parallel_count)
multithread_processor_props.job_manager = JobManagerMemory(job_manager_props)
multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self.__collection_id, self.__staging_bucket, self.__retry_wait_time_sec, self.__retry_times, self.__delete_files)
multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files)
multithread_processor = MultiThreadProcessor(multithread_processor_props)
multithread_processor.start()

@@ -165,10 +138,10 @@ def upload(self, **kwargs) -> str:
LOGGER.debug(f'writing results: {successful_features_file} && {failed_features_file}')
FileUtils.write_json(successful_features_file, successful_item_collections.to_dict(False))
FileUtils.write_json(failed_features_file, failed_item_collections.to_dict(False))
s3_url = self.__s3.upload(successful_features_file, self.__staging_bucket,
self.__result_path_prefix,
s3_url = self.__s3.upload(successful_features_file, self._staging_bucket,
self._result_path_prefix,
s3_name=f'successful_features_{TimeUtils.get_current_time()}.json',
delete_files=self.__delete_files)
delete_files=self._delete_files)
LOGGER.debug(f'uploaded successful features to S3: {s3_url}')
LOGGER.debug(f'creating response catalog')
catalog_json = GranulesCatalog().update_catalog(catalog_file_path, [successful_features_file, failed_features_file])
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@


class UploadGranulesFactory:
UPLOAD_S3_BY_STAC_CATALOG = 'UPLOAD_S3_BY_STAC_CATALOG'
UPLOAD_AUXILIARY_FILE_AS_GRANULE = 'UPLOAD_AUXILIARY_FILE_AS_GRANULE'

def get_class(self, upload_type):
if upload_type == UploadGranulesFactory.UPLOAD_S3_BY_STAC_CATALOG:
from cumulus_lambda_functions.stage_in_out.upload_granules_by_complete_catalog_s3 import UploadGranulesByCompleteCatalogS3
return UploadGranulesByCompleteCatalogS3()
if upload_type == UploadGranulesFactory.UPLOAD_AUXILIARY_FILE_AS_GRANULE:
from cumulus_lambda_functions.stage_in_out.upload_arbitrary_files_as_granules import UploadArbitraryFilesAsGranules
return UploadArbitraryFilesAsGranules()
raise ValueError(f'unknown search_type: {upload_type}')
Loading

0 comments on commit 8af0ce1

Please sign in to comment.