unity-sds
diff --git a/‎ci.cd/Makefile
Lines changed: 3 additions & 0 deletions b/‎ci.cd/Makefile
Lines changed: 3 additions & 0 deletions
diff --git a/‎cumulus_lambda_functions/cumulus_stac/item_transformer.py
Lines changed: 7 additions & 0 deletions b/‎cumulus_lambda_functions/cumulus_stac/item_transformer.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎cumulus_lambda_functions/granules_to_es/granules_index_mapping.py
Lines changed: 1 addition & 1 deletion b/‎cumulus_lambda_functions/granules_to_es/granules_index_mapping.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎cumulus_lambda_functions/lib/utils/file_utils.py
Lines changed: 3 additions & 3 deletions b/‎cumulus_lambda_functions/lib/utils/file_utils.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎cumulus_lambda_functions/stage_in_out/upload_arbitrary_files_as_granules.py
Lines changed: 170 additions & 0 deletions b/‎cumulus_lambda_functions/stage_in_out/upload_arbitrary_files_as_granules.py
Lines changed: 170 additions & 0 deletions
diff --git a/‎cumulus_lambda_functions/stage_in_out/upload_granules_abstract.py
Lines changed: 38 additions & 1 deletion b/‎cumulus_lambda_functions/stage_in_out/upload_granules_abstract.py
Lines changed: 38 additions & 1 deletion
diff --git a/‎cumulus_lambda_functions/stage_in_out/upload_granules_by_complete_catalog_s3.py
Lines changed: 8 additions & 35 deletions b/‎cumulus_lambda_functions/stage_in_out/upload_granules_by_complete_catalog_s3.py
Lines changed: 8 additions & 35 deletions
diff --git a/‎cumulus_lambda_functions/stage_in_out/upoad_granules_factory.py
Lines changed: 6 additions & 0 deletions b/‎cumulus_lambda_functions/stage_in_out/upoad_granules_factory.py
Lines changed: 6 additions & 0 deletions
@@ -20,6 +20,9 @@ build_lambda_public:
 upload_lambda:
 	aws --profile saml-pub s3 cp cumulus_lambda_functions_deployment.zip s3://am-uds-dev-cumulus-tf-state/unity_cumulus_lambda/
 
+move:
+	 mv /Users/wphyo/Downloads/cumulus_lambda_functions-*.zip tf-module/unity-cumulus/build/cumulus_lambda_functions_deployment.zip
+
 upload_lambda_mcp_dev:
 	aws s3 cp tf-module/unity-cumulus/build/cumulus_lambda_functions_deployment.zip s3://uds-dev-cumulus-public/unity_cumulus_lambda/
 update_lambda_function_mcp_dev_6:
 
@@ -344,12 +344,18 @@ def __get_asset_obj(self, input_dict):
         :param input_dict:
         :return:
         """
+        # https://github.com/stac-extensions/file
+        # https://github.com/stac-extensions/file/blob/main/examples/item.json
         description_keys = ['size', 'checksumType', 'checksum']
         descriptions = [f'{k}={input_dict[k]};' for k in description_keys if k in input_dict]
         asset = Asset(
             href=f"s3://{input_dict['bucket']}/{input_dict['key']}",
             title=input_dict['fileName'],
             description=''.join(descriptions),
+            extra_fields={
+                'file:size': input_dict['size'] if 'size' in input_dict else -1,
+                'file:checksum': input_dict['checksum'] if 'checksum' in input_dict else -1,
+            },
             roles=[input_dict['type']]
         )
         return asset
@@ -472,6 +478,7 @@ def to_stac(self, source: dict) -> dict:
         }
         stac_item = Item(
             id=source['granuleId'],
+            stac_extensions=["https://stac-extensions.github.io/file/v2.1.0/schema.json"],
             bbox=[-180.0, -90.0, 180.0, 90.0],
             properties={
                 **custom_metadata,
 
@@ -47,7 +47,7 @@ class GranulesIndexMapping:
                 "title": {"type": "text"}
             }
         },
-        "stac_extensions": {"type": "object"},
+        "stac_extensions": {"type": "keyword"},
         "properties": {
             "dynamic": "false",
             "properties": {
 
@@ -67,10 +67,10 @@ def gunzip_file_os(zipped_file_path, output_file_path=None):
         return output_file_path
 
     @staticmethod
-    def get_checksum(file_path):
+    def get_checksum(file_path, is_md5=False, chunk_size=25 * 2**20):
         with open(file_path, mode='rb') as f:
-            d = hashlib.sha512()
-            for buf in iter(partial(f.read, 512 * 2**10), b''):
+            d = hashlib.md5() if is_md5 else hashlib.sha512()
+            for buf in iter(partial(f.read, chunk_size), b''):
                 d.update(buf)
         return d.hexdigest()
 
 
@@ -0,0 +1,170 @@
+import json
+import logging
+import os.path
+from glob import glob
+from multiprocessing import Manager
+
+from cumulus_lambda_functions.cumulus_stac.granules_catalog import GranulesCatalog
+
+from cumulus_lambda_functions.lib.aws.aws_s3 import AwsS3
+from cumulus_lambda_functions.lib.processing_jobs.job_manager_memory import JobManagerMemory
+from cumulus_lambda_functions.lib.processing_jobs.multithread_processor import MultiThreadProcessorProps, MultiThreadProcessor
+from cumulus_lambda_functions.lib.processing_jobs.job_manager_abstract import JobManagerProps
+from cumulus_lambda_functions.lib.utils.file_utils import FileUtils
+from cumulus_lambda_functions.lib.processing_jobs.job_executor_abstract import JobExecutorAbstract
+from cumulus_lambda_functions.lib.time_utils import TimeUtils
+from cumulus_lambda_functions.stage_in_out.upload_granules_abstract import UploadGranulesAbstract
+from pystac import Item, Asset, ItemCollection, Catalog, Link
+
+LOGGER = logging.getLogger(__name__)
+
+
+class UploadItemExecutor(JobExecutorAbstract):
+    def __init__(self, result_list, error_list, collection_id, staging_bucket, retry_wait_time_sec, retry_times, delete_files: bool) -> None:
+        super().__init__()
+        self.__collection_id = collection_id
+        self.__staging_bucket = staging_bucket
+        self.__delete_files = delete_files
+
+        self.__gc = GranulesCatalog()
+        self.__result_list = result_list
+        self.__error_list = error_list
+        # self.__gc = GranulesCatalog()
+        self.__s3 = AwsS3()
+        self.__retry_wait_time_sec = retry_wait_time_sec
+        self.__retry_times = retry_times
+
+    def validate_job(self, job_obj):
+        return True
+
+    def generate_sample_stac(self, filepath: str):
+        filename = os.path.basename(filepath)
+        file_checksum = FileUtils.get_checksum(filepath, True)
+        # https://github.com/stac-extensions/file
+        # https://github.com/stac-extensions/file/blob/main/examples/item.json
+        sample_stac_item = Item(
+                         id=f'{self.__collection_id}:{os.path.splitext(filename)[0]}',
+                         stac_extensions=["https://stac-extensions.github.io/file/v2.1.0/schema.json"],
+                         geometry={
+                             "type": "Point",
+                             "coordinates": [0.0, 0.0]
+                         },
+                         bbox=[0.0, 0.0, 0.0, 0.0],
+                         datetime=TimeUtils().parse_from_unix(0, True).get_datetime_obj(),
+                         properties={
+                             "start_datetime": TimeUtils.get_current_time(),
+                             "end_datetime": TimeUtils.get_current_time(),
+                             "created": TimeUtils.get_current_time(),
+                             "updated": TimeUtils.get_current_time(),
+                         },
+                         collection=self.__collection_id,
+                         assets={
+                             filename: Asset(
+                                 href=filepath,
+                                 roles=['data'],
+                                 title=os.path.basename(filename),
+                                 extra_fields={
+                                     'file:size': FileUtils.get_size(filepath),
+                                     'file:checksum': file_checksum,
+                                 },
+                                 description=f'size={FileUtils.get_size(filepath)};checksumType=md5;checksum={file_checksum}'),
+                             f'{filename}.stac.json': Asset(href=f'{filepath}.stac.json', roles=['metadata'], description='desc=metadata stac;size=-1;checksumType=md5;checksum=unknown'),  # How to update this? It's a circular dependency
+                         })
+
+        return sample_stac_item
+
+    def execute_job(self, job_obj, lock) -> bool:
+        sample_stac_item = self.generate_sample_stac(job_obj)
+        updating_assets = {}
+        try:
+            s3_url = self.__s3.upload(job_obj, self.__staging_bucket, f'{self.__collection_id}/{self.__collection_id}:{sample_stac_item.id}', self.__delete_files)
+            updating_assets[os.path.basename(s3_url)] = s3_url
+            uploading_current_granule_stac = f'{s3_url}.stac.json'
+            self.__s3.set_s3_url(uploading_current_granule_stac)
+            self.__s3.upload_bytes(json.dumps(sample_stac_item.to_dict(False, False),indent=4).encode())
+            updating_assets[os.path.basename(uploading_current_granule_stac)] = uploading_current_granule_stac
+            self.__gc.update_assets_href(sample_stac_item, updating_assets)
+            self.__result_list.put(sample_stac_item.to_dict(False, False))
+        except Exception as e:
+            sample_stac_item.properties['upload_error'] = str(e)
+            LOGGER.exception(f'error while processing: {job_obj}')
+            self.__error_list.put(sample_stac_item.to_dict(False, False))
+        return True
+
+
+class UploadArbitraryFilesAsGranules(UploadGranulesAbstract):
+    BASE_DIRECTORY = 'BASE_DIRECTORY'
+
+    def __init__(self):
+        super().__init__()
+        self.__s3 = AwsS3()
+
+    def upload(self, **kwargs) -> str:
+
+        """
+        1. Use Glob to find files
+        2. Create stac.json for each file.
+        3. Need collection ID which has tenant + venue.
+        4. Create successful features.json
+        :param kwargs:
+        :return:
+        """
+        self._set_props_from_env()
+        output_dir = os.environ.get(self.OUTPUT_DIRECTORY)
+        if not FileUtils.dir_exist(output_dir):
+            raise ValueError(f'OUTPUT_DIRECTORY: {output_dir} does not exist')
+        missing_keys = [k for k in [self.BASE_DIRECTORY] if k not in os.environ]
+        if len(missing_keys) > 0:
+            raise ValueError(f'missing environment keys: {missing_keys}')
+        base_dir = os.environ.get(self.BASE_DIRECTORY)
+        possible_files = [k for k in glob(os.path.join(base_dir, '**'), recursive=True) if os.path.isfile(k)]
+
+        local_items = Manager().Queue()
+        error_list = Manager().Queue()
+
+        if self._parallel_count == 1:
+            for each_child in possible_files:
+                temp_job = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files)
+                temp_job.execute_job(each_child, None)
+        else:
+            job_manager_props = JobManagerProps()
+            for each_child in possible_files:
+                job_manager_props.memory_job_dict[each_child] = each_child
+            # https://www.infoworld.com/article/3542595/6-python-libraries-for-parallel-processing.html
+            multithread_processor_props = MultiThreadProcessorProps(self._parallel_count)
+            multithread_processor_props.job_manager = JobManagerMemory(job_manager_props)
+            multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files)
+            multithread_processor = MultiThreadProcessor(multithread_processor_props)
+            multithread_processor.start()
+
+        LOGGER.debug(f'finished uploading all granules')
+        dapa_body_granules = []
+        while not local_items.empty():
+            dapa_body_granules.append(local_items.get())
+
+        errors = []
+        while not error_list.empty():
+            errors.append(error_list.get())
+        LOGGER.debug(f'successful count: {len(dapa_body_granules)}. failed count: {len(errors)}')
+        successful_item_collections = ItemCollection(items=dapa_body_granules)
+        failed_item_collections = ItemCollection(items=errors)
+        successful_features_file = os.path.join(output_dir, 'successful_features.json')
+
+        failed_features_file = os.path.join(output_dir, 'failed_features.json')
+        LOGGER.debug(f'writing results: {successful_features_file} && {failed_features_file}')
+        FileUtils.write_json(successful_features_file, successful_item_collections.to_dict(False))
+        FileUtils.write_json(failed_features_file, failed_item_collections.to_dict(False))
+        s3_url = self.__s3.upload(successful_features_file, self._staging_bucket,
+                                  self._result_path_prefix,
+                                  s3_name=f'successful_features_{TimeUtils.get_current_time()}.json',
+                                  delete_files=self._delete_files)
+        LOGGER.debug(f'uploaded successful features to S3: {s3_url}')
+        LOGGER.debug(f'creating response catalog')
+        catalog = Catalog(
+            id='NA',
+            description='NA')
+        catalog.add_link(Link('item', successful_features_file, 'application/json'))
+        catalog.add_link(Link('item', failed_features_file, 'application/json'))
+        catalog_json = catalog.to_dict(False, False)
+        LOGGER.debug(f'catalog_json: {catalog_json}')
+        return json.dumps(catalog_json)
@@ -1,7 +1,44 @@
+import os
 from abc import ABC, abstractmethod
 
+from cumulus_lambda_functions.lib.constants import Constants
+
 
 class UploadGranulesAbstract(ABC):
+    RESULT_PATH_PREFIX = 'RESULT_PATH_PREFIX'  # s3 prefix
+    DEFAULT_RESULT_PATH_PREFIX = 'stage_out'  # default s3 prefix
+    OUTPUT_DIRECTORY = 'OUTPUT_DIRECTORY'  # To store successful & failed features json
+    COLLECTION_ID_KEY = 'COLLECTION_ID'  # Need this
+    STAGING_BUCKET_KEY = 'STAGING_BUCKET'  # S3 Bucket
+    VERIFY_SSL_KEY = 'VERIFY_SSL'
+    DELETE_FILES_KEY = 'DELETE_FILES'
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._collection_id = ''
+        self._staging_bucket = ''
+        self._result_path_prefix = ''
+        self._parallel_count = int(os.environ.get(Constants.PARALLEL_COUNT, '-1'))
+        self._retry_wait_time_sec = int(os.environ.get('UPLOAD_RETRY_WAIT_TIME', '30'))
+        self._retry_times = int(os.environ.get('UPLOAD_RETRY_TIMES', '5'))
+        self._verify_ssl = True
+        self._delete_files = False
+
+    def _set_props_from_env(self):
+        missing_keys = [k for k in [self.COLLECTION_ID_KEY, self.STAGING_BUCKET_KEY] if k not in os.environ]
+        if len(missing_keys) > 0:
+            raise ValueError(f'missing environment keys: {missing_keys}')
+
+        self._collection_id = os.environ.get(self.COLLECTION_ID_KEY)
+        self._staging_bucket = os.environ.get(self.STAGING_BUCKET_KEY)
+        self._result_path_prefix = os.environ.get(self.RESULT_PATH_PREFIX, self.DEFAULT_RESULT_PATH_PREFIX)
+        self._result_path_prefix = self._result_path_prefix[:-1] if self._result_path_prefix.endswith('/') else self._result_path_prefix
+        self._result_path_prefix = self._result_path_prefix[1:] if self._result_path_prefix.startswith('/') else self._result_path_prefix
+
+        self._verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
+        self._delete_files = os.environ.get(self.DELETE_FILES_KEY, 'FALSE').strip().upper() == 'TRUE'
+        return self
+
     @abstractmethod
-    def upload(self, **kwargs) -> list:
+    def upload(self, **kwargs) -> str:
         raise NotImplementedError()
@@ -89,48 +89,21 @@ def execute_job(self, each_child, lock) -> bool:
 
 
 class UploadGranulesByCompleteCatalogS3(UploadGranulesAbstract):
-    RESULT_PATH_PREFIX = 'RESULT_PATH_PREFIX'
-    DEFAULT_RESULT_PATH_PREFIX = 'stage_out'
     CATALOG_FILE = 'CATALOG_FILE'
-    OUTPUT_DIRECTORY = 'OUTPUT_DIRECTORY'
-    COLLECTION_ID_KEY = 'COLLECTION_ID'
-    STAGING_BUCKET_KEY = 'STAGING_BUCKET'
-
-    VERIFY_SSL_KEY = 'VERIFY_SSL'
-    DELETE_FILES_KEY = 'DELETE_FILES'
 
     def __init__(self) -> None:
         super().__init__()
         self.__gc = GranulesCatalog()
-        self.__collection_id = ''
-        self.__staging_bucket = ''
-        self.__result_path_prefix = ''
-        self.__verify_ssl = True
-        self.__delete_files = False
         self.__s3 = AwsS3()
-        self._parallel_count = int(os.environ.get(Constants.PARALLEL_COUNT, '-1'))
-        self.__retry_wait_time_sec = int(os.environ.get('UPLOAD_RETRY_WAIT_TIME', '30'))
-        self.__retry_times = int(os.environ.get('UPLOAD_RETRY_TIMES', '5'))
-
-    def __set_props_from_env(self):
-        missing_keys = [k for k in [self.CATALOG_FILE, self.COLLECTION_ID_KEY, self.STAGING_BUCKET_KEY] if k not in os.environ]
-        if len(missing_keys) > 0:
-            raise ValueError(f'missing environment keys: {missing_keys}')
-
-        self.__collection_id = os.environ.get(self.COLLECTION_ID_KEY)
-        self.__staging_bucket = os.environ.get(self.STAGING_BUCKET_KEY)
-        self.__result_path_prefix = os.environ.get(self.RESULT_PATH_PREFIX, self.DEFAULT_RESULT_PATH_PREFIX)
-        self.__result_path_prefix = self.__result_path_prefix[:-1] if self.__result_path_prefix.endswith('/') else self.__result_path_prefix
-        self.__result_path_prefix = self.__result_path_prefix[1:] if self.__result_path_prefix.startswith('/') else self.__result_path_prefix
-        self.__verify_ssl = os.environ.get(self.VERIFY_SSL_KEY, 'TRUE').strip().upper() == 'TRUE'
-        self.__delete_files = os.environ.get(self.DELETE_FILES_KEY, 'FALSE').strip().upper() == 'TRUE'
-        return self
 
     def upload(self, **kwargs) -> str:
-        self.__set_props_from_env()
+        self._set_props_from_env()
         output_dir = os.environ.get(self.OUTPUT_DIRECTORY)
         if not FileUtils.dir_exist(output_dir):
             raise ValueError(f'OUTPUT_DIRECTORY: {output_dir} does not exist')
+        missing_keys = [k for k in [self.CATALOG_FILE] if k not in os.environ]
+        if len(missing_keys) > 0:
+            raise ValueError(f'missing environment keys: {missing_keys}')
         catalog_file_path = os.environ.get(self.CATALOG_FILE)
         child_links = self.__gc.get_child_link_hrefs(catalog_file_path)
         local_items = Manager().Queue()
@@ -142,7 +115,7 @@ def upload(self, **kwargs) -> str:
         # https://www.infoworld.com/article/3542595/6-python-libraries-for-parallel-processing.html
         multithread_processor_props = MultiThreadProcessorProps(self._parallel_count)
         multithread_processor_props.job_manager = JobManagerMemory(job_manager_props)
-        multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self.__collection_id, self.__staging_bucket, self.__retry_wait_time_sec, self.__retry_times, self.__delete_files)
+        multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files)
         multithread_processor = MultiThreadProcessor(multithread_processor_props)
         multithread_processor.start()
 
@@ -165,10 +138,10 @@ def upload(self, **kwargs) -> str:
         LOGGER.debug(f'writing results: {successful_features_file} && {failed_features_file}')
         FileUtils.write_json(successful_features_file, successful_item_collections.to_dict(False))
         FileUtils.write_json(failed_features_file, failed_item_collections.to_dict(False))
-        s3_url = self.__s3.upload(successful_features_file, self.__staging_bucket,
-                                  self.__result_path_prefix,
+        s3_url = self.__s3.upload(successful_features_file, self._staging_bucket,
+                                  self._result_path_prefix,
                                   s3_name=f'successful_features_{TimeUtils.get_current_time()}.json',
-                                  delete_files=self.__delete_files)
+                                  delete_files=self._delete_files)
         LOGGER.debug(f'uploaded successful features to S3: {s3_url}')
         LOGGER.debug(f'creating response catalog')
         catalog_json = GranulesCatalog().update_catalog(catalog_file_path, [successful_features_file, failed_features_file])
 
@@ -1,8 +1,14 @@
+
+
 class UploadGranulesFactory:
     UPLOAD_S3_BY_STAC_CATALOG = 'UPLOAD_S3_BY_STAC_CATALOG'
+    UPLOAD_AUXILIARY_FILE_AS_GRANULE = 'UPLOAD_AUXILIARY_FILE_AS_GRANULE'
 
     def get_class(self, upload_type):
         if upload_type == UploadGranulesFactory.UPLOAD_S3_BY_STAC_CATALOG:
             from cumulus_lambda_functions.stage_in_out.upload_granules_by_complete_catalog_s3 import UploadGranulesByCompleteCatalogS3
             return UploadGranulesByCompleteCatalogS3()
+        if upload_type == UploadGranulesFactory.UPLOAD_AUXILIARY_FILE_AS_GRANULE:
+            from cumulus_lambda_functions.stage_in_out.upload_arbitrary_files_as_granules import UploadArbitraryFilesAsGranules
+            return UploadArbitraryFilesAsGranules()
         raise ValueError(f'unknown search_type: {upload_type}')
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ class GranulesIndexMapping:`
`47`	`47`	`"title": {"type": "text"}`
`48`	`48`	`}`
`49`	`49`	`},`
`50`		`- "stac_extensions": {"type": "object"},`
	`50`	`+ "stac_extensions": {"type": "keyword"},`
`51`	`51`	`"properties": {`
`52`	`52`	`"dynamic": "false",`
`53`	`53`	`"properties": {`