|
| 1 | +import json |
| 2 | +import logging |
| 3 | +import os.path |
| 4 | +from glob import glob |
| 5 | +from multiprocessing import Manager |
| 6 | + |
| 7 | +from cumulus_lambda_functions.cumulus_stac.granules_catalog import GranulesCatalog |
| 8 | + |
| 9 | +from cumulus_lambda_functions.lib.aws.aws_s3 import AwsS3 |
| 10 | +from cumulus_lambda_functions.lib.processing_jobs.job_manager_memory import JobManagerMemory |
| 11 | +from cumulus_lambda_functions.lib.processing_jobs.multithread_processor import MultiThreadProcessorProps, MultiThreadProcessor |
| 12 | +from cumulus_lambda_functions.lib.processing_jobs.job_manager_abstract import JobManagerProps |
| 13 | +from cumulus_lambda_functions.lib.utils.file_utils import FileUtils |
| 14 | +from cumulus_lambda_functions.lib.processing_jobs.job_executor_abstract import JobExecutorAbstract |
| 15 | +from cumulus_lambda_functions.lib.time_utils import TimeUtils |
| 16 | +from cumulus_lambda_functions.stage_in_out.upload_granules_abstract import UploadGranulesAbstract |
| 17 | +from pystac import Item, Asset, ItemCollection, Catalog, Link |
| 18 | + |
| 19 | +LOGGER = logging.getLogger(__name__) |
| 20 | + |
| 21 | + |
| 22 | +class UploadItemExecutor(JobExecutorAbstract): |
| 23 | + def __init__(self, result_list, error_list, collection_id, staging_bucket, retry_wait_time_sec, retry_times, delete_files: bool) -> None: |
| 24 | + super().__init__() |
| 25 | + self.__collection_id = collection_id |
| 26 | + self.__staging_bucket = staging_bucket |
| 27 | + self.__delete_files = delete_files |
| 28 | + |
| 29 | + self.__gc = GranulesCatalog() |
| 30 | + self.__result_list = result_list |
| 31 | + self.__error_list = error_list |
| 32 | + # self.__gc = GranulesCatalog() |
| 33 | + self.__s3 = AwsS3() |
| 34 | + self.__retry_wait_time_sec = retry_wait_time_sec |
| 35 | + self.__retry_times = retry_times |
| 36 | + |
| 37 | + def validate_job(self, job_obj): |
| 38 | + return True |
| 39 | + |
| 40 | + def generate_sample_stac(self, filepath: str): |
| 41 | + filename = os.path.basename(filepath) |
| 42 | + file_checksum = FileUtils.get_checksum(filepath, True) |
| 43 | + # https://github.com/stac-extensions/file |
| 44 | + # https://github.com/stac-extensions/file/blob/main/examples/item.json |
| 45 | + sample_stac_item = Item( |
| 46 | + id=f'{self.__collection_id}:{os.path.splitext(filename)[0]}', |
| 47 | + stac_extensions=["https://stac-extensions.github.io/file/v2.1.0/schema.json"], |
| 48 | + geometry={ |
| 49 | + "type": "Point", |
| 50 | + "coordinates": [0.0, 0.0] |
| 51 | + }, |
| 52 | + bbox=[0.0, 0.0, 0.0, 0.0], |
| 53 | + datetime=TimeUtils().parse_from_unix(0, True).get_datetime_obj(), |
| 54 | + properties={ |
| 55 | + "start_datetime": TimeUtils.get_current_time(), |
| 56 | + "end_datetime": TimeUtils.get_current_time(), |
| 57 | + "created": TimeUtils.get_current_time(), |
| 58 | + "updated": TimeUtils.get_current_time(), |
| 59 | + }, |
| 60 | + collection=self.__collection_id, |
| 61 | + assets={ |
| 62 | + filename: Asset( |
| 63 | + href=filepath, |
| 64 | + roles=['data'], |
| 65 | + title=os.path.basename(filename), |
| 66 | + extra_fields={ |
| 67 | + 'file:size': FileUtils.get_size(filepath), |
| 68 | + 'file:checksum': file_checksum, |
| 69 | + }, |
| 70 | + description=f'size={FileUtils.get_size(filepath)};checksumType=md5;checksum={file_checksum}'), |
| 71 | + f'{filename}.stac.json': Asset(href=f'{filepath}.stac.json', roles=['metadata'], description='desc=metadata stac;size=-1;checksumType=md5;checksum=unknown'), # How to update this? It's a circular dependency |
| 72 | + }) |
| 73 | + |
| 74 | + return sample_stac_item |
| 75 | + |
| 76 | + def execute_job(self, job_obj, lock) -> bool: |
| 77 | + sample_stac_item = self.generate_sample_stac(job_obj) |
| 78 | + updating_assets = {} |
| 79 | + try: |
| 80 | + s3_url = self.__s3.upload(job_obj, self.__staging_bucket, f'{self.__collection_id}/{self.__collection_id}:{sample_stac_item.id}', self.__delete_files) |
| 81 | + updating_assets[os.path.basename(s3_url)] = s3_url |
| 82 | + uploading_current_granule_stac = f'{s3_url}.stac.json' |
| 83 | + self.__s3.set_s3_url(uploading_current_granule_stac) |
| 84 | + self.__s3.upload_bytes(json.dumps(sample_stac_item.to_dict(False, False),indent=4).encode()) |
| 85 | + updating_assets[os.path.basename(uploading_current_granule_stac)] = uploading_current_granule_stac |
| 86 | + self.__gc.update_assets_href(sample_stac_item, updating_assets) |
| 87 | + self.__result_list.put(sample_stac_item.to_dict(False, False)) |
| 88 | + except Exception as e: |
| 89 | + sample_stac_item.properties['upload_error'] = str(e) |
| 90 | + LOGGER.exception(f'error while processing: {job_obj}') |
| 91 | + self.__error_list.put(sample_stac_item.to_dict(False, False)) |
| 92 | + return True |
| 93 | + |
| 94 | + |
| 95 | +class UploadArbitraryFilesAsGranules(UploadGranulesAbstract): |
| 96 | + BASE_DIRECTORY = 'BASE_DIRECTORY' |
| 97 | + |
| 98 | + def __init__(self): |
| 99 | + super().__init__() |
| 100 | + self.__s3 = AwsS3() |
| 101 | + |
| 102 | + def upload(self, **kwargs) -> str: |
| 103 | + |
| 104 | + """ |
| 105 | + 1. Use Glob to find files |
| 106 | + 2. Create stac.json for each file. |
| 107 | + 3. Need collection ID which has tenant + venue. |
| 108 | + 4. Create successful features.json |
| 109 | + :param kwargs: |
| 110 | + :return: |
| 111 | + """ |
| 112 | + self._set_props_from_env() |
| 113 | + output_dir = os.environ.get(self.OUTPUT_DIRECTORY) |
| 114 | + if not FileUtils.dir_exist(output_dir): |
| 115 | + raise ValueError(f'OUTPUT_DIRECTORY: {output_dir} does not exist') |
| 116 | + missing_keys = [k for k in [self.BASE_DIRECTORY] if k not in os.environ] |
| 117 | + if len(missing_keys) > 0: |
| 118 | + raise ValueError(f'missing environment keys: {missing_keys}') |
| 119 | + base_dir = os.environ.get(self.BASE_DIRECTORY) |
| 120 | + possible_files = [k for k in glob(os.path.join(base_dir, '**'), recursive=True) if os.path.isfile(k)] |
| 121 | + |
| 122 | + local_items = Manager().Queue() |
| 123 | + error_list = Manager().Queue() |
| 124 | + |
| 125 | + if self._parallel_count == 1: |
| 126 | + for each_child in possible_files: |
| 127 | + temp_job = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files) |
| 128 | + temp_job.execute_job(each_child, None) |
| 129 | + else: |
| 130 | + job_manager_props = JobManagerProps() |
| 131 | + for each_child in possible_files: |
| 132 | + job_manager_props.memory_job_dict[each_child] = each_child |
| 133 | + # https://www.infoworld.com/article/3542595/6-python-libraries-for-parallel-processing.html |
| 134 | + multithread_processor_props = MultiThreadProcessorProps(self._parallel_count) |
| 135 | + multithread_processor_props.job_manager = JobManagerMemory(job_manager_props) |
| 136 | + multithread_processor_props.job_executor = UploadItemExecutor(local_items, error_list, self._collection_id, self._staging_bucket, self._retry_wait_time_sec, self._retry_times, self._delete_files) |
| 137 | + multithread_processor = MultiThreadProcessor(multithread_processor_props) |
| 138 | + multithread_processor.start() |
| 139 | + |
| 140 | + LOGGER.debug(f'finished uploading all granules') |
| 141 | + dapa_body_granules = [] |
| 142 | + while not local_items.empty(): |
| 143 | + dapa_body_granules.append(local_items.get()) |
| 144 | + |
| 145 | + errors = [] |
| 146 | + while not error_list.empty(): |
| 147 | + errors.append(error_list.get()) |
| 148 | + LOGGER.debug(f'successful count: {len(dapa_body_granules)}. failed count: {len(errors)}') |
| 149 | + successful_item_collections = ItemCollection(items=dapa_body_granules) |
| 150 | + failed_item_collections = ItemCollection(items=errors) |
| 151 | + successful_features_file = os.path.join(output_dir, 'successful_features.json') |
| 152 | + |
| 153 | + failed_features_file = os.path.join(output_dir, 'failed_features.json') |
| 154 | + LOGGER.debug(f'writing results: {successful_features_file} && {failed_features_file}') |
| 155 | + FileUtils.write_json(successful_features_file, successful_item_collections.to_dict(False)) |
| 156 | + FileUtils.write_json(failed_features_file, failed_item_collections.to_dict(False)) |
| 157 | + s3_url = self.__s3.upload(successful_features_file, self._staging_bucket, |
| 158 | + self._result_path_prefix, |
| 159 | + s3_name=f'successful_features_{TimeUtils.get_current_time()}.json', |
| 160 | + delete_files=self._delete_files) |
| 161 | + LOGGER.debug(f'uploaded successful features to S3: {s3_url}') |
| 162 | + LOGGER.debug(f'creating response catalog') |
| 163 | + catalog = Catalog( |
| 164 | + id='NA', |
| 165 | + description='NA') |
| 166 | + catalog.add_link(Link('item', successful_features_file, 'application/json')) |
| 167 | + catalog.add_link(Link('item', failed_features_file, 'application/json')) |
| 168 | + catalog_json = catalog.to_dict(False, False) |
| 169 | + LOGGER.debug(f'catalog_json: {catalog_json}') |
| 170 | + return json.dumps(catalog_json) |
0 commit comments