Skip to content

Commit

Permalink
HGA - Further code clean-up.
Browse files Browse the repository at this point in the history
  • Loading branch information
owenlittlejohns authored Jan 3, 2023
1 parent 57cd778 commit cee4eb6
Show file tree
Hide file tree
Showing 7 changed files with 347 additions and 142 deletions.
11 changes: 11 additions & 0 deletions CHANGE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
[version 1.2.11] 2022-12-23
* Refactor `HarmonyAdapter.process_item` to move staging and STAC record
generation to separate class method.
* Refactor `HarmonyAdapter.process_zip` and `HarmonyAdapter.unpack_zipfile` to
remove redundant variables.
* Refactor `get_files_fromunzipfiles` to capture all NetCDF-4 and GeoTIFF
extension options, while implementing list comprehensions for increased
readability.
* Attempt to add more type hints and documentation strings, to enable better
understanding of the code.

[version 1.2.10] 2022-12-16
* Migrate `calc_subset_envelope_window` and `box_to_shapefile` out of HarmonyAdapter.
* Increased unit test coverage.
Expand Down
11 changes: 10 additions & 1 deletion gdal_subsetter/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,19 @@ def __init__(self, file_format):


class IncompatibleVariablesError(HGAException):
"""This exception is raised when the dataset variables requested are not
""" This exception is raised when the dataset variables requested are not
compatible, i.e. they have different projections, geotransforms, sizes or
data types.
"""
def __init__(self, message):
super().__init__(f'Incompatible variables: {message}')


class MultipleZippedNetCDF4FilesError(HGAException):
""" This exception is raised when the input file supplied to HGA is a zip
file containing multiple NetCDF-4 files, as these cannot be aggregated.
"""
def __init__(self, zip_file):
super().__init__(f'Multiple NetCDF-4 files within input: {zip_file}.')
258 changes: 152 additions & 106 deletions gdal_subsetter/transform.py

Large diffs are not rendered by default.

83 changes: 55 additions & 28 deletions gdal_subsetter/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from os import rename
from os.path import dirname, exists, join as path_join, splitext
from typing import List
import re

from harmony.util import generate_output_filename
from osgeo import gdal
Expand Down Expand Up @@ -57,41 +56,69 @@ def get_version() -> str:
return version


def get_files_from_unzipfiles(extract_dir: str, filetype: str,
variables=None) -> List[str]:
def has_world_file(file_mime_type: str) -> bool:
""" Determine if the given MIME type for a transformed output is expected
to have an accompanying ESRI world file.
"""
inputs: extract_dir which include geotiff files, filetype is
either 'tif' or 'nc', variables is the list of variable names.
return: filelist for variables.
return any(world_mime in file_mime_type.lower()
for world_mime in ['png', 'jpeg'])


def is_geotiff(file_name: str) -> bool:
""" Determine if the given file is a GeoTIFF via `gdalinfo`. """
gdalinfo_lines = gdal.Info(file_name).splitlines()
return gdalinfo_lines[0] == 'Driver: GTiff/GeoTIFF'


def get_files_from_unzipfiles(extract_dir: str, file_type: str,
variable_names: List[str] = []) -> List[str]:
""" Retrieve a filtered list of files that have been extracted from an
input zip file based on a specific file type ('nc' or 'tif').
If a list of variables names is specified, and the first variable name
does not include "Band", the list of extracted files with the expected
extension will be further filtered to only return those file names that
include one of the requested variable names.
As currently called, requests to determine NetCDF-4 files will not
include a filtering list of variable names.
"""
tmpexp = path_join(extract_dir, f'*.{filetype}')
filelist = sorted(glob(tmpexp))
ch_filelist = []
if filelist:
if variables:
if 'Band' not in variables[0]:
for variable in variables:
variable_tmp = variable.replace('-', '_')
variable_raw =fr'{variable_tmp}'
for filename in filelist:
if re.search(variable_raw, filename.replace('-', '_')):
ch_filelist.append(filename)
break
else:
ch_filelist = filelist
else:
ch_filelist = filelist
return ch_filelist
file_extensions = [file_extension for file_extension, known_file_type
in known_file_types.items()
if known_file_type == file_type]

files_with_type = []

for file_extension in file_extensions:
files_with_type.extend(glob(path_join(extract_dir,
f'*{file_extension}')))

files_with_type.sort()

if len(variable_names) > 0 and 'Band' not in variable_names[0]:
formatted_variable_names = [variable_name.replace('-', '_')
for variable_name in variable_names]

filtered_file_list = [
file_name for file_name in files_with_type
if any(variable_name in file_name.replace('-', '_')
for variable_name in formatted_variable_names)
]
else:
filtered_file_list = files_with_type

return filtered_file_list


def rename_file(input_filename: str, stac_asset_href: str) -> str:
""" Rename a given file to a name determined for the input STAC Asset
by the harmony-service-lib Python library.
TODO: `generate_output_filename` should be called with appropriate
values for `variable_subset`, `is_regridded` and `is_subsetted`.
These kwargs allow the function to determine any required
suffices for the file, e.g., `<input_file>_subsetted.nc4`.
This function is used to rename the input file downloaded to the
Docker container from a randomly generated temporary file name, to one
that pertains to the original STAC asset URL.
"""
output_filename = path_join(dirname(input_filename),
Expand Down
2 changes: 1 addition & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pylint~=2.12.2
pylint~=2.15.9
pytest==7.1.1
python-dotenv
pytest-cov
122 changes: 117 additions & 5 deletions tests/unit/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,28 @@
from tempfile import mkdtemp
from unittest import TestCase

from gdal_subsetter.utilities import get_file_type, OpenGDAL, rename_file
from gdal_subsetter.utilities import (get_file_type, get_files_from_unzipfiles,
has_world_file, is_geotiff, OpenGDAL,
rename_file)


class TestUtilities(TestCase):
""" A class to test the functions in the utilities.py module. """
@classmethod
def setUpClass(cls):
""" Define items that can be shared between tests. """
cls.granule_dir = 'tests/data/granules'
cls.uavsar_granule = path_join(
cls.granule_dir, 'uavsar',
'gulfco_32010_09045_001_090617_L090_CX_01_pauli.tif'
)
cls.sentinel_granule = path_join(
cls.granule_dir, 'gfrn',
'S1-GUNW-D-R-083-tops-20141116_20141023-095646-360325S_38126S-PP-24b3-v2_0_2.nc'
)

def setUp(self):
""" Define items that need to be unique to each test. """
self.temp_dir = mkdtemp()

def tearDown(self):
Expand Down Expand Up @@ -76,14 +92,110 @@ def test_open_gdal(self):
access to a GeoTIFF file.
"""
uavsar_granule = ('tests/data/granules/uavsar/'
'gulfco_32010_09045_001_090617_L090_CX_01_pauli.tif')

with OpenGDAL(uavsar_granule) as uavsar_gdal_object:
with OpenGDAL(self.uavsar_granule) as uavsar_gdal_object:
gdal_metadata = uavsar_gdal_object.GetMetadata()

self.assertDictEqual(gdal_metadata,
{'AREA_OR_POINT': 'Area',
'TIFFTAG_RESOLUTIONUNIT': '1 (unitless)',
'TIFFTAG_XRESOLUTION': '1',
'TIFFTAG_YRESOLUTION': '1'})

def test_is_geotiff(self):
""" Ensure that a file is correctly recognised as a GeoTIFF. """
with self.subTest('A GeoTIFF granule returns True'):
self.assertTrue(is_geotiff(self.uavsar_granule))

with self.subTest('A NetCDF-4 granules returns False'):
self.assertFalse(is_geotiff(self.sentinel_granule))

def test_get_files_from_unzipfiles(self):
""" Ensure that files extracted from a zip file can be filtered based
on their extensions. In addition, if variables are specified, and
are not "Band1", "Band2", etc, the output file list should be
further filtered to only those paths that match to a variable name.
The tests below verify that files with ".nc" and ".nc4" extensions
are both recognised as having type "nc", while files with ".tif"
and ".tiff" extensions are both recognised as GeoTIFFs.
"""
netcdf4_files = [path_join(self.temp_dir, 'granule_amplitude.nc'),
path_join(self.temp_dir, 'granule_coherence.nc4'),
path_join(self.temp_dir, 'granule_variable-one.nc'),
path_join(self.temp_dir, 'granule_variable_two.nc')]
geotiff_files = [path_join(self.temp_dir, 'granule_amplitude.tif'),
path_join(self.temp_dir, 'granule_coherence.tiff')]

for netcdf4_file in netcdf4_files:
with open(netcdf4_file, 'a', encoding='utf-8') as file_handler:
file_handler.write(netcdf4_file)

for geotiff_file in geotiff_files:
with open(geotiff_file, 'a', encoding='utf-8') as file_handler:
file_handler.write(geotiff_file)

with self.subTest('No variables, all GeoTIFF files are retrieved.'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'tif',
variable_names=[]),
geotiff_files
)

with self.subTest('No variables, all NetCDF-4 files are retrieved.'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'nc'),
netcdf4_files
)

with self.subTest('Only files matching variable names are retrieved.'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'nc',
variable_names=['amplitude']),
[netcdf4_files[0]]
)

with self.subTest('No files matching variables returns empty list.'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'nc',
variable_names=['wind_speed']),
[]
)

with self.subTest('Variable names are ignored if they contain "Band"'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'tif',
variable_names=['Band1', 'Band2']),
geotiff_files
)

with self.subTest('Variable name hyphens converted to underscores.'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'nc',
variable_names=['variable-two']),
[netcdf4_files[3]]
)

with self.subTest('File name hyphens converted to underscores.'):
self.assertListEqual(
get_files_from_unzipfiles(self.temp_dir, 'nc',
variable_names=['variable_one']),
[netcdf4_files[2]]
)

def test_has_world_file(self):
""" Ensure that files are correctly identified as having an associated
ESRI world file based on their MIME type.
"""
with self.subTest('PNG returns True.'):
self.assertTrue(has_world_file('image/png'))

with self.subTest('JPEG returns True.'):
self.assertTrue(has_world_file('image/jpeg'))

with self.subTest('GeoTIFF returns False.'):
self.assertFalse(has_world_file('image/tiff'))

with self.subTest('NetCDF-4 returns False.'):
self.assertFalse(has_world_file('application/x-netcdf4'))
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.10
1.2.11

0 comments on commit cee4eb6

Please sign in to comment.