diff --git a/.gitignore b/.gitignore index 2bbc3b6..f878a30 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ coverage.xml .pytest_cache/ coverage test-reports +tests/reports # Translations *.mo diff --git a/Dockerfile b/Dockerfile index 15e2a6a..f4cb090 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,15 +7,17 @@ FROM continuumio/miniconda3 WORKDIR "/home" -# Copy requirements into the container +# Copy Conda requirements into the container COPY ./conda_requirements.txt conda_requirements.txt -COPY ./pip_requirements.txt pip_requirements.txt # Create Conda environment RUN conda create -y --name subsetter --file conda_requirements.txt python=3.7 -q \ --channel conda-forge \ --channel defaults +# Copy additional Pip dependencies into the container +COPY ./pip_requirements.txt pip_requirements.txt + # Install additional Pip dependencies RUN conda run --name subsetter pip install --no-input -r pip_requirements.txt diff --git a/conda_requirements.txt b/conda_requirements.txt index 2f38271..1a67b24 100644 --- a/conda_requirements.txt +++ b/conda_requirements.txt @@ -1 +1,2 @@ # This file should contain any requirements installed via Conda +hdf5==1.12.0 diff --git a/pip_requirements.txt b/pip_requirements.txt index c3183f1..04e34ca 100644 --- a/pip_requirements.txt +++ b/pip_requirements.txt @@ -1,9 +1,10 @@ # This file should contain requirements to be installed via Pip. --extra-index-url https://maven.earthdata.nasa.gov/repository/python-repo/simple # Packages available only from Maven -sds-varinfo ~= 0.1.0 +sds-varinfo ~= 0.1.1 # Open source packages available from PyPI +netCDF4 ~= 1.5.6 numpy ~= 1.18.4 pyyaml ~= 5.4.0 harmony-service-lib ~= 1.0.4 diff --git a/pymods/geo_grid.py b/pymods/geo_grid.py new file mode 100644 index 0000000..1dd7fbd --- /dev/null +++ b/pymods/geo_grid.py @@ -0,0 +1,376 @@ +""" This module includes functions that support bounding box spatial subsets of + geographically gridded data. + + For a given list of required variables, as determined by the `sds-varinfo` + package, dimensions are first identified. Those dimensions that are + geographic are determined by checking their `units` metadata attribute. + The geographic dimension variables are fetched, in full, from the OPeNDAP + server, and the index ranges that correspond to the regions of these + variables within the specified bounding box are identified. All required + variables are then requested from the OPeNDAP server, only in the range of + the bounding box. + + If the bounding box crosses the longitudinal edge of the grid, the full + longitudinal range of each variable is retrieved. The ranges of data for + each variable outside of the bounding box are set to the variable fill + value. + + An example of this would be for the RSSMIF16D data which have a + grid with 0 ≤ longitude (degrees) < 360. The Harmony message will specify + a bounding box within -180 ≤ longitude (degrees) < 180. If the western edge + is west of the Prime Meridian and the eastern edge is east of it, then the + box will cross the RSSMIF16D grid edge. + + For example: [W, S, E, N] = [-20, -90, 20, 90] + +""" +from logging import Logger +from typing import Dict, List, Set + +from netCDF4 import Dataset, Variable +import numpy as np + +from harmony.util import Config +from varinfo import VarInfoFromDmr, VariableFromDmr + +from pymods.utilities import get_opendap_nc4 + + +def get_geo_bounding_box_subset(required_variables: Set[str], + dataset: VarInfoFromDmr, + bounding_box: List[float], url: str, + output_dir: str, logger: Logger, + access_token: str, config: Config) -> str: + """ Inspect the list of required variables to determine the geographic + dimensions. Then perform a pre-fetch of data to retrieve the full range + of those variables. From the full range, determine the index ranges for + the coordinate dimensions. + + Iterate through each variable and determine the index ranges for each. + If a dimension is a geographic coordinate, then use the index range + determined from the previous steps, otherwise retrieve the full range. + + If a request includes a bounding box crossing a longitude discontinuity + at either the Prime Meridian or Antimeridian, the western and eastern + extents of the bounding box will seem to be in the reverse order. In + this case, the full range of longitudes is requested for all variables + and the data outside of the bounding box are filled. + + """ + dimension_variables = get_required_dimensions(dataset, required_variables) + + geographic_dimensions = set( + dimension for dimension in dimension_variables + if is_geographic_coordinate(dataset.get_variable(dimension)) + ) + + if len(geographic_dimensions) > 0: + # Prefetch geographic dimension data from OPeNDAP + dimensions_path = get_opendap_nc4(url, geographic_dimensions, + output_dir, logger, access_token, + config) + + # Derive index ranges of the bounding box for each geographic variable + index_ranges = get_dimension_index_ranges(dimensions_path, dataset, + geographic_dimensions, + bounding_box) + else: + # There are no geographic dimensions for the required variables, so + # none of the variables have index ranges constraints. + index_ranges = {} + + # Cycle through all variables and map to correct dimension index ranges + variables_with_ranges = set( + add_index_range(variable, dataset, index_ranges) + for variable in required_variables + ) + + # Get full variable subset using derived index ranges: + output_path = get_opendap_nc4(url, variables_with_ranges, output_dir, + logger, access_token, config) + + # Filter the geographic index ranges to only include those to be filled + fill_ranges = {dimension: index_range + for dimension, index_range + in index_ranges.items() + if index_range[0] > index_range[1]} + + # If variables need filling (due to crossing longitude discontinuity) + # fill the gaps between the ranges in the bounding box + if len(fill_ranges) > 0: + fill_variables(output_path, dataset, required_variables, fill_ranges) + + return output_path + + +def get_required_dimensions(dataset: VarInfoFromDmr, + required_variables: Set[str]) -> Set[str]: + """ Return a single set of all variables that used as dimensions of any + variables being requested from OPeNDAP. + + """ + return set(dimension for variable in required_variables + for dimension in dataset.get_variable(variable).dimensions) + + +def get_dimension_index_ranges(dimensions_path: str, dataset: VarInfoFromDmr, + geographic_dimensions: Set[str], + bounding_box: List[float]) -> Dict[str, List[float]]: + """ Iterate through all geographic dimensions and extract the indices that + correspond to the minimum and maximum extents in that dimension. For + longitudes, it is assumed that the western extent should be considered + the minimum extent. If the bounding box crosses a longitude + discontinuity this will be later identified by the minimum extent index + being larger than the maximum extent index. + + The return value from this function is a dictionary that contains the + index ranges for each geographic dimension, such as: + + index_range = {'/latitude': [12, 34], '/longitude': [56, 78]} + + """ + index_ranges = {} + + with Dataset(dimensions_path, 'r') as dimensions_file: + for dimension in geographic_dimensions: + variable = dataset.get_variable(dimension) + if is_latitude(variable): + index_ranges[dimension] = get_dimension_index_range( + dimensions_file[dimension][:], bounding_box[1], + bounding_box[3] + ) + else: + # First, convert the bounding box western and eastern extents + # to match the valid range of the dimension data + west_extent, east_extent = get_bounding_box_longitudes( + bounding_box, dimensions_file[dimension] + ) + index_ranges[dimension] = get_dimension_index_range( + dimensions_file[dimension][:], west_extent, east_extent + ) + + return index_ranges + + +def get_dimension_index_range(dimension: np.ma.core.MaskedArray, + minimum_extent: float, + maximum_extent: float) -> List[int]: + """ Find the indices closest to the interpolated values of the minimum and + maximum extents in that dimension. + + Use of `numpy.interp` maps the dimension scale values to their index + values and then computes an interpolated index value that best matches + the bounding value (minimum_extent, maximum_extent) to a "fractional" + index value. Rounding that value gives the starting index value for the + cell that contains that bound. + + For a latitude dimension: + + * `minimum_extent` is the southern extent of the bounding box. + * `maximum_extent` is the northern extent of the bounding box. + + For a longitude dimension: + + * `minimum_extent` is the western extent of the bounding box. + * `maximum_extent` is the eastern extent of the bounding box. + + The input longitude extent values must conform to the valid range of + the native dimension data. + + """ + dimension_range = [minimum_extent, maximum_extent] + raw_indices = np.interp(dimension_range, dimension, + np.arange(dimension.size)) + minimum_index = int(np.rint(raw_indices[0])) + maximum_index = int(np.rint(raw_indices[1])) + + return [minimum_index, maximum_index] + + +def get_bounding_box_longitudes(bounding_box: List[float], + longitude: Variable) -> List[float]: + """ Ensure the bounding box longitude extents are in the valid range for + the longitude variable. The bounding box values are expected to range + from -180 ≤ longitude (degrees) < 180, whereas some collections have + grids with discontinuities at the Prime Meridian. + + The bounding box from the Harmony message is ordered: [W, S, E, N] + + """ + valid_range = get_valid_longitude_range(longitude) + + if valid_range[1] > 180: + # Discontinuity at Prime Meridian: 0 ≤ longitude (degrees) < 360 + western_box_extent = unwrap_longitude(bounding_box[0]) + eastern_box_extent = unwrap_longitude(bounding_box[2]) + else: + # Discontinuity at Antimeridian: -180 ≤ longitude (degrees) < 180 + western_box_extent = bounding_box[0] + eastern_box_extent = bounding_box[2] + + return [western_box_extent, eastern_box_extent] + + +def wrap_longitude(longitude: float) -> float: + """ Wrap longitude to be in the -180 ≤ longitude (degrees) < 180 range. + For longitudes already in this range, this is a no-op. + + """ + return ((longitude + 180) % 360) - 180 + + +def unwrap_longitude(wrapped_longitude: float) -> float: + """ Convert longitude from the -180 ≤ longitude (degrees) < 180 range to + 0 ≤ longitude (degrees) < 360. This allows that bounding box to be + converted from its native range to match that of collections in this + latter format (e.g., RSSMIF16D). The bounding box needs to be evaluated + in the same range as the collection's grid, to ensure the longitude + discontinuity is preserved and discontinuous array indices can be + identified. + + """ + return ((wrapped_longitude % 360) + 360) % 360 + + +def get_valid_longitude_range(longitude: Variable) -> List[float]: + """ Check the variable metadata for `valid_range` or `valid_min` and + `valid_max`. If no metadata data attributes indicating the valid range + are present, check if the data contain a value in the range + 180 < longitude < 360 to determine the adopted convention. + + The expected options are: + + * Discontinuity at Antimeridian: -180 ≤ longitude (degrees) < 180 + * Discontinuity at Prime Meridian: 0 ≤ longitude (degrees) < 360 + + """ + if hasattr(longitude, 'valid_range'): + valid_range = [longitude.valid_range[0], longitude.valid_range[1]] + elif hasattr(longitude, 'valid_min') and hasattr(longitude, 'valid_max'): + valid_range = [longitude.valid_min, longitude.valid_max] + elif np.max(longitude) > 180.0: + valid_range = [0.0, 360.0] + else: + valid_range = [-180.0, 180.0] + + return valid_range + + +def add_index_range(variable_name: str, dataset: VarInfoFromDmr, + index_ranges: Dict[str, List[float]]) -> str: + """ Append the index ranges of each dimension for the specified variable. + If there are no dimensions with listed index ranges, then the full + variable should be requested, and no index notation is required. + A variable with a bounding box crossing the edge of the grid (e.g., at + the Antimeridian or Prime Meridian) will have a minimum index greater + than the maximum index. In this case the full dimension range should be + requested, as the related values will be masked before returning the + output to the user. + + """ + variable = dataset.get_variable(variable_name) + + range_strings = [] + + for dimension in variable.dimensions: + dimension_range = index_ranges.get(dimension) + + if dimension_range is not None and dimension_range[0] <= dimension_range[1]: + range_strings.append(f'[{dimension_range[0]}:{dimension_range[1]}]') + else: + range_strings.append('[]') + + if all(range_string == '[]' for range_string in range_strings): + indices_string = '' + else: + indices_string = ''.join(range_strings) + + return f'{variable_name}{indices_string}' + + +def fill_variables(output_path: str, dataset: VarInfoFromDmr, + required_variables: Set[str], + fill_ranges: Dict[str, List[float]]) -> None: + """ Cycle through the output NetCDF-4 file and check the dimensions of + each variable. If the minimum index is greater than the maximum index + in the subset range, then the requested bounding box crossed an edge of + the grid, and must be filled in between those values. + + This function will only be called if there are variables that require + filling. + + """ + dimensions_to_fill = set(fill_ranges) + + with Dataset(output_path, 'a', format='NETCDF4') as output_dataset: + for variable_path in required_variables: + variable = dataset.get_variable(variable_path) + if len(dimensions_to_fill.intersection(variable.dimensions)) > 0: + fill_index_tuple = tuple( + get_fill_slice(variable, dimension, fill_ranges) + for dimension in variable.dimensions + ) + + output_dataset[variable_path][fill_index_tuple] = np.ma.masked + + +def get_fill_slice(variable: VariableFromDmr, dimension: str, + fill_ranges: Dict[str, List[float]]) -> slice: + """ Check the dictionary of dimensions that need to be filled for the + given dimension. If present, the minimum index will be greater than the + maximum index (the eastern edge of the bounding box will seem to be to + the west of the western edge due to crossing the grid edge). The region + to be filled is between these indices: + + * Start index = maximum index + 1. + * Stop index = minimum index. (As Python index slices go up to, but not + including, the stop index). + + If the dimension is not to be filled, return a `slice` with unspecified + start and stop. This is the equivalent of the full range in this + dimension. Slices for all variable dimensions will be combined to + identify the region of the variable to be filled, e.g.: + + * variable[(slice(None), slice(None), slice(start_lon, stop_lon))] = fill + + This is equivalent to: + + * science_variable[:][:][start_lon:stop_lon] = fill + + Note - longitude variables themselves will not be filled, to ensure + valid grid coordinates at all points of the science variables. + + """ + if dimension in fill_ranges and not is_longitude(variable): + fill_slice = slice(fill_ranges[dimension][1] + 1, + fill_ranges[dimension][0]) + else: + fill_slice = slice(None) + + return fill_slice + + +def is_geographic_coordinate(variable: VariableFromDmr) -> bool: + """ Use heuristics to determine if a variable is a geographic coordinate + based on its units. A latitude variable will have units 'degrees_north' + and a longitude variable will have units 'degrees_east'. + + + """ + return is_longitude(variable) or is_latitude(variable) + + +def is_latitude(variable: VariableFromDmr) -> bool: + """ Determine if the variable is a latitude based on the `units` metadata + attribute being 'degrees_north'. + + """ + return variable.attributes.get('units') == 'degrees_north' + + +def is_longitude(variable: VariableFromDmr) -> bool: + """ Determine if the variable is a longitude based on the `units` metadata + attribute being 'degrees_east'. + + """ + return variable.attributes.get('units') == 'degrees_east' diff --git a/pymods/subset.py b/pymods/subset.py index 21b59a7..926c755 100644 --- a/pymods/subset.py +++ b/pymods/subset.py @@ -5,27 +5,28 @@ """ from logging import Logger from typing import List -from urllib.parse import quote -from harmony.message import Variable +from harmony.message import Variable as HarmonyVariable from harmony.util import Config from varinfo import VarInfoFromDmr -from pymods.utilities import download_url +from pymods.geo_grid import get_geo_bounding_box_subset +from pymods.utilities import download_url, get_opendap_nc4 -def subset_granule( - url: str, - variables: List[Variable], - output_dir: str, - logger: Logger, - access_token: str = None, - config: Config = None) -> str: +def subset_granule(url: str, variables: List[HarmonyVariable], output_dir: str, + logger: Logger, access_token: str = None, + config: Config = None, bounding_box: List[float] = None) -> str: """ This function takes a granule's OPeNDAP URL and extracts the requested variables, and those sub-variables they depend upon (such as coordinates), to produce an output file with only those variables. The path of this output file is returned. + The optional `bounding_box` argument can be supplied for geographically + gridded data. In this case a bounding-box spatial subset will be + applied to the retrieved variables, in addition to only retrieving the + required variables. + """ granule_filename = url.split('?')[0].rstrip('/').split('/')[-1] logger.info(f'Performing variable subsetting on: {granule_filename}') @@ -53,12 +54,18 @@ def subset_granule( # TODO: Add switch mechanism for including (or not including) all metadata # variables in every subset request to OPeNDAP. - # Build the DAP4 format constraint expression, which is a semi-colon - # separated list of variable names. - constraint_expression = quote(';'.join(required_variables), safe='') - request_data = {'dap4.ce': constraint_expression} + if bounding_box is not None: + # Retrieve OPeNDAP data for a geographically gridded collection, that + # includes only the specified variables, and only in the ranges defined + # by the bounding box. + output_path = get_geo_bounding_box_subset(required_variables, dataset, + bounding_box, url, + output_dir, logger, + access_token, config) + else: + # Retrieve OPeNDAP data including only the specified variables (but in + # their full ranges). + output_path = get_opendap_nc4(url, required_variables, output_dir, + logger, access_token, config) - # Note: The empty string `data` argument ensures a POST request is used. - return download_url(f'{url}.dap.nc4', output_dir, logger, - access_token=access_token, config=config, - data=request_data) + return output_path diff --git a/pymods/utilities.py b/pymods/utilities.py index 11e6c70..4d823b1 100644 --- a/pymods/utilities.py +++ b/pymods/utilities.py @@ -4,8 +4,13 @@ """ from logging import Logger -from typing import Optional, Tuple +from os import sep +from os.path import splitext +from shutil import copy +from typing import Optional, Set, Tuple from urllib.error import HTTPError +from urllib.parse import quote +from uuid import uuid4 import mimetypes from harmony.util import Config, download as util_download @@ -39,13 +44,54 @@ def get_file_mimetype(file_name: str) -> Tuple[Optional[str]]: return mimetype -def download_url( - url: str, - destination: str, - logger: Logger, - access_token: str = None, - config: Config = None, - data=None) -> str: +def get_opendap_nc4(url: str, required_variables: Set[str], output_dir: str, + logger: Logger, access_token: str, config: Config) -> str: + """ Construct a semi-colon separated string of the required variables and + use as a constraint expression to retrieve those variables from + OPeNDAP. + + Returns the path of the downloaded granule containing those variables. + + """ + constraint_expression = get_constraint_expression(required_variables) + request_data = {'dap4.ce': constraint_expression} + + downloaded_nc4 = download_url(f'{url}.dap.nc4', output_dir, logger, + access_token=access_token, config=config, + data=request_data) + + # Rename output file, to ensure repeated data downloads to OPeNDAP will be + # respected by `harmony-service-lib-py`. + return move_downloaded_nc4(output_dir, downloaded_nc4) + + +def get_constraint_expression(variables: Set[str]) -> str: + """ Take a set of variables and return a URL encoded, semi-colon separated + DAP4 constraint expression to retrieve those variables. Each variable + may or may not specify their index ranges. + + """ + return quote(';'.join(variables), safe='') + + +def move_downloaded_nc4(output_dir: str, downloaded_file: str) -> str: + """ Change the basename of a NetCDF-4 file downloaded from OPeNDAP. The + `harmony-service-lib-py` produces a local filename that is a hex digest + of the requested URL only. If this filename is already present in the + local file system, `harmony-service-lib-py` assumes it does not need to + make another HTTP request, and just returns the constructed file path, + even if a POST request is being made with different parameters. + + """ + extension = splitext(downloaded_file)[1] + new_filename = sep.join([output_dir, f'{uuid4().hex}{extension}']) + copy(downloaded_file, new_filename) + return new_filename + + +def download_url(url: str, destination: str, logger: Logger, + access_token: str = None, config: Config = None, + data=None) -> str: """ Use built-in Harmony functionality to download from a URL. This is expected to be used for obtaining the granule `.dmr` and the granule itself (only the required variables). @@ -59,6 +105,10 @@ def download_url( """ logger.info(f'Downloading: {url}') + + if data is not None: + logger.info(f'POST request data: "{data}"') + request_completed = False attempts = 0 @@ -76,7 +126,6 @@ def download_url( ) request_completed = True except HTTPError as http_exception: - logger.info('In HTTPError except\n\n\n\n') if http_exception.code == 500 and attempts < HTTP_REQUEST_ATTEMPTS: logger.info('500 error returned, retrying request.') elif http_exception.code == 500: diff --git a/tests/data/f16_ssmis_20200102v7.nc b/tests/data/f16_ssmis_20200102v7.nc new file mode 100644 index 0000000..4e9bc62 Binary files /dev/null and b/tests/data/f16_ssmis_20200102v7.nc differ diff --git a/tests/data/rssmif16d_example.dmr b/tests/data/rssmif16d_example.dmr new file mode 100644 index 0000000..4ef4e86 --- /dev/null +++ b/tests/data/rssmif16d_example.dmr @@ -0,0 +1,272 @@ + + + + + + + + + degrees_north + + + latitude + + + Y + + + -89.875 + 89.875 + + + lat + + + + + + degrees_east + + + longitude + + + X + + + 0.125 + 359.875 + + + + + + hours since 2020-01-02T00:00:00Z + + + time + + + T + + + 1. + 2. + + + + + + + + hours since 2020-01-02T00:00:00Z + + + sst_dtime + + + SST_DTime + + + 0. + 24. + + + 0.1000000015 + + + 0. + + + + + + + + m/s + + + wind_speed + + + 10 meter Surface Wind Speed + + + 0. + 50. + + + 251 + 252 + 253 + 254 + 255 + + + missing_wind_speed_due_to_rain sea_ice bad_data no_observations land_mass + + + 0.200000003 + + + 0. + + + + + + + + kg m-2 + + + atmosphere_water_vapor_content + + + Columnar Water Vapor + + + 0. + 75. + + + 251 + 252 + 253 + 254 + 255 + + + missing_wind_speed_due_to_rain sea_ice bad_data no_observations land_mass + + + 0.3000000119 + + + 0. + + + + + + + + kg m-2 + + + atmosphere_cloud_liquid_water_content + + + Columnar Cloud Liquid Water + + + -0.05000000075 + 2.450000048 + + + 251 + 252 + 253 + 254 + 255 + + + missing_wind_speed_due_to_rain sea_ice bad_data no_observations land_mass + + + 0.009999999776 + + + -0.05000000075 + + + + + + + + mm/hr + + + rainfall_rate + + + Rain Rate + + + 0. + 25. + + + 251 + 252 + 253 + 254 + 255 + + + missing_wind_speed_due_to_rain sea_ice bad_data no_observations land_mass + + + 0.1000000015 + + + 0. + + + + 90 + + + Version-7 algorithm processing was performed during August 2011 and continues forward for active instruments. Use data files with a date of August 2011 or newer. The RSS SSMIS geophysical dataset consists of data derived from observations collected by SSMIS instruments carried onboard the DMSP series of polar orbiting satellites. These satellites are numbered: F16 SSMIS Oct 2003 to present, F17 SSMIS Dec 2006 to present, F18 SSMIS Nov 2009 to present. Daily files of SSMIS geophysical parameters consist of data mapped to a regular grid complete with data gaps between orbits. Two maps exist for each parameter, one of ascending orbit segments and the other of descending orbit segments. Data on each of the maps are overwritten at both the high latitudes (where successive orbits cross) and at the 'seam' or the region where the last orbit of the day overlaps the first orbit of the day. Water vapor and cloud liquid water are given in kg/m2 (which is equivalent to 1 mm or 0.1 g/cm2). + + + CF-1.6 + + + Starting in August 2011, all SSMIS data were reprocessed from a preliminary beta version to the official Version-7. Use data files that have a creation date of August 2011 or later. At GHRC, RSS binary files are translated to netCDF-4 for archive, and for distribution to netCDF users. RSS Provenance: processing location = Remote Sensing Systems, processing host = Ops1p, input files = SSMIS Level 2B data files (fSS_rRRRRR.dat, SS= sat number, RRRRR=orbital rev number), processing code name = ssmis_processor.exe, processing code version = 2011-08-08T02:55:51Z. GHRC Provenance: processing location = GHRC, processing host = breeze.nsstc.nasa.gov, processing environment = ops, binary input file name = f16_20200102v7.gz (daily), processing script name = RSSbinary2netCDF.jar, processing script version = 2012-07-01T23:55:36Z, configuration file name = SSMIS_Daily.xml, configuration file version = 2012-06-29T21:06:34Z + + + Remote Sensing Systems, Santa Rosa, California. Sponsored by the NASA Earth Science MEaSUREs DISCOVER Project. +Global Hydrology Resource Center, Huntsville, Alabama, a NASA earth science data center. + + + 2 + + + 1:ascending orbit segments. 2:descending orbit segments. + + + http://www.ssmi.com/ssmi/ssmi_description.html + + + DMSP-F16 + + + SSMIS data are scaled for storage and affect the resolution of the data: time: x0.1 hrs, wind: x0.2 m/s, vapor: x0.3 kg/m2, cloud: x0.01 kg/m2, rain: x0.1 mm + + + SSMIS + + + Data derived from observations collected by SSMIS instruments carried onboard the DMSP series of polar orbiting satellites(F16-F18) + + + RSS SSMIS Ocean Product Grids Daily from DMSP F16 netCDF + + + 251(1506 for time)=missing wind speed due to rain, or missing water vapor due to heavy rain. 252(1512 for time)=sea ice. 253(1518 for time)=SSMIS observations exist,but are bad. 254(1524 for time)=no SSMIS observations. 255(1530 for time)=land mass. + + + 2020-01-02T00:00:00Z + + + 2020-01-02T23:59:59Z + + + 10.5067/MEASURES/DMSP-F16/SSMIS/DATA301 + + + http://dx.doi.org + + diff --git a/tests/data/test_subsetter_config.yml b/tests/data/test_subsetter_config.yml new file mode 100644 index 0000000..d9b2a65 --- /dev/null +++ b/tests/data/test_subsetter_config.yml @@ -0,0 +1,94 @@ +Identification: var_subsetter_config +Version: 13 + +Collection_ShortName_Path: + - '/HDF5_GLOBAL/short_name' + - '/NC_GLOBAL/short_name' + - '/Metadata/DatasetIdentification/shortName' + - '/METADATA/DatasetIdentification/shortName' + - '/Metadata/SeriesIdentification/shortName' + - '/METADATA/SeriesIdentification/shortName' + - '/HDF5_GLOBAL/id' + - '/NC_GLOBAL/id' + - 'short_name' + +Mission: + 'ATL\d{2}': 'ICESat2' + +Excluded_Science_Variables: + - Applicability: + Mission: 'ICESat2' + Variable_Pattern: + - '/quality_assessment/.*' + - '/orbit_info/.*' + - '/atlas_impulse_response/.*' + +ProductEpochs: + - Applicability: + Mission: ICESat2 + Epoch: '2005-01-01T00:00:00.000000' + +CF_Overrides: + - Applicability: + +CF_Supplements: + - Applicability: + Mission: 'ICESat2' + ShortNamePath: 'ATL0[3-9]|ATL1[023]' # L1/L2 + Global_Attributes: + - Name: 'Data_Organization' + Value: 'h5_trajectory' + Applicability_Group: + - Applicability: + Variable_Pattern: '/gt[123][lr]/geolocation/.*' + Attributes: + - Name: 'ancillary_variables' + Value: 'podppd_flag' + - Applicability: + ShortNamePath: 'ATL03' + Variable_Pattern: '/gt[123][lr]/geophys_corr/.*' + Attributes: + - Name: 'subset_control_variables' + Value: '../geolocation/delta_time, ../geolocation/reference_photon_lat, ../geolocation/reference_photon_lon' + - Name: 'subset_control_type' + Value: 'coordinates' + - Applicability: + ShortNamePath: 'ATL03' + Variable_Pattern: '/gt[123][lr]/heights/.*' + Attributes: + - Name: 'subset_control_variables' + Value: '../geolocation/ph_index_beg, ../geolocation/segment_ph_cnt' + - Name: 'subset_control_type' + Value: 'fwd_segment_index' + - Applicability: + ShortNamePath: 'ATL03' + Variable_Pattern: '/gt[123][lr]/geolocation/ph_index_beg' + Attributes: + - Name: 'subset_control_variable_type' + Value: 'segment_index_beg' + - Applicability: + ShortNamePath: 'ATL03' + Variable_Pattern: '/gt[123][lr]/geolocation/ph_ind' + Attributes: + - Name: 'subset_control_variable_type' + Value: 'segment_index_cnt' + - Applicability: + ShortNamePath: 'ATL08' + Variable_Pattern: '/gt[123][lr]/signal_photons/.*' + Attributes: + - Name: 'subset_control_variables' + Value: '../land_segments/ph_ndx_beg, ../land_segments/n_seg_ph' + - Name: 'subset_control_type' + Value: 'fwd_segment_index' + - Applicability: + ShortNamePath: 'ATL08' + Variable_Pattern: '/gt[123][lr]/land_segments/ph_ndx_beg' + Attributes: + - Name: 'subset_control_variable_type' + Value: 'segment_index_beg' + - Applicability: + ShortNamePath: 'ATL08' + Variable_Pattern: '/gt[123][lr]/land_segments/n_seg_ph' + Attributes: + - Name: 'subset_control_variable_type' + Value: 'segment_index_cnt' diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 3f9bfe3..7578ecf 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -30,7 +30,7 @@ coverage report --omit="*tests/*" coverage html --omit="*tests/*" -d /home/tests/coverage # Run pylint -pylint pymods subsetter.py --disable=E0401 +pylint pymods subsetter.py --disable=E0401 --extension-pkg-whitelist=netCDF4 RESULT=$? RESULT=$((3 & $RESULT)) diff --git a/tests/test_subsetter.py b/tests/test_subsetter.py index b3e3313..1c60310 100755 --- a/tests/test_subsetter.py +++ b/tests/test_subsetter.py @@ -1,7 +1,7 @@ from shutil import rmtree from tempfile import mkdtemp from unittest import TestCase -from unittest.mock import ANY, patch +from unittest.mock import ANY, Mock, patch from urllib.parse import unquote import json @@ -37,19 +37,26 @@ def setUp(self): def tearDown(self): rmtree(self.tmp_dir) + @patch('pymods.utilities.uuid4') + @patch('pymods.utilities.copy') @patch('subsetter.mkdtemp') @patch('shutil.rmtree') + @patch('pymods.utilities.download_url') @patch('pymods.subset.download_url') @patch('harmony.util.stage') - def test_dmr_end_to_end(self, mock_stage, mock_download_subset, - mock_rmtree, mock_mkdtemp): + def test_non_geo_end_to_end(self, mock_stage, mock_download_dmr, + mock_download_data, mock_rmtree, mock_mkdtemp, + mock_copy, mock_uuid): """ Ensure the subsetter will run end-to-end, only mocking the HTTP responses, and the output interactions with Harmony. """ + mock_uuid.return_value = Mock(hex='uuid') mock_mkdtemp.return_value = self.tmp_dir dmr_path = write_dmr(self.tmp_dir, self.atl03_dmr) - mock_download_subset.side_effect = [dmr_path, 'opendap_url_subset.nc4'] + mock_download_dmr.return_value = dmr_path + mock_download_data.return_value = 'opendap_url_subset.nc4' + mock_copy.return_value = 'moved_url_subset.nc4' message_data = { 'sources': [{ @@ -68,7 +75,7 @@ def test_dmr_end_to_end(self, mock_stage, mock_download_subset, 'callback': 'https://example.com/', 'stagingLocation': 's3://example-bucket/', 'user': 'fhaise', - 'accessToken': 'fake-token', + 'accessToken': None } message = Message(json.dumps(message_data)) @@ -77,21 +84,25 @@ def test_dmr_end_to_end(self, mock_stage, mock_download_subset, mock_mkdtemp.assert_called_once() - self.assertEqual(mock_download_subset.call_count, 2) - mock_download_subset.assert_any_call(f'{self.granule_url}.dmr', - self.tmp_dir, - subsetter.logger, - access_token=message_data['accessToken'], - config=subsetter.config) - - mock_download_subset.assert_any_call(f'{self.granule_url}.dap.nc4', - self.tmp_dir, - subsetter.logger, - access_token=message_data['accessToken'], - config=subsetter.config, - data=ANY) - - post_data = mock_download_subset.call_args[1].get('data', {}) + mock_download_dmr.assert_called_once_with( + f'{self.granule_url}.dmr', + self.tmp_dir, + subsetter.logger, + access_token=message_data['accessToken'], + config=subsetter.config + ) + mock_download_data.assert_called_once_with( + f'{self.granule_url}.dap.nc4', + self.tmp_dir, + subsetter.logger, + access_token=message_data['accessToken'], + config=subsetter.config, + data=ANY + ) + mock_copy.assert_called_once_with('opendap_url_subset.nc4', + f'{self.tmp_dir}/uuid.nc4') + + post_data = mock_download_data.call_args[1].get('data', {}) self.assertIn('dap4.ce', post_data) decoded_constraint_expression = unquote(post_data['dap4.ce']) @@ -99,12 +110,20 @@ def test_dmr_end_to_end(self, mock_stage, mock_download_subset, self.assertCountEqual(requested_variables, self.expected_variables) mock_stage.assert_called_once_with( - 'opendap_url_subset.nc4', + f'{self.tmp_dir}/uuid.nc4', 'opendap_url__gt1r_geophys_corr_geoid.', 'application/x-netcdf4', location='s3://example-bucket/', logger=subsetter.logger) + def test_geo_end_to_end(self): + """ A placeholder test for DAS-1084, in which an end-to-end test should + be placed, ensuring a full run of the new HOSS functionality is + successful. + + """ + pass + @patch('subsetter.mkdtemp') @patch('shutil.rmtree') @patch('pymods.subset.download_url') @@ -117,7 +136,6 @@ def test_exception_handling(self, mock_stage, mock_download_subset, """ mock_mkdtemp.return_value = self.tmp_dir - dmr_path = write_dmr(self.tmp_dir, self.atl03_dmr) mock_download_subset.side_effect = Exception('Random error') message_data = { diff --git a/tests/unit/test_geo_grid.py b/tests/unit/test_geo_grid.py new file mode 100644 index 0000000..bff4a41 --- /dev/null +++ b/tests/unit/test_geo_grid.py @@ -0,0 +1,560 @@ +from logging import getLogger +from os import makedirs +from os.path import exists +from shutil import copy, rmtree +from unittest import TestCase +from unittest.mock import patch + +from netCDF4 import Dataset +import numpy as np + +from harmony.util import config +from varinfo import VarInfoFromDmr + +from pymods.geo_grid import (add_index_range, fill_variables, + get_bounding_box_longitudes, + get_dimension_index_ranges, + get_dimension_index_range, get_fill_slice, + get_geo_bounding_box_subset, + get_required_dimensions, + get_valid_longitude_range, + is_geographic_coordinate, is_latitude, + is_longitude, unwrap_longitude, wrap_longitude) + + +class TestGeoGrid(TestCase): + """ A class for testing functions in the pymods.utilities module. """ + @classmethod + def setUpClass(cls): + cls.config = config(validate=False) + cls.logger = getLogger('tests') + cls.dataset = VarInfoFromDmr('tests/data/rssmif16d_example.dmr', + cls.logger, + 'tests/data/test_subsetter_config.yml') + cls.test_dir = 'tests/output' + + def tearDown(self): + if exists(self.test_dir): + rmtree(self.test_dir) + + @patch('pymods.geo_grid.fill_variables') + @patch('pymods.geo_grid.get_dimension_index_ranges') + @patch('pymods.geo_grid.get_opendap_nc4') + def test_get_geo_bounding_box_subset(self, mock_get_opendap_nc4, + mock_get_dimension_index_ranges, + mock_fill_variables): + """ Ensure a request to get a geographic bounding box will make the + expected requests to retrieve OPeNDAP data. The filling should not + be called as the bounding box does not cross the longitude + discontinuity at the edge of the grid. + + This test is primarily a unit test to ensure the correct branches + of code are exercised, calling other functions with the expected + arguments. For an end-to-end test see: `tests/test_subsetter.py`. + + """ + access_token = '0p3n5354m3!' + url = 'https://opendap.earthdata.nasa.gov/path/to/granule' + test_file = 'tests/data/f16_ssmis_20200102v7.nc' + makedirs(self.test_dir) + copy(test_file, self.test_dir) + + bounding_box = [20, 20, 40, 40] + required_variables = {'/rainfall_rate', '/latitude', '/longitude'} + mock_get_opendap_nc4.side_effect = ['dimensions.nc', 'final.nc'] + mock_get_dimension_index_ranges.return_value = {'/longitude': [80, 160], + '/latitude': [440, 520]} + + self.assertEqual( + get_geo_bounding_box_subset(required_variables, self.dataset, + bounding_box, url, self.test_dir, + self.logger, access_token, + self.config), + 'final.nc' + ) + + self.assertEqual(mock_get_opendap_nc4.call_count, 2) + mock_get_opendap_nc4.assert_any_call( + url, {'/latitude', '/longitude'}, self.test_dir, self.logger, + access_token, self.config + ) + mock_get_opendap_nc4.assert_any_call( + url, + {'/rainfall_rate[][440:520][80:160]', '/longitude[80:160]', + '/latitude[440:520]'}, + self.test_dir, self.logger, access_token, self.config + ) + mock_get_dimension_index_ranges.assert_called_once_with( + 'dimensions.nc', self.dataset, {'/latitude', '/longitude'}, + bounding_box + ) + mock_fill_variables.assert_not_called() + + @patch('pymods.geo_grid.fill_variables') + @patch('pymods.geo_grid.get_dimension_index_ranges') + @patch('pymods.geo_grid.get_opendap_nc4') + def test_get_geo_bounding_box_grid_edge(self, mock_get_opendap_nc4, + mock_get_dimension_index_ranges, + mock_fill_variables): + """ Ensure a request to get a geographic bounding box will make the + expected requests to retrieve OPeNDAP data. Also ensure that + because the bounding box crosses the edge of a grid, filling is + attempted. + + This test is primarily a unit test to ensure the correct branches + of code are exercised, calling other functions with the expected + arguments. For an end-to-end test see: `tests/test_subsetter.py`. + + """ + access_token = '0p3n5354m3!' + url = 'https://opendap.earthdata.nasa.gov/path/to/granule' + test_file = 'tests/data/f16_ssmis_20200102v7.nc' + makedirs(self.test_dir) + copy(test_file, self.test_dir) + + bounding_box = [-20, 0, 10, 30] + required_variables = {'/rainfall_rate', '/latitude', '/longitude'} + mock_get_opendap_nc4.side_effect = ['dimensions.nc', 'final.nc'] + mock_get_dimension_index_ranges.return_value = { + '/longitude': [1360, 40], + '/latitude': [360, 480] + } + + self.assertEqual( + get_geo_bounding_box_subset(required_variables, self.dataset, + bounding_box, url, self.test_dir, + self.logger, access_token, + self.config), + 'final.nc' + ) + + self.assertEqual(mock_get_opendap_nc4.call_count, 2) + mock_get_opendap_nc4.assert_any_call( + url, {'/latitude', '/longitude'}, self.test_dir, self.logger, + access_token, self.config + ) + mock_get_opendap_nc4.assert_any_call( + url, + {'/rainfall_rate[][360:480][]', '/longitude', + '/latitude[360:480]'}, + self.test_dir, self.logger, access_token, self.config + ) + mock_get_dimension_index_ranges.assert_called_once_with( + 'dimensions.nc', self.dataset, {'/latitude', '/longitude'}, + bounding_box + ) + mock_fill_variables.assert_called_once_with('final.nc', + self.dataset, + required_variables, + {'/longitude': [1360, 40]}) + + @patch('pymods.geo_grid.fill_variables') + @patch('pymods.geo_grid.get_dimension_index_ranges') + @patch('pymods.geo_grid.get_opendap_nc4') + def test_get_geo_bounding_box_no_geo(self, mock_get_opendap_nc4, + mock_get_dimension_index_ranges, + mock_fill_variables): + """ Ensure a request to get a geographic bounding box will make the + expected requests to retrieve OPeNDAP data. In this request there + are no requested geographic variables, so the function should + request the full range for all requested variables. + + This test is primarily a unit test to ensure the correct branches + of code are exercised, calling other functions with the expected + arguments. For an end-to-end test see: `tests/test_subsetter.py`. + + """ + access_token = '0p3n5354m3!' + url = 'https://opendap.earthdata.nasa.gov/path/to/granule' + test_file = 'tests/data/f16_ssmis_20200102v7.nc' + makedirs(self.test_dir) + copy(test_file, self.test_dir) + + bounding_box = [-20, 0, 10, 30] + required_variables = {'/time'} + + mock_get_opendap_nc4.return_value = 'final.nc' + mock_get_dimension_index_ranges.return_value = {} + + self.assertEqual( + get_geo_bounding_box_subset(required_variables, self.dataset, + bounding_box, url, self.test_dir, + self.logger, access_token, + self.config), + 'final.nc' + ) + + mock_get_opendap_nc4.assert_called_once_with( + url, {'/time'}, self.test_dir, self.logger, access_token, + self.config + ) + mock_get_dimension_index_ranges.assert_not_called() + mock_fill_variables.assert_not_called() + + def test_get_required_dimensions(self): + """ Ensure a set of required variables can be parsed and a set of + dimensions used by all of those variables is returned. + + """ + with self.subTest('Combines different dimensions'): + self.assertSetEqual( + get_required_dimensions(self.dataset, + {'/latitude', '/longitude'}), + {'/latitude', '/longitude'} + ) + + with self.subTest('Has a single entry for repeated dimensions'): + self.assertSetEqual( + get_required_dimensions(self.dataset, + {'/rainfall_rate', '/sst_dtime'}), + {'/latitude', '/longitude', '/time'} + ) + + def test_get_dimension_index_ranges(self): + """ Ensure that correct index ranges can be calculated for: + + - Latitude dimensions + - Longitude dimensions (continuous ranges) + - Longitude dimensions (bounding box crossing grid edge) + + """ + makedirs(self.test_dir) + test_file_name = f'{self.test_dir}/test.nc' + bounding_box = [-20, 45, 20, 85] + + with Dataset(test_file_name, 'w', format='NETCDF4') as test_file: + test_file.createDimension('latitude', size=180) + test_file.createDimension('longitude', size=360) + + test_file.createVariable('latitude', float, + dimensions=('latitude', )) + test_file['latitude'][:] = np.linspace(-89.5, 89.5, 180) + test_file['latitude'].setncatts({'units': 'degrees_north'}) + + test_file.createVariable('longitude', float, + dimensions=('longitude', )) + test_file['longitude'][:] = np.linspace(-179.5, 179.5, 360) + test_file['longitude'].setncatts({'units': 'degrees_east'}) + + with self.subTest('Latitude dimension'): + # latitude[134] = 44.5, latitude[174] = 84.5 + self.assertDictEqual( + get_dimension_index_ranges(test_file_name, self.dataset, + {'/latitude'}, bounding_box), + {'/latitude': [134, 174]} + ) + + with self.subTest('Longitude dimension, bounding box within grid'): + self.assertDictEqual( + get_dimension_index_ranges(test_file_name, self.dataset, + {'/longitude'}, bounding_box), + {'/longitude': [160, 200]} + ) + + with Dataset(test_file_name, 'a', format='NETCDF4') as test_file: + test_file['longitude'][:] = np.linspace(0.5, 359.5, 360) + + with self.subTest('Longitude, bounding box crosses grid edge'): + self.assertDictEqual( + get_dimension_index_ranges(test_file_name, self.dataset, + {'/longitude'}, bounding_box), + {'/longitude': [340, 20]} + ) + + def test_get_dimension_index_range(self): + """ Ensure the expected index values are retrieved for the minimum and + maximum values of an expected range. This should correspond to the + nearest integer, to ensure partial pixels are included in a + bounding box spatial subset. List elements must be integers for + later array slicing. + + """ + data = np.linspace(0, 200, 101) + dimension_data = np.ma.masked_array(data=data, mask=False) + minimum_extent = 39 + maximum_extent = 174.3 + + # data[20] = 40.0, data[87] = 174.0 + expected_results = [20, 87] + + results = get_dimension_index_range(dimension_data, minimum_extent, + maximum_extent) + + self.assertIsInstance(results[0], int) + self.assertIsInstance(results[1], int) + self.assertListEqual(results, expected_results) + + def test_get_bounding_box_longitudes(self): + """ Ensure the western and eastern extents of a bounding box are + converted to the correct range according to the range of the + longitude variable. + + If the variable range is -180 ≤ longitude (degrees) < 180, then the + bounding box values should remain un converted. If the variable + range is 0 ≤ longitude (degrees) < 360, then the bounding box + values should be converted to this range. + + """ + bounding_box = [-150, -15, -120, 15] + + dataset = Dataset('test.nc', 'w', diskless=True) + dataset.createDimension('longitude', size=361) + + test_args = [ + ['-180 ≤ lon (deg) < 180', 'lon_wrap', -180, 180, [-150, -120]], + ['0 ≤ lon (deg) < 360', 'lon_unwrap', 0, 360, [210, 240]], + ] + + for description, var_name, valid_min, valid_max, results in test_args: + with self.subTest(description): + data = np.linspace(valid_min, valid_max, 361) + longitude = dataset.createVariable(var_name, data.dtype, + dimensions=('longitude', )) + longitude[:] = data + longitude.setncatts({'valid_min': valid_min, + 'valid_max': valid_max}) + + self.assertListEqual( + get_bounding_box_longitudes(bounding_box, dataset[var_name]), + results + ) + + dataset.close() + + def test_wrap_longitude(self): + """ Ensure that longitudes are correctly mapped to the + -180 ≤ longitude (degrees) < 180 range. + + `TestCase.assertAlmostEqual` rounds to 7 decimal places. + + """ + longitudes = [['Needs wrapping', 190.0, -170.0], + ['Already wrapped', 123.45, 123.45]] + + for description, longitude, expected_longitude in longitudes: + with self.subTest(description): + self.assertAlmostEqual(wrap_longitude(longitude), + expected_longitude) + + def test_unwrap_longitudes(self): + """ Ensure that longitudes are correctly mapped to the + 0 ≤ longitude (degrees) < 360 range. + + """ + longitudes = [['Needs unwrapping', -160.5, 199.5], + ['Already unwrapped', 12.34, 12.34]] + + for description, longitude, expected_longitude in longitudes: + with self.subTest(description): + self.assertAlmostEqual(unwrap_longitude(longitude), + expected_longitude) + + def test_get_valid_longitude_range(self): + """ Ensure the valid longitude can be extracted from either the + valid_range or valid_min and valid_max metadata attributes. Ensure + that, if these metadata attributes are absent, the longitude range + can be identified from the data themselves. + + """ + unwrapped_data = np.linspace(0, 360, 361) + wrapped_data = np.linspace(-180, 180, 361) + + with Dataset('tests.nc', 'w', diskless=True) as test_file: + test_file.createDimension('longitude', size=361) + + with self.subTest('valid_range metadata attribute is present'): + variable = test_file.createVariable('valid_range', + unwrapped_data.dtype, + dimensions=('longitude', )) + variable.setncatts({'valid_range': [0, 360]}) + self.assertListEqual(get_valid_longitude_range(variable), + [0, 360]) + + with self.subTest('valid_min and valid_max metadata attributes'): + variable = test_file.createVariable('valid_min_max', + wrapped_data.dtype, + dimensions=('longitude', )) + variable.setncatts({'valid_min': -180, 'valid_max': 180}) + self.assertListEqual(get_valid_longitude_range(variable), + [-180, 180]) + + with self.subTest('No metadata attributes, data > 180 degrees'): + variable = test_file.createVariable('from_data_0_360', + unwrapped_data.dtype, + dimensions=('longitude', )) + variable[:] = unwrapped_data + self.assertListEqual(get_valid_longitude_range(variable), + [0, 360]) + + with self.subTest('No metadata attributes, data ≤ 180 degrees'): + variable = test_file.createVariable('from_data_180_180', + wrapped_data.dtype, + dimensions=('longitude', )) + variable[:] = wrapped_data + self.assertListEqual(get_valid_longitude_range(variable), + [-180, 180]) + + with self.subTest('Only valid_min resorts to data range'): + variable = test_file.createVariable('only_valid_min', + unwrapped_data.dtype, + dimensions=('longitude', )) + variable.setncatts({'valid_min': 0}) + variable[:] = unwrapped_data + self.assertListEqual(get_valid_longitude_range(variable), + [0, 360]) + + with self.subTest('Only valid_max resorts to data range'): + variable = test_file.createVariable('only_valid_max', + unwrapped_data.dtype, + dimensions=('longitude', )) + variable.setncatts({'valid_max': 360}) + variable[:] = unwrapped_data + self.assertListEqual(get_valid_longitude_range(variable), + [0, 360]) + + def test_add_index_range(self): + """ Ensure the correct combinations of index ranges are added as + suffixes to the input variable based upon that variable's + dimensions. + + If a dimension range has the lower index > upper index, that + indicates the bounding box crosses the edge of the grid. In this + instance, the full range of the variable should be retrieved. + + The order of indices in RSSMIF16D is: (time, latitude, longitude) + + """ + with self.subTest('No index constraints'): + index_ranges = {} + self.assertEqual(add_index_range('/sst_dtime', self.dataset, + index_ranges), + '/sst_dtime') + + with self.subTest('With index constraints'): + index_ranges = {'/latitude': [12, 34], '/longitude': [45, 56]} + self.assertEqual(add_index_range('/sst_dtime', self.dataset, + index_ranges), + '/sst_dtime[][12:34][45:56]') + + with self.subTest('With a longitude crossing discontinuity'): + index_ranges = {'/latitude': [12, 34], '/longitude': [56, 5]} + self.assertEqual(add_index_range('/sst_dtime', self.dataset, + index_ranges), + '/sst_dtime[][12:34][]') + + def test_fill_variables(self): + """ Ensure only the expected variables are filled (e.g., those with + a longitude crossing the grid edge). Longitude variables should not + themselves be filled. + + """ + makedirs(self.test_dir) + input_file = 'tests/data/f16_ssmis_20200102v7.nc' + test_file = copy(input_file, self.test_dir) + fill_ranges = {'/longitude': [1400, 10]} + required_variables = {'/sst_dtime', '/wind_speed'} + + fill_variables(test_file, self.dataset, required_variables, + fill_ranges) + + with Dataset(test_file, 'r') as test_output, Dataset(input_file, 'r') as test_input: + # Assert none of the dimension variables are filled at any pixel + for variable_dimension in ['/time', '/latitude', '/longitude']: + data = test_output[variable_dimension][:] + self.assertFalse(np.any(data.mask)) + np.testing.assert_array_equal(test_input[variable_dimension], + test_output[variable_dimension]) + + # Assert the expected range of wind_speed and sst_dtime are filled + # but that rest of the variable matches the input file. + for variable in required_variables: + input_data = test_input[variable][:] + output_data = test_output[variable][:] + self.assertTrue(np.all(output_data[:][:][11:1400].mask)) + np.testing.assert_array_equal(output_data[:][:][:11], + input_data[:][:][:11]) + np.testing.assert_array_equal(output_data[:][:][1400:], + input_data[:][:][1400:]) + + # Assert a variable that wasn't to be filled isn't + rainfall_rate_in = test_input['/rainfall_rate'][:] + rainfall_rate_out = test_output['/rainfall_rate'][:] + np.testing.assert_array_equal(rainfall_rate_in, rainfall_rate_out) + + def test_get_fill_slice(self): + """ Ensure that a slice object is correctly formed for a requested + dimension. + + Longitude variables should not be filled, to ensure that there are + valid grid coordinates for science variables (sub-test 3). + + """ + fill_ranges = {'/longitude': [200, 15]} + longitude = self.dataset.get_variable('/longitude') + wind_speed = self.dataset.get_variable('/wind_speed') + + with self.subTest('An unfilled dimension returns slice(None).'): + self.assertEqual( + get_fill_slice(wind_speed, '/time', fill_ranges), + slice(None) + ) + + with self.subTest('A filled dimension returns slice(start, stop).'): + self.assertEqual( + get_fill_slice(wind_speed, '/longitude', fill_ranges), + slice(16, 200) + ) + + with self.subTest('A longitude variable returns slice(None).'): + self.assertEqual( + get_fill_slice(longitude, '/longitude', fill_ranges), + slice(None) + ) + + def test_is_geographic_coordinate(self): + """ Ensure a latitude and longitude can be correctly determined as + geographic. + + """ + with self.subTest('A latitude variable returns True'): + self.assertTrue( + is_geographic_coordinate(self.dataset.get_variable('/latitude')) + ) + + with self.subTest('A longitude variable returns True'): + self.assertTrue( + is_geographic_coordinate(self.dataset.get_variable('/longitude')) + ) + + with self.subTest('A non geographic dimension returns False'): + self.assertFalse( + is_geographic_coordinate(self.dataset.get_variable('/sst_dtime')) + ) + + def test_is_latitude(self): + """ Ensure a latitude variable is correctly identified, but that + longitudes or other non-latitudes are not misattributed. + + """ + with self.subTest('A latitude variable returns True'): + self.assertTrue(is_latitude(self.dataset.get_variable('/latitude'))) + + with self.subTest('A longitude variable returns False'): + self.assertFalse(is_latitude(self.dataset.get_variable('/longitude'))) + + with self.subTest('A non geographic dimension returns False'): + self.assertFalse(is_latitude(self.dataset.get_variable('/sst_dtime'))) + + def test_is_longitude(self): + """ Ensure a longitude variable is correctly identified, but that + latitudes or other non-longitudes are not misattributed. + + """ + with self.subTest('A latitude variable returns False'): + self.assertFalse(is_longitude(self.dataset.get_variable('/latitude'))) + + with self.subTest('A longitude variable returns True'): + self.assertTrue(is_longitude(self.dataset.get_variable('/longitude'))) + + with self.subTest('A non geographic dimension returns False'): + self.assertFalse(is_longitude(self.dataset.get_variable('/sst_dtime'))) diff --git a/tests/unit/test_subset.py b/tests/unit/test_subset.py index d12ec47..d42aed6 100644 --- a/tests/unit/test_subset.py +++ b/tests/unit/test_subset.py @@ -1,6 +1,6 @@ from logging import Logger from unittest import TestCase -from unittest.mock import patch, ANY +from unittest.mock import ANY, Mock, patch import json import shutil from tempfile import mkdtemp @@ -35,36 +35,89 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.output_dir) + @patch('pymods.subset.get_geo_bounding_box_subset') @patch('pymods.subset.VarInfoFromDmr') + @patch('pymods.subset.get_opendap_nc4') @patch('pymods.subset.download_url') - def test_subset_granule(self, mock_download_url, mock_var_info_dmr): + def test_subset_granule_no_geo(self, mock_download_dmr, mock_get_opendap_data, + mock_var_info_dmr, mock_get_geo_subset): """ Ensure valid request does not raise exception, raise appropriate exception otherwise. Note: %2F is a URL encoded slash and %3B is a URL encoded semi-colon. + Because no bounding box is specified in this request, the HOSS + functionality in `pymods.geo_grid.py` should not be called. + """ - url = self.__class__.granule_url - mock_download_url.return_value = 'africa_subset.nc4' + url = self.granule_url + mock_download_dmr.return_value = 'africa_subset.dmr' + mock_get_opendap_data.return_value = 'africa_subset.nc4' - # Note: return value below is a list, not a set, so the order can be - # guaranteed in the assertions that the request to OPeNDAP was made - # with all required variables. - mock_var_info_dmr.return_value.get_required_variables.return_value = [ + mock_var_info_dmr.return_value.get_required_variables.return_value = { '/alpha_var', '/blue_var' - ] + } variables = self.message.sources[0].variables - with self.subTest('Succesful calls to OPeNDAP'): - output_path = subset_granule(url, variables, self.output_dir, self.logger) - self.assertEqual(mock_download_url.call_count, 2) - mock_download_url.assert_any_call(f'{url}.dmr', ANY, self.logger, - access_token=None, config=None) - mock_download_url.assert_any_call( - f'{url}.dap.nc4', - ANY, - self.logger, - access_token=None, - config=None, - data={'dap4.ce': '%2Falpha_var%3B%2Fblue_var'} - ) - self.assertIn('africa_subset.nc4', output_path) + output_path = subset_granule(url, variables, self.output_dir, + self.logger, access_token='access', + config=self.config) + + mock_download_dmr.assert_called_once_with(f'{url}.dmr', + ANY, + self.logger, + access_token='access', + config=self.config) + mock_get_opendap_data.assert_called_once_with(url, + {'/alpha_var', '/blue_var'}, + self.output_dir, + self.logger, + 'access', + self.config) + self.assertIn('africa_subset.nc4', output_path) + mock_get_geo_subset.assert_not_called() + + @patch('pymods.subset.get_geo_bounding_box_subset') + @patch('pymods.subset.VarInfoFromDmr') + @patch('pymods.subset.get_opendap_nc4') + @patch('pymods.subset.download_url') + def test_subset_granule_geo(self, mock_download_dmr, mock_get_opendap_data, + mock_var_info_dmr, mock_get_geo_subset): + """ Ensure valid request does not raise exception, + raise appropriate exception otherwise. + Note: %2F is a URL encoded slash and %3B is a URL encoded semi-colon. + + Because a bounding box is specified in this request, the HOSS + functionality in `pymods.geo_grid.py` should be called instead of + the `pymods.utilities.get_opendap_nc4` function directly. + + """ + url = self.granule_url + required_variables = {'/alpha_var', '/blue_var'} + bounding_box = [40, -30, 50, -20] + mock_download_dmr.return_value = 'africa_subset.dmr' + mock_get_geo_subset.return_value = 'africa_geo_subset.nc4' + + mock_var_info = Mock() + mock_var_info.get_required_variables.return_value = required_variables + mock_var_info_dmr.return_value = mock_var_info + + variables = self.message.sources[0].variables + + output_path = subset_granule(url, variables, self.output_dir, + self.logger, access_token='access', + config=self.config, + bounding_box=bounding_box) + + mock_download_dmr.assert_called_once_with(f'{url}.dmr', + self.output_dir, + self.logger, + access_token='access', + config=self.config) + mock_get_geo_subset.assert_called_once_with({'/alpha_var', '/blue_var'}, + mock_var_info, + bounding_box, url, + self.output_dir, + self.logger, 'access', + self.config) + self.assertIn('africa_geo_subset.nc4', output_path) + mock_get_opendap_data.assert_not_called() diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index bb6c35d..74e070f 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -1,13 +1,14 @@ -from logging import Logger +from logging import getLogger from unittest import TestCase -from unittest.mock import patch +from unittest.mock import Mock, patch from urllib.error import HTTPError from harmony.util import config from pymods.exceptions import UrlAccessFailed, UrlAccessFailedWithRetries -from pymods.utilities import (download_url, get_file_mimetype, - HTTP_REQUEST_ATTEMPTS) +from pymods.utilities import (download_url, get_constraint_expression, + get_file_mimetype, get_opendap_nc4, + HTTP_REQUEST_ATTEMPTS, move_downloaded_nc4) class TestUtilities(TestCase): @@ -15,11 +16,8 @@ class TestUtilities(TestCase): @classmethod def setUpClass(cls): - cls.namespace = 'namespace_string' - - def setUp(self): - self.logger = Logger('tests') - self.config = config(validate=False) + cls.config = config(validate=False) + cls.logger = getLogger('tests') def test_get_file_mimetype(self): """ Ensure a mimetype can be retrieved for a valid file path or, if @@ -48,6 +46,7 @@ def test_download_url(self, mock_util_download): """ output_directory = 'output/dir' test_url = 'test.org' + test_data = {'dap4.ce': '%2Flatitude%3B%2Flongitude'} access_token = 'xyzzy' message_retry = 'Internal Server Error' message_other = 'Authentication Error' @@ -72,6 +71,23 @@ def test_download_url(self, mock_util_download): mock_util_download.reset_mock() + with self.subTest('A request with data passes the data to Harmony.'): + mock_util_download.return_value = http_response + response = download_url(test_url, output_directory, self.logger, + access_token, self.config, data=test_data) + + self.assertEqual(response, http_response) + mock_util_download.assert_called_once_with( + test_url, + output_directory, + self.logger, + access_token=access_token, + data=test_data, + cfg=self.config + ) + + mock_util_download.reset_mock() + with self.subTest('500 error triggers a retry.'): mock_util_download.side_effect = [http_error_retry, http_response] @@ -95,3 +111,78 @@ def test_download_url(self, mock_util_download): download_url(test_url, output_directory, self.logger) self.assertEqual(mock_util_download.call_count, HTTP_REQUEST_ATTEMPTS) + + @patch('pymods.utilities.move_downloaded_nc4') + @patch('pymods.utilities.util_download') + def test_get_opendap_nc4(self, mock_download, mock_move_download): + """ Ensure a request is sent to OPeNDAP that combines the URL of the + granule with a constraint expression. + + Once the request is completed, the output file should be moved to + ensure a second request to the same URL is still performed. + + """ + downloaded_file_name = 'output_file.nc4' + moved_file_name = 'moved_file.nc4' + mock_download.return_value = downloaded_file_name + mock_move_download.return_value = moved_file_name + + url = 'https://opendap.earthdata.nasa.gov/granule' + required_variables = {'variable'} + output_dir = '/path/to/temporary/folder/' + access_token = 'secret_token!!!' + expected_data = {'dap4.ce': 'variable'} + + output_file = get_opendap_nc4(url, required_variables, output_dir, + self.logger, access_token, self.config) + + self.assertEqual(output_file, moved_file_name) + mock_download.assert_called_once_with( + f'{url}.dap.nc4', output_dir, self.logger, + access_token=access_token, data=expected_data, cfg=self.config + ) + mock_move_download.assert_called_once_with(output_dir, + downloaded_file_name) + + def test_get_constraint_expression(self): + """ Ensure a correctly encoded DAP4 constraint expression is + constructed for the given input. + + URL encoding: + + - %2F = '/' + - %3A = ':' + - %3B = ';' + - %5B = '[' + - %5D = ']' + + Note - with sets, the order can't be guaranteed, so there are two + options for the combined constraint expression. + + """ + with self.subTest('No index ranges specified'): + self.assertIn( + get_constraint_expression({'/alpha_var', '/blue_var'}), + ['%2Falpha_var%3B%2Fblue_var', '%2Fblue_var%3B%2Falpha_var'] + ) + + with self.subTest('Variables with index ranges'): + self.assertIn( + get_constraint_expression({'/alpha_var[1:2]', '/blue_var[3:4]'}), + ['%2Falpha_var%5B1%3A2%5D%3B%2Fblue_var%5B3%3A4%5D', + '%2Fblue_var%5B3%3A4%5D%3B%2Falpha_var%5B1%3A2%5D'] + ) + + @patch('pymods.utilities.copy') + @patch('pymods.utilities.uuid4') + def test_move_downloaded_nc4(self, mock_uuid4, mock_copy): + """ Ensure a specified file is moved to the specified location. """ + mock_uuid4.return_value = Mock(hex='uuid4') + output_dir = '/tmp/path/to' + old_path = '/tmp/path/to/file.nc4' + + self.assertEqual(move_downloaded_nc4(output_dir, old_path), + '/tmp/path/to/uuid4.nc4') + + mock_copy.assert_called_once_with('/tmp/path/to/file.nc4', + '/tmp/path/to/uuid4.nc4')