From 4209e02a5865f6ed62380605bbfcaec45faf936b Mon Sep 17 00:00:00 2001 From: Russell Desiderio Date: Tue, 16 May 2017 17:54:26 -0700 Subject: [PATCH] BOTPT: Added coverage thresholds for calculating L2 products --- ion_functions/data/prs_functions.py | 419 +++++++++++------- ion_functions/data/test/test_prs_functions.py | 116 ++++- 2 files changed, 341 insertions(+), 194 deletions(-) diff --git a/ion_functions/data/prs_functions.py b/ion_functions/data/prs_functions.py index 007cba6..20e4dcb 100755 --- a/ion_functions/data/prs_functions.py +++ b/ion_functions/data/prs_functions.py @@ -12,9 +12,8 @@ import scipy.io as spio from scipy import signal - """ - Listing of functions, in order encountered. + Listing of BOTPT functions, in order encountered. Functions calculating data products. @@ -41,7 +40,8 @@ BOTSFLU: - anchor_bin + anchor_bin_raw_data_to_15s + anchor_bin_detided_data_to_24h calc_meandepth_plus calculate_sliding_means calculate_sliding_slopes @@ -53,7 +53,6 @@ prs_tsunami_detection -- event notification specified by DPS prs_eruption_imminent -- event notification specified by DPS prs_eruption_occurred -- event notification specified by DPS - calculate_sliding_slopes """ @@ -247,8 +246,9 @@ def prs_bottilt_tdir(x_tilt, y_tilt, ccmp): # #return np.round(tdir) - # The preceding calculation is faster and simpler if the arctan2 function is used. - # Use 450 as an addend in the first argument to the mod function to make sure the result is positive. + # The calculation is faster and simpler if the arctan2 function is used. + # Use 450=90+360 as the addend in the first argument to the mod(x,360) + # function to make sure the result is positive. return np.round(np.mod(450 - np.degrees(np.arctan2(y_tilt, x_tilt)) + ccmp, 360)) @@ -266,15 +266,18 @@ def prs_botsflu_time15s(timestamp, botpres): Implemented by: 2015-01-13: Russell Desiderio. Initial code. + 2017-05-15: Russell Desiderio. Included botpres as an input argument so that + timestamps associated with bad rawdata values could be deleted. Usage - time15s = prs_botsflu_time15s(timestamp) + time15s = prs_botsflu_time15s(timestamp, botpres) where time15s = BOTSFLU-TIME15S-AUX [sec since 01-01-1900] timestamp = OOI system timestamps [sec since 01-01-1900] + botpres = BOTPRES_L1 [psia] Notes: @@ -290,8 +293,8 @@ def prs_botsflu_time15s(timestamp, botpres): OOI (2015). Data Product Specification for Seafloor Uplift and Subsidence (BOTSFLU) from the BOTPT instrument. Document Control Number 1341-00080. """ - ###### [in development]: the second calling argument is a placeholder - time15s, _, _ = anchor_bin_rawdata_to_15s(timestamp, botpres) + # botpres is required to eliminate timestamps of bad input values + time15s, _, _ = anchor_bin_raw_data_to_15s(timestamp, botpres) return time15s @@ -325,7 +328,7 @@ def prs_botsflu_meanpres(timestamp, botpres): OOI (2015). Data Product Specification for Seafloor Uplift and Subsidence (BOTSFLU) from the BOTPT instrument. Document Control Number 1341-00080. """ - _, meanpres, _ = anchor_bin_rawdata_to_15s(timestamp, botpres) + _, meanpres, _ = anchor_bin_raw_data_to_15s(timestamp, botpres) return meanpres @@ -560,14 +563,16 @@ def prs_botsflu_time24h(time15s): Calculates the auxiliary BOTSFLU data product TIME24H-AUX. These are timestamps anchored at midnight which correspond to the time base for - the BOTSFLU data products which are binned on a day's worth of data. + the BOTSFLU data products which are binned on a day's worth of data + (from noon to noon). Implemented by: 2015-01-14: Russell Desiderio. Initial code. 2017-05-05: Russell Desiderio. Changed time24h time base to span the entire dataset including data gaps. This change is - made in the function anchor_bin_detided_to_24h. + made in the function call to + anchor_bin_detided_data_to_24h. Usage @@ -590,8 +595,8 @@ def prs_botsflu_time24h(time15s): OOI (2015). Data Product Specification for Seafloor Uplift and Subsidence (BOTSFLU) from the BOTPT instrument. Document Control Number 1341-00080. """ - # the second calling argument is a placeholder - time24h, _ = anchor_bin_detided_to_24h(time15s, None, None) + # the second and third calling arguments are placeholders + time24h, _, _ = anchor_bin_detided_data_to_24h(time15s, None, None) return time24h @@ -608,22 +613,30 @@ def prs_botsflu_daydepth(timestamp, botpres, dday_coverage=0.90): 2015-01-14: Russell Desiderio. Initial code. 2017-05-05: Russell Desiderio. Changed time24h time base to span the entire dataset including data gaps. + 2017-05-11: Russell Desiderio. Incorporated daydepth coverage threshold. Usage - daydepth = prs_botsflu_daydepth(timestamp, botpres) + daydepth = prs_botsflu_daydepth(timestamp, botpres, dday_coverage) where daydepth = BOTSFLU-DAYDEPTH_L2 [m] timestamp = OOI system timestamps [sec since 01-01-1900] botpres = BOTPRES_L1 [psia] + dday_coverage = fractional coverage threshold, below which daydepth values + are assigned Nan values. Notes: The timebase data product associated with this data product is TIME24H. + Fractional coverage is calculated as the number of non-Nan depth values + within a bin divided by the maximum number of possible bin values. For + 15-second data points within 24-hour bins, this maximum number is + necessarily 86400/15 = 5760. + References: OOI (2015). Data Product Specification for Seafloor Uplift and Subsidence @@ -635,15 +648,11 @@ def prs_botsflu_daydepth(timestamp, botpres, dday_coverage=0.90): # bin the 15sec data into 24 hour bins so that the timestamps are at midnight. # to calculate daydepth, don't need the time24h timestamps. - _, daydepth = anchor_bin_detided_to_24h(time15s, meandepth, dday_coverage) + _, daydepth, _ = anchor_bin_detided_data_to_24h(time15s, meandepth, dday_coverage) # downstream data products no longer require the mask_nonzero variable return daydepth - #daydepth = calc_daydepth_plus(timestamp, botpres) - # - #return daydepth - def prs_botsflu_4wkrate(timestamp, botpres, dday_coverage=0.9, rate_coverage=0.75): """ @@ -659,16 +668,21 @@ def prs_botsflu_4wkrate(timestamp, botpres, dday_coverage=0.9, rate_coverage=0.7 dataset including data gaps. Therefore removed the last masking operation that removed the values at bins that had zero data. + 2017-05-12: Russell Desiderio. Incorporated daydepth and rate coverage thresholds. Usage - botsflu_4wkrate = pprs_botsflu_4wkrate(timestamp, botpres) + botsflu_4wkrate = pprs_botsflu_4wkrate(timestamp, botpres, dday_coverage, rate_coverage) where botsflu_4wkrate = BOTSFLU-4WKRATE_L2 [cm/yr] timestamp = CI system timestamps [sec since 01-01-1900] botpres = BOTPRES_L1 [psia] + dday_coverage = fractional daydepth coverage threshold, below which daydepth + values are assigned Nan values. + rate_coverage = fractional rate coverage threshold, below which 4wkrate data + product values are assigned Nan values. Notes: @@ -707,16 +721,21 @@ def prs_botsflu_8wkrate(timestamp, botpres, dday_coverage=0.9, rate_coverage=0.7 dataset including data gaps. Therefore removed the last masking operation that removed the values at bins that had zero data. + 2017-05-12: Russell Desiderio. Incorporated daydepth and rate coverage thresholds. Usage - botsflu_8wkrate = pprs_botsflu_8wkrate(timestamp, botpres) + botsflu_8wkrate = pprs_botsflu_8wkrate(timestamp, botpres, dday_coverage, rate_coverage) where botsflu_8wkrate = BOTSFLU-8WKRATE_L2 [cm/yr] timestamp = OOI system timestamps [sec since 01-01-1900] botpres = BOTPRES_L1 [psia] + dday_coverage = fractional daydepth coverage threshold, below which daydepth + values are assigned Nan values. + rate_coverage = fractional rate coverage threshold, below which 8wkrate data + product values are assigned Nan values. Notes: @@ -744,65 +763,34 @@ def prs_botsflu_8wkrate(timestamp, botpres, dday_coverage=0.9, rate_coverage=0.7 #********************************************************************** #.. BOTSFLU: Worker functions called by the data product functions #********************************************************************** -def anchor_bin_rawdata_to_15s(time, data): +def anchor_bin_raw_data_to_15s(time, data): """ Description: - Calculates 'anchored' timestamps (see Notes) and binned data based on timestamps - in units of seconds since midnight. Written explicitly for the BOTSFLU DPA which - requires two stages of binning: 20hz data on 15 seconds, then the 15sec data on 24 hours. + Calculates 'anchored' timestamps (see Notes) and bins data based on system + timestamps in units of seconds. This routine is hard-coded to bin BOTPT 20hz + raw pressure data into 15 second bins. Implemented by: - 2015-01-13: Russell Desiderio. Initial code. - 2015-01-14: Russell Desiderio. Changed output arguments and incorporated conditionals - to improve program efficiency. - - Usage (1): - - bin_timestamps = anchor_bin(time, None, bin_duration, 'time') - - where - - bin_timestamps = 1D array of centered timestamps for non-empty bins - time = 1D array of timestamps, units of sec since 01-01-1900 - None = not used; python placeholder object - bin_duration = size of bin [s] - mode = the string 'time' - - Usage (2): - - binned_data, mask_nonzero = anchor_bin(time, data, bin_duration, 'data') - - where + 2017-05-05: Russell Desiderio. Initial code from modifying the function anchor_bin. + This function now traps out input nans and unphysical + raw pressure readings <= 0. - binned_data = 1D array of binned data; no empty bins are represented - mask_nonzero = boolean where True values represent locations of non-empty bins - time = 1D array of timestamps, units of sec since 01-01-1900 - data = data to be binned - bin_duration = size of bin [s] - mode = the string 'data' - - Usage (3): + Usage: - bin_timestamps, binned_data, mask_nonzero = anchor_bin(time, data, bin_duration, 'both') + bin_timestamps, binned_data, mask_nonzero = anchor_bin(time, data) where bin_timestamps = 1D array of centered timestamps for non-empty bins binned_data = 1D array of binned data; no empty bins are represented mask_nonzero = boolean where True values represent locations of non-empty bins - time = 1D array of timestamps, units of sec since 01-01-1900 + time = 1D array of (system) timestamps, units of sec since 01-01-1900 data = data to be binned - bin_duration = size of bin [s] - mode = the string 'both' Notes: - The conditional construction is used so that only necessary statements are executed; - when multiple years' worth of 20 Hz data is operated on, each np.bincount operation - may take multiple tens of seconds to execute. - The np.bincount routine is used in the same way accumarray in matlab is used to bin data. The key to the routine is to convert the timestamps into elapsed time in units of bin_duration and to construct bins based on the floored @@ -810,15 +798,15 @@ def anchor_bin_rawdata_to_15s(time, data): feature of the np.bincount function, as described in the example in the numpy.bincount documentation as listed in the References. - The BOTSFLU data products require binning at two stages. Bin results both with - and without empty bins are required. The output arguments have been selected to - provide this flexibility (in particular mask_nonzero). + Empty bins are not represented in the 15s timestamps nor in the data products + associated with these timestamps. The boolean array mask_nonzero spans the + entire time range (in units of 15s) of data into the DPA and has True values + according to the locations of the non-empty bins. - This routine has been constructed to supply 'anchored' timestamps. For example, - if the bin_duration is 86400 (the number of seconds in a day) then the start time - will be half a bin earlier than the first day of data (at noon) and all timestamps - will be 'anchored' at midnight. Similarly, if the bin_duration is 15 sec, all - timestamps will be at 00, 15, 30, and 45 seconds past the minute. + This routine has been constructed to supply centered timestamps 'anchored' at + each quadrant of the minute. With a hard-coded bin_duration of 15 sec, all + timestamps will be at 00, 15, 30, and 45 seconds past the minute. Each bin will + encompass data 7.5 seconds on either side of it. References: @@ -827,6 +815,14 @@ def anchor_bin_rawdata_to_15s(time, data): bin_duration = 15.0 # seconds half_bin = bin_duration/2.0 + # one nan value in a bin will nan out the sum of the values of that bin. + # throw out nan values and their associated timestamps. + # also throw out non-physical data values <= 0. + data[np.isnan(data)] = -999.0 # use any negative value + mask = (data > 0.0) + time = time[mask] + data = data[mask] + # anchor time-centered bins by determining the start time to be half a bin # before the first 'anchor timestamp', which will be an integral number of # bin_durations after midnight. @@ -854,65 +850,32 @@ def anchor_bin_rawdata_to_15s(time, data): return bin_timestamps, binned_data, mask_nonzero -def anchor_bin_detided_to_24h(time, data, dday_coverage): +def anchor_bin_detided_data_to_24h(time, data, dday_coverage): """ - Description: - - Calculates 'anchored' timestamps (see Notes) and binned data based on timestamps - in units of seconds since midnight. Written explicitly for the BOTSFLU DPA which - requires two stages of binning: 20hz data on 15 seconds, then the 15sec data on 24 hours. + Calculates 'anchored' timestamps (see Notes) and bins data based on system + timestamps in units of seconds. This routine is hard-coded to bin BOTPT 15s + pressure data into 24 hour bins (data product daydepth). Implemented by: - 2015-01-13: Russell Desiderio. Initial code. - 2015-01-14: Russell Desiderio. Changed output arguments and incorporated conditionals - to improve program efficiency. - - Usage (1): - - bin_timestamps = anchor_bin(time, None, bin_duration, 'time') - - where - - bin_timestamps = 1D array of centered timestamps for non-empty bins - time = 1D array of timestamps, units of sec since 01-01-1900 - None = not used; python placeholder object - bin_duration = size of bin [s] - mode = the string 'time' - - Usage (2): - - binned_data, mask_nonzero = anchor_bin(time, data, bin_duration, 'data') - - where - - binned_data = 1D array of binned data; no empty bins are represented - mask_nonzero = boolean where True values represent locations of non-empty bins - time = 1D array of timestamps, units of sec since 01-01-1900 - data = data to be binned - bin_duration = size of bin [s] - mode = the string 'data' + 2017-05-05: Russell Desiderio. Initial code from modifying the function anchor_bin. - Usage (3): + Usage: - bin_timestamps, binned_data, mask_nonzero = anchor_bin(time, data, bin_duration, 'both') + bin_timestamps, binned_data, bincount = anchor_bin(time, data, dday_coverage) where - bin_timestamps = 1D array of centered timestamps for non-empty bins - binned_data = 1D array of binned data; no empty bins are represented - mask_nonzero = boolean where True values represent locations of non-empty bins + bin_timestamps = 1D array of centered timestamps + binned_data = 1D array of binned data (data product daydepth) + bincount = 1D array of the number of data values in each bin time = 1D array of timestamps, units of sec since 01-01-1900 data = data to be binned - bin_duration = size of bin [s] - mode = the string 'both' + dday_coverage = fractional coverage threshold, below which daydepth values + are assigned Nan values. Notes: - The conditional construction is used so that only necessary statements are executed; - when multiple years' worth of 20 Hz data is operated on, each np.bincount operation - may take multiple tens of seconds to execute. - The np.bincount routine is used in the same way accumarray in matlab is used to bin data. The key to the routine is to convert the timestamps into elapsed time in units of bin_duration and to construct bins based on the floored @@ -920,15 +883,20 @@ def anchor_bin_detided_to_24h(time, data, dday_coverage): feature of the np.bincount function, as described in the example in the numpy.bincount documentation as listed in the References. - The BOTSFLU data products require binning at two stages. Bin results both with - and without empty bins are required. The output arguments have been selected to - provide this flexibility (in particular mask_nonzero). + Within a set of input data, all days will be represented by a timestamp, + including days with less than the number of data values required to trigger + the dday_coverage threshold. - This routine has been constructed to supply 'anchored' timestamps. For example, - if the bin_duration is 86400 (the number of seconds in a day) then the start time - will be half a bin earlier than the first day of data (at noon) and all timestamps - will be 'anchored' at midnight. Similarly, if the bin_duration is 15 sec, all - timestamps will be at 00, 15, 30, and 45 seconds past the minute. + This routine has been constructed to supply centered timestamps 'anchored' at mid- + night. With the bin_duration hard-coded at 86400 (the number of seconds in a day) + all the bins will encompass data from noon-to-noon. + + Fractional coverage is calculated as the number of non-Nan depth values + within a bin divided by the maximum number of possible bin values. For + 15-second data points within 24-hour bins, this maximum number is + necessarily 86400/15 = 5760. + + The 3rd output variable, bincount, is used as a diagnostic in the unit tests. References: @@ -936,7 +904,7 @@ def anchor_bin_detided_to_24h(time, data, dday_coverage): """ bin_duration = 86400.0 # number of seconds in a day half_bin = bin_duration/2.0 - max_count = 86400.0/15.0 # maximum number of values in a day's bin + max_count = bin_duration/15.0 # maximum number of values in a day's bin # anchor time-centered bins by determining the start time to be half a bin # before the first 'anchor timestamp', which will an integral number of @@ -948,10 +916,13 @@ def anchor_bin_detided_to_24h(time, data, dday_coverage): bin_number = np.floor(time_elapsed).astype(int) # the number of elements in each bin is given by bin_count = np.bincount(bin_number).astype(float) + raw_bincount = np.copy(bin_count) # for unit test + # bin_count is used as a divisor to calculate mean values at each bin - # bins with bincounts below the threshold value will have a nan value + # bins with bincounts below the threshold value will have a nan value. + # bins with bincounts equal to or above the threshold value are non_Nan. bin_count[bin_count/max_count < dday_coverage] = np.nan - # replace 0 with nan + # use nans to prevent dividing by zero in case dday_coverage=0 bin_count[bin_count == 0] = np.nan # directly calculate bin timestamp, units of [sec]: @@ -963,7 +934,7 @@ def anchor_bin_detided_to_24h(time, data, dday_coverage): # divide the values in each bin by the number of values in each bin daydepth = binned_data/bin_count - return bin_timestamps, daydepth + return bin_timestamps, daydepth, raw_bincount def calc_meandepth_plus(timestamp, botpres): @@ -1016,7 +987,7 @@ def calc_meandepth_plus(timestamp, botpres): psi_2_depth = -0.67 # psi to depth in meters bin_duration = 15.0 # seconds - time15s, meanpres, mask_nonzero = anchor_bin_rawdata_to_15s(timestamp, botpres) + time15s, meanpres, mask_nonzero = anchor_bin_raw_data_to_15s(timestamp, botpres) # look up tide data tide = prs_botsflu_predtide(time15s) # de-tide @@ -1037,7 +1008,7 @@ def calculate_sliding_means(data, window_size): Implemented by: 2015-01-13: Russell Desiderio. Initial code. - 2017-05-06: Russell Desiderio. Updated to make work with odd window_size. + 2017-05-06: Russell Desiderio. Updated to make work with odd window_sizes. Usage @@ -1051,8 +1022,9 @@ def calculate_sliding_means(data, window_size): Notes - The unit test values were calculated in Matlab, so that the python convolution - result for even sized windows is shifted by 1 element to match the matlab result. + The botsflu unit test values were calculated in Matlab, so that the python + convolution result for even sized windows is shifted by 1 element to match + the matlab result. """ kk = np.ones(window_size) / window_size @@ -1080,22 +1052,22 @@ def calculate_sliding_slopes(data, window_size, coverage_threshold): Implemented by: 2017-05-03: Russell Desiderio. Initial code. Replaces the much faster Moore-Penrose - pseudo-inverse method so that nan-masking can be - incorporated. - 2017-05-08: Russell Desiderio. Added the 70% coverage criterion: if the number of non-Nan - data points in a window is greater than or equal to 70% of - the maximum possible window points calculate the slope. + pseudo-inverse method so that nans can be trapped out. + 2017-05-08: Russell Desiderio. Added the fractional coverage criterion: if the number of + non-Nan data points in a window is greater than or equal + to the fractional coverage multiplied by the maximum + possible number of window points, the slope is calculated. Usage - slopes = calculate_sliding_slopes(data, window_size) + slopes = calculate_sliding_slopes(data, window_size, coverage_threshold) where slopes = 1D array of sliding slopes data = 1D array of data window_size = integer - coverage = fractional window fill threshold for calculation of slope values + coverage_threshold = fractional window fill threshold for calculation of slope values Notes @@ -1104,10 +1076,10 @@ def calculate_sliding_slopes(data, window_size, coverage_threshold): Before the May 2017 modifications, just one Nan within a window would result in a Nan value for the slope. The routine now will calculate slopes by ignoring Nans, and if the coverage - is 70% or better, a calculated value will result. If the coverage is less than 70%, then the - output value is Nan. + is better than that specified by the coverage_threshold value, a calculated value will result. + If the coverage is less than 70%, then the output value is Nan. - The data vector is padded so that bins within a window of the beginning of the data record + The data vector is padded so that points within a window of the beginning of the data record that satisfy the coverage criterion will have non-Nan data product values. References: @@ -1119,45 +1091,57 @@ def calculate_sliding_slopes(data, window_size, coverage_threshold): window_size = window_size + 1 - np.mod(window_size, 2) half_window = window_size / 2 # this will 'floor' when window_size is odd as desired - # pad front end of data with Nans (because for loop is backwards-looking) + # pad front end of data with Nans (because data product and 'for loop' are backwards-looking) npts = 2 * half_window + data.size padded_data = np.zeros(npts) + np.nan padded_data[window_size-1:] = data - # first calculate values for all sliding windows - slopes = np.zeros(npts) + np.nan - abscissa = np.arange(npts).astype('float') - for ii in range(window_size-1, npts): - x = abscissa[(ii-2*half_window):(ii+1)] # rather than np.arange-ing in each iteration + # first calculate coverage and compare it to threshold so that only windows containing the + # prescribed number of 'good' data points will have slope values calculated in the for loop. + + y = np.copy(padded_data) + # determine the fraction of good values per window (vectorized): + # first change non-nan values to 1, then nan values to 0, and take the average + y[~np.isnan(y)] = 1.0 + y[np.isnan(y)] = 0.0 + fraction_good = calculate_sliding_means(y, window_size) # centered windows + # convert to backwards-looking + fraction_good = np.roll(fraction_good, half_window) + + # deal with round-off error and boundary considerations at threshold values + machine_epsilon = np.finfo(np.float).eps + eps_x_100 = machine_epsilon * 100 + + # rather than deleting padding, keep it for index registration with padded data; + # the data padding will be deleted at the end of the routine anyway. + # set nans to no coverage (avoids warning message in conditional statements) + fraction_good[np.isnan(fraction_good)] = -1.0 + # indices with fraction_good near 0 are to be disregarded + fraction_good[fraction_good <= eps_x_100] = -1.0 + + # there can be issues involving roundoff error when considering 100% coverage, so: + fraction_good = fraction_good + eps_x_100 + # find indices that satisfy coverage criterion, + # which is the first element of the 'np.where' tuple + indices = np.where(fraction_good >= coverage_threshold)[0] + + # actual slope calculation + + slopes = np.zeros(npts) + np.nan # initialize + abscissa = np.arange(npts).astype('float') # unity spacing + for ii in indices: + x = abscissa[(ii-2*half_window):(ii+1)] y = padded_data[(ii-2*half_window):(ii+1)] x = x[~np.isnan(y)] y = y[~np.isnan(y)] if x.size < 2: - continue # trap out cases of either 0 or 1 valid datapoint in window + continue # trap out windows with only 0 or 1 valid datapoint tti = x - x.mean(axis=0) stt = np.sum(tti * tti) slopes[ii] = np.sum(tti * y) / stt # get rid of padding slopes = slopes[2*half_window:] - # now determine the fraction of good values per window (vectorized): - # first change non-nan values to 1, then nan values to 0, and take the average - padded_data[~np.isnan(padded_data)] = 1.0 - padded_data[np.isnan(padded_data)] = 0.0 - fraction_good = calculate_sliding_means(padded_data, window_size) - # convert to backwards-looking - fraction_good = np.roll(fraction_good, half_window) - # get rid of padding - fraction_good = fraction_good[2*half_window:] - - # nan out fractional values (means) in sliding windows less than coverage threshold - # there can be issues involving roundoff error when considering 100% coverage, so: - machine_epsilon = np.finfo(np.float).eps - fraction_good = fraction_good + 100 * machine_epsilon - # avoid a python warning message by trapping out nans in the conditional - fraction_good[np.isnan(fraction_good)] = -999.0 # any negative number will work as intended - slopes[fraction_good < coverage_threshold] = np.nan - return slopes @@ -1252,11 +1236,12 @@ def prs_eruption_occurred(botsflu_10minrate, eruption_occurred_threshold=-5.0): #********************************************************************** -#.. BOTSFLU functions deprecated May 2017 but retained. +#.. BOTSFLU functions deprecated in May 2017 but retained +#.. for future re-use and/or documentation. #********************************************************************** -def anchor_bin_DEPRECATED(time, data, bin_duration, mode): +def anchor_bin(time, data, bin_duration, mode): """ Description: @@ -1383,7 +1368,7 @@ def anchor_bin_DEPRECATED(time, data, bin_duration, mode): return bin_timestamps, binned_data, mask_nonzero -def calc_daydepth_plus_deprecated(timestamp, botpres): +def calc_daydepth_plus(timestamp, botpres): """ Description: @@ -1419,12 +1404,102 @@ def calc_daydepth_plus_deprecated(timestamp, botpres): # bin the 15sec data into 24 hour bins so that the timestamps are at midnight. # to calculate daydepth, don't need the time24h timestamps. - _, daydepth = anchor_bin_detided_to_24h(time15s, meandepth) + _, daydepth = anchor_bin(time15s, meandepth) # downstream data products no longer require the mask_nonzero variable return daydepth +def calculate_all_sliding_slopes_then_Nan(data, window_size, coverage_threshold): + """ + Description: + + Calculates backwards-looking sliding slopes using the normal linear + regression equations rewritten to be less susceptible to round-off + error; required for the BOTSFLU data products 4WKRATE and 8WKRATE. + + Implemented by: + + 2017-05-03: Russell Desiderio. Initial code. Replaces the much faster Moore-Penrose + pseudo-inverse method so that nan-masking can be + incorporated. + 2017-05-08: Russell Desiderio. Added the coverage criterion. + + Usage + + slopes = calculate_sliding_slopes(data, window_size, coverage_threshold) + + where + + slopes = 1D array of sliding slopes + data = 1D array of data + window_size = integer + coverage = fractional window fill threshold for calculation of slope values + + Notes + + The robust regression equations are taken from equations 14.2.15-14.2.17 in the Numerical + Recipes reference below. + + Before the May 2017 modifications, just one Nan within a window would result in a Nan value + for the slope. The routine now will calculate slopes by ignoring Nans, and if the coverage + is 70% or better, a calculated value will result. If the coverage is less than 70%, then the + output value is Nan. + + The data vector is padded so that bins within a window of the beginning of the data record + that satisfy the coverage criterion will have non-Nan data product values. + + References: + + Press, Flannery, Teukolsky and Vetterling. Numerical Recipes, 1986; 1987 reprint. + Cambridge University Press. page 507. + """ + # ODD WINDOW SIZES are expected; if even, increment by one + window_size = window_size + 1 - np.mod(window_size, 2) + half_window = window_size / 2 # this will 'floor' when window_size is odd as desired + + # pad front end of data with Nans (because 'for loop' is backwards-looking) + npts = 2 * half_window + data.size + padded_data = np.zeros(npts) + np.nan + padded_data[window_size-1:] = data + + # first calculate values for all sliding windows + slopes = np.zeros(npts) + np.nan + abscissa = np.arange(npts).astype('float') + for ii in range(window_size-1, npts): + x = abscissa[(ii-2*half_window):(ii+1)] # rather than np.arange-ing in each iteration + y = padded_data[(ii-2*half_window):(ii+1)] + x = x[~np.isnan(y)] + y = y[~np.isnan(y)] + if x.size < 2: + continue # trap out windows with only 0 or 1 valid datapoint + tti = x - x.mean(axis=0) + stt = np.sum(tti * tti) + slopes[ii] = np.sum(tti * y) / stt + # get rid of padding + slopes = slopes[2*half_window:] + + # now determine the fraction of good values per window (vectorized): + # first change non-nan values to 1, then nan values to 0, and take the average + padded_data[~np.isnan(padded_data)] = 1.0 + padded_data[np.isnan(padded_data)] = 0.0 + fraction_good = calculate_sliding_means(padded_data, window_size) # centered windows + # convert to backwards-looking + fraction_good = np.roll(fraction_good, half_window) + # get rid of padding + fraction_good = fraction_good[2*half_window:] + + # nan out fractional values (means) less than coverage threshold + # there can be issues involving roundoff error when considering 100% coverage, so: + machine_epsilon = np.finfo(np.float).eps + fraction_good = fraction_good + 100 * machine_epsilon + # avoid a python warning message by trapping out nans in the conditional + fraction_good[np.isnan(fraction_good)] = -999.0 # any negative number will work as intended + slopes[fraction_good < coverage_threshold] = np.nan + + return slopes + + def calculate_sliding_slopes__MoorePenrose(data, window_size): # DEPRECATED because nan-masking cannot be implemented with this algorithm # """ @@ -1441,7 +1516,7 @@ def calculate_sliding_slopes__MoorePenrose(data, window_size): be implemented with this algorithm. Usage - + slopes = calculate_sliding_slopes(data, window_size) where diff --git a/ion_functions/data/test/test_prs_functions.py b/ion_functions/data/test/test_prs_functions.py index 7677688..ca03b39 100755 --- a/ion_functions/data/test/test_prs_functions.py +++ b/ion_functions/data/test/test_prs_functions.py @@ -145,7 +145,7 @@ class TestPRSFunctionsUnit(BaseUnitTestCase): botpres = botpres / -0.67 # pressure values were formerly imported as unsigned integers in units of # 0.0001 psi. if the rounding operation below is omitted, one of the unit - # tests will fail. + # tests (10minrate) will fail. True in Jan 2015 and May 2017. botpres = np.around(botpres, decimals=4) # generate OOI 20 Hz time stamps. @@ -156,11 +156,14 @@ class TestPRSFunctionsUnit(BaseUnitTestCase): ss1900 = np.arange(botpres.size) * 0.05 # each step is 1/20 sec ss1900 = ss1900 + starttime - # now find where the nan values are, and delete in both the - # pressure record and in the timestamps. - nan_position_mask = np.isnan(botpres) - botpres = botpres[~nan_position_mask] - ss1900 = ss1900[~nan_position_mask] + #### May 2017 + #### Input Nans are now trapped out by the DPA, so the next 3 executable + #### statements are commented out. Nevertheless, DO NOT DELETE THEM. + ## find where the nan values are, and delete in both the + ## pressure record and in the timestamps. + #nan_position_mask = np.isnan(botpres) + #botpres = botpres[~nan_position_mask] + #ss1900 = ss1900[~nan_position_mask] """ END generating 20 hz unit test data for BOTSFLU @@ -278,6 +281,60 @@ def test_prs_bottilt_tdir(self): ### ########################## ################################################### ### BOTSFLU DATA PRODUCT TESTS ################################################### ### ########################## ################################################### + def test_anchor_bin_detided_data_to_24h(self): + """ + Test the function anchor_bin_detided_data_to_24h. + (1) the 24hr timestamps now include values for empty bins (bin values are nan). + (2) the 90% threshold coverage criterion is tested. + + Even though there are only 2 24-hour gaps of missing data, 4 consecutive 24-hr bins + will have no data because (a) there is a 90% coverage threshold and (b) the missing + data are not aligned with the boundaries of the midnight-centered bins (which would + be noons). + + The pattern of the data gaps is taken directly from actual data (February 2011, + Axial Seamount) supplied by RSN. + + Russell Desiderio, May 15, 2017. Original code. + """ + # generate 8 days of 15sec timestamps with the following gaps: + # 24 hrs of missing data from day 3 at 03:00:00 to day 4 at 02:59:45 + # 24 hrs of missing data from day 4 at 21:00:00 to day 5 at 20:59:45 + time15s = np.hstack((1 * 86400.0 + np.arange(0, 86386, 15), + 2 * 86400.0 + np.arange(0, 86386, 15), + 3 * 86400.0 + np.arange(0, 10786, 15), + 4 * 86400.0 + np.arange(10800, 75586, 15), + 5 * 86400.0 + np.arange(75600, 86386, 15), + 6 * 86400.0 + np.arange(0, 86386, 15), + 7 * 86400.0 + np.arange(0, 86386, 15), + 8 * 86400.0 + np.arange(0, 86386, 15))) + data = time15s * 0 + 1 # therefore all the non-Nan bin values will be 1.0 + dday_coverage = 0.90 + # expected values + xpctd_timestamps = np.arange(1.0, 10.0) * 86400 # 9 days centered at midnight + nan = np.nan + xpctd_data = np.array([nan, 1.0, nan, nan, nan, nan, 1.0, 1.0, nan]) + # there are 86400/15 = 5760 15-sec bins in 1 day + xpctd_bincount = np.array([2880, 5760, 3600, 2160, 2160, 3600, 5760, 5760, 2880]) + # calc is a tuple of three elements + calc = prsfunc.anchor_bin_detided_data_to_24h(time15s, data, dday_coverage) + # test + np.testing.assert_array_equal(calc[0], xpctd_timestamps) + np.testing.assert_array_equal(calc[1], xpctd_data) + np.testing.assert_array_equal(calc[2], xpctd_bincount) + + # also test potentially pathological case mimicked with an unphysical coverage: + dday_coverage = 1.1 # 110% + xpctd_timestamps = np.arange(1.0, 10.0) * 86400 # same as above + xpctd_data = np.zeros(9.0) + np.nan + xpctd_bincount = np.array([2880, 5760, 3600, 2160, 2160, 3600, 5760, 5760, 2880]) # as above + # calc is a tuple of three elements + calc = prsfunc.anchor_bin_detided_data_to_24h(time15s, data, dday_coverage) + # test + np.testing.assert_array_equal(calc[0], xpctd_timestamps) + np.testing.assert_array_equal(calc[1], xpctd_data) + np.testing.assert_array_equal(calc[2], xpctd_bincount) + def test_calculate_sliding_means(self): """ Test the calculation of window-centered rolling means. @@ -287,7 +344,8 @@ def test_calculate_sliding_means(self): appropriately. Russell Desiderio, January 2015. Original code. - May 2017: Added case of odd window-size for use in sliding_slope with Nanmasking + May 2017: Added case of odd window-size for use in sliding_slope algorithms + implementing threshold coverage criteria. """ xpctd = np.array([np.nan, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, np.nan, np.nan]) data = np.arange(10.0) @@ -307,7 +365,8 @@ def test_calculate_sliding_slopes(self): Verify the placement of nans in the output array. - Implemented by Russell Desiderio, January 2015. + Russell Desiderio, January 2015. Original code. + Russell Desiderio, May 2017: updated function call by adding rate_coverage argument """ # xpctd and calc are not equal, presumably due to roundoff error xpctd = np.array([np.nan, np.nan, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) @@ -316,18 +375,16 @@ def test_calculate_sliding_slopes(self): rate_coverage = 1.0 calc = prsfunc.calculate_sliding_slopes(data, window_size, rate_coverage) np.testing.assert_allclose(calc, xpctd, rtol=0.0, atol=1.e-12) - """ - Additional test, RDesiderio, May 12, 2017 - Verify nan placement after routine has been altered to accept a 75% fill criterion. - """ + + # Verify nan placement after routine has been altered to accept a 75% fill criterion. # set up test data with slope=2 data = 2 * np.arange(150.0) # nan out middle section data[50:100] = np.nan # when the sliding slope routine is applied, the result will be sets of contiguous nans - # and non-Nan values whose indices depend on the window size and on the location of nans - # in the test data. + # and contiguous non-Nan values whose indices depend on the window size and on the + # location of nans in the test data. # 29 point window (4weekrate case) xpctd_w29 = np.ones(150) + 1.0 xpctd_w29[0:21] = np.nan @@ -345,6 +402,20 @@ def test_calculate_sliding_slopes(self): calc = prsfunc.calculate_sliding_slopes(data, window_size, rate_coverage) np.testing.assert_array_equal(calc, xpctd_w57) + # make sure that routine doesn't crash if no windows satisfy the + # rate_coverage criterion + rate_coverage = 1.1 # 110%; that ought to do it + xpctd = np.ones(150) + np.nan + calc = prsfunc.calculate_sliding_slopes(data, window_size, rate_coverage) + np.testing.assert_array_equal(calc, xpctd) + + # one last potentially pathological case + rate_coverage = 0.0 + xpctd = np.ones(150) + np.nan + allnans = np.copy(xpctd) + calc = prsfunc.calculate_sliding_slopes(allnans, window_size, rate_coverage) + np.testing.assert_array_equal(calc, xpctd) + def test_prs_botsflu_auxiliary_timestamps(self): """ Test the calculation of the auxiliary timestamp data products @@ -353,6 +424,7 @@ def test_prs_botsflu_auxiliary_timestamps(self): Uses matplotlib.dates as mdates. Implemented by Russell Desiderio, January 2015. + No change after May 2017 modifications. """ xpctd_time15s = ['2011 02 01 00 00 00', '2011 02 05 13 53 45', @@ -514,30 +586,31 @@ def test_prs_botsflu_daydepth(self): """ Test the calculation of DAYDEPTH. - Implemented by Russell Desiderio, January 2015. - RDesiderio. May 12, 2015. Added 0.9 and default coverage test + Russell Desiderio, January 2015. Original code. + Russell Desiderio, May 12, 2017. Added 0.9 coverage and default coverage test. """ # case(1) January 2015 unit test values xpctd_daydepth = np.array([-1510.8705, -1510.9055, -1510.9119, -1510.8491, -1510.8596, -1510.8354, -1510.8791, -1510.8807, -1510.8378, -1510.8279, -1510.8530, -1512.2859, -1513.2018, -1513.1660, -1513.1128, -1513.0478]) - dday_coverage = 0.0 # if a bin contains a non-Nan value, its datum will exist + dday_coverage = 0.0 # if a bin contains at least one non-Nan value, bin value!=nan daydepth = prsfunc.prs_botsflu_daydepth(self.ss1900, self.botpres, dday_coverage) calc = daydepth[self.idx_24h] np.testing.assert_allclose(calc, xpctd_daydepth, rtol=0, atol=1.e-4) - # case(2) May 2017 unit test values, coverage = 0.90 + # case(2) May 2017 unit test values, coverage = 0.9 xpctd_daydepth = np.array([np.nan, -1510.9055, np.nan, -1510.8491, -1510.8596, -1510.8354, -1510.8791, -1510.8807, -1510.8378, -1510.8279, -1510.8530, -1512.2859, -1513.2018, -1513.1660, -1513.1128, np.nan]) - dday_coverage = 0.9 + dday_coverage = 0.9 # binvalue is nan unless >= 90% of the bin values are good. daydepth = prsfunc.prs_botsflu_daydepth(self.ss1900, self.botpres, dday_coverage) calc = daydepth[self.idx_24h] np.testing.assert_allclose(calc, xpctd_daydepth, rtol=0, atol=1.e-4) # case(3) May 2017 default, no coverage specified in argument list + # default coverage value is 0.90, so expected values are same as for case (2) xpctd_daydepth = np.array([np.nan, -1510.9055, np.nan, -1510.8491, -1510.8596, -1510.8354, -1510.8791, -1510.8807, -1510.8378, -1510.8279, -1510.8530, -1512.2859, @@ -553,7 +626,7 @@ def test_prs_botsflu_4wkrate(self): Implemented by Russell Desiderio, January 2015. RDesiderio. May 12, 2015. 3 additional tests for coverage parameters; 4 total (1) dday_coverage = 0.0; rate_coverage = 1.0 (January 2015 unit test values) - (2) dday_coverage = 0.0; rate_coverage = 0.75 + (2) dday_coverage = 0.0; rate_coverage = 0.75 (intermediate case) (3) dday_coverage = 0.9; rate_coverage = 0.75 (4) default (no argument) default (no argument) (should be same as (3)) """ @@ -609,7 +682,7 @@ def test_prs_botsflu_8wkrate(self): Implemented by Russell Desiderio, January 2015. RDesiderio. May 12, 2015. 3 additional tests for coverage parameters; 4 total (1) dday_coverage = 0.0; rate_coverage = 1.0 (January 2015 unit test values) - (2) dday_coverage = 0.0; rate_coverage = 0.75 + (2) dday_coverage = 0.0; rate_coverage = 0.75 (intermediate case) (3) dday_coverage = 0.9; rate_coverage = 0.75 (4) default (no argument) default (no argument) (should be same as (3)) """ @@ -702,4 +775,3 @@ def test_prs_eruption_occurred(self): for ii in range(3): tf[ii] = prsfunc.prs_eruption_occurred(data[ii, :]) np.testing.assert_array_equal(tf, xpctd) -