Skip to content

Commit

Permalink
Merge pull request #189 from GEUS-Glaciology-and-Climate/features/aut…
Browse files Browse the repository at this point in the history
…o_qc

Renamed static_qc -> persistent_qc
  • Loading branch information
ladsmund authored Oct 4, 2023
2 parents 21d8bdd + 9ba2f82 commit 1491856
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ jobs:
shell: bash
run: |
cd $GITHUB_WORKSPACE/src/pypromice
python3 -m unittest -v process/aws.py get.py tx/tx.py qc/static_qc_test.py
python3 -m unittest -v process/aws.py get.py tx/tx.py qc/persistence_test.py
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="pypromice",
version="1.2.1",
version="1.3.0",
author="GEUS Glaciology and Climate",
description="PROMICE/GC-Net data processing toolbox",
long_description=long_description,
Expand Down
4 changes: 2 additions & 2 deletions src/pypromice/process/L1toL2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import os
import xarray as xr

from pypromice.qc.static_qc import apply_static_qc
from pypromice.qc.persistence import persistence_qc
from pypromice.process.value_clipping import clip_values

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -62,7 +62,7 @@ def toL2(
except Exception:
logger.exception('Flagging and fixing failed:')
if ds.attrs['format'] == 'TX':
ds = apply_static_qc(ds) # Detect and filter data points that seems to be static
ds = persistence_qc(ds) # Detect and filter data points that seems to be static

T_100 = _getTempK(T_0)
ds['rh_u_cor'] = correctHumidity(ds['rh_u'], ds['t_u'],
Expand Down
30 changes: 15 additions & 15 deletions src/pypromice/qc/static_qc.py → src/pypromice/qc/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from typing import Mapping, Optional, Union

__all__ = [
"apply_static_qc",
"find_static_regions",
"count_consecutive_static_values",
"persistence_qc",
"find_persistent_regions",
"count_consecutive_persistent_values",
"count_consecutive_true",
]

Expand All @@ -22,12 +22,12 @@
}


def apply_static_qc(
def persistence_qc(
ds: xr.Dataset,
variable_thresholds: Optional[Mapping] = None,
) -> xr.Dataset:
"""
Detect and filter data points that seems to be static within a certain period.
Detect and filter data points that seems to be persistent within a certain period.
TODO: It could be nice to have a reference to the logger or description of the behaviour here.
The AWS logger program is know to return the last successfully read value if it fails reading from the sensor.
Expand Down Expand Up @@ -58,24 +58,24 @@ def apply_static_qc(
if variable_thresholds is None:
variable_thresholds = DEFAULT_VARIABLE_THRESHOLDS

logger.debug(f"Running apply_static_qc using {variable_thresholds}")
logger.debug(f"Running persistence_qc using {variable_thresholds}")

for k in variable_thresholds.keys():
var_all = [
k + "_u",
k + "_l",
k + "_i",
] # apply to upper, lower boom, and instant
max_diff = variable_thresholds[k]["max_diff"] # loading static limit
max_diff = variable_thresholds[k]["max_diff"] # loading persistent limit
period = variable_thresholds[k]["period"] # loading diff period

for v in var_all:
if v in df:
mask = find_static_regions(df[v], period, max_diff)
mask = find_persistent_regions(df[v], period, max_diff)
n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying static QC in {v}. Filtering {n_masked}/{n_samples} samples"
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, v] = np.nan
Expand All @@ -89,22 +89,22 @@ def apply_static_qc(
return ds_out


def find_static_regions(
def find_persistent_regions(
data: pd.Series,
min_repeats: int,
max_diff: float,
) -> pd.Series:
"""
Algorithm that ensures values can stay the same within the outliers_mask
"""
consecutive_true_df = count_consecutive_static_values(data, max_diff)
static_regions = consecutive_true_df >= min_repeats
consecutive_true_df = count_consecutive_persistent_values(data, max_diff)
persistent_regions = consecutive_true_df >= min_repeats
# Ignore entries which already nan in the input data
static_regions[data.isna()] = False
return static_regions
persistent_regions[data.isna()] = False
return persistent_regions


def count_consecutive_static_values(
def count_consecutive_persistent_values(
data: pd.Series,
max_diff: float,
) -> pd.Series:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import numpy.testing
import pandas as pd

from pypromice.qc.static_qc import find_static_regions
from pypromice.qc.persistence import find_persistent_regions


class StaticQATestCase(unittest.TestCase):
def test_1_hour_static(self):
class PersistenceQATestCase(unittest.TestCase):
def test_1_hour_persistent(self):
self._test_1_hour_repeat(10)

def test_1_hour_second_index(self):
Expand All @@ -28,27 +28,27 @@ def _test_1_hour_repeat(self, index: int):
expected_output = input_series.map(lambda _: False)
expected_output[index + 1] = True

static_mask = find_static_regions(
persistent_mask = find_persistent_regions(
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, static_mask, check_names=False)
pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)

def test_no_static_period(self):
def test_no_persistent_period(self):
time_range = pd.date_range(
start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left"
)
input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range)))
min_repeats = 1
expected_output = input_series.map(lambda _: False)

static_mask = find_static_regions(
persistent_mask = find_persistent_regions(
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, static_mask, check_names=False)
pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)

def test_static_period_longer_than_period_threshold(self):
def test_persistent_period_longer_than_period_threshold(self):
time_range = pd.date_range(
start="2023-01-26", end="2023-01-28", freq="h", tz="utc", inclusive="left"
)
Expand All @@ -62,13 +62,13 @@ def test_static_period_longer_than_period_threshold(self):
expected_output = input_series.map(lambda _: False)
expected_output[expected_filter_start:expected_filter_end] = True

static_mask = find_static_regions(
persistent_mask = find_persistent_regions(
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, static_mask, check_names=False)
pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)

def test_period_threshold_longer_than_static_period(self):
def test_period_threshold_longer_than_persistent_period(self):
time_range = pd.date_range(
start="2023-01-26", end="2023-01-28", freq="h", tz="utc", inclusive="left"
)
Expand All @@ -79,13 +79,13 @@ def test_period_threshold_longer_than_static_period(self):
input_series[index_start:index_end] = input_series[index_start]
expected_output = input_series.map(lambda _: False)

static_mask = find_static_regions(
persistent_mask = find_persistent_regions(
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, static_mask, check_names=False)
pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)

def test_static_period_at_the_end(self):
def test_persistent_period_at_the_end(self):
time_range = pd.date_range(
start="2023-01-26", end="2023-01-28", freq="h", tz="utc", inclusive="left"
)
Expand All @@ -97,11 +97,11 @@ def test_static_period_at_the_end(self):
expected_output = input_series.map(lambda _: False)
expected_output[expected_filter_start:] = True

static_mask = find_static_regions(
persistent_mask = find_persistent_regions(
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, static_mask, check_names=False)
pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)

def test_dont_filter_nan_values(self):
time_range = pd.date_range(
Expand All @@ -119,13 +119,13 @@ def test_dont_filter_nan_values(self):
# The output mask shouldn't filter nan values.
expected_output = input_series.map(lambda _: False)

static_mask = find_static_regions(
persistent_mask = find_persistent_regions(
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, static_mask, check_names=False)
pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)

def test_series_with_nan_values_between_static_values(self):
def test_series_with_nan_values_between_persistent_values(self):
time_range = pd.date_range(
start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left"
)
Expand All @@ -141,6 +141,10 @@ def test_series_with_nan_values_between_static_values(self):
# Note: The station region mask shall not filter nan values
expected_mask[16] = True

output_mask = find_static_regions(series, min_repeats=period, max_diff=0.01)
output_mask = find_persistent_regions(series, min_repeats=period, max_diff=0.01)

np.testing.assert_equal(expected_mask, output_mask)


if __name__ == "__main__":
unittest.main()

0 comments on commit 1491856

Please sign in to comment.