Moved percentileQC to separate module

GEUS-Glaciology-and-Climate · Sep 20, 2023 · 6b45719 · 6b45719
1 parent 3d9d743
commit 6b45719
Show file tree

Hide file tree

Showing 2 changed files with 224 additions and 238 deletions.
diff --git a/src/pypromice/process/L1toL2.py b/src/pypromice/process/L1toL2.py
@@ -6,11 +6,11 @@
 import urllib.request
 from urllib.error import HTTPError, URLError
 import pandas as pd
-import subprocess
 import os
 import xarray as xr
 
 from pypromice.qc.static_qc import apply_static_qc
+from pypromice.qc.percentile import percentileQC
 
 
 # from IPython import embed
@@ -159,199 +159,6 @@ def toL2(L1, T_0=273.15, ews=1013.246, ei0=6.1071, eps_overcast=1.,
                                              T_0, T_100, ews, ei0)                   
     return ds
 
-def percentileQC(ds):
-    """
-    Parameters
-    ----------
-    ds : xr.Dataset
-        Level 1 dataset
-
-    Returns
-    -------
-    ds_out : xr.Dataset
-        Level 1 dataset with percentile outliers set to NaN
-    """
-    # Check if on-disk sqlite db exists
-    # If it not exists, then run compute_percentiles.py
-
-    base_path = os.getcwd()
-
-    file_path1 = base_path + "/main/src/pypromice/qc/percentiles.db"
-    file_path2 = base_path + "/qc/percentiles.db"
-
-    script_path1 = base_path + "/main/src/pypromice/qc/compute_percentiles.py"
-    script_path2 = base_path + "/qc/compute_percentiles.py"
-
-    script_exist1 = os.path.isfile(script_path1)
-
-    db_exist1 = os.path.isfile(file_path1)
-    db_exist2 = os.path.isfile(file_path2)
-
-    print(f"Does percentiles.db exist {db_exist2} in this path {file_path2}")
-
-    if not db_exist1 and not db_exist2:
-        if script_exist1:
-            print(f"percentiles.db does not exist running {script_path1}")
-            subprocess.call(["python", script_path1])
-            file_path = file_path1
-        else:
-            print(f"percentiles.db does not exist running {script_path2}")
-            subprocess.call(["python", script_path2])
-            file_path = file_path2
-
-    # Optionally examine flagged data by setting make_plots to True
-    # This is best done by running aws.py directly and setting 'test_station'
-    # Plots will be shown before and after flag removal for each var
-    make_plots = False
-
-    stid = ds.station_id
-    stid_type = type(stid)
-
-    print(f"station id {stid}")
-    print(f"station id type {stid_type}")
-    df = ds.to_dataframe()  # Switch to pandas
-
-    # Define threshold dict to hold limit values, and 'hi' and 'lo' percentile.
-    # Limit values indicate how far we will go beyond the hi and lo percentiles to flag outliers.
-    # *_u are used to calculate and define all limits, which are then applied to *_u, *_l and *_i
-    var_threshold = {
-        "t_u": {"limit": 9},  # 'hi' and 'lo' will be held in 'seasons' dict
-        "p_u": {"limit": 12},
-        "rh_u": {"limit": 12},
-        "wspd_u": {"limit": 12},
-    }
-
-    # Query from the on-disk sqlite db for specified percentiles
-    if db_exist1:
-        file_path = file_path1
-    else:
-        file_path = file_path2
-
-    con = sqlite3.connect(file_path)
-    cur = con.cursor()
-    for k in var_threshold.keys():
-        if k == "t_u":
-            # Different pattern for t_u, which considers seasons
-            # 1: winter (DecJanFeb), 2: spring (MarAprMay), 3: summer (JunJulAug), 4: fall (SepOctNov)
-            seasons = {1: {}, 2: {}, 3: {}, 4: {}}
-            sql = f"SELECT p0p5,p99p5,season FROM {k} WHERE season in (1,2,3,4) and stid = ?"
-            cur.execute(sql, [stid])
-            result = cur.fetchall()
-            if result:
-                for row in result:
-                    # row[0] is p0p5, row[1] is p99p5, row[2] is the season integer
-                    seasons[row[2]]["lo"] = row[0]  # 0.005
-                    seasons[row[2]]["hi"] = row[1]  # 0.995
-                    var_threshold[k]["seasons"] = seasons
-            else:
-                print(f"{stid} has no {k} data")
-        else:
-            sql = f"SELECT p0p5,p99p5 FROM {k} WHERE stid = ?"
-            cur.execute(sql, [stid])
-            result = cur.fetchone()  # we only expect one row back per station
-            if result:
-                var_threshold[k]["lo"] = result[0]  # 0.005
-                var_threshold[k]["hi"] = result[1]  # 0.995
-            else:
-                print(f"{stid} has no {k} data")
-
-    con.close()  # close the database connection (and cursor)
-
-    # Set flagged data to NaN
-    for k in var_threshold.keys():
-        if k == "t_u":
-            # use t_u thresholds to flag t_u, t_l, t_i
-            base_var = k.split("_")[0]
-            vars_all = [k, base_var + "_l", base_var + "_i"]
-            for t in vars_all:
-                if t in df:
-                    winter = df[t][df.index.month.isin([12, 1, 2])]
-                    spring = df[t][df.index.month.isin([3, 4, 5])]
-                    summer = df[t][df.index.month.isin([6, 7, 8])]
-                    fall = df[t][df.index.month.isin([9, 10, 11])]
-                    season_dfs = [winter, spring, summer, fall]
-
-                    if make_plots:
-                        _plot_percentiles_t(
-                            k, t, df, season_dfs, var_threshold, stid
-                        )  # BEFORE OUTLIER REMOVAL
-                    for x1, x2 in zip([1, 2, 3, 4], season_dfs):
-                        try:
-                            print(f"percentile flagging {t} {x1}")
-                            lower_thresh = (
-                                var_threshold[k]["seasons"][x1]["lo"]
-                                - var_threshold[k]["limit"]
-                            )
-                            upper_thresh = (
-                                var_threshold[k]["seasons"][x1]["hi"]
-                                + var_threshold[k]["limit"]
-                            )
-                            outliers_upper = x2[x2.values > upper_thresh]
-                            outliers_lower = x2[x2.values < lower_thresh]
-                            outliers = pd.concat([outliers_upper, outliers_lower])
-                            df.loc[outliers.index, t] = np.nan
-                            df.loc[outliers.index, t] = np.nan
-                        except Exception as e:
-                            print(
-                                f"{t} Season {x1} is not computed due to lack of data"
-                            )
-                            print(e)
-                    if make_plots:
-                        _plot_percentiles_t(
-                            k, t, df, season_dfs, var_threshold, stid
-                        )  # AFTER OUTLIER REMOVAL
-        else:
-            # use *_u thresholds to flag *_u, *_l, *_i
-            base_var = k.split("_")[0]
-            vars_all = [k, base_var + "_l", base_var + "_i"]
-            for t in vars_all:
-                if t in df:
-                    try:
-                        print(f"percentile flagging {t}")
-                        upper_thresh = (
-                            var_threshold[k]["hi"] + var_threshold[k]["limit"]
-                        )
-                        lower_thresh = (
-                            var_threshold[k]["lo"] - var_threshold[k]["limit"]
-                        )
-                        if make_plots:
-                            _plot_percentiles(
-                                k,
-                                t,
-                                df,
-                                var_threshold,
-                                upper_thresh,
-                                lower_thresh,
-                                stid,
-                            )  # BEFORE OUTLIER REMOVAL
-                        if t == "p_i":
-                            # shift p_i so we can use the p_u thresholds
-                            shift_p = df[t] + 1000.0
-                            outliers_upper = shift_p[shift_p.values > upper_thresh]
-                            outliers_lower = shift_p[shift_p.values < lower_thresh]
-                        else:
-                            outliers_upper = df[t][df[t].values > upper_thresh]
-                            outliers_lower = df[t][df[t].values < lower_thresh]
-                        outliers = pd.concat([outliers_upper, outliers_lower])
-                        df.loc[outliers.index, t] = np.nan
-                        df.loc[outliers.index, t] = np.nan
-                    except Exception as e:
-                        print(f"{t} is not flagged due to lack of data")
-                        print(e)
-                    if make_plots:
-                        _plot_percentiles(
-                            k, t, df, var_threshold, upper_thresh, lower_thresh, stid
-                        )  # AFTER OUTLIER REMOVAL
-
-    # Back to xarray, and re-assign the original attrs
-    ds_out = df.to_xarray()
-    ds_out = ds_out.assign_attrs(ds.attrs)  # Dataset attrs
-    for x in ds_out.data_vars:  # variable-specific attrs
-        ds_out[x].attrs = ds[x].attrs
-    # equivalent to above:
-    # vals = [xr.DataArray(data=df_out[c], dims=['time'], coords={'time':df_out.index}, attrs=ds[c].attrs) for c in df_out.columns]
-    # ds_out = xr.Dataset(dict(zip(df_out.columns, vals)), attrs=ds.attrs)
-    return ds_out
 
 def flagNAN(ds_in,
             flag_url='https://raw.githubusercontent.com/GEUS-Glaciology-and-Climate/PROMICE-AWS-data-issues/master/flags/',
@@ -1224,50 +1031,6 @@ def _getRotation():                                                            #
     rad2deg = 1 / deg2rad
     return deg2rad, rad2deg
 
-def _plot_percentiles_t(k, t, df, season_dfs, var_threshold, stid):
-    '''Plot data and percentile thresholds for air temp (seasonal)'''
-    import matplotlib.pyplot as plt
-    plt.figure(figsize=(20,12))
-    inst_var = t.split('_')[0] + '_i'
-    if inst_var in df:
-        i_plot = df[inst_var]
-        plt.scatter(df.index,i_plot, color='orange', s=3, label='t_i instantaneuous')
-    if t in ('t_u','t_l'):
-        plt.scatter(df.index,df[t], color='b', s=3, label=f'{t} hourly ave')
-    for x1,x2 in zip([1,2,3,4], season_dfs):
-        y1 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['lo'] - var_threshold[k]['limit']))
-        y2 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['hi'] + var_threshold[k]['limit']))
-        y11 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['lo'] ))
-        y22 = np.full(len(x2.index), (var_threshold[k]['seasons'][x1]['hi'] ))
-        plt.scatter(x2.index, y1, color='r',s=1)
-        plt.scatter(x2.index, y2, color='r', s=1)
-        plt.scatter(x2.index, y11, color='k', s=1)
-        plt.scatter(x2.index, y22, color='k', s=1)
-    plt.title('{} {}'.format(stid, t))
-    plt.legend(loc="lower left")
-    plt.show()
-
-def _plot_percentiles(k, t, df, var_threshold, upper_thresh, lower_thresh, stid):
-    '''Plot data and percentile thresholds'''
-    import matplotlib.pyplot as plt
-    plt.figure(figsize=(20,12))
-    inst_var = t.split('_')[0] + '_i'
-    if inst_var in df:
-        if k == 'p_u':
-            i_plot = (df[inst_var]+1000.)
-        else:
-            i_plot = df[inst_var]
-        plt.scatter(df.index,i_plot, color='orange', s=3, label='instantaneuous')
-    if t != inst_var:
-        plt.scatter(df.index,df[t], color='b', s=3, label=f' {t} hourly ave')
-    plt.axhline(y=upper_thresh, color='r', linestyle='-')
-    plt.axhline(y=lower_thresh, color='r', linestyle='-')
-    plt.axhline(y=var_threshold[k]['hi'], color='k', linestyle='--')
-    plt.axhline(y=var_threshold[k]['lo'], color='k', linestyle='--')
-    plt.title('{} {}'.format(stid, t))
-    plt.legend(loc="lower left")
-    plt.show()
-
 
 if __name__ == "__main__": 
     # unittest.main()