Merge pull request #28 from WenjieDu/dev

Release v0.4
WenjieDu · Dec 16, 2023 · 30bf360 · 30bf360
2 parents 9460d8c + 377b36f
commit 30bf360
Show file tree

Hide file tree

Showing 8 changed files with 303 additions and 526 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@
 </p>
 
 <a href='https://github.com/WenjieDu/PyPOTS'><img src='https://pypots.com/figs/pypots_logos/PyPOTS_logo_FFBG.svg?sanitize=true' width='160' align='left' /></a>
-PyGrinder is a part of 
+PyGrinder is a part of
 <a href="https://github.com/WenjieDu/PyPOTS">
 PyPOTS <img align="center" src="https://img.shields.io/github/stars/WenjieDu/PyPOTS?style=social">
 </a>
@@ -74,20 +74,20 @@ or install from source code:
 
 ```python
 import numpy as np
-import pygrinder
+from pygrinder import mcar, mar_logistic, mnar_x, mnar_t
 
 # given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features
 ts_dataset = np.random.randn(128, 10, 36)
 
 # grind the dataset with MCAR pattern, 10% missing probability, and using 0 to fill missing values
-X_intact, X, missing_mask, indicating_mask = pygrinder.mcar(ts_dataset, p=0.1, nan=0)
+X_with_mcar_data = mcar(ts_dataset, p=0.1)
 
 # grind the dataset with MAR pattern
-X_intact, X, missing_mask, indicating_mask = pygrinder.mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0.1, nan=0)
+X_with_mar_data = mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0.1)
 
 # grind the dataset with MNAR pattern
-X_intact, X, missing_mask, indicating_mask = pygrinder.mnar_x(ts_dataset, offset=0.1, nan=0)
-X_intact, X, missing_mask, indicating_mask = pygrinder.mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3, nan=0)
+X_with_mnar_x_data = mnar_x(ts_dataset, offset=0.1)
+X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3)
 ```
 
 

diff --git a/pygrinder/__init__.py b/pygrinder/__init__.py
@@ -21,14 +21,17 @@
 #
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.3"
+__version__ = "0.4"
 
-from .missing_completely_at_random import mcar, mcar_little_test
 from .missing_at_random import mar_logistic
+from .missing_completely_at_random import mcar, mcar_little_test
 from .missing_not_at_random import mnar_x, mnar_t
 from .utils import (
-    cal_missing_rate,
+    calc_missing_rate,
     masked_fill,
+    fill_and_get_mask,
+    fill_and_get_mask_torch,
+    fill_and_get_mask_numpy,
 )
 
 __all__ = [
@@ -38,6 +41,9 @@
     "mar_logistic",
     "mnar_x",
     "mnar_t",
-    "cal_missing_rate",
+    "calc_missing_rate",
     "masked_fill",
+    "fill_and_get_mask",
+    "fill_and_get_mask_torch",
+    "fill_and_get_mask_numpy",
 ]
diff --git a/pygrinder/missing_at_random/mar_logistic.py b/pygrinder/missing_at_random/mar_logistic.py
@@ -5,121 +5,18 @@
 # Created by Wenjie Du <[email protected]>
 # License: BSD-3-Clause
 
-from typing import Union, Tuple, overload
+from typing import Union
 
 import numpy as np
 import torch
 from scipy import optimize
 
 
-@overload
-def mar_logistic(
-    X: Union[torch.Tensor, np.ndarray],
-    obs_rate: float,
-    missing_rate: float,
-    return_masks: bool = True,
-    nan: Union[float, int] = 0,
-) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...], np.ndarray, torch.Tensor]:
-    raise NotImplementedError()
-
-
-@overload
-def mar_logistic(
-    X: Union[torch.Tensor, np.ndarray],
-    obs_rate: float,
-    missing_rate: float,
-    return_masks: bool = False,
-    nan: Union[float, int] = 0,
-) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...], np.ndarray, torch.Tensor]:
-    raise NotImplementedError()
-
-
-def mar_logistic(
-    X: Union[torch.Tensor, np.ndarray],
-    obs_rate: float,
-    missing_rate: float,
-    return_masks: bool = True,
-    nan: Union[float, int] = 0,
-) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...], np.ndarray, torch.Tensor]:
-    """Create random missing values (MAR case) with a logistic model.
-    First, a subset of the variables without missing values is randomly selected.
-    Missing values will be introduced into the remaining variables according to a logistic model with random weights.
-    This implementation is inspired by the tutorial
-    https://rmisstastic.netlify.app/how-to/python/generate_html/how%20to%20generate%20missing%20values
-
-    Parameters
-    ----------
-    X : shape of [n_steps, n_features]
-        A time series data vector without any missing data.
-
-    obs_rate :
-        The proportion of variables without missing values that will be used for fitting the logistic masking model.
-
-    missing_rate:
-        The proportion of missing values to generate for variables which will have missing values.
-
-    return_masks : bool, optional, default=True
-        Whether to return the masks indicating missing values in X and indicating artificially-missing values in X.
-        If True, return X_intact, X, missing_mask, and indicating_mask (refer to Returns for more details).
-        If False, only return X with added missing at random values.
-
-    nan : int/float, optional, default=0
-        Value used to fill NaN values. Only valid when return_masks is True.
-        If return_masks is False, the NaN values will be kept as NaN.
-
-    Returns
-    -------
-    If return_masks is True:
-
-        X_intact : array,
-            Original data with missing values (nan) filled with given parameter `nan`, with observed values intact.
-            X_intact is for loss calculation in the masked imputation task.
-
-        X : array,
-            Original X with artificial missing values. X is for model input.
-            Both originally-missing and artificially-missing values are filled with given parameter `nan`.
-
-        missing_mask : array,
-            The mask indicates all missing values in X.
-            In it, 1 indicates observed values, and 0 indicates missing values.
-
-        indicating_mask : array,
-            The mask indicates the artificially-missing values in X, namely missing parts different from X_intact.
-            In it, 1 indicates artificially missing values,
-            and the other values (including originally observed/missing values) are indicated as 0.
-
-    If return_masks is False:
-
-        X : array-like
-            Original X with artificial missing values.
-            Both originally-missing and artificially-missing values are left as NaN.
-
-    """
-    if isinstance(X, list):
-        X = np.asarray(X)
-
-    if isinstance(X, np.ndarray) or isinstance(X, torch.Tensor):
-        results = _mar_logistic_torch(X, obs_rate, missing_rate, return_masks, nan)
-    else:
-        raise TypeError(
-            "X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}"
-        )
-
-    if not return_masks:
-        X = results
-        return X
-
-    X_intact, X, missing_mask, indicating_mask = results
-    return X_intact, X, missing_mask, indicating_mask
-
-
 def _mar_logistic_torch(
     X: Union[np.ndarray, torch.Tensor],
-    rate: float,
     rate_obs: float,
-    return_masks: bool,
-    nan: Union[float, int] = 0,
-):
+    rate_missing: float,
+) -> Union[np.ndarray, torch.Tensor]:
     def pick_coefficients(X, idxs_obs=None, idxs_nas=None, self_mask=False):
         n, d = X.shape
         if self_mask:
@@ -168,7 +65,6 @@ def f(x):
         torch.isnan(X).sum() == 0
     ), "the input X of the mar_logistic() shouldn't containing originally missing data"
 
-    X_intact = torch.clone(X)
     mask = torch.zeros(n, d).bool()
 
     # number of variables that will have no missing values (at least one variable)
@@ -182,23 +78,54 @@ def f(x):
     # Pick coefficients so that W^Tx has unit variance (avoids shrinking)
     coeffs = pick_coefficients(X, idxs_obs, idxs_nas)
     # Pick the intercepts to have a desired amount of missing values
-    intercepts = fit_intercepts(X[:, idxs_obs], coeffs, rate)
+    intercepts = fit_intercepts(X[:, idxs_obs], coeffs, rate_missing)
 
     ps = torch.sigmoid(X[:, idxs_obs].mm(coeffs) + intercepts)
     ber = torch.rand(n, d_na)
     mask[:, idxs_nas] = ber < ps  # True means missing
 
     X[mask] = torch.nan
 
-    if not return_masks:  # return X with MCAR values only if not return masks
-        return X.numpy() if ori_type_is_np else X
+    return X.numpy() if ori_type_is_np else X
 
-    X = torch.nan_to_num(X, nan)
-    missing_mask = (~mask).to(torch.float32)
-    indicating_mask = torch.clone(missing_mask)
 
-    if ori_type_is_np:
-        X_intact, X, missing_mask, indicating_mask = [
-            i.numpy() for i in (X_intact, X, missing_mask, indicating_mask)
-        ]
-    return tuple((X_intact, X, missing_mask, indicating_mask))
+def mar_logistic(
+    X: Union[torch.Tensor, np.ndarray],
+    obs_rate: float,
+    missing_rate: float,
+) -> Union[np.ndarray, torch.Tensor]:
+    """Create random missing values (MAR case) with a logistic model.
+    First, a subset of the variables without missing values is randomly selected.
+    Missing values will be introduced into the remaining variables according to a logistic model with random weights.
+    This implementation is inspired by the tutorial
+    https://rmisstastic.netlify.app/how-to/python/generate_html/how%20to%20generate%20missing%20values
+
+    Parameters
+    ----------
+    X : shape of [n_steps, n_features]
+        A time series data vector without any missing data.
+
+    obs_rate :
+        The proportion of variables without missing values that will be used for fitting the logistic masking model.
+
+    missing_rate:
+        The proportion of missing values to generate for variables which will have missing values.
+
+    Returns
+    -------
+    corrupted_X : array-like
+        Original X with artificial missing values.
+        Both originally-missing and artificially-missing values are left as NaN.
+
+    """
+    if isinstance(X, list):
+        X = np.asarray(X)
+
+    if isinstance(X, np.ndarray) or isinstance(X, torch.Tensor):
+        corrupted_X = _mar_logistic_torch(X, missing_rate, obs_rate)
+    else:
+        raise TypeError(
+            "X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}"
+        )
+
+    return corrupted_X