Skip to content

Commit

Permalink
Merge pull request #28 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Release v0.4
  • Loading branch information
WenjieDu authored Dec 16, 2023
2 parents 9460d8c + 377b36f commit 30bf360
Show file tree
Hide file tree
Showing 8 changed files with 303 additions and 526 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
</p>

<a href='https://github.com/WenjieDu/PyPOTS'><img src='https://pypots.com/figs/pypots_logos/PyPOTS_logo_FFBG.svg?sanitize=true' width='160' align='left' /></a>
PyGrinder is a part of
PyGrinder is a part of
<a href="https://github.com/WenjieDu/PyPOTS">
PyPOTS <img align="center" src="https://img.shields.io/github/stars/WenjieDu/PyPOTS?style=social">
</a>
Expand All @@ -74,20 +74,20 @@ or install from source code:
```python
import numpy as np
import pygrinder
from pygrinder import mcar, mar_logistic, mnar_x, mnar_t

# given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features
ts_dataset = np.random.randn(128, 10, 36)

# grind the dataset with MCAR pattern, 10% missing probability, and using 0 to fill missing values
X_intact, X, missing_mask, indicating_mask = pygrinder.mcar(ts_dataset, p=0.1, nan=0)
X_with_mcar_data = mcar(ts_dataset, p=0.1)

# grind the dataset with MAR pattern
X_intact, X, missing_mask, indicating_mask = pygrinder.mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0.1, nan=0)
X_with_mar_data = mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0.1)

# grind the dataset with MNAR pattern
X_intact, X, missing_mask, indicating_mask = pygrinder.mnar_x(ts_dataset, offset=0.1, nan=0)
X_intact, X, missing_mask, indicating_mask = pygrinder.mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3, nan=0)
X_with_mnar_x_data = mnar_x(ts_dataset, offset=0.1)
X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3)
```


Expand Down
14 changes: 10 additions & 4 deletions pygrinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.3"
__version__ = "0.4"

from .missing_completely_at_random import mcar, mcar_little_test
from .missing_at_random import mar_logistic
from .missing_completely_at_random import mcar, mcar_little_test
from .missing_not_at_random import mnar_x, mnar_t
from .utils import (
cal_missing_rate,
calc_missing_rate,
masked_fill,
fill_and_get_mask,
fill_and_get_mask_torch,
fill_and_get_mask_numpy,
)

__all__ = [
Expand All @@ -38,6 +41,9 @@
"mar_logistic",
"mnar_x",
"mnar_t",
"cal_missing_rate",
"calc_missing_rate",
"masked_fill",
"fill_and_get_mask",
"fill_and_get_mask_torch",
"fill_and_get_mask_numpy",
]
163 changes: 45 additions & 118 deletions pygrinder/missing_at_random/mar_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,121 +5,18 @@
# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

from typing import Union, Tuple, overload
from typing import Union

import numpy as np
import torch
from scipy import optimize


@overload
def mar_logistic(
X: Union[torch.Tensor, np.ndarray],
obs_rate: float,
missing_rate: float,
return_masks: bool = True,
nan: Union[float, int] = 0,
) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...], np.ndarray, torch.Tensor]:
raise NotImplementedError()


@overload
def mar_logistic(
X: Union[torch.Tensor, np.ndarray],
obs_rate: float,
missing_rate: float,
return_masks: bool = False,
nan: Union[float, int] = 0,
) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...], np.ndarray, torch.Tensor]:
raise NotImplementedError()


def mar_logistic(
X: Union[torch.Tensor, np.ndarray],
obs_rate: float,
missing_rate: float,
return_masks: bool = True,
nan: Union[float, int] = 0,
) -> Union[Tuple[np.ndarray, ...], Tuple[torch.Tensor, ...], np.ndarray, torch.Tensor]:
"""Create random missing values (MAR case) with a logistic model.
First, a subset of the variables without missing values is randomly selected.
Missing values will be introduced into the remaining variables according to a logistic model with random weights.
This implementation is inspired by the tutorial
https://rmisstastic.netlify.app/how-to/python/generate_html/how%20to%20generate%20missing%20values
Parameters
----------
X : shape of [n_steps, n_features]
A time series data vector without any missing data.
obs_rate :
The proportion of variables without missing values that will be used for fitting the logistic masking model.
missing_rate:
The proportion of missing values to generate for variables which will have missing values.
return_masks : bool, optional, default=True
Whether to return the masks indicating missing values in X and indicating artificially-missing values in X.
If True, return X_intact, X, missing_mask, and indicating_mask (refer to Returns for more details).
If False, only return X with added missing at random values.
nan : int/float, optional, default=0
Value used to fill NaN values. Only valid when return_masks is True.
If return_masks is False, the NaN values will be kept as NaN.
Returns
-------
If return_masks is True:
X_intact : array,
Original data with missing values (nan) filled with given parameter `nan`, with observed values intact.
X_intact is for loss calculation in the masked imputation task.
X : array,
Original X with artificial missing values. X is for model input.
Both originally-missing and artificially-missing values are filled with given parameter `nan`.
missing_mask : array,
The mask indicates all missing values in X.
In it, 1 indicates observed values, and 0 indicates missing values.
indicating_mask : array,
The mask indicates the artificially-missing values in X, namely missing parts different from X_intact.
In it, 1 indicates artificially missing values,
and the other values (including originally observed/missing values) are indicated as 0.
If return_masks is False:
X : array-like
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
if isinstance(X, list):
X = np.asarray(X)

if isinstance(X, np.ndarray) or isinstance(X, torch.Tensor):
results = _mar_logistic_torch(X, obs_rate, missing_rate, return_masks, nan)
else:
raise TypeError(
"X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}"
)

if not return_masks:
X = results
return X

X_intact, X, missing_mask, indicating_mask = results
return X_intact, X, missing_mask, indicating_mask


def _mar_logistic_torch(
X: Union[np.ndarray, torch.Tensor],
rate: float,
rate_obs: float,
return_masks: bool,
nan: Union[float, int] = 0,
):
rate_missing: float,
) -> Union[np.ndarray, torch.Tensor]:
def pick_coefficients(X, idxs_obs=None, idxs_nas=None, self_mask=False):
n, d = X.shape
if self_mask:
Expand Down Expand Up @@ -168,7 +65,6 @@ def f(x):
torch.isnan(X).sum() == 0
), "the input X of the mar_logistic() shouldn't containing originally missing data"

X_intact = torch.clone(X)
mask = torch.zeros(n, d).bool()

# number of variables that will have no missing values (at least one variable)
Expand All @@ -182,23 +78,54 @@ def f(x):
# Pick coefficients so that W^Tx has unit variance (avoids shrinking)
coeffs = pick_coefficients(X, idxs_obs, idxs_nas)
# Pick the intercepts to have a desired amount of missing values
intercepts = fit_intercepts(X[:, idxs_obs], coeffs, rate)
intercepts = fit_intercepts(X[:, idxs_obs], coeffs, rate_missing)

ps = torch.sigmoid(X[:, idxs_obs].mm(coeffs) + intercepts)
ber = torch.rand(n, d_na)
mask[:, idxs_nas] = ber < ps # True means missing

X[mask] = torch.nan

if not return_masks: # return X with MCAR values only if not return masks
return X.numpy() if ori_type_is_np else X
return X.numpy() if ori_type_is_np else X

X = torch.nan_to_num(X, nan)
missing_mask = (~mask).to(torch.float32)
indicating_mask = torch.clone(missing_mask)

if ori_type_is_np:
X_intact, X, missing_mask, indicating_mask = [
i.numpy() for i in (X_intact, X, missing_mask, indicating_mask)
]
return tuple((X_intact, X, missing_mask, indicating_mask))
def mar_logistic(
X: Union[torch.Tensor, np.ndarray],
obs_rate: float,
missing_rate: float,
) -> Union[np.ndarray, torch.Tensor]:
"""Create random missing values (MAR case) with a logistic model.
First, a subset of the variables without missing values is randomly selected.
Missing values will be introduced into the remaining variables according to a logistic model with random weights.
This implementation is inspired by the tutorial
https://rmisstastic.netlify.app/how-to/python/generate_html/how%20to%20generate%20missing%20values
Parameters
----------
X : shape of [n_steps, n_features]
A time series data vector without any missing data.
obs_rate :
The proportion of variables without missing values that will be used for fitting the logistic masking model.
missing_rate:
The proportion of missing values to generate for variables which will have missing values.
Returns
-------
corrupted_X : array-like
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
if isinstance(X, list):
X = np.asarray(X)

if isinstance(X, np.ndarray) or isinstance(X, torch.Tensor):
corrupted_X = _mar_logistic_torch(X, missing_rate, obs_rate)
else:
raise TypeError(
"X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}"
)

return corrupted_X
Loading

0 comments on commit 30bf360

Please sign in to comment.