Skip to content

Fixed issue where rolling.kurt() calculations would be effected by values outside of scope #61481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
- Bug in :meth:`Rolling.kurt` where kurtosis calculations would unintentionally be influenced by values outside of scope. (:issue:`61416`)
- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`)

Reshaping
Expand Down
122 changes: 70 additions & 52 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# cython: boundscheck=False, wraparound=False, cdivision=True

from libc.math cimport (
abs,
isfinite,
log10,
pow,
round,
signbit,
sqrt,
)
from libcpp cimport bool
from libcpp.deque cimport deque
from libcpp.stack cimport stack
from libcpp.unordered_map cimport unordered_map
Expand Down Expand Up @@ -724,6 +728,49 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs,

return result

cdef void update_sum_of_window(float64_t val,
float64_t **x_value,
float64_t **comp_value,
int power_of_element,
bool add_mode, # 1 for add_kurt, 0 for remove_kurt
) noexcept nogil:

cdef:
float64_t val_raised
double val_length, x_length
bool val_length_flag, x_length_flag

if add_mode:
val_raised = pow(val, power_of_element)
else:
val_raised = -pow(val, power_of_element)

x_length = abs(log10(abs(x_value[0][0])))
val_length = abs(log10(abs(val_raised)))

x_length_flag = x_length > 15 and isfinite(x_length)
val_length_flag = val_length > 15 and isfinite(val_length)

# We'll try to maintain comp_value as the counter for
# numbers <1e15 to keep it from getting rounded out.
if x_length_flag and val_length_flag:
# Both > 1e15 or < 1-e15
x_value[0][0] += val_raised

elif x_length_flag:
comp_value[0][0] += val_raised

elif val_length_flag:
comp_value[0][0] += x_value[0][0]
x_value[0][0] = val_raised

else:
# Neither are >1e15/<1e-15, safe to proceed
x_value[0][0] += val_raised

if comp_value[0][0] != 0:
x_value[0][0] += comp_value[0][0]
comp_value[0][0] = 0

cdef void add_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
Expand All @@ -736,29 +783,15 @@ cdef void add_kurt(float64_t val, int64_t *nobs,
float64_t *prev_value
) noexcept nogil:
""" add a value from the kurotic calc """
cdef:
float64_t y, t

# Not NaN
if val == val:
nobs[0] = nobs[0] + 1

y = val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t
update_sum_of_window(val, &x, &compensation_x, 1, 1)
update_sum_of_window(val, &xx, &compensation_xx, 2, 1)
update_sum_of_window(val, &xxx, &compensation_xxx, 3, 1)
update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 1)

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
Expand All @@ -768,7 +801,6 @@ cdef void add_kurt(float64_t val, int64_t *nobs,
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef void remove_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx,
Expand All @@ -777,40 +809,26 @@ cdef void remove_kurt(float64_t val, int64_t *nobs,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) noexcept nogil:
""" remove a value from the kurotic calc """
cdef:
float64_t y, t

# Not NaN
if val == val:
nobs[0] = nobs[0] - 1

y = - val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = - val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = - val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = - val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t
update_sum_of_window(val, &x, &compensation_x, 1, 0)
update_sum_of_window(val, &xx, &compensation_xx, 2, 0)
update_sum_of_window(val, &xxx, &compensation_xxx, 3, 0)
update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 0)


def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
cdef:
Py_ssize_t i, j
float64_t val, mean_val, min_val, sum_val = 0
float64_t compensation_xxxx_add, compensation_xxxx_remove
float64_t compensation_xxx_remove, compensation_xxx_add
float64_t compensation_xx_remove, compensation_xx_add
float64_t compensation_x_remove, compensation_x_add
float64_t compensation_xxxx
float64_t compensation_xxx
float64_t compensation_xx
float64_t compensation_x
float64_t x, xx, xxx, xxxx
float64_t prev_value
int64_t nobs, s, e, num_consecutive_same_value
Expand Down Expand Up @@ -851,16 +869,16 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
prev_value = values[s]
num_consecutive_same_value = 0

compensation_xxxx_add = compensation_xxxx_remove = 0
compensation_xxx_remove = compensation_xxx_add = 0
compensation_xx_remove = compensation_xx_add = 0
compensation_x_remove = compensation_x_add = 0
compensation_xxxx = 0
compensation_xxx = 0
compensation_xx = 0
compensation_x = 0
x = xx = xxx = xxxx = 0
nobs = 0
for j in range(s, e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add,
&compensation_x, &compensation_xx,
&compensation_xxx, &compensation_xxxx,
&num_consecutive_same_value, &prev_value)

else:
Expand All @@ -870,14 +888,14 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
# calculate deletes
for j in range(start[i - 1], s):
remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_remove, &compensation_xx_remove,
&compensation_xxx_remove, &compensation_xxxx_remove)
&compensation_x, &compensation_xx,
&compensation_xxx, &compensation_xxxx)

# calculate adds
for j in range(end[i - 1], e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add,
&compensation_x, &compensation_xx,
&compensation_xxx, &compensation_xxxx,
&num_consecutive_same_value, &prev_value)

output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx,
Expand Down
58 changes: 58 additions & 0 deletions pandas/tests/window/test_rolling_skew_kurt.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,61 @@ def test_rolling_kurt_eq_value_fperr(step):
a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
assert (a[a.index >= 9] == -3).all()
assert a[a.index < 9].isna().all()


@pytest.mark.parametrize(
"test_len, window_size, modifiers",
[
([0, 10], 5, [[0, 1e6], [3, -1e6]]),
([0, 10], 5, [[0, 1e-6], [3, 1e6]]),
([10, 100], 20, [[40, -1e10], [59, -9e9]]),
([10500, 11000], 200, [[10581, 0], [109900, -1e6], [10999, 0]]),
],
)
def test_rolling_kurt_outlier_influence(test_len, window_size, modifiers):
# #61416 Extreme values causes kurtosis value to become incorrect
test_series = Series(
range(test_len[0], test_len[1]), index=range(test_len[0], test_len[1])
)
for ind, number in modifiers:
test_series = test_series.replace(ind, number)

# minimum elements needed for "window_size" number of kurts
test_len_diff = test_len[1] - test_len[0]
min_elements_needed = test_len_diff - 2 * window_size + 1
expected_series = test_series[min_elements_needed:].reindex(
range(test_len[0], test_len[1])
)

actual = test_series.rolling(window_size, min_periods=1).kurt()
expected = expected_series.rolling(window_size, min_periods=1).kurt()

tm.assert_series_equal(actual.tail(window_size), expected.tail(window_size))


@pytest.mark.parametrize(
"array_param, window_size, modifiers",
[
([10, 10, 10], 5, [[0, 1e6], [3, -1e6]]),
([-15, 10, 10], 5, [[0, 1e2], [3, 1e6]]),
([1e4, 1e3, 100], 20, [[90, -1e7], [0, 1e7]]),
([1e-3, 3e-3, 100], 20, [[90, 100], [20, 1e4]]),
],
)
def test_rolling_kurt_outlier_influence_rand(array_param, window_size, modifiers):
# #61416 Extreme values causes kurtosis value to become incorrect
rand_array = np.random.default_rng(5).normal(
array_param[0], array_param[1], array_param[2]
)
test_series = Series(rand_array)
for ind, number in modifiers:
test_series = test_series.replace(ind, number)

# minimum elements needed for "window_size" number of kurts
min_elements_needed = array_param[2] - 2 * window_size + 1
expected_series = test_series[min_elements_needed:]

actual = test_series.rolling(window_size, min_periods=1).kurt()
expected = expected_series.rolling(window_size, min_periods=1).kurt()

tm.assert_series_equal(actual.tail(window_size), expected.tail(window_size))
Loading