From e2e72600468440ed43c99033307e1cf0182d55ee Mon Sep 17 00:00:00 2001 From: eicchen Date: Thu, 22 May 2025 17:19:47 -0500 Subject: [PATCH 1/3] Merged changes from branch BUG_df.rolling_#61416 to be included in PR --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 123 ++++++++++-------- pandas/tests/window/test_rolling_skew_kurt.py | 50 +++++++ 3 files changed, 121 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..d060da4bc8743 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -830,6 +830,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) +- Bug in :meth:`Rolling.kurt` where kurtosis calculations would unintentionally be influenced by values outside of scope. (:issue:`61416`) - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0c8ea28b60ce8..0595ca40bad60 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1,13 +1,18 @@ # cython: boundscheck=False, wraparound=False, cdivision=True - from libc.math cimport ( round, signbit, sqrt, + pow, + log10, + abs, + isfinite, ) from libcpp.deque cimport deque from libcpp.stack cimport stack from libcpp.unordered_map cimport unordered_map +from libcpp cimport bool + from pandas._libs.algos cimport TiebreakEnumType @@ -21,6 +26,8 @@ from numpy cimport ( ndarray, ) + + cnp.import_array() import cython @@ -724,6 +731,46 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, return result +cdef void update_sum_of_window( float64_t val, + float64_t **x_value, + float64_t **comp_value, + int power_of_element, + bool add_mode, #1 for add_kurt, 0 for remove_kurt + ) noexcept nogil: + + cdef: + float64_t val_raised, new_sum + bool val_length_flag, x_length_flag + + if add_mode: + val_raised = pow(val, power_of_element) + else: + val_raised = -pow(val, power_of_element) + + x_length_flag = abs(log10(abs(x_value[0][0]))) > 15 and isfinite(abs(log10(abs(x_value[0][0])))) == 1 + val_length_flag = abs(log10(abs(val_raised))) > 15 and isfinite(abs(log10(abs(val_raised)))) == 1 + + # We'll try to maintain comp_value as the counter for + # numbers <1e15 to keep it from getting rounded out. + if x_length_flag and val_length_flag: + #Both > 1e15 or < 1-e15 + x_value[0][0] += val_raised + + elif x_length_flag: + comp_value[0][0] += val_raised + + + elif val_length_flag: + comp_value[0][0] += x_value[0][0] + x_value[0][0] = val_raised + + else: + #Neither are >1e15/<1e-15, safe to proceed + x_value[0][0] += val_raised + + if comp_value[0][0] != 0: + x_value[0][0] += comp_value[0][0] + comp_value[0][0] = 0 cdef void add_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, @@ -736,29 +783,15 @@ cdef void add_kurt(float64_t val, int64_t *nobs, float64_t *prev_value ) noexcept nogil: """ add a value from the kurotic calc """ - cdef: - float64_t y, t # Not NaN if val == val: nobs[0] = nobs[0] + 1 - y = val - compensation_x[0] - t = x[0] + y - compensation_x[0] = t - x[0] - y - x[0] = t - y = val * val - compensation_xx[0] - t = xx[0] + y - compensation_xx[0] = t - xx[0] - y - xx[0] = t - y = val * val * val - compensation_xxx[0] - t = xxx[0] + y - compensation_xxx[0] = t - xxx[0] - y - xxx[0] = t - y = val * val * val * val - compensation_xxxx[0] - t = xxxx[0] + y - compensation_xxxx[0] = t - xxxx[0] - y - xxxx[0] = t + update_sum_of_window(val, &x, &compensation_x, 1, 1) + update_sum_of_window(val, &xx, &compensation_xx, 2, 1) + update_sum_of_window(val, &xxx, &compensation_xxx, 3, 1) + update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 1) # GH#42064, record num of same values to remove floating point artifacts if val == prev_value[0]: @@ -768,7 +801,6 @@ cdef void add_kurt(float64_t val, int64_t *nobs, num_consecutive_same_value[0] = 1 prev_value[0] = val - cdef void remove_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, float64_t *xxx, float64_t *xxxx, @@ -777,40 +809,25 @@ cdef void remove_kurt(float64_t val, int64_t *nobs, float64_t *compensation_xxx, float64_t *compensation_xxxx) noexcept nogil: """ remove a value from the kurotic calc """ - cdef: - float64_t y, t # Not NaN if val == val: nobs[0] = nobs[0] - 1 - y = - val - compensation_x[0] - t = x[0] + y - compensation_x[0] = t - x[0] - y - x[0] = t - y = - val * val - compensation_xx[0] - t = xx[0] + y - compensation_xx[0] = t - xx[0] - y - xx[0] = t - y = - val * val * val - compensation_xxx[0] - t = xxx[0] + y - compensation_xxx[0] = t - xxx[0] - y - xxx[0] = t - y = - val * val * val * val - compensation_xxxx[0] - t = xxxx[0] + y - compensation_xxxx[0] = t - xxxx[0] - y - xxxx[0] = t - + update_sum_of_window(val, &x, &compensation_x, 1, 0) + update_sum_of_window(val, &xx, &compensation_xx, 2, 0) + update_sum_of_window(val, &xxx, &compensation_xxx, 3, 0) + update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 0) def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j float64_t val, mean_val, min_val, sum_val = 0 - float64_t compensation_xxxx_add, compensation_xxxx_remove - float64_t compensation_xxx_remove, compensation_xxx_add - float64_t compensation_xx_remove, compensation_xx_add - float64_t compensation_x_remove, compensation_x_add + float64_t compensation_xxxx + float64_t compensation_xxx + float64_t compensation_xx + float64_t compensation_x float64_t x, xx, xxx, xxxx float64_t prev_value int64_t nobs, s, e, num_consecutive_same_value @@ -851,16 +868,16 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, prev_value = values[s] num_consecutive_same_value = 0 - compensation_xxxx_add = compensation_xxxx_remove = 0 - compensation_xxx_remove = compensation_xxx_add = 0 - compensation_xx_remove = compensation_xx_add = 0 - compensation_x_remove = compensation_x_add = 0 + compensation_xxxx = 0 + compensation_xxx = 0 + compensation_xx = 0 + compensation_x = 0 x = xx = xxx = xxxx = 0 nobs = 0 for j in range(s, e): add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, - &compensation_x_add, &compensation_xx_add, - &compensation_xxx_add, &compensation_xxxx_add, + &compensation_x, &compensation_xx, + &compensation_xxx, &compensation_xxxx, &num_consecutive_same_value, &prev_value) else: @@ -870,14 +887,14 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # calculate deletes for j in range(start[i - 1], s): remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, - &compensation_x_remove, &compensation_xx_remove, - &compensation_xxx_remove, &compensation_xxxx_remove) + &compensation_x, &compensation_xx, + &compensation_xxx, &compensation_xxxx) # calculate adds for j in range(end[i - 1], e): add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, - &compensation_x_add, &compensation_xx_add, - &compensation_xxx_add, &compensation_xxxx_add, + &compensation_x, &compensation_xx, + &compensation_xxx, &compensation_xxxx, &num_consecutive_same_value, &prev_value) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 79c14f243e7cc..5ec7218e4d399 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -225,3 +225,53 @@ def test_rolling_kurt_eq_value_fperr(step): a = Series([1.1] * 15).rolling(window=10, step=step).kurt() assert (a[a.index >= 9] == -3).all() assert a[a.index < 9].isna().all() + +@pytest.mark.parametrize("test_len, window_size, modifiers", + [([0, 10], 5, [[0,1e6], [3, -1e6]]), + ([0, 10], 5, [[0,1e-6], [3, 1e6]]), + ([10, 100], 20,[[40, -1e10], [59, -9e9]]), + ([10500, 11000], 200,[[10581, 0], [109900, -1e6], [10999, 0]]), + ] + ) +def test_rolling_kurt_outlier_influence(test_len, window_size, modifiers): + # #61416 Extreme values causes kurtosis value to become incorrect + test_series = Series(range(test_len[0], test_len[1]), index = range(test_len[0], test_len[1])) + for ind, number in modifiers: + test_series = test_series.replace(ind, number) + + #minimum elements needed for "window_size" number of kurts + test_len_diff = test_len[1] - test_len[0] + min_elements_needed = test_len_diff - 2*window_size + 1 + expected_series = (test_series[min_elements_needed:].reindex(range(test_len[0], test_len[1]))) + + actual = test_series.rolling(window_size,min_periods=1).kurt() + expected = expected_series.rolling(window_size,min_periods=1).kurt() + + tm.assert_series_equal(actual.tail(window_size), + expected.tail(window_size) + ) + +@pytest.mark.parametrize("array_param, window_size, modifiers", + [([10, 10, 10], 5, [[0,1e6], [3, -1e6]]), + ([-15, 10, 10], 5, [[0,1e2], [3, 1e6]]), + ([1e4, 1e3, 100], 20, [[90,-1e7], [0, 1e7]]), + ([1e-3, 3e-3, 100], 20, [[90,100], [20, 1e4]]), + ] + ) +def test_rolling_kurt_outlier_influence_rand(array_param, window_size, modifiers): + # #61416 Extreme values causes kurtosis value to become incorrect + rand_array = np.random.default_rng(5).normal(array_param[0], array_param[1], array_param[2]) + test_series = Series(rand_array) + for ind, number in modifiers: + test_series = test_series.replace(ind, number) + + #minimum elements needed for "window_size" number of kurts + min_elements_needed = array_param[2] - 2*window_size + 1 + expected_series = (test_series[min_elements_needed:]) + + actual = test_series.rolling(window_size,min_periods=1).kurt() + expected = expected_series.rolling(window_size,min_periods=1).kurt() + + tm.assert_series_equal(actual.tail(window_size), + expected.tail(window_size) + ) \ No newline at end of file From 76cb380389b94fdf70065f1675c45c233f0bdd56 Mon Sep 17 00:00:00 2001 From: eicchen Date: Thu, 22 May 2025 17:51:11 -0500 Subject: [PATCH 2/3] pre-commit checks I forgot to run :/ --- pandas/_libs/window/aggregations.pyx | 47 +++++----- pandas/tests/window/test_rolling_skew_kurt.py | 86 ++++++++++--------- 2 files changed, 71 insertions(+), 62 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0595ca40bad60..df9983f688c07 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1,18 +1,17 @@ # cython: boundscheck=False, wraparound=False, cdivision=True from libc.math cimport ( + abs, + isfinite, + log10, + pow, round, signbit, sqrt, - pow, - log10, - abs, - isfinite, ) +from libcpp cimport bool from libcpp.deque cimport deque from libcpp.stack cimport stack from libcpp.unordered_map cimport unordered_map -from libcpp cimport bool - from pandas._libs.algos cimport TiebreakEnumType @@ -26,8 +25,6 @@ from numpy cimport ( ndarray, ) - - cnp.import_array() import cython @@ -731,41 +728,44 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, return result -cdef void update_sum_of_window( float64_t val, - float64_t **x_value, - float64_t **comp_value, - int power_of_element, - bool add_mode, #1 for add_kurt, 0 for remove_kurt - ) noexcept nogil: +cdef void update_sum_of_window(float64_t val, + float64_t **x_value, + float64_t **comp_value, + int power_of_element, + bool add_mode, # 1 for add_kurt, 0 for remove_kurt + ) noexcept nogil: cdef: - float64_t val_raised, new_sum + float64_t val_raised + int val_length, x_length bool val_length_flag, x_length_flag - + if add_mode: val_raised = pow(val, power_of_element) else: val_raised = -pow(val, power_of_element) - x_length_flag = abs(log10(abs(x_value[0][0]))) > 15 and isfinite(abs(log10(abs(x_value[0][0])))) == 1 - val_length_flag = abs(log10(abs(val_raised))) > 15 and isfinite(abs(log10(abs(val_raised)))) == 1 + x_length = abs(log10(abs(x_value[0][0]))) + val_length = abs(log10(abs(val_raised))) - # We'll try to maintain comp_value as the counter for + x_length_flag = x_length > 15 and isfinite(x_length) + val_length_flag = val_length > 15 and isfinite(val_length) + + # We'll try to maintain comp_value as the counter for # numbers <1e15 to keep it from getting rounded out. if x_length_flag and val_length_flag: - #Both > 1e15 or < 1-e15 + # Both > 1e15 or < 1-e15 x_value[0][0] += val_raised elif x_length_flag: comp_value[0][0] += val_raised - elif val_length_flag: comp_value[0][0] += x_value[0][0] x_value[0][0] = val_raised - + else: - #Neither are >1e15/<1e-15, safe to proceed + # Neither are >1e15/<1e-15, safe to proceed x_value[0][0] += val_raised if comp_value[0][0] != 0: @@ -819,6 +819,7 @@ cdef void remove_kurt(float64_t val, int64_t *nobs, update_sum_of_window(val, &xxx, &compensation_xxx, 3, 0) update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 0) + def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 5ec7218e4d399..5faba6d9103e7 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -226,52 +226,60 @@ def test_rolling_kurt_eq_value_fperr(step): assert (a[a.index >= 9] == -3).all() assert a[a.index < 9].isna().all() -@pytest.mark.parametrize("test_len, window_size, modifiers", - [([0, 10], 5, [[0,1e6], [3, -1e6]]), - ([0, 10], 5, [[0,1e-6], [3, 1e6]]), - ([10, 100], 20,[[40, -1e10], [59, -9e9]]), - ([10500, 11000], 200,[[10581, 0], [109900, -1e6], [10999, 0]]), - ] - ) + +@pytest.mark.parametrize( + "test_len, window_size, modifiers", + [ + ([0, 10], 5, [[0, 1e6], [3, -1e6]]), + ([0, 10], 5, [[0, 1e-6], [3, 1e6]]), + ([10, 100], 20, [[40, -1e10], [59, -9e9]]), + ([10500, 11000], 200, [[10581, 0], [109900, -1e6], [10999, 0]]), + ], +) def test_rolling_kurt_outlier_influence(test_len, window_size, modifiers): # #61416 Extreme values causes kurtosis value to become incorrect - test_series = Series(range(test_len[0], test_len[1]), index = range(test_len[0], test_len[1])) + test_series = Series( + range(test_len[0], test_len[1]), index=range(test_len[0], test_len[1]) + ) for ind, number in modifiers: test_series = test_series.replace(ind, number) - - #minimum elements needed for "window_size" number of kurts + + # minimum elements needed for "window_size" number of kurts test_len_diff = test_len[1] - test_len[0] - min_elements_needed = test_len_diff - 2*window_size + 1 - expected_series = (test_series[min_elements_needed:].reindex(range(test_len[0], test_len[1]))) - - actual = test_series.rolling(window_size,min_periods=1).kurt() - expected = expected_series.rolling(window_size,min_periods=1).kurt() - - tm.assert_series_equal(actual.tail(window_size), - expected.tail(window_size) - ) - -@pytest.mark.parametrize("array_param, window_size, modifiers", - [([10, 10, 10], 5, [[0,1e6], [3, -1e6]]), - ([-15, 10, 10], 5, [[0,1e2], [3, 1e6]]), - ([1e4, 1e3, 100], 20, [[90,-1e7], [0, 1e7]]), - ([1e-3, 3e-3, 100], 20, [[90,100], [20, 1e4]]), - ] - ) + min_elements_needed = test_len_diff - 2 * window_size + 1 + expected_series = test_series[min_elements_needed:].reindex( + range(test_len[0], test_len[1]) + ) + + actual = test_series.rolling(window_size, min_periods=1).kurt() + expected = expected_series.rolling(window_size, min_periods=1).kurt() + + tm.assert_series_equal(actual.tail(window_size), expected.tail(window_size)) + + +@pytest.mark.parametrize( + "array_param, window_size, modifiers", + [ + ([10, 10, 10], 5, [[0, 1e6], [3, -1e6]]), + ([-15, 10, 10], 5, [[0, 1e2], [3, 1e6]]), + ([1e4, 1e3, 100], 20, [[90, -1e7], [0, 1e7]]), + ([1e-3, 3e-3, 100], 20, [[90, 100], [20, 1e4]]), + ], +) def test_rolling_kurt_outlier_influence_rand(array_param, window_size, modifiers): # #61416 Extreme values causes kurtosis value to become incorrect - rand_array = np.random.default_rng(5).normal(array_param[0], array_param[1], array_param[2]) + rand_array = np.random.default_rng(5).normal( + array_param[0], array_param[1], array_param[2] + ) test_series = Series(rand_array) for ind, number in modifiers: test_series = test_series.replace(ind, number) - - #minimum elements needed for "window_size" number of kurts - min_elements_needed = array_param[2] - 2*window_size + 1 - expected_series = (test_series[min_elements_needed:]) - - actual = test_series.rolling(window_size,min_periods=1).kurt() - expected = expected_series.rolling(window_size,min_periods=1).kurt() - - tm.assert_series_equal(actual.tail(window_size), - expected.tail(window_size) - ) \ No newline at end of file + + # minimum elements needed for "window_size" number of kurts + min_elements_needed = array_param[2] - 2 * window_size + 1 + expected_series = test_series[min_elements_needed:] + + actual = test_series.rolling(window_size, min_periods=1).kurt() + expected = expected_series.rolling(window_size, min_periods=1).kurt() + + tm.assert_series_equal(actual.tail(window_size), expected.tail(window_size)) From 7763a832f4713572d68f79d30b7341f7d12bb519 Mon Sep 17 00:00:00 2001 From: eicchen Date: Thu, 22 May 2025 17:52:37 -0500 Subject: [PATCH 3/3] Errant type error --- pandas/_libs/window/aggregations.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index df9983f688c07..bf787a799293c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -737,7 +737,7 @@ cdef void update_sum_of_window(float64_t val, cdef: float64_t val_raised - int val_length, x_length + double val_length, x_length bool val_length_flag, x_length_flag if add_mode: