From 9163039071fb3ce74b6757b5f6637f0a2299a13e Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Fri, 15 Aug 2025 09:53:35 -0700 Subject: [PATCH 1/3] nest accessor masking --- src/nested_pandas/nestedframe/core.py | 5 +++++ src/nested_pandas/series/accessor.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 8eb7e8c2..81572bee 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -226,6 +226,11 @@ def __getitem__(self, item): return super().__getitem__(item) def _getitem_str(self, item): + + #if item in self.nested_columns: + # # If the item is a nested column, return a flat dataframe + # return super().__getitem__(item).nest.to_flat() + # Preempt the nested check if the item is a base column, with or without # dots and backticks. if item in self.columns: diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index f69d84d2..6a7d3bbd 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -13,6 +13,7 @@ from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct from nested_pandas.series.utils import nested_types_mapper +from nested_pandas.nestedframe.core import NestedFrame __all__ = ["NestSeriesAccessor"] @@ -479,6 +480,19 @@ def get_list_series(self, field: str) -> pd.Series: ) def __getitem__(self, key: str | list[str]) -> pd.Series: + + #import pdb;pdb.set_trace() + if isinstance(key, pd.Series) and pd.api.types.is_bool_dtype(key.dtype): + # Boolean masking + flat_df = self.to_flat() + if not key.index.equals(flat_df.index): + raise ValueError("Boolean mask must have the same index as the series") + # Apply the mask to the series + masked_df = flat_df[key] + + nested_idx = NestedFrame(index=self._series.index) + return nested_idx.add_nested(masked_df, name=self._series.name) + if isinstance(key, list): new_array = self._series.array.view_fields(key) return pd.Series(new_array, index=self._series.index, name=self._series.name) From 2e583e3cefff7c1ad473d3470c89449d29dbf6e2 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Fri, 15 Aug 2025 11:27:27 -0700 Subject: [PATCH 2/3] tidy implementation --- src/nested_pandas/nestedframe/core.py | 5 ----- src/nested_pandas/series/accessor.py | 18 +++++++----------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 81572bee..8eb7e8c2 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -226,11 +226,6 @@ def __getitem__(self, item): return super().__getitem__(item) def _getitem_str(self, item): - - #if item in self.nested_columns: - # # If the item is a nested column, return a flat dataframe - # return super().__getitem__(item).nest.to_flat() - # Preempt the nested check if the item is a base column, with or without # dots and backticks. if item in self.columns: diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 6a7d3bbd..53901cda 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -10,10 +10,10 @@ from numpy.typing import ArrayLike from pandas.api.extensions import register_series_accessor +from nested_pandas.nestedframe.core import NestedFrame from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct from nested_pandas.series.utils import nested_types_mapper -from nested_pandas.nestedframe.core import NestedFrame __all__ = ["NestSeriesAccessor"] @@ -480,19 +480,15 @@ def get_list_series(self, field: str) -> pd.Series: ) def __getitem__(self, key: str | list[str]) -> pd.Series: - - #import pdb;pdb.set_trace() + # Allow boolean masking given a Series of booleans if isinstance(key, pd.Series) and pd.api.types.is_bool_dtype(key.dtype): - # Boolean masking - flat_df = self.to_flat() + flat_df = self.to_flat() # Use the flat representation if not key.index.equals(flat_df.index): - raise ValueError("Boolean mask must have the same index as the series") - # Apply the mask to the series - masked_df = flat_df[key] - - nested_idx = NestedFrame(index=self._series.index) - return nested_idx.add_nested(masked_df, name=self._series.name) + raise ValueError("Boolean mask must have the same index as the flattened nested dataframe.") + # Apply the mask to the series, return a new NestedFrame + return NestedFrame(index=self._series.index).add_nested(flat_df[key], name=self._series.name) + # If the key is a single string, return the flat series for that field if isinstance(key, list): new_array = self._series.array.view_fields(key) return pd.Series(new_array, index=self._series.index, name=self._series.name) From 91dffbed66b418eddc40edd2ccee16b651efe9f3 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Fri, 15 Aug 2025 12:08:19 -0700 Subject: [PATCH 3/3] add tests and docs --- docs/tutorials/low_level.ipynb | 64 ++++++++++++++++++++- tests/nested_pandas/series/test_accessor.py | 29 ++++++++++ 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index e0004a56..ff9279fa 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -295,6 +295,66 @@ "new_series.nest.to_flat()" ] }, + { + "cell_type": "markdown", + "id": "7308a0a5", + "metadata": {}, + "source": [ + "## Use familiar pandas masking operations through the `.nest` accessor\n", + "\n", + "A popular usage pattern within pandas is the ability to filter `DataFrames`/`Series` using boolean masks. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a0a700f", + "metadata": {}, + "outputs": [], + "source": [ + "nf = generate_data(5, 5, seed=1)\n", + "nf[nf[\"a\"] > 0.3]" + ] + }, + { + "cell_type": "markdown", + "id": "21395561", + "metadata": {}, + "source": [ + "In `nested-pandas`, the ability to do this masking is contained within the `.nest` accessor, which looks like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "156d0d71", + "metadata": {}, + "outputs": [], + "source": [ + "nf = generate_data(5, 5, seed=1)\n", + "nf[\"nested.flag\"] = True # Add an extra flag column\n", + "nf[\"nested\"].nest[(nf[\"nested.t\"] < 5) & nf[\"nested.flag\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "3ff4ee06", + "metadata": {}, + "source": [ + "The result here is a new nested column, with the masking applied, which we can assign back to the `nf` `NestedFrame` if we wish." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0f8dac3", + "metadata": {}, + "outputs": [], + "source": [ + "nf[\"nested\"] = nf[\"nested\"].nest[(nf[\"nested.t\"] < 5) & nf[\"nested.flag\"]]\n", + "nf" + ] + }, { "cell_type": "markdown", "id": "93f73bf28d48bfdc", @@ -735,7 +795,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "lsdb", "language": "python", "name": "python3" }, @@ -749,7 +809,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py index 915c03c2..caefc9bf 100644 --- a/tests/nested_pandas/series/test_accessor.py +++ b/tests/nested_pandas/series/test_accessor.py @@ -736,6 +736,35 @@ def test___getitem___multiple_fields(): ) +def test___getitem___series_masking(): + """Test that series masking works through the accessor.""" + nf = generate_data(5, 5, seed=1) + nf["nested.flag"] = True # Add an additional boolean column + + # Test a simple mask + result = nf["nested"].nest[nf["nested.t"] < 5.0] + new_nf = nf.copy() + new_nf["nested"] = result + + expected = nf["nested"].nest.to_flat().query("t < 5.0") + + assert_frame_equal(new_nf["nested"].nest.to_flat(), expected) + + # Test a two column mask + result = nf["nested"].nest[(nf["nested.t"] < 5.0) & (nf["nested.flag"])] + new_nf = nf.copy() + new_nf["nested"] = result + + expected = nf["nested"].nest.to_flat().query("t < 5.0 and flag == True") + + assert_frame_equal(new_nf["nested"].nest.to_flat(), expected) + + # Test for misaligned index ValueError + with pytest.raises(ValueError): + mask = nf["nested.t"] < 5.0 + _ = nf["nested"].nest[mask[0:23]] + + def test___setitem__(): """Test that the .nest["field"] = ... works for a single field.""" struct_array = pa.StructArray.from_arrays(