Skip to content

Enable boolean masking through nest accessor #329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions docs/tutorials/low_level.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,66 @@
"new_series.nest.to_flat()"
]
},
{
"cell_type": "markdown",
"id": "7308a0a5",
"metadata": {},
"source": [
"## Use familiar pandas masking operations through the `.nest` accessor\n",
"\n",
"A popular usage pattern within pandas is the ability to filter `DataFrames`/`Series` using boolean masks. For example:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a0a700f",
"metadata": {},
"outputs": [],
"source": [
"nf = generate_data(5, 5, seed=1)\n",
"nf[nf[\"a\"] > 0.3]"
]
},
{
"cell_type": "markdown",
"id": "21395561",
"metadata": {},
"source": [
"In `nested-pandas`, the ability to do this masking is contained within the `.nest` accessor, which looks like this:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "156d0d71",
"metadata": {},
"outputs": [],
"source": [
"nf = generate_data(5, 5, seed=1)\n",
"nf[\"nested.flag\"] = True # Add an extra flag column\n",
"nf[\"nested\"].nest[(nf[\"nested.t\"] < 5) & nf[\"nested.flag\"]]"
]
},
{
"cell_type": "markdown",
"id": "3ff4ee06",
"metadata": {},
"source": [
"The result here is a new nested column, with the masking applied, which we can assign back to the `nf` `NestedFrame` if we wish."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0f8dac3",
"metadata": {},
"outputs": [],
"source": [
"nf[\"nested\"] = nf[\"nested\"].nest[(nf[\"nested.t\"] < 5) & nf[\"nested.flag\"]]\n",
"nf"
]
},
{
"cell_type": "markdown",
"id": "93f73bf28d48bfdc",
Expand Down Expand Up @@ -735,7 +795,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "lsdb",
"language": "python",
"name": "python3"
},
Expand All @@ -749,7 +809,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
"version": "3.12.8"
}
},
"nbformat": 4,
Expand Down
10 changes: 10 additions & 0 deletions src/nested_pandas/series/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from numpy.typing import ArrayLike
from pandas.api.extensions import register_series_accessor

from nested_pandas.nestedframe.core import NestedFrame
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct
from nested_pandas.series.utils import nested_types_mapper
Expand Down Expand Up @@ -479,6 +480,15 @@ def get_list_series(self, field: str) -> pd.Series:
)

def __getitem__(self, key: str | list[str]) -> pd.Series:
# Allow boolean masking given a Series of booleans
if isinstance(key, pd.Series) and pd.api.types.is_bool_dtype(key.dtype):
flat_df = self.to_flat() # Use the flat representation
if not key.index.equals(flat_df.index):
raise ValueError("Boolean mask must have the same index as the flattened nested dataframe.")
# Apply the mask to the series, return a new NestedFrame
return NestedFrame(index=self._series.index).add_nested(flat_df[key], name=self._series.name)

# If the key is a single string, return the flat series for that field
if isinstance(key, list):
new_array = self._series.array.view_fields(key)
return pd.Series(new_array, index=self._series.index, name=self._series.name)
Expand Down
29 changes: 29 additions & 0 deletions tests/nested_pandas/series/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,35 @@ def test___getitem___multiple_fields():
)


def test___getitem___series_masking():
"""Test that series masking works through the accessor."""
nf = generate_data(5, 5, seed=1)
nf["nested.flag"] = True # Add an additional boolean column

# Test a simple mask
result = nf["nested"].nest[nf["nested.t"] < 5.0]
new_nf = nf.copy()
new_nf["nested"] = result

expected = nf["nested"].nest.to_flat().query("t < 5.0")

assert_frame_equal(new_nf["nested"].nest.to_flat(), expected)

# Test a two column mask
result = nf["nested"].nest[(nf["nested.t"] < 5.0) & (nf["nested.flag"])]
new_nf = nf.copy()
new_nf["nested"] = result

expected = nf["nested"].nest.to_flat().query("t < 5.0 and flag == True")

assert_frame_equal(new_nf["nested"].nest.to_flat(), expected)

# Test for misaligned index ValueError
with pytest.raises(ValueError):
mask = nf["nested.t"] < 5.0
_ = nf["nested"].nest[mask[0:23]]


def test___setitem__():
"""Test that the .nest["field"] = ... works for a single field."""
struct_array = pa.StructArray.from_arrays(
Expand Down