diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index e0004a56..ff9279fa 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -295,6 +295,66 @@ "new_series.nest.to_flat()" ] }, + { + "cell_type": "markdown", + "id": "7308a0a5", + "metadata": {}, + "source": [ + "## Use familiar pandas masking operations through the `.nest` accessor\n", + "\n", + "A popular usage pattern within pandas is the ability to filter `DataFrames`/`Series` using boolean masks. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a0a700f", + "metadata": {}, + "outputs": [], + "source": [ + "nf = generate_data(5, 5, seed=1)\n", + "nf[nf[\"a\"] > 0.3]" + ] + }, + { + "cell_type": "markdown", + "id": "21395561", + "metadata": {}, + "source": [ + "In `nested-pandas`, the ability to do this masking is contained within the `.nest` accessor, which looks like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "156d0d71", + "metadata": {}, + "outputs": [], + "source": [ + "nf = generate_data(5, 5, seed=1)\n", + "nf[\"nested.flag\"] = True # Add an extra flag column\n", + "nf[\"nested\"].nest[(nf[\"nested.t\"] < 5) & nf[\"nested.flag\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "3ff4ee06", + "metadata": {}, + "source": [ + "The result here is a new nested column, with the masking applied, which we can assign back to the `nf` `NestedFrame` if we wish." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0f8dac3", + "metadata": {}, + "outputs": [], + "source": [ + "nf[\"nested\"] = nf[\"nested\"].nest[(nf[\"nested.t\"] < 5) & nf[\"nested.flag\"]]\n", + "nf" + ] + }, { "cell_type": "markdown", "id": "93f73bf28d48bfdc", @@ -735,7 +795,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "lsdb", "language": "python", "name": "python3" }, @@ -749,7 +809,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index f69d84d2..53901cda 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -10,6 +10,7 @@ from numpy.typing import ArrayLike from pandas.api.extensions import register_series_accessor +from nested_pandas.nestedframe.core import NestedFrame from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct from nested_pandas.series.utils import nested_types_mapper @@ -479,6 +480,15 @@ def get_list_series(self, field: str) -> pd.Series: ) def __getitem__(self, key: str | list[str]) -> pd.Series: + # Allow boolean masking given a Series of booleans + if isinstance(key, pd.Series) and pd.api.types.is_bool_dtype(key.dtype): + flat_df = self.to_flat() # Use the flat representation + if not key.index.equals(flat_df.index): + raise ValueError("Boolean mask must have the same index as the flattened nested dataframe.") + # Apply the mask to the series, return a new NestedFrame + return NestedFrame(index=self._series.index).add_nested(flat_df[key], name=self._series.name) + + # If the key is a single string, return the flat series for that field if isinstance(key, list): new_array = self._series.array.view_fields(key) return pd.Series(new_array, index=self._series.index, name=self._series.name) diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py index 915c03c2..caefc9bf 100644 --- a/tests/nested_pandas/series/test_accessor.py +++ b/tests/nested_pandas/series/test_accessor.py @@ -736,6 +736,35 @@ def test___getitem___multiple_fields(): ) +def test___getitem___series_masking(): + """Test that series masking works through the accessor.""" + nf = generate_data(5, 5, seed=1) + nf["nested.flag"] = True # Add an additional boolean column + + # Test a simple mask + result = nf["nested"].nest[nf["nested.t"] < 5.0] + new_nf = nf.copy() + new_nf["nested"] = result + + expected = nf["nested"].nest.to_flat().query("t < 5.0") + + assert_frame_equal(new_nf["nested"].nest.to_flat(), expected) + + # Test a two column mask + result = nf["nested"].nest[(nf["nested.t"] < 5.0) & (nf["nested.flag"])] + new_nf = nf.copy() + new_nf["nested"] = result + + expected = nf["nested"].nest.to_flat().query("t < 5.0 and flag == True") + + assert_frame_equal(new_nf["nested"].nest.to_flat(), expected) + + # Test for misaligned index ValueError + with pytest.raises(ValueError): + mask = nf["nested.t"] < 5.0 + _ = nf["nested"].nest[mask[0:23]] + + def test___setitem__(): """Test that the .nest["field"] = ... works for a single field.""" struct_array = pa.StructArray.from_arrays(