Skip to content
Merged
1 change: 1 addition & 0 deletions docs/reference/nestedframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Extended Pandas.DataFrame Interface
NestedFrame.min
NestedFrame.max
NestedFrame.describe
NestedFrame.explode

I/O
~~~~~~~~~
Expand Down
88 changes: 88 additions & 0 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,94 @@ def describe(self, exclude_nest: bool = False, percentiles=None, include=None, e

return NestedFrame(pd.concat(result, axis=1))

def explode(self, column: IndexLabel, ignore_index: bool = False):
"""

Transform each element of a list-like base column to a row, replicating index value.
Or unnest a specified nested column with the other columns being replicated as part
of the unnest. The exploded columns will be added to the right of the rest of the frame.

Parameters
----------
column : IndexLabel
Base column(s) or nested column to explode.
For multiple base columns, specify a non-empty list with each element being a string or tuple.
For all specified base columns, their list-like data on same row of the frame
must have matching length.
Only a single nested column can be exploded at a time. Indicate the nested column as a string.
ignore_index : bool, default False
If True, the resulting index will be labeled 0, 1, ..., n - 1.

Returns
-------
NestedFrame
A new NestedFrame with the specified column(s) exploded.

Raises
------
ValueError
If specified columns to explode have more than one nested column,
or contain a mix of nested and base columns.

See Also
--------
:meth:`pandas.DataFrame.explode`

Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(3,3, seed=1)

>>> nf_explode = nf.explode(column="nested")
>>> nf_explode
a b t flux band
0 0.417022 0.604665 3.725204 67.046751 g
0 0.417022 0.604665 10.776335 14.038694 g
0 0.417022 0.604665 4.089045 96.826158 g
1 0.720324 0.293512 6.911215 41.73048 r
1 0.720324 0.293512 8.38389 19.810149 r
1 0.720324 0.293512 17.562349 31.342418 g
2 0.000114 0.184677 7.935349 55.868983 r
2 0.000114 0.184677 13.70439 80.074457 r
2 0.000114 0.184677 0.547752 69.232262 g

"""

if isinstance(column, list):
nested_in_list = [col for col in column if col in self.nested_columns]
# list contains more than 1 nested columns
if len(nested_in_list) > 1:
raise ValueError(
f"Exploding multiple nested columns at once is not supported.\n"
f"Nested columns: {nested_in_list}"
)

# list contains mixing nested & base columns
if len(nested_in_list) == 1 and len(column) > 1:
raise ValueError(
f"Exploding nested column together with base columns is not supported.\n"
f"Nested column: {nested_in_list[0]}"
)

# normalize a single-element list to string
if isinstance(column, list) and len(column) == 1:
column = column[0]

# handle single nested column explode
if isinstance(column, str) and column in self.nested_columns:
selected_nested_df = self[column].nest.to_flat()
other_col = [col for col in self.columns if col != column]
other_col_df = self[other_col]
result = other_col_df.join(selected_nested_df)

if ignore_index:
result = result.reset_index(drop=True)

return NestedFrame(result)

# otherwise just use pandas' explode
return NestedFrame(super().explode(column=column, ignore_index=ignore_index))

def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
"""Evaluate a string describing operations on NestedFrame columns.

Expand Down
57 changes: 56 additions & 1 deletion tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1403,7 +1403,7 @@ def test_min():


def test_max():
"""Test max function return correct result with an without the nested columns"""
"""Test max function return correct result with and without the nested columns"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6], "c": ["x", "y", "z"]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"d": [10, 11, 20, 21, 3, 31, 32], "y": [1, 10, 20, 30, 40, 50, 60]}, index=[0, 0, 1, 1, 1, 2, 2]
Expand Down Expand Up @@ -1638,6 +1638,61 @@ def test_describe():
assert "top" not in r18.index


def test_explode():
"""Test NestedFrame.explode gives correct result for flattening specified nested columns"""
base = NestedFrame(
data={
"a": [[1, 2, 3], 4, [5, 6]],
"b": ["2", "4", "6"],
"c": [["x1", "x2", "x3"], "y", ["z1", "z2"]],
},
index=[0, 1, 2],
)

nested_num = pd.DataFrame(
data={"d": [10, 11, 20, 21, 30, 31, 32], "e": [1, 2, 3, 4, 5, 6, 7]}, index=[0, 0, 1, 1, 1, 2, 2]
)
nested_mix = pd.DataFrame(
data={"f": ["A", "B", "C", "D", "E", "A", "A", "B"], "g": [5, 4, 7, 5, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 2, 2, 2],
)
base = base.add_nested(nested_num, "nested_num").add_nested(nested_mix, "nested_mix")

# explode on base columns
r1 = base.explode(column=["a"])
assert r1.shape[0] == 6
assert r1.shape[1] == 5
expected1 = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 0, 0, 1, 2, 2])
assert (r1["a"] == expected1).all()

r2 = base.explode(column="c", ignore_index=True)
expected2 = pd.Series(["x1", "x2", "x3", "y", "z1", "z2"])
assert (r2["c"] == expected2).all()

r3 = base.explode(column=["a", "c"])
assert (r3["a"] == expected1).all()
expected3 = pd.Series(["x1", "x2", "x3", "y", "z1", "z2"], index=[0, 0, 0, 1, 2, 2])
assert (r3["c"] == expected3).all()

# explode on nested column error
with pytest.raises(ValueError):
base.explode(column=["nested_num", "nested_mix"])

with pytest.raises(ValueError):
base.explode(column=["nested_num", "a"])

# explode on nested column
r4 = base.explode(column="nested_num")
assert r4.shape[1] == 6
expected4 = pd.Series([1, 2, 3, 4, 5, 6, 7], index=[0, 0, 1, 1, 1, 2, 2])
assert (r4["e"] == expected4).all()

r5 = base.explode(column="nested_mix", ignore_index=True)
assert r5.shape[1] == 6
expected5 = pd.Series(["A", "B", "C", "D", "E", "A", "A", "B"])
assert (r5["f"] == expected5).all()


def test_eval():
"""
Test basic behavior of NestedFrame.eval, and that it can handle nested references
Expand Down