Skip to content

Commit 6158e29

Browse files
authored
adding explode (#327)
* adding explode * update docstring * adding explode unit tests * adding reference and example docstring * Revert "adding explode unit tests" recover describe function tests This reverts commit ac37d6f. * update explode tests * docstring typo * adding coverage tests * change naming convention and update docstring * docstring format * docstring format again * docstring format again x2 * docstring grammer
1 parent 4186d56 commit 6158e29

File tree

3 files changed

+145
-1
lines changed

3 files changed

+145
-1
lines changed

docs/reference/nestedframe.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Extended Pandas.DataFrame Interface
4242
NestedFrame.min
4343
NestedFrame.max
4444
NestedFrame.describe
45+
NestedFrame.explode
4546

4647
I/O
4748
~~~~~~~~~

src/nested_pandas/nestedframe/core.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,94 @@ def describe(self, exclude_nest: bool = False, percentiles=None, include=None, e
974974

975975
return NestedFrame(pd.concat(result, axis=1))
976976

977+
def explode(self, column: IndexLabel, ignore_index: bool = False):
978+
"""
979+
980+
Transform each element of a list-like base column to a row, replicating index value.
981+
Or unnest a specified nested column with the other columns being replicated as part
982+
of the unnest. The exploded columns will be added to the right of the rest of the frame.
983+
984+
Parameters
985+
----------
986+
column : IndexLabel
987+
Base column(s) or nested column to explode.
988+
For multiple base columns, specify a non-empty list with each element being a string or tuple.
989+
For all specified base columns, their list-like data on same row of the frame
990+
must have matching length.
991+
Only a single nested column can be exploded at a time. Indicate the nested column as a string.
992+
ignore_index : bool, default False
993+
If True, the resulting index will be labeled 0, 1, ..., n - 1.
994+
995+
Returns
996+
-------
997+
NestedFrame
998+
A new NestedFrame with the specified column(s) exploded.
999+
1000+
Raises
1001+
------
1002+
ValueError
1003+
If specified columns to explode have more than one nested column,
1004+
or contain a mix of nested and base columns.
1005+
1006+
See Also
1007+
--------
1008+
:meth:`pandas.DataFrame.explode`
1009+
1010+
Examples
1011+
--------
1012+
>>> from nested_pandas.datasets.generation import generate_data
1013+
>>> nf = generate_data(3,3, seed=1)
1014+
1015+
>>> nf_explode = nf.explode(column="nested")
1016+
>>> nf_explode
1017+
a b t flux band
1018+
0 0.417022 0.604665 3.725204 67.046751 g
1019+
0 0.417022 0.604665 10.776335 14.038694 g
1020+
0 0.417022 0.604665 4.089045 96.826158 g
1021+
1 0.720324 0.293512 6.911215 41.73048 r
1022+
1 0.720324 0.293512 8.38389 19.810149 r
1023+
1 0.720324 0.293512 17.562349 31.342418 g
1024+
2 0.000114 0.184677 7.935349 55.868983 r
1025+
2 0.000114 0.184677 13.70439 80.074457 r
1026+
2 0.000114 0.184677 0.547752 69.232262 g
1027+
1028+
"""
1029+
1030+
if isinstance(column, list):
1031+
nested_in_list = [col for col in column if col in self.nested_columns]
1032+
# list contains more than 1 nested columns
1033+
if len(nested_in_list) > 1:
1034+
raise ValueError(
1035+
f"Exploding multiple nested columns at once is not supported.\n"
1036+
f"Nested columns: {nested_in_list}"
1037+
)
1038+
1039+
# list contains mixing nested & base columns
1040+
if len(nested_in_list) == 1 and len(column) > 1:
1041+
raise ValueError(
1042+
f"Exploding nested column together with base columns is not supported.\n"
1043+
f"Nested column: {nested_in_list[0]}"
1044+
)
1045+
1046+
# normalize a single-element list to string
1047+
if isinstance(column, list) and len(column) == 1:
1048+
column = column[0]
1049+
1050+
# handle single nested column explode
1051+
if isinstance(column, str) and column in self.nested_columns:
1052+
selected_nested_df = self[column].nest.to_flat()
1053+
other_col = [col for col in self.columns if col != column]
1054+
other_col_df = self[other_col]
1055+
result = other_col_df.join(selected_nested_df)
1056+
1057+
if ignore_index:
1058+
result = result.reset_index(drop=True)
1059+
1060+
return NestedFrame(result)
1061+
1062+
# otherwise just use pandas' explode
1063+
return NestedFrame(super().explode(column=column, ignore_index=ignore_index))
1064+
9771065
def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
9781066
"""Evaluate a string describing operations on NestedFrame columns.
9791067

tests/nested_pandas/nestedframe/test_nestedframe.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1403,7 +1403,7 @@ def test_min():
14031403

14041404

14051405
def test_max():
1406-
"""Test max function return correct result with an without the nested columns"""
1406+
"""Test max function return correct result with and without the nested columns"""
14071407
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6], "c": ["x", "y", "z"]}, index=[0, 1, 2])
14081408
nested = pd.DataFrame(
14091409
data={"d": [10, 11, 20, 21, 3, 31, 32], "y": [1, 10, 20, 30, 40, 50, 60]}, index=[0, 0, 1, 1, 1, 2, 2]
@@ -1638,6 +1638,61 @@ def test_describe():
16381638
assert "top" not in r18.index
16391639

16401640

1641+
def test_explode():
1642+
"""Test NestedFrame.explode gives correct result for flattening specified nested columns"""
1643+
base = NestedFrame(
1644+
data={
1645+
"a": [[1, 2, 3], 4, [5, 6]],
1646+
"b": ["2", "4", "6"],
1647+
"c": [["x1", "x2", "x3"], "y", ["z1", "z2"]],
1648+
},
1649+
index=[0, 1, 2],
1650+
)
1651+
1652+
nested_num = pd.DataFrame(
1653+
data={"d": [10, 11, 20, 21, 30, 31, 32], "e": [1, 2, 3, 4, 5, 6, 7]}, index=[0, 0, 1, 1, 1, 2, 2]
1654+
)
1655+
nested_mix = pd.DataFrame(
1656+
data={"f": ["A", "B", "C", "D", "E", "A", "A", "B"], "g": [5, 4, 7, 5, 1, 9, 3, 4]},
1657+
index=[0, 0, 0, 1, 1, 2, 2, 2],
1658+
)
1659+
base = base.add_nested(nested_num, "nested_num").add_nested(nested_mix, "nested_mix")
1660+
1661+
# explode on base columns
1662+
r1 = base.explode(column=["a"])
1663+
assert r1.shape[0] == 6
1664+
assert r1.shape[1] == 5
1665+
expected1 = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 0, 0, 1, 2, 2])
1666+
assert (r1["a"] == expected1).all()
1667+
1668+
r2 = base.explode(column="c", ignore_index=True)
1669+
expected2 = pd.Series(["x1", "x2", "x3", "y", "z1", "z2"])
1670+
assert (r2["c"] == expected2).all()
1671+
1672+
r3 = base.explode(column=["a", "c"])
1673+
assert (r3["a"] == expected1).all()
1674+
expected3 = pd.Series(["x1", "x2", "x3", "y", "z1", "z2"], index=[0, 0, 0, 1, 2, 2])
1675+
assert (r3["c"] == expected3).all()
1676+
1677+
# explode on nested column error
1678+
with pytest.raises(ValueError):
1679+
base.explode(column=["nested_num", "nested_mix"])
1680+
1681+
with pytest.raises(ValueError):
1682+
base.explode(column=["nested_num", "a"])
1683+
1684+
# explode on nested column
1685+
r4 = base.explode(column="nested_num")
1686+
assert r4.shape[1] == 6
1687+
expected4 = pd.Series([1, 2, 3, 4, 5, 6, 7], index=[0, 0, 1, 1, 1, 2, 2])
1688+
assert (r4["e"] == expected4).all()
1689+
1690+
r5 = base.explode(column="nested_mix", ignore_index=True)
1691+
assert r5.shape[1] == 6
1692+
expected5 = pd.Series(["A", "B", "C", "D", "E", "A", "A", "B"])
1693+
assert (r5["f"] == expected5).all()
1694+
1695+
16411696
def test_eval():
16421697
"""
16431698
Test basic behavior of NestedFrame.eval, and that it can handle nested references

0 commit comments

Comments
 (0)