diff --git a/docs/reference/nestedframe.rst b/docs/reference/nestedframe.rst index c6f9201d..291633e8 100644 --- a/docs/reference/nestedframe.rst +++ b/docs/reference/nestedframe.rst @@ -42,6 +42,7 @@ Extended Pandas.DataFrame Interface NestedFrame.min NestedFrame.max NestedFrame.describe + NestedFrame.explode I/O ~~~~~~~~~ diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 8eb7e8c2..9e2b8641 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -974,6 +974,94 @@ def describe(self, exclude_nest: bool = False, percentiles=None, include=None, e return NestedFrame(pd.concat(result, axis=1)) + def explode(self, column: IndexLabel, ignore_index: bool = False): + """ + + Transform each element of a list-like base column to a row, replicating index value. + Or unnest a specified nested column with the other columns being replicated as part + of the unnest. The exploded columns will be added to the right of the rest of the frame. + + Parameters + ---------- + column : IndexLabel + Base column(s) or nested column to explode. + For multiple base columns, specify a non-empty list with each element being a string or tuple. + For all specified base columns, their list-like data on same row of the frame + must have matching length. + Only a single nested column can be exploded at a time. Indicate the nested column as a string. + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, ..., n - 1. + + Returns + ------- + NestedFrame + A new NestedFrame with the specified column(s) exploded. + + Raises + ------ + ValueError + If specified columns to explode have more than one nested column, + or contain a mix of nested and base columns. + + See Also + -------- + :meth:`pandas.DataFrame.explode` + + Examples + -------- + >>> from nested_pandas.datasets.generation import generate_data + >>> nf = generate_data(3,3, seed=1) + + >>> nf_explode = nf.explode(column="nested") + >>> nf_explode + a b t flux band + 0 0.417022 0.604665 3.725204 67.046751 g + 0 0.417022 0.604665 10.776335 14.038694 g + 0 0.417022 0.604665 4.089045 96.826158 g + 1 0.720324 0.293512 6.911215 41.73048 r + 1 0.720324 0.293512 8.38389 19.810149 r + 1 0.720324 0.293512 17.562349 31.342418 g + 2 0.000114 0.184677 7.935349 55.868983 r + 2 0.000114 0.184677 13.70439 80.074457 r + 2 0.000114 0.184677 0.547752 69.232262 g + + """ + + if isinstance(column, list): + nested_in_list = [col for col in column if col in self.nested_columns] + # list contains more than 1 nested columns + if len(nested_in_list) > 1: + raise ValueError( + f"Exploding multiple nested columns at once is not supported.\n" + f"Nested columns: {nested_in_list}" + ) + + # list contains mixing nested & base columns + if len(nested_in_list) == 1 and len(column) > 1: + raise ValueError( + f"Exploding nested column together with base columns is not supported.\n" + f"Nested column: {nested_in_list[0]}" + ) + + # normalize a single-element list to string + if isinstance(column, list) and len(column) == 1: + column = column[0] + + # handle single nested column explode + if isinstance(column, str) and column in self.nested_columns: + selected_nested_df = self[column].nest.to_flat() + other_col = [col for col in self.columns if col != column] + other_col_df = self[other_col] + result = other_col_df.join(selected_nested_df) + + if ignore_index: + result = result.reset_index(drop=True) + + return NestedFrame(result) + + # otherwise just use pandas' explode + return NestedFrame(super().explode(column=column, ignore_index=ignore_index)) + def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """Evaluate a string describing operations on NestedFrame columns. diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index e12e8196..4cea8c80 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -1403,7 +1403,7 @@ def test_min(): def test_max(): - """Test max function return correct result with an without the nested columns""" + """Test max function return correct result with and without the nested columns""" base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6], "c": ["x", "y", "z"]}, index=[0, 1, 2]) nested = pd.DataFrame( data={"d": [10, 11, 20, 21, 3, 31, 32], "y": [1, 10, 20, 30, 40, 50, 60]}, index=[0, 0, 1, 1, 1, 2, 2] @@ -1638,6 +1638,61 @@ def test_describe(): assert "top" not in r18.index +def test_explode(): + """Test NestedFrame.explode gives correct result for flattening specified nested columns""" + base = NestedFrame( + data={ + "a": [[1, 2, 3], 4, [5, 6]], + "b": ["2", "4", "6"], + "c": [["x1", "x2", "x3"], "y", ["z1", "z2"]], + }, + index=[0, 1, 2], + ) + + nested_num = pd.DataFrame( + data={"d": [10, 11, 20, 21, 30, 31, 32], "e": [1, 2, 3, 4, 5, 6, 7]}, index=[0, 0, 1, 1, 1, 2, 2] + ) + nested_mix = pd.DataFrame( + data={"f": ["A", "B", "C", "D", "E", "A", "A", "B"], "g": [5, 4, 7, 5, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 2, 2, 2], + ) + base = base.add_nested(nested_num, "nested_num").add_nested(nested_mix, "nested_mix") + + # explode on base columns + r1 = base.explode(column=["a"]) + assert r1.shape[0] == 6 + assert r1.shape[1] == 5 + expected1 = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 0, 0, 1, 2, 2]) + assert (r1["a"] == expected1).all() + + r2 = base.explode(column="c", ignore_index=True) + expected2 = pd.Series(["x1", "x2", "x3", "y", "z1", "z2"]) + assert (r2["c"] == expected2).all() + + r3 = base.explode(column=["a", "c"]) + assert (r3["a"] == expected1).all() + expected3 = pd.Series(["x1", "x2", "x3", "y", "z1", "z2"], index=[0, 0, 0, 1, 2, 2]) + assert (r3["c"] == expected3).all() + + # explode on nested column error + with pytest.raises(ValueError): + base.explode(column=["nested_num", "nested_mix"]) + + with pytest.raises(ValueError): + base.explode(column=["nested_num", "a"]) + + # explode on nested column + r4 = base.explode(column="nested_num") + assert r4.shape[1] == 6 + expected4 = pd.Series([1, 2, 3, 4, 5, 6, 7], index=[0, 0, 1, 1, 1, 2, 2]) + assert (r4["e"] == expected4).all() + + r5 = base.explode(column="nested_mix", ignore_index=True) + assert r5.shape[1] == 6 + expected5 = pd.Series(["A", "B", "C", "D", "E", "A", "A", "B"]) + assert (r5["f"] == expected5).all() + + def test_eval(): """ Test basic behavior of NestedFrame.eval, and that it can handle nested references