import re import numpy as np import pytest import pandas as pd import pandas._testing as tm def test_error(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) with pytest.raises( ValueError, match="column must be a scalar, tuple, or list thereof" ): df.explode([list("AA")]) with pytest.raises(ValueError, match="column must be unique"): df.explode(list("AA")) df.columns = list("AA") with pytest.raises( ValueError, match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"), ): df.explode("A") @pytest.mark.parametrize( "input_subset, error_message", [ ( list("AC"), "columns must have matching element counts", ), ( [], "column must be nonempty", ), ( list("AC"), "columns must have matching element counts", ), ], ) def test_error_multi_columns(input_subset, error_message): # GH 39240 df = pd.DataFrame( { "A": [[0, 1, 2], np.nan, [], (3, 4)], "B": 1, "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]], }, index=list("abcd"), ) with pytest.raises(ValueError, match=error_message): df.explode(input_subset) @pytest.mark.parametrize( "scalar", ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")], ) def test_basic(scalar): df = pd.DataFrame( {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) result = df.explode(scalar) expected = pd.DataFrame( { scalar: pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object ), "B": 1, } ) tm.assert_frame_equal(result, expected) def test_multi_index_rows(): df = pd.DataFrame( {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), ) result = df.explode("A") expected = pd.DataFrame( { "A": pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4], index=pd.MultiIndex.from_tuples( [ ("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 1), ("b", 2), ("b", 2), ] ), dtype=object, ), "B": 1, } ) tm.assert_frame_equal(result, expected) def test_multi_index_columns(): df = pd.DataFrame( {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} ) result = df.explode(("A", 1)) expected = pd.DataFrame( { ("A", 1): pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4], index=pd.Index([0, 0, 0, 1, 2, 3, 3]), dtype=object, ), ("A", 2): 1, } ) tm.assert_frame_equal(result, expected) def test_usecase(): # explode a single column # gh-10511 df = pd.DataFrame( [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") ).set_index("C") result = df.explode("B") expected = pd.DataFrame( { "A": [11, 11, 11, 11, 11, 22, 22, 22], "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), "C": [10, 10, 10, 10, 10, 20, 20, 20], }, columns=list("ABC"), ).set_index("C") tm.assert_frame_equal(result, expected) # gh-8517 df = pd.DataFrame( [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], columns=["dt", "name", "text"], ) result = df.assign(text=df.text.str.split(" ")).explode("text") expected = pd.DataFrame( [ ["2014-01-01", "Alice", "A"], ["2014-01-01", "Alice", "B"], ["2014-01-02", "Bob", "C"], ["2014-01-02", "Bob", "D"], ], columns=["dt", "name", "text"], index=[0, 0, 1, 1], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "input_dict, input_index, expected_dict, expected_index", [ ( {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, [0, 0], {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, [0, 0, 0, 0], ), ( {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, pd.Index([0, 0], name="my_index"), {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, pd.Index([0, 0, 0, 0], name="my_index"), ), ( {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, pd.MultiIndex.from_arrays( [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"] ), {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, pd.MultiIndex.from_arrays( [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_first_index", "my_second_index"], ), ), ( {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]), {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, pd.MultiIndex.from_arrays( [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None] ), ), ], ) def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): # GH 28005 df = pd.DataFrame(input_dict, index=input_index, dtype=object) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) def test_ignore_index(): # GH 34932 df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) result = df.explode("values", ignore_index=True) expected = pd.DataFrame( {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] ) tm.assert_frame_equal(result, expected) def test_explode_sets(): # https://github.com/pandas-dev/pandas/issues/35614 df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) result = df.explode(column="a").sort_values(by="a") expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "input_subset, expected_dict, expected_index", [ ( list("AC"), { "A": pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], index=list("aaabcdde"), dtype=object, ), "B": 1, "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan], }, list("aaabcdde"), ), ( list("A"), { "A": pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], index=list("aaabcdde"), dtype=object, ), "B": 1, "C": [ ["a", "b", "c"], ["a", "b", "c"], ["a", "b", "c"], "foo", [], ["d", "e"], ["d", "e"], np.nan, ], }, list("aaabcdde"), ), ], ) def test_multi_columns(input_subset, expected_dict, expected_index): # GH 39240 df = pd.DataFrame( { "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan], "B": 1, "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan], }, index=list("abcde"), ) result = df.explode(input_subset) expected = pd.DataFrame(expected_dict, expected_index) tm.assert_frame_equal(result, expected) def test_multi_columns_nan_empty(): # GH 46084 df = pd.DataFrame( { "A": [[0, 1], [5], [], [2, 3]], "B": [9, 8, 7, 6], "C": [[1, 2], np.nan, [], [3, 4]], } ) result = df.explode(["A", "C"]) expected = pd.DataFrame( { "A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object), "B": [9, 9, 8, 7, 6, 6], "C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object), }, index=[0, 0, 1, 2, 3, 3], ) tm.assert_frame_equal(result, expected)