import numpy as np import pytest import pandas as pd from pandas import ( DataFrame, Index, MultiIndex, Series, Timestamp, date_range, ) import pandas._testing as tm def test_apply_describe_bug(multiindex_dataframe_random_data): grouped = multiindex_dataframe_random_data.groupby(level="first") grouped.describe() # it works! def test_series_describe_multikey(): ts = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) tm.assert_series_equal(result["std"], grouped.std(), check_names=False) tm.assert_series_equal(result["min"], grouped.min(), check_names=False) def test_series_describe_single(): ts = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) expected = grouped.describe().stack(future_stack=True) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) def test_series_describe_as_index(as_index, keys): # GH#49256 df = DataFrame( { "key1": ["one", "two", "two", "three", "two"], "key2": ["one", "two", "two", "three", "two"], "foo2": [1, 2, 4, 4, 6], } ) gb = df.groupby(keys, as_index=as_index)["foo2"] result = gb.describe() expected = DataFrame( { "key1": ["one", "three", "two"], "count": [1.0, 1.0, 3.0], "mean": [1.0, 4.0, 4.0], "std": [np.nan, np.nan, 2.0], "min": [1.0, 4.0, 2.0], "25%": [1.0, 4.0, 3.0], "50%": [1.0, 4.0, 4.0], "75%": [1.0, 4.0, 5.0], "max": [1.0, 4.0, 6.0], } ) if len(keys) == 2: expected.insert(1, "key2", expected["key1"]) if as_index: expected = expected.set_index(keys) tm.assert_frame_equal(result, expected) def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() desc_groups = [] for col in tsframe: group = grouped[col].describe() # GH 17464 - Remove duplicate MultiIndex levels group_col = MultiIndex( levels=[[col], group.columns], codes=[[0] * len(group.columns), range(len(group.columns))], ) group = DataFrame(group.values, columns=group_col, index=group.index) desc_groups.append(group) expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ expected.index = MultiIndex( levels=[[0, 1], expected.index], codes=[[0, 0, 1, 1], range(len(expected.index))], ) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 df1 = DataFrame( { "x": [1, 2, 3, 4, 5] * 3, "y": [10, 20, 30, 40, 50] * 3, "z": [100, 200, 300, 400, 500] * 3, } ) df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 df2 = df1.rename(columns={"k": "key"}) msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): df1.groupby("k").describe() with pytest.raises(ValueError, match=msg): df2.groupby("key").describe() def test_frame_describe_unstacked_format(): # GH 4792 prices = { Timestamp("2011-01-06 10:59:05", tz=None): 24990, Timestamp("2011-01-06 12:43:33", tz=None): 25499, Timestamp("2011-01-06 12:54:09", tz=None): 25499, } volumes = { Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, Timestamp("2011-01-06 12:54:09", tz=None): 100000000, } df = DataFrame({"PRICE": prices, "VOLUME": volumes}) result = df.groupby("PRICE").VOLUME.describe() data = [ df[df.PRICE == 24990].VOLUME.describe().values.tolist(), df[df.PRICE == 25499].VOLUME.describe().values.tolist(), ] expected = DataFrame( data, index=Index([24990, 25499], name="PRICE"), columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings( "ignore:" "indexing past lexsort depth may impact performance:" "pandas.errors.PerformanceWarning" ) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_describe_with_duplicate_output_column_names(as_index, keys): # GH 35314 df = DataFrame( { "a1": [99, 99, 99, 88, 88, 88], "a2": [99, 99, 99, 88, 88, 88], "b": [1, 2, 3, 4, 5, 6], "c": [10, 20, 30, 40, 50, 60], }, columns=["a1", "a2", "b", "b"], copy=False, ) if keys == ["a1"]: df = df.drop(columns="a2") expected = ( DataFrame.from_records( [ ("b", "count", 3.0, 3.0), ("b", "mean", 5.0, 2.0), ("b", "std", 1.0, 1.0), ("b", "min", 4.0, 1.0), ("b", "25%", 4.5, 1.5), ("b", "50%", 5.0, 2.0), ("b", "75%", 5.5, 2.5), ("b", "max", 6.0, 3.0), ("b", "count", 3.0, 3.0), ("b", "mean", 5.0, 2.0), ("b", "std", 1.0, 1.0), ("b", "min", 4.0, 1.0), ("b", "25%", 4.5, 1.5), ("b", "50%", 5.0, 2.0), ("b", "75%", 5.5, 2.5), ("b", "max", 6.0, 3.0), ], ) .set_index([0, 1]) .T ) expected.columns.names = [None, None] if len(keys) == 2: expected.index = MultiIndex( levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] ) else: expected.index = Index([88, 99], name="a1") if not as_index: expected = expected.reset_index() result = df.groupby(keys, as_index=as_index).describe() tm.assert_frame_equal(result, expected) def test_describe_duplicate_columns(): # GH#50806 df = DataFrame([[0, 1, 2, 3]]) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1]) result = gb.describe(percentiles=[]) columns = ["count", "mean", "std", "min", "50%", "max"] frames = [ DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) for val in (0.0, 2.0, 3.0) ] expected = pd.concat(frames, axis=1) expected.columns = MultiIndex( levels=[[0, 2], columns], codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], ) expected.index.names = [1] tm.assert_frame_equal(result, expected) class TestGroupByNonCythonPaths: # GH#5610 non-cython calls should not include the grouper # Tests for code not expected to go through cython paths. @pytest.fixture def df(self): df = DataFrame( [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"], ) return df @pytest.fixture def gb(self, df): gb = df.groupby("A") return gb @pytest.fixture def gni(self, df): gni = df.groupby("A", as_index=False) return gni def test_describe(self, df, gb, gni): # describe expected_index = Index([1, 3], name="A") expected_col = MultiIndex( levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], codes=[[0] * 8, list(range(8))], ) expected = DataFrame( [ [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], ], index=expected_index, columns=expected_col, ) result = gb.describe() tm.assert_frame_equal(result, expected) expected = expected.reset_index() result = gni.describe() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", [ {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, ], ) def test_groupby_empty_dataset(dtype, kwargs): # GH#41575 df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) df["B"] = df["B"].astype(int) df["C"] = df["C"].astype(float) result = df.iloc[:0].groupby("A").describe(**kwargs) expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] tm.assert_frame_equal(result, expected) result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] expected.index = Index([]) tm.assert_frame_equal(result, expected)