import re import numpy as np import pytest from pandas._libs import lib import pandas as pd from pandas import ( DataFrame, Index, Series, Timestamp, date_range, ) import pandas._testing as tm from pandas.tests.groupby import get_groupby_method_args class TestNumericOnly: # make sure that we are passing thru kwargs to our agg functions @pytest.fixture def df(self): # GH3668 # GH5724 df = DataFrame( { "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], "string": list("abc"), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"), "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), }, columns=[ "group", "int", "float", "string", "category_string", "category_int", "datetime", "datetimetz", "timedelta", ], ) return df @pytest.mark.parametrize("method", ["mean", "median"]) def test_averages(self, df, method): # mean / median expected_columns_numeric = Index(["int", "float", "category_int"]) gb = df.groupby("group") expected = DataFrame( { "category_int": [7.5, 9], "float": [4.5, 6.0], "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], "int": [1.5, 3], "datetime": [ Timestamp("2013-01-01 12:00:00"), Timestamp("2013-01-03 00:00:00"), ], "datetimetz": [ Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), ], }, index=Index([1, 2], name="group"), columns=[ "int", "float", "category_int", ], ) result = getattr(gb, method)(numeric_only=True) tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns self._check(df, method, expected_columns, expected_columns_numeric) @pytest.mark.parametrize("method", ["min", "max"]) def test_extrema(self, df, method): # TODO: min, max *should* handle # categorical (ordered) dtype expected_columns = Index( [ "int", "float", "string", "category_int", "datetime", "datetimetz", "timedelta", ] ) expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) @pytest.mark.parametrize("method", ["first", "last"]) def test_first_last(self, df, method): expected_columns = Index( [ "int", "float", "string", "category_string", "category_int", "datetime", "datetimetz", "timedelta", ] ) expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) @pytest.mark.parametrize("method", ["sum", "cumsum"]) def test_sum_cumsum(self, df, method): expected_columns_numeric = Index(["int", "float", "category_int"]) expected_columns = Index( ["int", "float", "string", "category_int", "timedelta"] ) if method == "cumsum": # cumsum loses string expected_columns = Index(["int", "float", "category_int", "timedelta"]) self._check(df, method, expected_columns, expected_columns_numeric) @pytest.mark.parametrize("method", ["prod", "cumprod"]) def test_prod_cumprod(self, df, method): expected_columns = Index(["int", "float", "category_int"]) expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) @pytest.mark.parametrize("method", ["cummin", "cummax"]) def test_cummin_cummax(self, df, method): # like min, max, but don't include strings expected_columns = Index( ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] ) # GH#15561: numeric_only=False set by default like min/max expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") # object dtypes for transformations are not implemented in Cython and # have no Python fallback exception = NotImplementedError if method.startswith("cum") else TypeError if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError msg = "|".join( [ "Categorical is not ordered", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", ] ) with pytest.raises(exception, match=msg): getattr(gb, method)() elif method in ("sum", "mean", "median", "prod"): msg = "|".join( [ "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): getattr(gb, method)() else: result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) if method not in ("first", "last"): msg = "|".join( [ "Categorical is not ordered", "category type does not support", "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): getattr(gb, method)(numeric_only=False) else: result = getattr(gb, method)(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] ) df["E"] = "x" groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] gb = df.groupby(groups) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) kwargs = {"axis": 1} if numeric_only is not None: # when numeric_only is None we don't pass any argument kwargs["numeric_only"] = numeric_only # Functions without numeric_only and axis args no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift") # Functions with axis args has_axis = ( "cumprod", "cumsum", "diff", "pct_change", "rank", "shift", "cummax", "cummin", "idxmin", "idxmax", "fillna", ) warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated" if numeric_only is not None and groupby_func in no_args: msg = "got an unexpected keyword argument 'numeric_only'" if groupby_func in ["cumprod", "cumsum"]: with pytest.raises(TypeError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: with pytest.raises(TypeError, match=msg): method(*args, **kwargs) elif groupby_func not in has_axis: msg = "got an unexpected keyword argument 'axis'" with pytest.raises(TypeError, match=msg): method(*args, **kwargs) # fillna and shift are successful even on object dtypes elif (numeric_only is None or not numeric_only) and groupby_func not in ( "fillna", "shift", ): msgs = ( # cummax, cummin, rank "not supported between instances of", # cumprod "can't multiply sequence by non-int of type 'float'", # cumsum, diff, pct_change "unsupported operand type", "has no kernel", ) if using_infer_string: import pyarrow as pa errs = (TypeError, pa.lib.ArrowNotImplementedError) else: errs = TypeError with pytest.raises(errs, match=f"({'|'.join(msgs)})"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): result = method(*args, **kwargs) df_expected = df.drop(columns="E").T if numeric_only else df.T expected = getattr(df_expected, groupby_func)(*args).T if groupby_func == "shift" and not numeric_only: # shift with axis=1 leaves the leftmost column as numeric # but transposing for expected gives us object dtype expected = expected.astype(float) tm.assert_equal(result, expected) @pytest.mark.parametrize( "kernel, has_arg", [ ("all", False), ("any", False), ("bfill", False), ("corr", True), ("corrwith", True), ("cov", True), ("cummax", True), ("cummin", True), ("cumprod", True), ("cumsum", True), ("diff", False), ("ffill", False), ("fillna", False), ("first", True), ("idxmax", True), ("idxmin", True), ("last", True), ("max", True), ("mean", True), ("median", True), ("min", True), ("nth", False), ("nunique", False), ("pct_change", False), ("prod", True), ("quantile", True), ("sem", True), ("skew", True), ("std", True), ("sum", True), ("var", True), ], ) @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_numeric_only(kernel, has_arg, numeric_only, keys): # GH#46072 # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False # has_arg: Whether the op has a numeric_only arg df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) args = get_groupby_method_args(kernel, df) kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} gb = df.groupby(keys) method = getattr(gb, kernel) if has_arg and numeric_only is True: # Cases where b does not appear in the result result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg kernel in ("first", "last") or ( # kernels that work on any dtype and don't have numeric_only arg kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") and numeric_only is lib.no_default ) ): warn = FutureWarning if kernel == "fillna" else None msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(warn, match=msg): result = method(*args, **kwargs) assert "b" in result.columns elif has_arg: assert numeric_only is not True # kernels that are successful on any dtype were above; this will fail # object dtypes for transformations are not implemented in Cython and # have no Python fallback exception = NotImplementedError if kernel.startswith("cum") else TypeError msg = "|".join( [ "not allowed for this dtype", "cannot be performed against 'object' dtypes", # On PY39 message is "a number"; on PY310 and after is "a real number" "must be a string or a.* number", "unsupported operand type", "function is not implemented for this dtype", re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) if kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" ): method(*args, **kwargs) else: assert kernel in ("diff", "pct_change") assert numeric_only is lib.no_default # Doesn't have numeric_only argument and fails on nuisance columns with pytest.raises(TypeError, match=r"unsupported operand type"): method(*args, **kwargs) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("dtype", [bool, int, float, object]) def test_deprecate_numeric_only_series(dtype, groupby_func, request): # GH#46560 grouper = [0, 0, 1] ser = Series([1, 0, 0], dtype=dtype) gb = ser.groupby(grouper) if groupby_func == "corrwith": # corrwith is not implemented on SeriesGroupBy assert not hasattr(gb, groupby_func) return method = getattr(gb, groupby_func) expected_ser = Series([1, 0, 0]) expected_gb = expected_ser.groupby(grouper) expected_method = getattr(expected_gb, groupby_func) args = get_groupby_method_args(groupby_func, ser) fails_on_numeric_object = ( "corr", "cov", "cummax", "cummin", "cumprod", "cumsum", "quantile", ) # ops that give an object result on object input obj_result = ( "first", "last", "nth", "bfill", "ffill", "shift", "sum", "diff", "pct_change", "var", "mean", "median", "min", "max", "prod", "skew", ) # Test default behavior; kernels that fail may be enabled in the future but kernels # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": msg = "cannot be performed against 'object' dtypes" else: msg = "is not supported for object dtype" warn = FutureWarning if groupby_func == "fillna" else None warn_msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(warn, match=warn_msg): with pytest.raises(TypeError, match=msg): method(*args) elif dtype is object: warn = FutureWarning if groupby_func == "fillna" else None warn_msg = "SeriesGroupBy.fillna is deprecated" with tm.assert_produces_warning(warn, match=warn_msg): result = method(*args) with tm.assert_produces_warning(warn, match=warn_msg): expected = expected_method(*args) if groupby_func in obj_result: expected = expected.astype(object) tm.assert_series_equal(result, expected) has_numeric_only = ( "first", "last", "max", "mean", "median", "min", "prod", "quantile", "sem", "skew", "std", "sum", "var", "cummax", "cummin", "cumprod", "cumsum", ) if groupby_func not in has_numeric_only: msg = "got an unexpected keyword argument 'numeric_only'" with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) elif dtype is object: msg = "|".join( [ "SeriesGroupBy.sem called with numeric_only=True and dtype object", "Series.skew does not allow numeric_only=True with non-numeric", "cum(sum|prod|min|max) is not supported for object dtype", r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric", ] ) with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) elif dtype == bool and groupby_func == "quantile": msg = "Allowing bool dtype in SeriesGroupBy.quantile" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#51424 result = method(*args, numeric_only=True) expected = method(*args, numeric_only=False) tm.assert_series_equal(result, expected) else: result = method(*args, numeric_only=True) expected = method(*args, numeric_only=False) tm.assert_series_equal(result, expected)