""" Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ from __future__ import annotations import decimal import operator from textwrap import dedent from typing import ( TYPE_CHECKING, Literal, cast, ) import warnings import numpy as np from pandas._libs import ( algos, hashtable as htable, iNaT, lib, ) from pandas._typing import ( AnyArrayLike, ArrayLike, AxisInt, DtypeObj, TakeIndexer, npt, ) from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, np_find_common_type, ) from pandas.core.dtypes.common import ( ensure_float64, ensure_object, ensure_platform_int, is_array_like, is_bool_dtype, is_complex_dtype, is_dict_like, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_signed_integer_dtype, needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( BaseMaskedDtype, CategoricalDtype, ExtensionDtype, NumpyEADtype, ) from pandas.core.dtypes.generic import ( ABCDatetimeArray, ABCExtensionArray, ABCIndex, ABCMultiIndex, ABCSeries, ABCTimedeltaArray, ) from pandas.core.dtypes.missing import ( isna, na_value_for_dtype, ) from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( array as pd_array, ensure_wrapped_if_datetimelike, extract_array, ) from pandas.core.indexers import validate_indices if TYPE_CHECKING: from pandas._typing import ( ListLike, NumpySorter, NumpyValueArrayLike, ) from pandas import ( Categorical, Index, Series, ) from pandas.core.arrays import ( BaseMaskedArray, ExtensionArray, ) # --------------- # # dtype access # # --------------- # def _ensure_data(values: ArrayLike) -> np.ndarray: """ routine to ensure that our data is of the correct input dtype for lower-level routines This will coerce: - ints -> int64 - uint -> uint64 - bool -> uint8 - datetimelike -> i8 - datetime64tz -> i8 (in local tz) - categorical -> codes Parameters ---------- values : np.ndarray or ExtensionArray Returns ------- np.ndarray """ if not isinstance(values, ABCMultiIndex): # extract_array would raise values = extract_array(values, extract_numpy=True) if is_object_dtype(values.dtype): return ensure_object(np.asarray(values)) elif isinstance(values.dtype, BaseMaskedDtype): # i.e. BooleanArray, FloatingArray, IntegerArray values = cast("BaseMaskedArray", values) if not values._hasna: # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816 # recurse to avoid re-implementing logic for eg bool->uint8 return _ensure_data(values._data) return np.asarray(values) elif isinstance(values.dtype, CategoricalDtype): # NB: cases that go through here should NOT be using _reconstruct_data # on the back-end. values = cast("Categorical", values) return values.codes elif is_bool_dtype(values.dtype): if isinstance(values, np.ndarray): # i.e. actually dtype == np.dtype("bool") return np.asarray(values).view("uint8") else: # e.g. Sparse[bool, False] # TODO: no test cases get here return np.asarray(values).astype("uint8", copy=False) elif is_integer_dtype(values.dtype): return np.asarray(values) elif is_float_dtype(values.dtype): # Note: checking `values.dtype == "float128"` raises on Windows and 32bit # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" # has no attribute "itemsize" if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] # we dont (yet) have float128 hashtable support return ensure_float64(values) return np.asarray(values) elif is_complex_dtype(values.dtype): return cast(np.ndarray, values) # datetimelike elif needs_i8_conversion(values.dtype): npvalues = values.view("i8") npvalues = cast(np.ndarray, npvalues) return npvalues # we have failed, return object values = np.asarray(values, dtype=object) return ensure_object(values) def _reconstruct_data( values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike ) -> ArrayLike: """ reverse of _ensure_data Parameters ---------- values : np.ndarray or ExtensionArray dtype : np.dtype or ExtensionDtype original : AnyArrayLike Returns ------- ExtensionArray or np.ndarray """ if isinstance(values, ABCExtensionArray) and values.dtype == dtype: # Catch DatetimeArray/TimedeltaArray return values if not isinstance(dtype, np.dtype): # i.e. ExtensionDtype; note we have ruled out above the possibility # that values.dtype == dtype cls = dtype.construct_array_type() values = cls._from_sequence(values, dtype=dtype) else: values = values.astype(dtype, copy=False) return values def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): # GH#52986 if func_name != "isin-targets": # Make an exception for the comps argument in isin. warnings.warn( f"{func_name} with argument that is not not a Series, Index, " "ExtensionArray, or np.ndarray is deprecated and will raise in a " "future version.", FutureWarning, stacklevel=find_stack_level(), ) inferred = lib.infer_dtype(values, skipna=False) if inferred in ["mixed", "string", "mixed-integer"]: # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) else: values = np.asarray(values) return values _hashtables = { "complex128": htable.Complex128HashTable, "complex64": htable.Complex64HashTable, "float64": htable.Float64HashTable, "float32": htable.Float32HashTable, "uint64": htable.UInt64HashTable, "uint32": htable.UInt32HashTable, "uint16": htable.UInt16HashTable, "uint8": htable.UInt8HashTable, "int64": htable.Int64HashTable, "int32": htable.Int32HashTable, "int16": htable.Int16HashTable, "int8": htable.Int8HashTable, "string": htable.StringHashTable, "object": htable.PyObjectHashTable, } def _get_hashtable_algo(values: np.ndarray): """ Parameters ---------- values : np.ndarray Returns ------- htable : HashTable subclass values : ndarray """ values = _ensure_data(values) ndtype = _check_object_for_strings(values) hashtable = _hashtables[ndtype] return hashtable, values def _check_object_for_strings(values: np.ndarray) -> str: """ Check if we can use string hashtable instead of object hashtable. Parameters ---------- values : ndarray Returns ------- str """ ndtype = values.dtype.name if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable if lib.is_string_array(values, skipna=False): ndtype = "string" return ndtype # --------------- # # top-level algos # # --------------- # def unique(values): """ Return unique values based on a hash table. Uniques are returned in order of appearance. This does NOT sort. Significantly faster than numpy.unique for long enough sequences. Includes NA values. Parameters ---------- values : 1d array-like Returns ------- numpy.ndarray or ExtensionArray The return can be: * Index : when the input is an Index * Categorical : when the input is a Categorical dtype * ndarray : when the input is a Series/ndarray Return numpy.ndarray or ExtensionArray. See Also -------- Index.unique : Return unique values from an Index. Series.unique : Return unique values of Series object. Examples -------- >>> pd.unique(pd.Series([2, 1, 3, 3])) array([2, 1, 3]) >>> pd.unique(pd.Series([2] + [1] * 5)) array([2, 1]) >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), ... ] ... ) ... ) ['2016-01-01 00:00:00-05:00'] Length: 1, dtype: datetime64[ns, US/Eastern] >>> pd.unique( ... pd.Index( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), ... ] ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], dtype='datetime64[ns, US/Eastern]', freq=None) >>> pd.unique(np.array(list("baabc"), dtype="O")) array(['b', 'a', 'c'], dtype=object) An unordered Categorical will return categories in the order of appearance. >>> pd.unique(pd.Series(pd.Categorical(list("baabc")))) ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc")))) ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] An ordered Categorical preserves the category ordering. >>> pd.unique( ... pd.Series( ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) ... ) ... ) ['b', 'a', 'c'] Categories (3, object): ['a' < 'b' < 'c'] An array of tuples >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ return unique_with_mask(values) def nunique_ints(values: ArrayLike) -> int: """ Return the number of unique values for integer array-likes. Significantly faster than pandas.unique for long enough sequences. No checks are done to ensure input is integral. Parameters ---------- values : 1d array-like Returns ------- int : The number of unique values in ``values`` """ if len(values) == 0: return 0 values = _ensure_data(values) # bincount requires intp result = (np.bincount(values.ravel().astype("intp")) != 0).sum() return result def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): """See algorithms.unique for docs. Takes a mask for masked arrays.""" values = _ensure_arraylike(values, func_name="unique") if isinstance(values.dtype, ExtensionDtype): # Dispatch to extension dtype's unique. return values.unique() original = values hashtable, values = _get_hashtable_algo(values) table = hashtable(len(values)) if mask is None: uniques = table.unique(values) uniques = _reconstruct_data(uniques, original.dtype, original) return uniques else: uniques, mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) assert mask is not None # for mypy return uniques, mask.astype("bool") unique1d = unique _MINIMUM_COMP_ARR_LEN = 1_000_000 def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: """ Compute the isin boolean array. Parameters ---------- comps : list-like values : list-like Returns ------- ndarray[bool] Same length as `comps`. """ if not is_list_like(comps): raise TypeError( "only list-like objects are allowed to be passed " f"to isin(), you passed a `{type(comps).__name__}`" ) if not is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed " f"to isin(), you passed a `{type(values).__name__}`" ) if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): orig_values = list(values) values = _ensure_arraylike(orig_values, func_name="isin-targets") if ( len(values) > 0 and values.dtype.kind in "iufcb" and not is_signed_integer_dtype(comps) ): # GH#46485 Use object to avoid upcast to float64 later # TODO: Share with _find_common_type_compat values = construct_1d_object_array_from_listlike(orig_values) elif isinstance(values, ABCMultiIndex): # Avoid raising in extract_array values = np.array(values) else: values = extract_array(values, extract_numpy=True, extract_range=True) comps_array = _ensure_arraylike(comps, func_name="isin") comps_array = extract_array(comps_array, extract_numpy=True) if not isinstance(comps_array, np.ndarray): # i.e. Extension Array return comps_array.isin(values) elif needs_i8_conversion(comps_array.dtype): # Dispatch to DatetimeLikeArrayMixin.isin return pd_array(comps_array).isin(values) elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype): # e.g. comps_array are integers and values are datetime64s return np.zeros(comps_array.shape, dtype=bool) # TODO: not quite right ... Sparse/Categorical elif needs_i8_conversion(values.dtype): return isin(comps_array, values.astype(object)) elif isinstance(values.dtype, ExtensionDtype): return isin(np.asarray(comps_array), np.asarray(values)) # GH16012 # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): def f(c, v): return np.logical_or(np.isin(c, v).ravel(), np.isnan(c)) else: f = lambda a, b: np.isin(a, b).ravel() else: common = np_find_common_type(values.dtype, comps_array.dtype) values = values.astype(common, copy=False) comps_array = comps_array.astype(common, copy=False) f = htable.ismember return f(comps_array, values) def factorize_array( values: np.ndarray, use_na_sentinel: bool = True, size_hint: int | None = None, na_value: object = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[npt.NDArray[np.intp], np.ndarray]: """ Factorize a numpy array to codes and uniques. This doesn't do any coercion of types or unboxing before factorization. Parameters ---------- values : ndarray use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NaN values. If False, NaN values will be encoded as non-negative integers and will not drop the NaN from the uniques of the values. size_hint : int, optional Passed through to the hashtable's 'get_labels' method na_value : object, optional A value in `values` to consider missing. Note: only use this parameter when you know that you don't have any values pandas would consider missing in the array (NaN for float data, iNaT for datetimes, etc.). mask : ndarray[bool], optional If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". Returns ------- codes : ndarray[np.intp] uniques : ndarray """ original = values if values.dtype.kind in "mM": # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we # need to do the same to na_value. We are assuming here that the passed # na_value is an appropriately-typed NaT. # e.g. test_where_datetimelike_categorical na_value = iNaT hash_klass, values = _get_hashtable_algo(values) table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( values, na_sentinel=-1, na_value=na_value, mask=mask, ignore_na=use_na_sentinel, ) # re-cast e.g. i8->dt64/td64, uint8->bool uniques = _reconstruct_data(uniques, original.dtype, original) codes = ensure_platform_int(codes) return codes, uniques @doc( values=dedent( """\ values : sequence A 1-D sequence. Sequences that aren't pandas objects are coerced to ndarrays before factorization. """ ), sort=dedent( """\ sort : bool, default False Sort `uniques` and shuffle `codes` to maintain the relationship. """ ), size_hint=dedent( """\ size_hint : int, optional Hint to the hashtable sizer. """ ), ) def factorize( values, sort: bool = False, use_na_sentinel: bool = True, size_hint: int | None = None, ) -> tuple[np.ndarray, np.ndarray | Index]: """ Encode the object as an enumerated type or categorical variable. This method is useful for obtaining a numeric representation of an array when all that matters is identifying distinct values. `factorize` is available as both a top-level function :func:`pandas.factorize`, and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. Parameters ---------- {values}{sort} use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NaN values. If False, NaN values will be encoded as non-negative integers and will not drop the NaN from the uniques of the values. .. versionadded:: 1.5.0 {size_hint}\ Returns ------- codes : ndarray An integer ndarray that's an indexer into `uniques`. ``uniques.take(codes)`` will have the same values as `values`. uniques : ndarray, Index, or Categorical The unique valid values. When `values` is Categorical, `uniques` is a Categorical. When `values` is some other pandas object, an `Index` is returned. Otherwise, a 1-D ndarray is returned. .. note:: Even if there's a missing value in `values`, `uniques` will *not* contain an entry for it. See Also -------- cut : Discretize continuous-valued array. unique : Find the unique value in an array. Notes ----- Reference :ref:`the user guide ` for more examples. Examples -------- These examples all show factorize as a top-level method like ``pd.factorize(values)``. The results are identical for methods like :meth:`Series.factorize`. >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O")) >>> codes array([0, 0, 1, 2, 0]) >>> uniques array(['b', 'a', 'c'], dtype=object) With ``sort=True``, the `uniques` will be sorted, and `codes` will be shuffled so that the relationship is the maintained. >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"), ... sort=True) >>> codes array([1, 1, 0, 2, 1]) >>> uniques array(['a', 'b', 'c'], dtype=object) When ``use_na_sentinel=True`` (the default), missing values are indicated in the `codes` with the sentinel value ``-1`` and missing values are not included in `uniques`. >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O")) >>> codes array([ 0, -1, 1, 2, 0]) >>> uniques array(['b', 'a', 'c'], dtype=object) Thus far, we've only factorized lists (which are internally coerced to NumPy arrays). When factorizing pandas objects, the type of `uniques` will differ. For Categoricals, a `Categorical` is returned. >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) >>> codes, uniques = pd.factorize(cat) >>> codes array([0, 0, 1]) >>> uniques ['a', 'c'] Categories (3, object): ['a', 'b', 'c'] Notice that ``'b'`` is in ``uniques.categories``, despite not being present in ``cat.values``. For all other pandas objects, an Index of the appropriate type is returned. >>> cat = pd.Series(['a', 'a', 'c']) >>> codes, uniques = pd.factorize(cat) >>> codes array([0, 0, 1]) >>> uniques Index(['a', 'c'], dtype='object') If NaN is in the values, and we want to include NaN in the uniques of the values, it can be achieved by setting ``use_na_sentinel=False``. >>> values = np.array([1, 2, 1, np.nan]) >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True >>> codes array([ 0, 1, 0, -1]) >>> uniques array([1., 2.]) >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) >>> codes array([0, 1, 0, 2]) >>> uniques array([ 1., 2., nan]) """ # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) # 2.) factorizing codes and uniques # 3.) Maybe boxing the uniques in an Index # # Step 2 is dispatched to extension types (like Categorical). They are # responsible only for factorization. All data coercion, sorting and boxing # should happen here. if isinstance(values, (ABCIndex, ABCSeries)): return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel) values = _ensure_arraylike(values, func_name="factorize") original = values if ( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None ): # The presence of 'freq' means we can fast-path sorting and know there # aren't NAs codes, uniques = values.factorize(sort=sort) return codes, uniques elif not isinstance(values, np.ndarray): # i.e. ExtensionArray codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex if not use_na_sentinel and values.dtype == object: # factorize can now handle differentiating various types of null values. # These can only occur when the array has object dtype. # However, for backwards compatibility we only use the null for the # provided dtype. This may be revisited in the future, see GH#48476. null_mask = isna(values) if null_mask.any(): na_value = na_value_for_dtype(values.dtype, compat=False) # Don't modify (potentially user-provided) array values = np.where(null_mask, na_value, values) codes, uniques = factorize_array( values, use_na_sentinel=use_na_sentinel, size_hint=size_hint, ) if sort and len(uniques) > 0: uniques, codes = safe_sort( uniques, codes, use_na_sentinel=use_na_sentinel, assume_unique=True, verify=False, ) uniques = _reconstruct_data(uniques, original.dtype, original) return codes, uniques def value_counts( values, sort: bool = True, ascending: bool = False, normalize: bool = False, bins=None, dropna: bool = True, ) -> Series: """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : bool, default True Sort by values ascending : bool, default False Sort in ascending order normalize: bool, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : bool, default True Don't include counts of NaN Returns ------- Series """ warnings.warn( # GH#53493 "pandas.value_counts is deprecated and will be removed in a " "future version. Use pd.Series(obj).value_counts() instead.", FutureWarning, stacklevel=find_stack_level(), ) return value_counts_internal( values, sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, ) def value_counts_internal( values, sort: bool = True, ascending: bool = False, normalize: bool = False, bins=None, dropna: bool = True, ) -> Series: from pandas import ( Index, Series, ) index_name = getattr(values, "name", None) name = "proportion" if normalize else "count" if bins is not None: from pandas.core.reshape.tile import cut if isinstance(values, Series): values = values._values try: ii = cut(values, bins, include_lowest=True) except TypeError as err: raise TypeError("bins argument only works with numeric data.") from err # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) result.name = name result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() # if we are dropna and we have NO values if dropna and (result._values == 0).all(): result = result.iloc[0:0] # normalizing is by len of all (regardless of dropna) counts = np.array([len(ii)]) else: if is_extension_array_dtype(values): # handle Categorical and sparse, result = Series(values, copy=False)._values.value_counts(dropna=dropna) result.name = name result.index.name = index_name counts = result._values if not isinstance(counts, np.ndarray): # e.g. ArrowExtensionArray counts = np.asarray(counts) elif isinstance(values, ABCMultiIndex): # GH49558 levels = list(range(values.nlevels)) result = ( Series(index=values, name=name) .groupby(level=levels, dropna=dropna) .size() ) result.index.names = values.names counts = result._values else: values = _ensure_arraylike(values, func_name="value_counts") keys, counts, _ = value_counts_arraylike(values, dropna) if keys.dtype == np.float16: keys = keys.astype(np.float32) # For backwards compatibility, we let Index do its normal type # inference, _except_ for if if infers from object to bool. idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) elif ( idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 and idx.dtype != "string[pyarrow_numpy]" ): warnings.warn( # GH#56161 "The behavior of value_counts with object-dtype is deprecated. " "In a future version, this will *not* perform dtype inference " "on the resulting index. To retain the old behavior, use " "`result.index = result.index.infer_objects()`", FutureWarning, stacklevel=find_stack_level(), ) idx.name = index_name result = Series(counts, index=idx, name=name, copy=False) if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / counts.sum() return result # Called once from SparseArray, otherwise could be private def value_counts_arraylike( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None ) -> tuple[ArrayLike, npt.NDArray[np.int64], int]: """ Parameters ---------- values : np.ndarray dropna : bool mask : np.ndarray[bool] or None, default None Returns ------- uniques : np.ndarray counts : np.ndarray[np.int64] """ original = values values = _ensure_data(values) keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period if dropna: mask = keys != iNaT keys, counts = keys[mask], counts[mask] res_keys = _reconstruct_data(keys, original.dtype, original) return res_keys, counts, na_counter def duplicated( values: ArrayLike, keep: Literal["first", "last", False] = "first", mask: npt.NDArray[np.bool_] | None = None, ) -> npt.NDArray[np.bool_]: """ Return boolean ndarray denoting duplicate values. Parameters ---------- values : np.ndarray or ExtensionArray Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. mask : ndarray[bool], optional array indicating which elements to exclude from checking Returns ------- duplicated : ndarray[bool] """ values = _ensure_data(values) return htable.duplicated(values, keep=keep, mask=mask) def mode( values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None ) -> ArrayLike: """ Returns the mode(s) of an array. Parameters ---------- values : array-like Array over which to check for duplicate values. dropna : bool, default True Don't consider counts of NaN/NaT. Returns ------- np.ndarray or ExtensionArray """ values = _ensure_arraylike(values, func_name="mode") original = values if needs_i8_conversion(values.dtype): # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray. values = ensure_wrapped_if_datetimelike(values) values = cast("ExtensionArray", values) return values._mode(dropna=dropna) values = _ensure_data(values) npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) if res_mask is not None: return npresult, res_mask # type: ignore[return-value] try: npresult = np.sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", stacklevel=find_stack_level(), ) result = _reconstruct_data(npresult, original.dtype, original) return result def rank( values: ArrayLike, axis: AxisInt = 0, method: str = "average", na_option: str = "keep", ascending: bool = True, pct: bool = False, ) -> npt.NDArray[np.float64]: """ Rank the values along a given axis. Parameters ---------- values : np.ndarray or ExtensionArray Array whose values will be ranked. The number of dimensions in this array must not exceed 2. axis : int, default 0 Axis over which to perform rankings. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' The method by which tiebreaks are broken during the ranking. na_option : {'keep', 'top'}, default 'keep' The method by which NaNs are placed in the ranking. - ``keep``: rank each NaN value with a NaN ranking - ``top``: replace each NaN with either +/- inf so that they there are ranked at the top ascending : bool, default True Whether or not the elements should be ranked in ascending order. pct : bool, default False Whether or not to the display the returned rankings in integer form (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ is_datetimelike = needs_i8_conversion(values.dtype) values = _ensure_data(values) if values.ndim == 1: ranks = algos.rank_1d( values, is_datetimelike=is_datetimelike, ties_method=method, ascending=ascending, na_option=na_option, pct=pct, ) elif values.ndim == 2: ranks = algos.rank_2d( values, axis=axis, is_datetimelike=is_datetimelike, ties_method=method, ascending=ascending, na_option=na_option, pct=pct, ) else: raise TypeError("Array with ndim > 2 are not supported.") return ranks # ---- # # take # # ---- # def take( arr, indices: TakeIndexer, axis: AxisInt = 0, allow_fill: bool = False, fill_value=None, ): """ Take elements from an array. Parameters ---------- arr : array-like or scalar value Non array-likes (sequences/scalars without a dtype) are coerced to an ndarray. .. deprecated:: 2.1.0 Passing an argument other than a numpy.ndarray, ExtensionArray, Index, or Series is deprecated. indices : sequence of int or one-dimensional np.ndarray of int Indices to be taken. axis : int, default 0 The axis over which to select values. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type (``self.dtype.na_value``) is used. For multi-dimensional `arr`, each *element* is filled with `fill_value`. Returns ------- ndarray or ExtensionArray Same type as the input. Raises ------ IndexError When `indices` is out of bounds for the array. ValueError When the indexer contains negative values other than ``-1`` and `allow_fill` is True. Notes ----- When `allow_fill` is False, `indices` may be whatever dimensionality is accepted by NumPy for `arr`. When `allow_fill` is True, `indices` should be 1-D. See Also -------- numpy.take : Take elements from an array along an axis. Examples -------- >>> import pandas as pd With the default ``allow_fill=False``, negative numbers indicate positional indices from the right. >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1]) array([10, 10, 30]) Setting ``allow_fill=True`` will place `fill_value` in those positions. >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) array([10., 10., nan]) >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, ... fill_value=-10) array([ 10, 10, -10]) """ if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): # GH#52981 warnings.warn( "pd.api.extensions.take accepting non-standard inputs is deprecated " "and will raise in a future version. Pass either a numpy.ndarray, " "ExtensionArray, Index, or Series instead.", FutureWarning, stacklevel=find_stack_level(), ) if not is_array_like(arr): arr = np.asarray(arr) indices = ensure_platform_int(indices) if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) result = take_nd( arr, indices, axis=axis, allow_fill=True, fill_value=fill_value ) else: # NumPy style result = arr.take(indices, axis=axis) return result # ------------ # # searchsorted # # ------------ # def searchsorted( arr: ArrayLike, value: NumpyValueArrayLike | ExtensionArray, side: Literal["left", "right"] = "left", sorter: NumpySorter | None = None, ) -> npt.NDArray[np.intp] | np.intp: """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted array `arr` (a) such that, if the corresponding elements in `value` were inserted before the indices, the order of `arr` would be preserved. Assuming that `arr` is sorted: ====== ================================ `side` returned index `i` satisfies ====== ================================ left ``arr[i-1] < value <= self[i]`` right ``arr[i-1] <= value < self[i]`` ====== ================================ Parameters ---------- arr: np.ndarray, ExtensionArray, Series Input array. If `sorter` is None, then it must be sorted in ascending order, otherwise `sorter` must be an array of indices that sort it. value : array-like or scalar Values to insert into `arr`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). sorter : 1-D array-like, optional Optional array of integer indices that sort array a into ascending order. They are typically the result of argsort. Returns ------- array of ints or int If value is array-like, array of insertion points. If value is scalar, a single integer. See Also -------- numpy.searchsorted : Similar method from NumPy. """ if sorter is not None: sorter = ensure_platform_int(sorter) if ( isinstance(arr, np.ndarray) and arr.dtype.kind in "iu" and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be # recast by numpy, causing a slow search. # Before searching below, we therefore try to give `value` the # same dtype as `arr`, while guarding against integer overflows. iinfo = np.iinfo(arr.dtype.type) value_arr = np.array([value]) if is_integer(value) else np.array(value) if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): # value within bounds, so no overflow, so can convert value dtype # to dtype of arr dtype = arr.dtype else: dtype = value_arr.dtype if is_integer(value): # We know that value is int value = cast(int, dtype.type(value)) else: value = pd_array(cast(ArrayLike, value), dtype=dtype) else: # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value arr = ensure_wrapped_if_datetimelike(arr) # Argument 1 to "searchsorted" of "ndarray" has incompatible type # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike" return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type] # ---- # # diff # # ---- # _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} def diff(arr, n: int, axis: AxisInt = 0): """ difference of n between self, analogous to s-s.shift(n) Parameters ---------- arr : ndarray or ExtensionArray n : int number of periods axis : {0, 1} axis to shift on stacklevel : int, default 3 The stacklevel for the lost dtype warning. Returns ------- shifted """ n = int(n) na = np.nan dtype = arr.dtype is_bool = is_bool_dtype(dtype) if is_bool: op = operator.xor else: op = operator.sub if isinstance(dtype, NumpyEADtype): # NumpyExtensionArray cannot necessarily hold shifted versions of itself. arr = arr.to_numpy() dtype = arr.dtype if not isinstance(arr, np.ndarray): # i.e ExtensionArray if hasattr(arr, f"__{op.__name__}__"): if axis != 0: raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: raise TypeError( f"{type(arr).__name__} has no 'diff' method. " "Convert to a suitable dtype prior to calling 'diff'." ) is_timedelta = False if arr.dtype.kind in "mM": dtype = np.int64 arr = arr.view("i8") na = iNaT is_timedelta = True elif is_bool: # We have to cast in order to be able to hold np.nan dtype = np.object_ elif dtype.kind in "iu": # We have to cast in order to be able to hold np.nan # int8, int16 are incompatible with float64, # see https://github.com/cython/cython/issues/2646 if arr.dtype.name in ["int8", "int16"]: dtype = np.float32 else: dtype = np.float64 orig_ndim = arr.ndim if orig_ndim == 1: # reshape so we can always use algos.diff_2d arr = arr.reshape(-1, 1) # TODO: require axis == 0 dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) na_indexer = [slice(None)] * 2 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) out_arr[tuple(na_indexer)] = na if arr.dtype.name in _diff_special: # TODO: can diff_2d dtype specialization troubles be fixed by defining # out_arr inside diff_2d? algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. _res_indexer = [slice(None)] * 2 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) res_indexer = tuple(_res_indexer) _lag_indexer = [slice(None)] * 2 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) lag_indexer = tuple(_lag_indexer) out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer]) if is_timedelta: out_arr = out_arr.view("timedelta64[ns]") if orig_ndim == 1: out_arr = out_arr[:, 0] return out_arr # -------------------------------------------------------------------- # Helper functions # Note: safe_sort is in algorithms.py instead of sorting.py because it is # low-dependency, is used in this module, and used private methods from # this module. def safe_sort( values: Index | ArrayLike, codes: npt.NDArray[np.intp] | None = None, use_na_sentinel: bool = True, assume_unique: bool = False, verify: bool = True, ) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]: """ Sort ``values`` and reorder corresponding ``codes``. ``values`` should be unique if ``codes`` is not None. Safe for use with mixed types (int, str), orders ints before strs. Parameters ---------- values : list-like Sequence; must be unique if ``codes`` is not None. codes : np.ndarray[intp] or None, default None Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``-1``. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NaN values. If False, NaN values will be encoded as non-negative integers and will not drop the NaN from the uniques of the values. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``codes`` is None. verify : bool, default True Check if codes are out of bound for the values and put out of bound codes equal to ``-1``. If ``verify=False``, it is assumed there are no out of bound codes. Ignored when ``codes`` is None. Returns ------- ordered : AnyArrayLike Sorted ``values`` new_codes : ndarray Reordered ``codes``; returned when ``codes`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``codes`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``codes`` is not None and ``values`` contain duplicates. """ if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCIndex)): raise TypeError( "Only np.ndarray, ExtensionArray, and Index objects are allowed to " "be passed to safe_sort as values" ) sorter = None ordered: AnyArrayLike if ( not isinstance(values.dtype, ExtensionDtype) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): ordered = _sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except (TypeError, decimal.InvalidOperation): # Previous sorters failed or were not applicable, try `_sort_mixed` # which would work, but which fails for special case of 1d arrays # with tuples. if values.size and isinstance(values[0], tuple): # error: Argument 1 to "_sort_tuples" has incompatible type # "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected # "ndarray[Any, Any]" ordered = _sort_tuples(values) # type: ignore[arg-type] else: ordered = _sort_mixed(values) # codes: if codes is None: return ordered if not is_list_like(codes): raise TypeError( "Only list-like objects or None are allowed to " "be passed to safe_sort as codes" ) codes = ensure_platform_int(np.asarray(codes)) if not assume_unique and not len(unique(values)) == len(values): raise ValueError("values should be unique if codes is not None") if sorter is None: # mixed types # error: Argument 1 to "_get_hashtable_algo" has incompatible type # "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected # "ndarray[Any, Any]" hash_klass, values = _get_hashtable_algo(values) # type: ignore[arg-type] t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if use_na_sentinel: # take_nd is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() if verify: mask = (codes < -len(values)) | (codes >= len(values)) codes[mask] = 0 else: mask = None new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") if use_na_sentinel: mask = codes == -1 if verify: mask = mask | (codes < -len(values)) | (codes >= len(values)) if use_na_sentinel and mask is not None: np.putmask(new_codes, mask, -1) return ordered, ensure_platform_int(new_codes) def _sort_mixed(values) -> AnyArrayLike: """order ints before strings before nulls in 1d arrays""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) null_pos = np.array([isna(x) for x in values], dtype=bool) num_pos = ~str_pos & ~null_pos str_argsort = np.argsort(values[str_pos]) num_argsort = np.argsort(values[num_pos]) # convert boolean arrays to positional indices, then order by underlying values str_locs = str_pos.nonzero()[0].take(str_argsort) num_locs = num_pos.nonzero()[0].take(num_argsort) null_locs = null_pos.nonzero()[0] locs = np.concatenate([num_locs, str_locs, null_locs]) return values.take(locs) def _sort_tuples(values: np.ndarray) -> np.ndarray: """ Convert array of tuples (1d) to array of arrays (2d). We need to keep the columns separately as they contain different types and nans (can't use `np.sort` as it may fail when str and nan are mixed in a column as types cannot be compared). """ from pandas.core.internals.construction import to_arrays from pandas.core.sorting import lexsort_indexer arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] def union_with_duplicates( lvals: ArrayLike | Index, rvals: ArrayLike | Index ) -> ArrayLike | Index: """ Extracts the union from lvals and rvals with respect to duplicates and nans in both arrays. Parameters ---------- lvals: np.ndarray or ExtensionArray left values which is ordered in front. rvals: np.ndarray or ExtensionArray right values ordered after lvals. Returns ------- np.ndarray or ExtensionArray Containing the unsorted union of both arrays. Notes ----- Caller is responsible for ensuring lvals.dtype == rvals.dtype. """ from pandas import Series with warnings.catch_warnings(): # filter warning from object dtype inference; we will end up discarding # the index here, so the deprecation does not affect the end result here. warnings.filterwarnings( "ignore", "The behavior of value_counts with object-dtype is deprecated", category=FutureWarning, ) l_count = value_counts_internal(lvals, dropna=False) r_count = value_counts_internal(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) final_count = np.maximum(l_count.values, r_count.values) final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex): unique_vals = lvals.append(rvals).unique() else: if isinstance(lvals, ABCIndex): lvals = lvals._values if isinstance(rvals, ABCIndex): rvals = rvals._values # error: List item 0 has incompatible type "Union[ExtensionArray, # ndarray[Any, Any], Index]"; expected "Union[ExtensionArray, # ndarray[Any, Any]]" combined = concat_compat([lvals, rvals]) # type: ignore[list-item] unique_vals = unique(combined) unique_vals = ensure_wrapped_if_datetimelike(unique_vals) repeats = final_count.reindex(unique_vals).values return np.repeat(unique_vals, repeats) def map_array( arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None, convert: bool = True, ) -> np.ndarray | ExtensionArray | Index: """ Map values using an input mapping or function. Parameters ---------- mapper : function, dict, or Series Mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NA values, without passing them to the mapping correspondence. convert : bool, default True Try to find better dtype for elementwise function results. If False, leave as dtype=object. Returns ------- Union[ndarray, Index, ExtensionArray] The output of the mapping function applied to the array. If the function returns a tuple with more than one element a MultiIndex will be returned. """ if na_action not in (None, "ignore"): msg = f"na_action must either be 'ignore' or None, {na_action} was passed" raise ValueError(msg) # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[ np.nan if isinstance(x, float) and np.isnan(x) else x ] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well from pandas import Series if len(mapper) == 0: mapper = Series(mapper, dtype=np.float64) else: mapper = Series(mapper) if isinstance(mapper, ABCSeries): if na_action == "ignore": mapper = mapper[mapper.index.notna()] # Since values were input this means we came from either # a dict or a series and mapper should be an index indexer = mapper.index.get_indexer(arr) new_values = take_nd(mapper._values, indexer) return new_values if not len(arr): return arr.copy() # we must convert to python types values = arr.astype(object, copy=False) if na_action is None: return lib.map_infer(values, mapper, convert=convert) else: return lib.map_infer_mask( values, mapper, mask=isna(values).view(np.uint8), convert=convert )