""" Utility functions and objects for implementing the interchange API. """ from __future__ import annotations import typing import numpy as np from pandas._libs import lib from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtype, DatetimeTZDtype, ) import pandas as pd if typing.TYPE_CHECKING: from pandas._typing import DtypeObj # Maps str(pyarrow.DataType) = C type format string # Currently, no pyarrow API for this PYARROW_CTYPES = { "null": "n", "bool": "b", "uint8": "C", "uint16": "S", "uint32": "I", "uint64": "L", "int8": "c", "int16": "S", "int32": "i", "int64": "l", "halffloat": "e", # float16 "float": "f", # float32 "double": "g", # float64 "string": "u", "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", "time64[us]": "ttu", "time64[ns]": "ttn", "date32[day]": "tdD", "date64[ms]": "tdm", "timestamp[s]": "tss:", "timestamp[ms]": "tsm:", "timestamp[us]": "tsu:", "timestamp[ns]": "tsn:", "duration[s]": "tDs", "duration[ms]": "tDm", "duration[us]": "tDu", "duration[ns]": "tDn", } class ArrowCTypes: """ Enum for Apache Arrow C type format strings. The Arrow C data interface: https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings """ NULL = "n" BOOL = "b" INT8 = "c" UINT8 = "C" INT16 = "s" UINT16 = "S" INT32 = "i" UINT32 = "I" INT64 = "l" UINT64 = "L" FLOAT16 = "e" FLOAT32 = "f" FLOAT64 = "g" STRING = "u" # utf-8 LARGE_STRING = "U" # utf-8 DATE32 = "tdD" DATE64 = "tdm" # Resoulution: # - seconds -> 's' # - milliseconds -> 'm' # - microseconds -> 'u' # - nanoseconds -> 'n' TIMESTAMP = "ts{resolution}:{tz}" TIME = "tt{resolution}" class Endianness: """Enum indicating the byte-order of a data-type.""" LITTLE = "<" BIG = ">" NATIVE = "=" NA = "|" def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: """ Represent pandas `dtype` as a format string in Apache Arrow C notation. Parameters ---------- dtype : np.dtype Datatype of pandas DataFrame to represent. Returns ------- str Format string in Apache Arrow C notation of the given `dtype`. """ if isinstance(dtype, CategoricalDtype): return ArrowCTypes.INT64 elif dtype == np.dtype("O"): return ArrowCTypes.STRING elif isinstance(dtype, ArrowDtype): import pyarrow as pa pa_type = dtype.pyarrow_dtype if pa.types.is_decimal(pa_type): return f"d:{pa_type.precision},{pa_type.scale}" elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None: return f"ts{pa_type.unit[0]}:{pa_type.tz}" format_str = PYARROW_CTYPES.get(str(pa_type), None) if format_str is not None: return format_str format_str = getattr(ArrowCTypes, dtype.name.upper(), None) if format_str is not None: return format_str if lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="") elif isinstance(dtype, DatetimeTZDtype): return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) elif isinstance(dtype, pd.BooleanDtype): return ArrowCTypes.BOOL raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: """ Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. - Returns `None` if the input series is not backed by a multi-chunk pyarrow array (and so doesn't need rechunking) - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk pyarrow array and `allow_copy` is `True`. - Raises a `RuntimeError` if `allow_copy` is `False` and input is a based by a multi-chunk pyarrow array. """ if not isinstance(series.dtype, pd.ArrowDtype): return None chunked_array = series.array._pa_array # type: ignore[attr-defined] if len(chunked_array.chunks) == 1: return None if not allow_copy: raise RuntimeError( "Found multi-chunk pyarrow array, but `allow_copy` is False. " "Please rechunk the array before calling this function, or set " "`allow_copy=True`." ) arr = chunked_array.combine_chunks() return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)