"""Accessors for arrow-backed data.""" from __future__ import annotations from abc import ( ABCMeta, abstractmethod, ) from typing import ( TYPE_CHECKING, cast, ) from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, ) from pandas.core.dtypes.common import is_list_like if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc from pandas.core.dtypes.dtypes import ArrowDtype if TYPE_CHECKING: from collections.abc import Iterator from pandas import ( DataFrame, Series, ) class ArrowAccessor(metaclass=ABCMeta): @abstractmethod def __init__(self, data, validation_msg: str) -> None: self._data = data self._validation_msg = validation_msg self._validate(data) @abstractmethod def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: pass def _validate(self, data): dtype = data.dtype if not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype): # Raise AttributeError so that inspect can handle invalid Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) @property def _pa_array(self): return self._data.array._pa_array class ListAccessor(ArrowAccessor): """ Accessor object for list data properties of the Series values. Parameters ---------- data : Series Series containing Arrow list data. """ def __init__(self, data=None) -> None: super().__init__( data, validation_msg="Can only use the '.list' accessor with " "'list[pyarrow]' dtype, not {dtype}.", ) def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: return ( pa.types.is_list(pyarrow_dtype) or pa.types.is_fixed_size_list(pyarrow_dtype) or pa.types.is_large_list(pyarrow_dtype) ) def len(self) -> Series: """ Return the length of each list in the Series. Returns ------- pandas.Series The length of each list. Examples -------- >>> import pyarrow as pa >>> s = pd.Series( ... [ ... [1, 2, 3], ... [3], ... ], ... dtype=pd.ArrowDtype(pa.list_( ... pa.int64() ... )) ... ) >>> s.list.len() 0 3 1 1 dtype: int32[pyarrow] """ from pandas import Series value_lengths = pc.list_value_length(self._pa_array) return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) def __getitem__(self, key: int | slice) -> Series: """ Index or slice lists in the Series. Parameters ---------- key : int | slice Index or slice of indices to access from each list. Returns ------- pandas.Series The list at requested index. Examples -------- >>> import pyarrow as pa >>> s = pd.Series( ... [ ... [1, 2, 3], ... [3], ... ], ... dtype=pd.ArrowDtype(pa.list_( ... pa.int64() ... )) ... ) >>> s.list[0] 0 1 1 3 dtype: int64[pyarrow] """ from pandas import Series if isinstance(key, int): # TODO: Support negative key but pyarrow does not allow # element index to be an array. # if key < 0: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) return Series(element, dtype=ArrowDtype(element.type)) elif isinstance(key, slice): if pa_version_under11p0: raise NotImplementedError( f"List slice not supported by pyarrow {pa.__version__}." ) # TODO: Support negative start/stop/step, ideally this would be added # upstream in pyarrow. start, stop, step = key.start, key.stop, key.step if start is None: # TODO: When adding negative step support # this should be setto last element of array # when step is negative. start = 0 if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) return Series(sliced, dtype=ArrowDtype(sliced.type)) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") def __iter__(self) -> Iterator: raise TypeError(f"'{type(self).__name__}' object is not iterable") def flatten(self) -> Series: """ Flatten list values. Returns ------- pandas.Series The data from all lists in the series flattened. Examples -------- >>> import pyarrow as pa >>> s = pd.Series( ... [ ... [1, 2, 3], ... [3], ... ], ... dtype=pd.ArrowDtype(pa.list_( ... pa.int64() ... )) ... ) >>> s.list.flatten() 0 1 1 2 2 3 3 3 dtype: int64[pyarrow] """ from pandas import Series flattened = pc.list_flatten(self._pa_array) return Series(flattened, dtype=ArrowDtype(flattened.type)) class StructAccessor(ArrowAccessor): """ Accessor object for structured data properties of the Series values. Parameters ---------- data : Series Series containing Arrow struct data. """ def __init__(self, data=None) -> None: super().__init__( data, validation_msg=( "Can only use the '.struct' accessor with 'struct[pyarrow]' " "dtype, not {dtype}." ), ) def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: return pa.types.is_struct(pyarrow_dtype) @property def dtypes(self) -> Series: """ Return the dtype object of each child field of the struct. Returns ------- pandas.Series The data type of each child field. Examples -------- >>> import pyarrow as pa >>> s = pd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) >>> s.struct.dtypes version int64[pyarrow] project string[pyarrow] dtype: object """ from pandas import ( Index, Series, ) pa_type = self._data.dtype.pyarrow_dtype types = [ArrowDtype(struct.type) for struct in pa_type] names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) def field( self, name_or_index: list[str] | list[bytes] | list[int] | pc.Expression | bytes | str | int, ) -> Series: """ Extract a child field of a struct as a Series. Parameters ---------- name_or_index : str | bytes | int | expression | list Name or index of the child field to extract. For list-like inputs, this will index into a nested struct. Returns ------- pandas.Series The data corresponding to the selected child field. See Also -------- Series.struct.explode : Return all child fields as a DataFrame. Notes ----- The name of the resulting Series will be set using the following rules: - For string, bytes, or integer `name_or_index` (or a list of these, for a nested selection), the Series name is set to the selected field's name. - For a :class:`pyarrow.compute.Expression`, this is set to the string form of the expression. - For list-like `name_or_index`, the name will be set to the name of the final field selected. Examples -------- >>> import pyarrow as pa >>> s = pd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) Extract by field name. >>> s.struct.field("project") 0 pandas 1 pandas 2 numpy Name: project, dtype: string[pyarrow] Extract by field index. >>> s.struct.field(0) 0 1 1 2 2 1 Name: version, dtype: int64[pyarrow] Or an expression >>> import pyarrow.compute as pc >>> s.struct.field(pc.field("project")) 0 pandas 1 pandas 2 numpy Name: project, dtype: string[pyarrow] For nested struct types, you can pass a list of values to index multiple levels: >>> version_type = pa.struct([ ... ("major", pa.int64()), ... ("minor", pa.int64()), ... ]) >>> s = pd.Series( ... [ ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, ... ], ... dtype=pd.ArrowDtype(pa.struct( ... [("version", version_type), ("project", pa.string())] ... )) ... ) >>> s.struct.field(["version", "minor"]) 0 5 1 1 2 26 Name: minor, dtype: int64[pyarrow] >>> s.struct.field([0, 0]) 0 1 1 2 2 1 Name: major, dtype: int64[pyarrow] """ from pandas import Series def get_name( level_name_or_index: list[str] | list[bytes] | list[int] | pc.Expression | bytes | str | int, data: pa.ChunkedArray, ): if isinstance(level_name_or_index, int): name = data.type.field(level_name_or_index).name elif isinstance(level_name_or_index, (str, bytes)): name = level_name_or_index elif isinstance(level_name_or_index, pc.Expression): name = str(level_name_or_index) elif is_list_like(level_name_or_index): # For nested input like [2, 1, 2] # iteratively get the struct and field name. The last # one is used for the name of the index. level_name_or_index = list(reversed(level_name_or_index)) selected = data while level_name_or_index: # we need the cast, otherwise mypy complains about # getting ints, bytes, or str here, which isn't possible. level_name_or_index = cast(list, level_name_or_index) name_or_index = level_name_or_index.pop() name = get_name(name_or_index, selected) selected = selected.type.field(selected.type.get_field_index(name)) name = selected.name else: raise ValueError( "name_or_index must be an int, str, bytes, " "pyarrow.compute.Expression, or list of those" ) return name pa_arr = self._data.array._pa_array name = get_name(name_or_index, pa_arr) field_arr = pc.struct_field(pa_arr, name_or_index) return Series( field_arr, dtype=ArrowDtype(field_arr.type), index=self._data.index, name=name, ) def explode(self) -> DataFrame: """ Extract all child fields of a struct as a DataFrame. Returns ------- pandas.DataFrame The data corresponding to all child fields. See Also -------- Series.struct.field : Return a single child field as a Series. Examples -------- >>> import pyarrow as pa >>> s = pd.Series( ... [ ... {"version": 1, "project": "pandas"}, ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], ... dtype=pd.ArrowDtype(pa.struct( ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) >>> s.struct.explode() version project 0 1 pandas 1 2 pandas 2 1 numpy """ from pandas import concat pa_type = self._pa_array.type return concat( [self.field(i) for i in range(pa_type.num_fields)], axis="columns" )