# Disable type checking for this module since numba's internals # are not typed, and we use numba's internals via its extension API # mypy: ignore-errors """ Utility classes/functions to let numba recognize pandas Index/Series/DataFrame Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py """ from __future__ import annotations from contextlib import contextmanager import operator import numba from numba import types from numba.core import cgutils from numba.core.datamodel import models from numba.core.extending import ( NativeValue, box, lower_builtin, make_attribute_wrapper, overload, overload_attribute, overload_method, register_model, type_callable, typeof_impl, unbox, ) from numba.core.imputils import impl_ret_borrowed import numpy as np from pandas._libs import lib from pandas.core.indexes.base import Index from pandas.core.indexing import _iLocIndexer from pandas.core.internals import SingleBlockManager from pandas.core.series import Series # Helper function to hack around fact that Index casts numpy string dtype to object # # Idea is to set an attribute on a Index called _numba_data # that is the original data, or the object data casted to numpy string dtype, # with a context manager that is unset afterwards @contextmanager def set_numba_data(index: Index): numba_data = index._data if numba_data.dtype == object: if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" ) numba_data = numba_data.astype("U") try: index._numba_data = numba_data yield index finally: del index._numba_data # TODO: Range index support # (this currently lowers OK, but does not round-trip) class IndexType(types.Type): """ The type class for Index objects. """ def __init__(self, dtype, layout, pyclass: any) -> None: self.pyclass = pyclass name = f"index({dtype}, {layout})" self.dtype = dtype self.layout = layout super().__init__(name) @property def key(self): return self.pyclass, self.dtype, self.layout @property def as_array(self): return types.Array(self.dtype, 1, self.layout) def copy(self, dtype=None, ndim: int = 1, layout=None): assert ndim == 1 if dtype is None: dtype = self.dtype layout = layout or self.layout return type(self)(dtype, layout, self.pyclass) class SeriesType(types.Type): """ The type class for Series objects. """ def __init__(self, dtype, index, namety) -> None: assert isinstance(index, IndexType) self.dtype = dtype self.index = index self.values = types.Array(self.dtype, 1, "C") self.namety = namety name = f"series({dtype}, {index}, {namety})" super().__init__(name) @property def key(self): return self.dtype, self.index, self.namety @property def as_array(self): return self.values def copy(self, dtype=None, ndim: int = 1, layout: str = "C"): assert ndim == 1 assert layout == "C" if dtype is None: dtype = self.dtype return type(self)(dtype, self.index, self.namety) @typeof_impl.register(Index) def typeof_index(val, c): """ This will assume that only strings are in object dtype index. (you should check this before this gets lowered down to numba) """ # arrty = typeof_impl(val._data, c) arrty = typeof_impl(val._numba_data, c) assert arrty.ndim == 1 return IndexType(arrty.dtype, arrty.layout, type(val)) @typeof_impl.register(Series) def typeof_series(val, c): index = typeof_impl(val.index, c) arrty = typeof_impl(val.values, c) namety = typeof_impl(val.name, c) assert arrty.ndim == 1 assert arrty.layout == "C" return SeriesType(arrty.dtype, index, namety) @type_callable(Series) def type_series_constructor(context): def typer(data, index, name=None): if isinstance(index, IndexType) and isinstance(data, types.Array): assert data.ndim == 1 if name is None: name = types.intp return SeriesType(data.dtype, index, name) return typer @type_callable(Index) def type_index_constructor(context): def typer(data, hashmap=None): if isinstance(data, types.Array): assert data.layout == "C" assert data.ndim == 1 assert hashmap is None or isinstance(hashmap, types.DictType) return IndexType(data.dtype, layout=data.layout, pyclass=Index) return typer # Backend extensions for Index and Series and Frame @register_model(IndexType) class IndexModel(models.StructModel): def __init__(self, dmm, fe_type) -> None: # We don't want the numpy string scalar type in our hashmap members = [ ("data", fe_type.as_array), # This is an attempt to emulate our hashtable code with a numba # typed dict # It maps from values in the index to their integer positions in the array ("hashmap", types.DictType(fe_type.dtype, types.intp)), # Pointer to the Index object this was created from, or that it # boxes to # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1 ("parent", types.pyobject), ] models.StructModel.__init__(self, dmm, fe_type, members) @register_model(SeriesType) class SeriesModel(models.StructModel): def __init__(self, dmm, fe_type) -> None: members = [ ("index", fe_type.index), ("values", fe_type.as_array), ("name", fe_type.namety), ] models.StructModel.__init__(self, dmm, fe_type, members) make_attribute_wrapper(IndexType, "data", "_data") make_attribute_wrapper(IndexType, "hashmap", "hashmap") make_attribute_wrapper(SeriesType, "index", "index") make_attribute_wrapper(SeriesType, "values", "values") make_attribute_wrapper(SeriesType, "name", "name") @lower_builtin(Series, types.Array, IndexType) def pdseries_constructor(context, builder, sig, args): data, index = args series = cgutils.create_struct_proxy(sig.return_type)(context, builder) series.index = index series.values = data series.name = context.get_constant(types.intp, 0) return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) @lower_builtin(Series, types.Array, IndexType, types.intp) @lower_builtin(Series, types.Array, IndexType, types.float64) @lower_builtin(Series, types.Array, IndexType, types.unicode_type) def pdseries_constructor_with_name(context, builder, sig, args): data, index, name = args series = cgutils.create_struct_proxy(sig.return_type)(context, builder) series.index = index series.values = data series.name = name return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) @lower_builtin(Index, types.Array, types.DictType, types.pyobject) def index_constructor_2arg(context, builder, sig, args): (data, hashmap, parent) = args index = cgutils.create_struct_proxy(sig.return_type)(context, builder) index.data = data index.hashmap = hashmap index.parent = parent return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) @lower_builtin(Index, types.Array, types.DictType) def index_constructor_2arg_parent(context, builder, sig, args): # Basically same as index_constructor_1arg, but also lets you specify the # parent object (data, hashmap) = args index = cgutils.create_struct_proxy(sig.return_type)(context, builder) index.data = data index.hashmap = hashmap return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) @lower_builtin(Index, types.Array) def index_constructor_1arg(context, builder, sig, args): from numba.typed import Dict key_type = sig.return_type.dtype value_type = types.intp def index_impl(data): return Index(data, Dict.empty(key_type, value_type)) return context.compile_internal(builder, index_impl, sig, args) # Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type # (regular string) def maybe_cast_str(x): # Dummy function that numba can overload pass @overload(maybe_cast_str) def maybe_cast_str_impl(x): """Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string). Is a no-op for other types.""" if isinstance(x, types.UnicodeCharSeq): return lambda x: str(x) else: return lambda x: x @unbox(IndexType) def unbox_index(typ, obj, c): """ Convert a Index object to a native structure. Note: Object dtype is not allowed here """ data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") index = cgutils.create_struct_proxy(typ)(c.context, c.builder) # If we see an object array, assume its been validated as only containing strings # We still need to do the conversion though index.data = c.unbox(typ.as_array, data_obj).value typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict)) # Create an empty typed dict in numba for the hashmap for indexing # equiv of numba.typed.Dict.empty(typ.dtype, types.intp) arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype)) intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp)) hashmap_obj = c.pyapi.call_method( typed_dict_obj, "empty", (arr_type_obj, intp_type_obj) ) index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value # Set the parent for speedy boxing. index.parent = obj # Decrefs c.pyapi.decref(data_obj) c.pyapi.decref(arr_type_obj) c.pyapi.decref(intp_type_obj) c.pyapi.decref(typed_dict_obj) return NativeValue(index._getvalue()) @unbox(SeriesType) def unbox_series(typ, obj, c): """ Convert a Series object to a native structure. """ index_obj = c.pyapi.object_getattr_string(obj, "index") values_obj = c.pyapi.object_getattr_string(obj, "values") name_obj = c.pyapi.object_getattr_string(obj, "name") series = cgutils.create_struct_proxy(typ)(c.context, c.builder) series.index = c.unbox(typ.index, index_obj).value series.values = c.unbox(typ.values, values_obj).value series.name = c.unbox(typ.namety, name_obj).value # Decrefs c.pyapi.decref(index_obj) c.pyapi.decref(values_obj) c.pyapi.decref(name_obj) return NativeValue(series._getvalue()) @box(IndexType) def box_index(typ, val, c): """ Convert a native index structure to a Index object. If our native index is of a numpy string dtype, we'll cast it to object. """ # First build a Numpy array object, then wrap it in a Index index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) res = cgutils.alloca_once_value(c.builder, index.parent) # Does parent exist? # (it means already boxed once, or Index same as original df.index or df.columns) # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17 with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as ( has_parent, otherwise, ): with has_parent: c.pyapi.incref(index.parent) with otherwise: # TODO: preserve the original class for the index # Also need preserve the name of the Index # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass)) class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index)) array_obj = c.box(typ.as_array, index.data) if isinstance(typ.dtype, types.UnicodeCharSeq): # We converted to numpy string dtype, convert back # to object since _simple_new won't do that for uss object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object")) array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,)) c.pyapi.decref(object_str_obj) # this is basically Index._simple_new(array_obj, name_obj) in python index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) index.parent = index_obj c.builder.store(index_obj, res) # Decrefs c.pyapi.decref(class_obj) c.pyapi.decref(array_obj) return c.builder.load(res) @box(SeriesType) def box_series(typ, val, c): """ Convert a native series structure to a Series object. """ series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr)) mgr_const_obj = c.pyapi.unserialize( c.pyapi.serialize_object(SingleBlockManager.from_array) ) index_obj = c.box(typ.index, series.index) array_obj = c.box(typ.as_array, series.values) name_obj = c.box(typ.namety, series.name) # This is basically equivalent of # pd.Series(data=array_obj, index=index_obj) # To improve perf, we will construct the Series from a manager # object to avoid checks. # We'll also set the name attribute manually to avoid validation mgr_obj = c.pyapi.call_function_objargs( mgr_const_obj, ( array_obj, index_obj, ), ) mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes") # Series._constructor_from_mgr(mgr, axes) series_obj = c.pyapi.call_function_objargs( series_const_obj, (mgr_obj, mgr_axes_obj) ) c.pyapi.object_setattr_string(series_obj, "_name", name_obj) # Decrefs c.pyapi.decref(series_const_obj) c.pyapi.decref(mgr_axes_obj) c.pyapi.decref(mgr_obj) c.pyapi.decref(mgr_const_obj) c.pyapi.decref(index_obj) c.pyapi.decref(array_obj) c.pyapi.decref(name_obj) return series_obj # Add common series reductions (e.g. mean, sum), # and also add common binops (e.g. add, sub, mul, div) def generate_series_reduction(ser_reduction, ser_method): @overload_method(SeriesType, ser_reduction) def series_reduction(series): def series_reduction_impl(series): return ser_method(series.values) return series_reduction_impl return series_reduction def generate_series_binop(binop): @overload(binop) def series_binop(series1, value): if isinstance(series1, SeriesType): if isinstance(value, SeriesType): def series_binop_impl(series1, series2): # TODO: Check index matching? return Series( binop(series1.values, series2.values), series1.index, series1.name, ) return series_binop_impl else: def series_binop_impl(series1, value): return Series( binop(series1.values, value), series1.index, series1.name ) return series_binop_impl return series_binop series_reductions = [ ("sum", np.sum), ("mean", np.mean), # Disabled due to discrepancies between numba std. dev # and pandas std. dev (no way to specify dof) # ("std", np.std), # ("var", np.var), ("min", np.min), ("max", np.max), ] for reduction, reduction_method in series_reductions: generate_series_reduction(reduction, reduction_method) series_binops = [operator.add, operator.sub, operator.mul, operator.truediv] for ser_binop in series_binops: generate_series_binop(ser_binop) # get_loc on Index @overload_method(IndexType, "get_loc") def index_get_loc(index, item): def index_get_loc_impl(index, item): # Initialize the hash table if not initialized if len(index.hashmap) == 0: for i, val in enumerate(index._data): index.hashmap[val] = i return index.hashmap[item] return index_get_loc_impl # Indexing for Series/Index @overload(operator.getitem) def series_indexing(series, item): if isinstance(series, SeriesType): def series_getitem(series, item): loc = series.index.get_loc(item) return series.iloc[loc] return series_getitem @overload(operator.getitem) def index_indexing(index, idx): if isinstance(index, IndexType): def index_getitem(index, idx): return index._data[idx] return index_getitem class IlocType(types.Type): def __init__(self, obj_type) -> None: self.obj_type = obj_type name = f"iLocIndexer({obj_type})" super().__init__(name=name) @property def key(self): return self.obj_type @typeof_impl.register(_iLocIndexer) def typeof_iloc(val, c): objtype = typeof_impl(val.obj, c) return IlocType(objtype) @type_callable(_iLocIndexer) def type_iloc_constructor(context): def typer(obj): if isinstance(obj, SeriesType): return IlocType(obj) return typer @lower_builtin(_iLocIndexer, SeriesType) def iloc_constructor(context, builder, sig, args): (obj,) = args iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder) iloc_indexer.obj = obj return impl_ret_borrowed( context, builder, sig.return_type, iloc_indexer._getvalue() ) @register_model(IlocType) class ILocModel(models.StructModel): def __init__(self, dmm, fe_type) -> None: members = [("obj", fe_type.obj_type)] models.StructModel.__init__(self, dmm, fe_type, members) make_attribute_wrapper(IlocType, "obj", "obj") @overload_attribute(SeriesType, "iloc") def series_iloc(series): def get(series): return _iLocIndexer(series) return get @overload(operator.getitem) def iloc_getitem(iloc_indexer, i): if isinstance(iloc_indexer, IlocType): def getitem_impl(iloc_indexer, i): return iloc_indexer.obj.values[i] return getitem_impl