"""Tests for Table Schema integration.""" from collections import OrderedDict from io import StringIO import json import numpy as np import pytest from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, PeriodDtype, ) import pandas as pd from pandas import DataFrame import pandas._testing as tm from pandas.io.json._table_schema import ( as_json_table_type, build_table_schema, convert_json_field_to_pandas_type, convert_pandas_type_to_json_field, set_default_names, ) @pytest.fixture def df_schema(): return DataFrame( { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), ) @pytest.fixture def df_table(): return DataFrame( { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), }, index=pd.Index(range(4), name="idx"), ) class TestBuildSchema: def test_build_table_schema(self, df_schema, using_infer_string): result = build_table_schema(df_schema, version=False) expected = { "fields": [ {"name": "idx", "type": "integer"}, {"name": "A", "type": "integer"}, {"name": "B", "type": "string"}, {"name": "C", "type": "datetime"}, {"name": "D", "type": "duration"}, ], "primaryKey": ["idx"], } if using_infer_string: expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result def test_series(self): s = pd.Series([1, 2, 3], name="foo") result = build_table_schema(s, version=False) expected = { "fields": [ {"name": "index", "type": "integer"}, {"name": "foo", "type": "integer"}, ], "primaryKey": ["index"], } assert result == expected result = build_table_schema(s) assert "pandas_version" in result def test_series_unnamed(self): result = build_table_schema(pd.Series([1, 2, 3]), version=False) expected = { "fields": [ {"name": "index", "type": "integer"}, {"name": "values", "type": "integer"}, ], "primaryKey": ["index"], } assert result == expected def test_multiindex(self, df_schema, using_infer_string): df = df_schema idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) df.index = idx result = build_table_schema(df, version=False) expected = { "fields": [ {"name": "level_0", "type": "string"}, {"name": "level_1", "type": "integer"}, {"name": "A", "type": "integer"}, {"name": "B", "type": "string"}, {"name": "C", "type": "datetime"}, {"name": "D", "type": "duration"}, ], "primaryKey": ["level_0", "level_1"], } if using_infer_string: expected["fields"][0] = { "name": "level_0", "type": "any", "extDtype": "string", } expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected df.index.names = ["idx0", None] expected["fields"][0]["name"] = "idx0" expected["primaryKey"] = ["idx0", "level_1"] result = build_table_schema(df, version=False) assert result == expected class TestTableSchemaType: @pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_data(self, int_type): int_data = [1, 2, 3] assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer" @pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64]) def test_as_json_table_type_float_data(self, float_type): float_data = [1.0, 2.0, 3.0] assert ( as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number" ) @pytest.mark.parametrize("bool_type", [bool, np.bool_]) def test_as_json_table_type_bool_data(self, bool_type): bool_data = [True, False] assert ( as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean" ) @pytest.mark.parametrize( "date_data", [ pd.to_datetime(["2016"]), pd.to_datetime(["2016"], utc=True), pd.Series(pd.to_datetime(["2016"])), pd.Series(pd.to_datetime(["2016"], utc=True)), pd.period_range("2016", freq="Y", periods=3), ], ) def test_as_json_table_type_date_data(self, date_data): assert as_json_table_type(date_data.dtype) == "datetime" @pytest.mark.parametrize( "str_data", [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], ) def test_as_json_table_type_string_data(self, str_data): assert as_json_table_type(str_data.dtype) == "string" @pytest.mark.parametrize( "cat_data", [ pd.Categorical(["a"]), pd.Categorical([1]), pd.Series(pd.Categorical([1])), pd.CategoricalIndex([1]), pd.Categorical([1]), ], ) def test_as_json_table_type_categorical_data(self, cat_data): assert as_json_table_type(cat_data.dtype) == "any" # ------ # dtypes # ------ @pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_dtypes(self, int_dtype): assert as_json_table_type(int_dtype) == "integer" @pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64]) def test_as_json_table_type_float_dtypes(self, float_dtype): assert as_json_table_type(float_dtype) == "number" @pytest.mark.parametrize("bool_dtype", [bool, np.bool_]) def test_as_json_table_type_bool_dtypes(self, bool_dtype): assert as_json_table_type(bool_dtype) == "boolean" @pytest.mark.parametrize( "date_dtype", [ np.dtype("