import datetime from datetime import timedelta from decimal import Decimal from io import ( BytesIO, StringIO, ) import json import os import sys import time import numpy as np import pytest from pandas._config import using_pyarrow_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td import pandas as pd from pandas import ( NA, DataFrame, DatetimeIndex, Index, RangeIndex, Series, Timestamp, date_range, read_json, ) import pandas._testing as tm from pandas.core.arrays import ( ArrowStringArray, StringArray, ) from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps def test_literal_json_deprecation(): # PR 53409 expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" msg = ( "Passing literal json to 'read_json' is deprecated and " "will be removed in a future version. To read from a " "literal string, wrap it in a 'StringIO' object." ) with tm.assert_produces_warning(FutureWarning, match=msg): try: read_json(jsonl, lines=False) except ValueError: pass with tm.assert_produces_warning(FutureWarning, match=msg): read_json(expected.to_json(), lines=False) with tm.assert_produces_warning(FutureWarning, match=msg): result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=msg): try: result = read_json( '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n', lines=False, ) except ValueError: pass with tm.assert_produces_warning(FutureWarning, match=msg): try: result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False) except ValueError: pass tm.assert_frame_equal(result, expected) def assert_json_roundtrip_equal(result, expected, orient): if orient in ("records", "values"): expected = expected.reset_index(drop=True) if orient == "values": expected.columns = range(len(expected.columns)) tm.assert_frame_equal(result, expected) class TestPandasContainer: @pytest.fixture def categorical_frame(self): data = { c: np.random.default_rng(i).standard_normal(30) for i, c in enumerate(list("ABCD")) } cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * 15 data["E"] = list(reversed(cat)) data["sort"] = np.arange(30, dtype="int64") return DataFrame(data, index=pd.CategoricalIndex(cat, name="E")) @pytest.fixture def datetime_series(self): # Same as usual datetime_series, but with index freq set to None, # since that doesn't round-trip, see GH#33711 ser = Series( 1.1 * np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) ser.index = ser.index._with_freq(None) return ser @pytest.fixture def datetime_frame(self): # Same as usual datetime_frame, but with index freq set to None, # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), columns=Index(list("ABCD"), dtype=object), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) return df def test_frame_double_encoded_labels(self, orient): df = DataFrame( [["a", "b"], ["c", "d"]], index=['index " 1', "index / 2"], columns=["a \\ b", "y / z"], ) data = StringIO(df.to_json(orient=orient)) result = read_json(data, orient=orient) expected = df.copy() assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("orient", ["split", "records", "values"]) def test_frame_non_unique_index(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) data = StringIO(df.to_json(orient=orient)) result = read_json(data, orient=orient) expected = df.copy() assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("orient", ["index", "columns"]) def test_frame_non_unique_index_raises(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) msg = f"DataFrame index must be unique for orient='{orient}'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) @pytest.mark.parametrize("orient", ["split", "values"]) @pytest.mark.parametrize( "data", [ [["a", "b"], ["c", "d"]], [[1.5, 2.5], [3.5, 4.5]], [[1, 2.5], [3, 4.5]], [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) def test_frame_non_unique_columns(self, orient, data): df = DataFrame(data, index=[1, 2], columns=["x", "x"]) result = read_json( StringIO(df.to_json(orient=orient)), orient=orient, convert_dates=["x"] ) if orient == "values": expected = DataFrame(data) if expected.iloc[:, 0].dtype == "datetime64[ns]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000) elif orient == "split": expected = df expected.columns = ["x", "x.1"] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("orient", ["index", "columns", "records"]) def test_frame_non_unique_columns_raises(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"]) msg = f"DataFrame columns must be unique for orient='{orient}'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) def test_frame_default_orient(self, float_frame): assert float_frame.to_json() == float_frame.to_json(orient="columns") @pytest.mark.parametrize("dtype", [False, float]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): data = StringIO(float_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) expected = float_frame assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): data = StringIO(int_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) expected = int_frame assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("dtype", [None, np.float64, int, "U3"]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_str_axes(self, orient, convert_axes, dtype): df = DataFrame( np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)], dtype=dtype, ) data = StringIO(df.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) expected = df.copy() if not dtype: expected = expected.astype(np.int64) # index columns, and records orients cannot fully preserve the string # dtype for axes as the index and column labels are used as keys in # JSON objects. JSON keys are by definition strings, so there's no way # to disambiguate whether those keys actually were strings or numeric # beforehand and numeric wins out. if convert_axes and (orient in ("index", "columns")): expected.columns = expected.columns.astype(np.int64) expected.index = expected.index.astype(np.int64) elif orient == "records" and convert_axes: expected.columns = expected.columns.astype(np.int64) elif convert_axes and orient == "split": expected.columns = expected.columns.astype(np.int64) assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_categorical( self, request, orient, categorical_frame, convert_axes, using_infer_string ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): request.applymarker( pytest.mark.xfail( reason=f"Can't have duplicate index values for orient '{orient}')" ) ) data = StringIO(categorical_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() expected.index = expected.index.astype( str if not using_infer_string else "string[pyarrow_numpy]" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_empty(self, orient, convert_axes): empty_frame = DataFrame() data = StringIO(empty_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes) if orient == "split": idx = Index([], dtype=(float if convert_axes else object)) expected = DataFrame(index=idx, columns=idx) elif orient in ["index", "columns"]: expected = DataFrame() else: expected = empty_frame.copy() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter data = StringIO(datetime_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = datetime_frame.copy() if not convert_axes: # one off for ts handling # DTI gets converted to epoch values idx = expected.index.view(np.int64) // 1000000 if orient != "split": # TODO: handle consistently across orients idx = idx.astype(str) expected.index = idx assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_mixed(self, orient, convert_axes): index = Index(["a", "b", "c", "d", "e"]) values = { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": [True, False, True, False, True], } df = DataFrame(data=values, index=index) data = StringIO(df.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = df.copy() expected = expected.assign(**expected.select_dtypes("number").astype(np.int64)) assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.xfail( reason="#50456 Column multiindex is stored and loaded differently", raises=AssertionError, ) @pytest.mark.parametrize( "columns", [ [["2022", "2022"], ["JAN", "FEB"]], [["2022", "2023"], ["JAN", "JAN"]], [["2022", "2022"], ["JAN", "JAN"]], ], ) def test_roundtrip_multiindex(self, columns): df = DataFrame( [[1, 2], [3, 4]], columns=pd.MultiIndex.from_arrays(columns), ) data = StringIO(df.to_json(orient="split")) result = read_json(data, orient="split") tm.assert_frame_equal(result, df) @pytest.mark.parametrize( "data,msg,orient", [ ('{"key":b:a:d}', "Expected object or value", "columns"), # too few indices ( '{"columns":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', "|".join( [ r"Length of values \(3\) does not match length of index \(2\)", ] ), "split", ), # too many columns ( '{"columns":["A","B","C"],' '"index":["1","2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', "3 columns passed, passed data had 2 columns", "split", ), # bad key ( '{"badkey":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', r"unexpected key\(s\): badkey", "split", ), ], ) def test_frame_from_json_bad_data_raises(self, data, msg, orient): with pytest.raises(ValueError, match=msg): read_json(StringIO(data), orient=orient) @pytest.mark.parametrize("dtype", [True, False]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_frame_from_json_missing_data(self, orient, convert_axes, dtype): num_df = DataFrame([[1, 2], [4, 5, 6]]) result = read_json( StringIO(num_df.to_json(orient=orient)), orient=orient, convert_axes=convert_axes, dtype=dtype, ) assert np.isnan(result.iloc[0, 2]) obj_df = DataFrame([["1", "2"], ["4", "5", "6"]]) result = read_json( StringIO(obj_df.to_json(orient=orient)), orient=orient, convert_axes=convert_axes, dtype=dtype, ) assert np.isnan(result.iloc[0, 2]) @pytest.mark.parametrize("dtype", [True, False]) def test_frame_read_json_dtype_missing_value(self, dtype): # GH28501 Parse missing values using read_json with dtype=False # to NaN instead of None result = read_json(StringIO("[null]"), dtype=dtype) expected = DataFrame([np.nan]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("inf", [np.inf, -np.inf]) @pytest.mark.parametrize("dtype", [True, False]) def test_frame_infinity(self, inf, dtype): # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df.loc[0, 2] = inf data = StringIO(df.to_json()) result = read_json(data, dtype=dtype) assert np.isnan(result.iloc[0, 2]) @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") @pytest.mark.parametrize( "value,precision,expected_val", [ (0.95, 1, 1.0), (1.95, 1, 2.0), (-1.95, 1, -2.0), (0.995, 2, 1.0), (0.9995, 3, 1.0), (0.99999999999999944, 15, 1.0), ], ) def test_frame_to_json_float_precision(self, value, precision, expected_val): df = DataFrame([{"a_float": value}]) encoded = df.to_json(double_precision=precision) assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}' def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) msg = "Invalid value 'garbage' for option 'orient'" with pytest.raises(ValueError, match=msg): df.to_json(orient="garbage") def test_frame_empty(self): df = DataFrame(columns=["jim", "joe"]) assert not df._is_mixed_type data = StringIO(df.to_json()) result = read_json(data, dtype=dict(df.dtypes)) tm.assert_frame_equal(result, df, check_index_type=False) def test_frame_empty_to_json(self): # GH 7445 df = DataFrame({"test": []}, index=[]) result = df.to_json(orient="columns") expected = '{"test":{}}' assert result == expected def test_frame_empty_mixedtype(self): # mixed type df = DataFrame(columns=["jim", "joe"]) df["joe"] = df["joe"].astype("i8") assert df._is_mixed_type data = df.to_json() tm.assert_frame_equal( read_json(StringIO(data), dtype=dict(df.dtypes)), df, check_index_type=False, ) def test_frame_mixedtype_orient(self): # GH10289 vals = [ [10, 1, "foo", 0.1, 0.01], [20, 2, "bar", 0.2, 0.02], [30, 3, "baz", 0.3, 0.03], [40, 4, "qux", 0.4, 0.04], ] df = DataFrame( vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"] ) assert df._is_mixed_type right = df.copy() for orient in ["split", "index", "columns"]: inp = StringIO(df.to_json(orient=orient)) left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) def test_v12_compat(self, datapath): dti = date_range("2000-01-03", "2000-01-07") # freq doesn't roundtrip dti = DatetimeIndex(np.asarray(dti), freq=None) df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], [-0.2550111, -0.08072427, -0.03202878, -0.17581665], [1.51493992, 0.11805825, 1.629455, -1.31506612], [-0.02765498, 0.44679743, 0.33192641, -0.27885413], [0.05951614, -2.69652057, 1.28163262, 0.34703478], ], columns=["A", "B", "C", "D"], index=dti, ) df["date"] = Timestamp("19920106 18:21:32.12").as_unit("ns") df.iloc[3, df.columns.get_loc("date")] = Timestamp("20130101") df["modified"] = df["date"] df.iloc[1, df.columns.get_loc("modified")] = pd.NaT dirpath = datapath("io", "json", "data") v12_json = os.path.join(dirpath, "tsframe_v012.json") df_unser = read_json(v12_json) tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = read_json(v12_iso_json) tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False) def test_blocks_compat_GH9037(self, using_infer_string): index = date_range("20000101", periods=10, freq="h") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) df_mixed = DataFrame( { "float_1": [ -0.92077639, 0.77434435, 1.25234727, 0.61485564, -0.60316077, 0.24653374, 0.28668979, -2.51969012, 0.95748401, -1.02970536, ], "int_1": [ 19680418, 75337055, 99973684, 65103179, 79373900, 40314334, 21290235, 4991321, 41903419, 16008365, ], "str_1": [ "78c608f1", "64a99743", "13d2ff52", "ca7f4af2", "97236474", "bde7e214", "1a6bde47", "b1190be5", "7a669144", "8d64d068", ], "float_2": [ -0.0428278, -1.80872357, 3.36042349, -0.7573685, -0.48217572, 0.86229683, 1.08935819, 0.93898739, -0.03030452, 1.43366348, ], "str_2": [ "14f04af9", "d085da90", "4bcfac83", "81504caf", "2ffef4a9", "08e2f5c4", "07e1af03", "addbd4a7", "1f6a09ba", "4bfc4d87", ], "int_2": [ 86967717, 98098830, 51927505, 20372254, 12601730, 20884027, 34193846, 10561746, 24867120, 76131025, ], }, index=index, ) # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( np.str_ if not using_infer_string else "string[pyarrow_numpy]" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( df_mixed, df_roundtrip, check_index_type=True, check_column_type=True, by_blocks=True, check_exact=True, ) def test_frame_nonprintable_bytes(self): # GH14256: failing column caused segfaults, if it is not the last one class BinaryThing: def __init__(self, hexed) -> None: self.hexed = hexed self.binary = bytes.fromhex(hexed) def __str__(self) -> str: return self.hexed hexed = "574b4454ba8c5eb4f98a8f45" binthing = BinaryThing(hexed) # verify the proper conversion of printable content df_printable = DataFrame({"A": [binthing.hexed]}) assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}' # check if non-printable content throws appropriate Exception df_nonprintable = DataFrame({"A": [binthing]}) msg = "Unsupported UTF-8 sequence length when encoding string" with pytest.raises(OverflowError, match=msg): df_nonprintable.to_json() # the same with multiple columns threw segfaults df_mixed = DataFrame({"A": [binthing], "B": [1]}, columns=["A", "B"]) with pytest.raises(OverflowError, match=msg): df_mixed.to_json() # default_handler should resolve exceptions for non-string types result = df_nonprintable.to_json(default_handler=str) expected = f'{{"A":{{"0":"{hexed}"}}}}' assert result == expected assert ( df_mixed.to_json(default_handler=str) == f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}' ) def test_label_overflow(self): # GH14256: buffer length not checked when writing label result = DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json() expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}' assert result == expected def test_series_non_unique_index(self): s = Series(["a", "b"], index=[1, 1]) msg = "Series index must be unique for orient='index'" with pytest.raises(ValueError, match=msg): s.to_json(orient="index") tm.assert_series_equal( s, read_json( StringIO(s.to_json(orient="split")), orient="split", typ="series" ), ) unserialized = read_json( StringIO(s.to_json(orient="records")), orient="records", typ="series" ) tm.assert_equal(s.values, unserialized.values) def test_series_default_orient(self, string_series): assert string_series.to_json() == string_series.to_json(orient="index") def test_series_roundtrip_simple(self, orient, string_series, using_infer_string): data = StringIO(string_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string expected.index = expected.index.astype("string[pyarrow_numpy]") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": expected.name = None tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient, dtype=dtype) expected = object_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": expected.name = None tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): empty_series = Series([], index=[], dtype=np.float64) data = StringIO(empty_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = empty_series.reset_index(drop=True) if orient in ("split"): expected.index = expected.index.astype(np.float64) tm.assert_series_equal(result, expected) def test_series_roundtrip_timeseries(self, orient, datetime_series): data = StringIO(datetime_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = datetime_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": expected.name = None tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [np.float64, int]) def test_series_roundtrip_numeric(self, orient, dtype): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"]) data = StringIO(s.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = s.copy() if orient in ("values", "records"): expected = expected.reset_index(drop=True) tm.assert_series_equal(result, expected) def test_series_to_json_except(self): s = Series([1, 2, 3]) msg = "Invalid value 'garbage' for option 'orient'" with pytest.raises(ValueError, match=msg): s.to_json(orient="garbage") def test_series_from_json_precise_float(self): s = Series([4.56, 4.56, 4.56]) result = read_json(StringIO(s.to_json()), typ="series", precise_float=True) tm.assert_series_equal(result, s, check_index_type=False) def test_series_with_dtype(self): # GH 21986 s = Series([4.56, 4.56, 4.56]) result = read_json(StringIO(s.to_json()), typ="series", dtype=np.int64) expected = Series([4] * 3) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "dtype,expected", [ (True, Series(["2000-01-01"], dtype="datetime64[ns]")), (False, Series([946684800000])), ], ) def test_series_with_dtype_datetime(self, dtype, expected): s = Series(["2000-01-01"], dtype="datetime64[ns]") data = StringIO(s.to_json()) result = read_json(data, typ="series", dtype=dtype) tm.assert_series_equal(result, expected) def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) result = read_json(StringIO(df.to_json()), precise_float=True) tm.assert_frame_equal(result, df) def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") result = read_json(StringIO(s.to_json()), typ=None) tm.assert_series_equal(result, s) def test_reconstruction_index(self): df = DataFrame([[1, 2, 3], [4, 5, 6]]) result = read_json(StringIO(df.to_json())) tm.assert_frame_equal(result, df) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"]) result = read_json(StringIO(df.to_json())) tm.assert_frame_equal(result, df) def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: for df in [float_frame, int_frame, datetime_frame]: df.to_json(path) read_json(path) def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) result = read_json(json) tm.assert_frame_equal(result, datetime_frame) # series json = StringIO(datetime_series.to_json()) result = read_json(json, typ="series") tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame df["date"] = Timestamp("20130101").as_unit("ns") json = StringIO(df.to_json()) result = read_json(json) tm.assert_frame_equal(result, df) df["foo"] = 1.0 json = StringIO(df.to_json(date_unit="ns")) result = read_json(json, convert_dates=False) expected = df.copy() expected["date"] = expected["date"].values.view("i8") expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(result, expected) # series ts = Series(Timestamp("20130101").as_unit("ns"), index=datetime_series.index) json = StringIO(ts.to_json()) result = read_json(json, typ="series") tm.assert_series_equal(result, ts) @pytest.mark.parametrize("date_format", ["epoch", "iso"]) @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize("date_typ", [datetime.date, datetime.datetime, Timestamp]) def test_date_index_and_values(self, date_format, as_object, date_typ): data = [date_typ(year=2020, month=1, day=1), pd.NaT] if as_object: data.append("a") ser = Series(data, index=data) result = ser.to_json(date_format=date_format) if date_format == "epoch": expected = '{"1577836800000":1577836800000,"null":null}' else: expected = ( '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}' ) if as_object: expected = expected.replace("}", ',"a":"a"}') assert result == expected @pytest.mark.parametrize( "infer_word", [ "trade_time", "date", "datetime", "sold_at", "modified", "timestamp", "timestamps", ], ) def test_convert_dates_infer(self, infer_word): # GH10747 data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}] expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "date,date_unit", [ ("20130101 20:43:42.123", None), ("20130101 20:43:42", "s"), ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), ], ) def test_date_format_frame(self, date, date_unit, datetime_frame): df = datetime_frame df["date"] = Timestamp(date).as_unit("ns") df.iloc[1, df.columns.get_loc("date")] = pd.NaT df.iloc[5, df.columns.get_loc("date")] = pd.NaT if date_unit: json = df.to_json(date_format="iso", date_unit=date_unit) else: json = df.to_json(date_format="iso") result = read_json(StringIO(json)) expected = df.copy() tm.assert_frame_equal(result, expected) def test_date_format_frame_raises(self, datetime_frame): df = datetime_frame msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize( "date,date_unit", [ ("20130101 20:43:42.123", None), ("20130101 20:43:42", "s"), ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), ], ) def test_date_format_series(self, date, date_unit, datetime_series): ts = Series(Timestamp(date).as_unit("ns"), index=datetime_series.index) ts.iloc[1] = pd.NaT ts.iloc[5] = pd.NaT if date_unit: json = ts.to_json(date_format="iso", date_unit=date_unit) else: json = ts.to_json(date_format="iso") result = read_json(StringIO(json), typ="series") expected = ts.copy() tm.assert_series_equal(result, expected) def test_date_format_series_raises(self, datetime_series): ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") dl = df.columns.get_loc("date") df.iloc[1, dl] = Timestamp("19710101 20:43:42") df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT json = df.to_json(date_format="epoch", date_unit=unit) # force date unit result = read_json(StringIO(json), date_unit=unit) tm.assert_frame_equal(result, df) # detect date unit result = read_json(StringIO(json), date_unit=None) tm.assert_frame_equal(result, df) @pytest.mark.parametrize("unit", ["s", "ms", "us"]) def test_iso_non_nano_datetimes(self, unit): # Test that numpy datetimes # in an Index or a column with non-nano resolution can be serialized # correctly # GH53686 index = DatetimeIndex( [np.datetime64("2023-01-01T11:22:33.123456", unit)], dtype=f"datetime64[{unit}]", ) df = DataFrame( { "date": Series( [np.datetime64("2022-01-01T11:22:33.123456", unit)], dtype=f"datetime64[{unit}]", index=index, ), "date_obj": Series( [np.datetime64("2023-01-01T11:22:33.123456", unit)], dtype=object, index=index, ), }, ) buf = StringIO() df.to_json(buf, date_format="iso", date_unit=unit) buf.seek(0) # read_json always reads datetimes in nanosecond resolution # TODO: check_dtype/check_index_type should be removable # once read_json gets non-nano support tm.assert_frame_equal( read_json(buf, convert_dates=["date", "date_obj"]), df, check_index_type=False, check_dtype=False, ) def test_weird_nested_json(self): # this used to core dump the parser s = r"""{ "status": "success", "data": { "posts": [ { "id": 1, "title": "A blog post", "body": "Some useful content" }, { "id": 2, "title": "Another blog post", "body": "More content" } ] } }""" read_json(StringIO(s)) def test_doc_example(self): dfj2 = DataFrame( np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB") ) dfj2["date"] = Timestamp("20130101") dfj2["ints"] = range(5) dfj2["bools"] = True dfj2.index = date_range("20130101", periods=5) json = StringIO(dfj2.to_json()) result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) tm.assert_frame_equal(result, result) def test_round_trip_exception(self, datapath): # GH 3867 path = datapath("io", "json", "data", "teams.csv") df = pd.read_csv(path) s = df.to_json() result = read_json(StringIO(s)) res = result.reindex(index=df.index, columns=df.columns) msg = "The 'downcast' keyword in fillna is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): res = res.fillna(np.nan, downcast=False) tm.assert_frame_equal(res, df) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( "field,dtype", [ ["created_at", pd.DatetimeTZDtype(tz="UTC")], ["closed_at", "datetime64[ns]"], ["updated_at", pd.DatetimeTZDtype(tz="UTC")], ], ) def test_url(self, field, dtype, httpserver): data = '{"created_at": ["2023-06-23T18:21:36Z"], "closed_at": ["2023-06-23T18:21:36"], "updated_at": ["2023-06-23T18:21:36Z"]}\n' # noqa: E501 httpserver.serve_content(content=data) result = read_json(httpserver.url, convert_dates=True) assert result[field].dtype == dtype def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit="ms") ser = Series([timedelta(23), timedelta(seconds=5)]) assert ser.dtype == "timedelta64[ns]" result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) tm.assert_series_equal(result, ser) ser = Series([timedelta(23), timedelta(seconds=5)], index=Index([0, 1])) assert ser.dtype == "timedelta64[ns]" result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) tm.assert_series_equal(result, ser) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) assert frame[0].dtype == "timedelta64[ns]" tm.assert_frame_equal( frame, read_json(StringIO(frame.to_json())).apply(converter) ) def test_timedelta2(self): frame = DataFrame( { "a": [timedelta(days=23), timedelta(seconds=5)], "b": [1, 2], "c": date_range(start="20130101", periods=2), } ) data = StringIO(frame.to_json(date_unit="ns")) result = read_json(data) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) tm.assert_frame_equal(frame, result) def test_mixed_timedelta_datetime(self): td = timedelta(23) ts = Timestamp("20130101") frame = DataFrame({"a": [td, ts]}, dtype=object) expected = DataFrame( {"a": [pd.Timedelta(td).as_unit("ns")._value, ts.as_unit("ns")._value]} ) data = StringIO(frame.to_json(date_unit="ns")) result = read_json(data, dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize("date_format", ["iso", "epoch"]) @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): # GH28156: to_json not correctly formatting Timedelta data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT] if as_object: data.append("a") ser = Series(data, index=data) if date_format == "iso": expected = ( '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' ) else: expected = '{"86400000":86400000,"172800000":172800000,"null":null}' if as_object: expected = expected.replace("}", ',"a":"a"}') result = ser.to_json(date_format=date_format) assert result == expected @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) def test_timedelta_to_json_fractional_precision(self, as_object, timedelta_typ): data = [timedelta_typ(milliseconds=42)] ser = Series(data, index=data) if as_object: ser = ser.astype(object) result = ser.to_json() expected = '{"42":42}' assert result == expected def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) expected = DataFrame({"a": [7, str(value)]}) result = read_json(StringIO(frame.to_json(default_handler=str))) tm.assert_frame_equal(expected, result, check_index_type=False) def test_default_handler_indirect(self): def default(obj): if isinstance(obj, complex): return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)] return str(obj) df_list = [ 9, DataFrame( {"a": [1, "STR", complex(4, -5)], "b": [float("nan"), None, "N/A"]}, columns=["a", "b"], ), ] expected = ( '[9,[[1,null],["STR",null],[[["mathjs","Complex"],' '["re",4.0],["im",-5.0]],"N\\/A"]]]' ) assert ( ujson_dumps(df_list, default_handler=default, orient="values") == expected ) def test_default_handler_numpy_unsupported_dtype(self): # GH12554 to_json raises 'Unhandled numpy dtype 15' df = DataFrame( {"a": [1, 2.3, complex(4, -5)], "b": [float("nan"), None, complex(1.2, 0)]}, columns=["a", "b"], ) expected = ( '[["(1+0j)","(nan+0j)"],' '["(2.3+0j)","(nan+0j)"],' '["(4-5j)","(1.2+0j)"]]' ) assert df.to_json(default_handler=str, orient="values") == expected def test_default_handler_raises(self): msg = "raisin" def my_handler_raises(obj): raise TypeError(msg) with pytest.raises(TypeError, match=msg): DataFrame({"a": [1, 2, object()]}).to_json( default_handler=my_handler_raises ) with pytest.raises(TypeError, match=msg): DataFrame({"a": [1, 2, complex(4, -5)]}).to_json( default_handler=my_handler_raises ) def test_categorical(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]}) df["B"] = df["A"] expected = df.to_json() df["B"] = df["A"].astype("category") assert expected == df.to_json() s = df["A"] sc = df["B"] assert s.to_json() == sc.to_json() def test_datetime_tz(self): # GH4377 df.to_json segfaults with non-ndarray blocks tz_range = date_range("20130101", periods=3, tz="US/Eastern") tz_naive = tz_range.tz_convert("utc").tz_localize(None) df = DataFrame({"A": tz_range, "B": date_range("20130101", periods=3)}) df_naive = df.copy() df_naive["A"] = tz_naive expected = df_naive.to_json() assert expected == df.to_json() stz = Series(tz_range) s_naive = Series(tz_naive) assert stz.to_json() == s_naive.to_json() def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) df.loc[:8] = np.nan sdf = df.astype("Sparse") expected = df.to_json() assert expected == sdf.to_json() s = Series(np.random.default_rng(2).standard_normal(10)) s.loc[:8] = np.nan ss = s.astype("Sparse") expected = s.to_json() assert expected == ss.to_json() @pytest.mark.parametrize( "ts", [ Timestamp("2013-01-10 05:00:00Z"), Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), Timestamp("2013-01-10 00:00:00-0500"), ], ) def test_tz_is_utc(self, ts): exp = '"2013-01-10T05:00:00.000Z"' assert ujson_dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() assert ujson_dumps(dt, iso_dates=True) == exp def test_tz_is_naive(self): ts = Timestamp("2013-01-10 05:00:00") exp = '"2013-01-10T05:00:00.000"' assert ujson_dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() assert ujson_dumps(dt, iso_dates=True) == exp @pytest.mark.parametrize( "tz_range", [ date_range("2013-01-01 05:00:00Z", periods=2), date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), date_range("2013-01-01 00:00:00-0500", periods=2), ], ) def test_tz_range_is_utc(self, tz_range): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' dfexp = ( '{"DT":{' '"0":"2013-01-01T05:00:00.000Z",' '"1":"2013-01-02T05:00:00.000Z"}}' ) assert ujson_dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) # Ensure datetimes in object array are serialized correctly # in addition to the normal DTI case assert ujson_dumps(dti, iso_dates=True) == exp assert ujson_dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = ujson_dumps(df, iso_dates=True) assert result == dfexp assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) def test_tz_range_is_naive(self): dti = date_range("2013-01-01 05:00:00", periods=2) exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' # Ensure datetimes in object array are serialized correctly # in addition to the normal DTI case assert ujson_dumps(dti, iso_dates=True) == exp assert ujson_dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = ujson_dumps(df, iso_dates=True) assert result == dfexp assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) def test_read_inline_jsonl(self): # GH9180 result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): # GH17200 result = read_json( f"s3n://{s3_public_bucket_with_data.name}/items.jsonl", lines=True, storage_options=s3so, ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_read_local_jsonl(self): # GH17200 with tm.ensure_clean("tmp_items.json") as path: with open(path, "w", encoding="utf-8") as infile: infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') result = read_json(path, lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK # simulate file handle json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' json = StringIO(json) result = read_json(json, lines=True) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # simulate string json = StringIO('{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n') result = read_json(json, lines=True) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_to_json_large_numbers(self, bigNum): # GH34473 series = Series(bigNum, dtype=object, index=["articleId"]) json = series.to_json() expected = '{"articleId":' + str(bigNum) + "}" assert json == expected df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) json = df.to_json() expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected @pytest.mark.parametrize("bigNum", [-(2**63) - 1, 2**64]) def test_read_json_large_numbers(self, bigNum): # GH20599, 26068 json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" with pytest.raises(ValueError, match=msg): read_json(json) json = StringIO('{"0":{"articleId":' + str(bigNum) + "}}") with pytest.raises(ValueError, match=msg): read_json(json) def test_read_json_large_numbers2(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' json = StringIO(json) result = read_json(json, typ="series") expected = Series(1.404366e21, index=["articleId"]) tm.assert_series_equal(result, expected) json = '{"0": {"articleId": "1404366058080022500245"}}' json = StringIO(json) result = read_json(json) expected = DataFrame(1.404366e21, index=["articleId"], columns=[0]) tm.assert_frame_equal(result, expected) def test_to_jsonl(self): # GH9180 df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n' assert result == expected df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' assert result == expected tm.assert_frame_equal(read_json(StringIO(result), lines=True), df) # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' assert result == expected tm.assert_frame_equal(read_json(StringIO(result), lines=True), df) # TODO: there is a near-identical test for pytables; can we share? @pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError) @pytest.mark.parametrize( "val", [ [b"E\xc9, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"a", b"b", b"c"], [b"EE, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], [b"", b"a", b"b", b"c"], [b"\xf8\xfc", b"a", b"b", b"c"], [b"A\xf8\xfc", b"", b"a", b"b", b"c"], [np.nan, b"", b"b", b"c"], [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) @pytest.mark.parametrize("dtype", ["category", object]) def test_latin_encoding(self, dtype, val): # GH 13774 ser = Series( [x.decode("latin-1") if isinstance(x, bytes) else x for x in val], dtype=dtype, ) encoding = "latin-1" with tm.ensure_clean("test.json") as path: ser.to_json(path, encoding=encoding) retr = read_json(StringIO(path), encoding=encoding) tm.assert_series_equal(ser, retr, check_categorical=False) def test_data_frame_size_after_to_json(self): # GH15344 df = DataFrame({"a": [str(1)]}) size_before = df.memory_usage(index=True, deep=True).sum() df.to_json() size_after = df.memory_usage(index=True, deep=True).sum() assert size_before == size_after @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @pytest.mark.parametrize("columns", [["a", "b"], ["1", "2"], ["1.", "2."]]) def test_from_json_to_json_table_index_and_columns(self, index, columns): # GH25433 GH25435 expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns) dfjson = expected.to_json(orient="table") result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = expected.to_json(orient="table") result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 expected = DataFrame.from_dict( { "Integer": Series([1, 2, 3], dtype="int64"), "Float": Series([None, 2.0, 3.0], dtype="float64"), "Object": Series([None, "", "c"], dtype="object"), "Bool": Series([True, False, True], dtype="bool"), "Category": Series(["a", "b", None], dtype="category"), "Datetime": Series( ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]" ), } ) dfjson = expected.to_json(orient=orient) result = read_json( StringIO(dfjson), orient=orient, dtype={ "Integer": "int64", "Float": "float64", "Object": "object", "Bool": "bool", "Category": "category", "Datetime": "datetime64[ns]", }, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 df = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = df.to_json(orient="table") msg = "cannot pass both dtype and orient='table'" with pytest.raises(ValueError, match=msg): read_json(dfjson, orient="table", dtype=dtype) @pytest.mark.parametrize("orient", ["index", "columns", "records", "values"]) def test_read_json_table_empty_axes_dtype(self, orient): # GH28558 expected = DataFrame() result = read_json(StringIO("{}"), orient=orient, convert_axes=True) tm.assert_index_equal(result.index, expected.index) tm.assert_index_equal(result.columns, expected.columns) def test_read_json_table_convert_axes_raises(self): # GH25433 GH25435 df = DataFrame([[1, 2], [3, 4]], index=[1.0, 2.0], columns=["1.", "2."]) dfjson = df.to_json(orient="table") msg = "cannot pass both convert_axes and orient='table'" with pytest.raises(ValueError, match=msg): read_json(dfjson, orient="table", convert_axes=True) @pytest.mark.parametrize( "data, expected", [ ( DataFrame([[1, 2], [4, 5]], columns=["a", "b"]), {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, ), ( DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo"), {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, ), ( DataFrame( [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]] ), {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, ), (Series([1, 2, 3], name="A"), {"name": "A", "data": [1, 2, 3]}), ( Series([1, 2, 3], name="A").rename_axis("foo"), {"name": "A", "data": [1, 2, 3]}, ), ( Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]]), {"name": "A", "data": [1, 2]}, ), ], ) def test_index_false_to_json_split(self, data, expected): # GH 17394 # Testing index=False in to_json with orient='split' result = data.to_json(orient="split", index=False) result = json.loads(result) assert result == expected @pytest.mark.parametrize( "data", [ (DataFrame([[1, 2], [4, 5]], columns=["a", "b"])), (DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo")), ( DataFrame( [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]] ) ), (Series([1, 2, 3], name="A")), (Series([1, 2, 3], name="A").rename_axis("foo")), (Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]])), ], ) def test_index_false_to_json_table(self, data): # GH 17394 # Testing index=False in to_json with orient='table' result = data.to_json(orient="table", index=False) result = json.loads(result) expected = { "schema": pd.io.json.build_table_schema(data, index=False), "data": DataFrame(data).to_dict(orient="records"), } assert result == expected @pytest.mark.parametrize("orient", ["index", "columns"]) def test_index_false_error_to_json(self, orient): # GH 17394, 25513 # Testing error message from to_json with index=False df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) msg = ( "'index=False' is only valid when 'orient' is 'split', " "'table', 'records', or 'values'" ) with pytest.raises(ValueError, match=msg): df.to_json(orient=orient, index=False) @pytest.mark.parametrize("orient", ["records", "values"]) def test_index_true_error_to_json(self, orient): # GH 25513 # Testing error message from to_json with index=True df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) msg = ( "'index=True' is only valid when 'orient' is 'split', " "'table', 'index', or 'columns'" ) with pytest.raises(ValueError, match=msg): df.to_json(orient=orient, index=True) @pytest.mark.parametrize("orient", ["split", "table"]) @pytest.mark.parametrize("index", [True, False]) def test_index_false_from_json_to_json(self, orient, index): # GH25170 # Test index=False in from_json to_json expected = DataFrame({"a": [1, 2], "b": [3, 4]}) dfjson = expected.to_json(orient=orient, index=index) result = read_json(StringIO(dfjson), orient=orient) tm.assert_frame_equal(result, expected) def test_read_timezone_information(self): # GH 25546 result = read_json( StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index" ) exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[ns, UTC]") expected = Series([88], index=exp_dti) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "url", [ "s3://example-fsspec/", "gcs://another-fsspec/file.json", "https://example-site.com/data", "some-protocol://data.txt", ], ) def test_read_json_with_url_value(self, url): # GH 36271 result = read_json(StringIO(f'{{"url":{{"0":"{url}"}}}}')) expected = DataFrame({"url": [url]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "compression", ["", ".gz", ".bz2", ".tar"], ) def test_read_json_with_very_long_file_path(self, compression): # GH 46718 long_json_path = f'{"a" * 1000}.json{compression}' with pytest.raises( FileNotFoundError, match=f"File {long_json_path} does not exist" ): # path too long for Windows is handled in file_exists() but raises in # _get_data_from_filepath() read_json(long_json_path) @pytest.mark.parametrize( "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] ) def test_timedelta_as_label(self, date_format, key): df = DataFrame([[1]], columns=[pd.Timedelta("1D")]) expected = f'{{"{key}":{{"0":1}}}}' result = df.to_json(date_format=date_format) assert result == expected @pytest.mark.parametrize( "orient,expected", [ ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), # TODO: the below have separate encoding procedures pytest.param( "split", "", marks=pytest.mark.xfail( reason="Produces JSON but not in a consistent manner" ), ), pytest.param( "table", "", marks=pytest.mark.xfail( reason="Produces JSON but not in a consistent manner" ), ), ], ) def test_tuple_labels(self, orient, expected): # GH 20500 df = DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) result = df.to_json(orient=orient) assert result == expected @pytest.mark.parametrize("indent", [1, 2, 4]) def test_to_json_indent(self, indent): # GH 12004 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(indent=indent) spaces = " " * indent expected = f"""{{ {spaces}"a":{{ {spaces}{spaces}"0":"foo", {spaces}{spaces}"1":"baz" {spaces}}}, {spaces}"b":{{ {spaces}{spaces}"0":"bar", {spaces}{spaces}"1":"qux" {spaces}}} }}""" assert result == expected @pytest.mark.skipif( using_pyarrow_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) @pytest.mark.parametrize( "orient,expected", [ ( "split", """{ "columns":[ "a", "b" ], "index":[ 0, 1 ], "data":[ [ "foo", "bar" ], [ "baz", "qux" ] ] }""", ), ( "records", """[ { "a":"foo", "b":"bar" }, { "a":"baz", "b":"qux" } ]""", ), ( "index", """{ "0":{ "a":"foo", "b":"bar" }, "1":{ "a":"baz", "b":"qux" } }""", ), ( "columns", """{ "a":{ "0":"foo", "1":"baz" }, "b":{ "0":"bar", "1":"qux" } }""", ), ( "values", """[ [ "foo", "bar" ], [ "baz", "qux" ] ]""", ), ( "table", """{ "schema":{ "fields":[ { "name":"index", "type":"integer" }, { "name":"a", "type":"string" }, { "name":"b", "type":"string" } ], "primaryKey":[ "index" ], "pandas_version":"1.4.0" }, "data":[ { "index":0, "a":"foo", "b":"bar" }, { "index":1, "a":"baz", "b":"qux" } ] }""", ), ], ) def test_json_indent_all_orients(self, orient, expected): # GH 12004 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(orient=orient, indent=4) assert result == expected def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): DataFrame().to_json(indent=-1) def test_emca_262_nan_inf_support(self): # GH 12213 data = StringIO( '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' ) result = read_json(data) expected = DataFrame( ["a", None, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) tm.assert_frame_equal(result, expected) def test_frame_int_overflow(self): # GH 30320 encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}]) expected = DataFrame({"col": ["31900441201190696999", "Text"]}) result = read_json(StringIO(encoded_json)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dataframe,expected", [ ( DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}), '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,' '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}', ) ], ) def test_json_multiindex(self, dataframe, expected): series = dataframe.stack(future_stack=True) result = series.to_json(orient="index") assert result == expected @pytest.mark.single_cpu def test_to_s3(self, s3_public_bucket, s3so): # GH 28375 mock_bucket_name, target_file = s3_public_bucket.name, "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) timeout = 5 while True: if target_file in (obj.key for obj in s3_public_bucket.objects.all()): break time.sleep(0.1) timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" def test_json_pandas_nulls(self, nulls_fixture, request): # GH 31615 if isinstance(nulls_fixture, Decimal): mark = pytest.mark.xfail(reason="not implemented") request.applymarker(mark) result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' def test_readjson_bool_series(self): # GH31464 result = read_json(StringIO("[true, true, false]"), typ="series") expected = Series([True, True, False]) tm.assert_series_equal(result, expected) def test_to_json_multiindex_escape(self): # GH 15273 df = DataFrame( True, index=date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], ).stack(future_stack=True) result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-20 00:00:00'), 'bar')\":true," "\"(Timestamp('2017-01-21 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-21 00:00:00'), 'bar')\":true," "\"(Timestamp('2017-01-22 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-22 00:00:00'), 'bar')\":true," "\"(Timestamp('2017-01-23 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-23 00:00:00'), 'bar')\":true}" ) assert result == expected def test_to_json_series_of_objects(self): class _TestObject: def __init__(self, a, b, _c, d) -> None: self.a = a self.b = b self._c = _c self.d = d def e(self): return 5 # JSON keys should be all non-callable non-underscore attributes, see GH-42768 series = Series([_TestObject(a=1, b=2, _c=3, d=4)]) assert json.loads(series.to_json()) == {"0": {"a": 1, "b": 2, "d": 4}} @pytest.mark.parametrize( "data,expected", [ ( Series({0: -6 + 8j, 1: 0 + 1j, 2: 9 - 5j}), '{"0":{"imag":8.0,"real":-6.0},' '"1":{"imag":1.0,"real":0.0},' '"2":{"imag":-5.0,"real":9.0}}', ), ( Series({0: -9.39 + 0.66j, 1: 3.95 + 9.32j, 2: 4.03 - 0.17j}), '{"0":{"imag":0.66,"real":-9.39},' '"1":{"imag":9.32,"real":3.95},' '"2":{"imag":-0.17,"real":4.03}}', ), ( DataFrame([[-2 + 3j, -1 - 0j], [4 - 3j, -0 - 10j]]), '{"0":{"0":{"imag":3.0,"real":-2.0},' '"1":{"imag":-3.0,"real":4.0}},' '"1":{"0":{"imag":0.0,"real":-1.0},' '"1":{"imag":-10.0,"real":0.0}}}', ), ( DataFrame( [[-0.28 + 0.34j, -1.08 - 0.39j], [0.41 - 0.34j, -0.78 - 1.35j]] ), '{"0":{"0":{"imag":0.34,"real":-0.28},' '"1":{"imag":-0.34,"real":0.41}},' '"1":{"0":{"imag":-0.39,"real":-1.08},' '"1":{"imag":-1.35,"real":-0.78}}}', ), ], ) def test_complex_data_tojson(self, data, expected): # GH41174 result = data.to_json() assert result == expected def test_json_uint64(self): # GH21073 expected = ( '{"columns":["col1"],"index":[0,1],' '"data":[[13342205958987758245],[12388075603347835679]]}' ) df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]}) result = df.to_json(orient="split") assert result == expected @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), "c": Series([1.5, np.nan, 2.5], dtype="Float64"), "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": [True, False, None], "f": [True, False, True], "g": ["a", "b", "c"], "h": ["a", "b", None], } ) if using_infer_string: string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) elif string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) elif dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient ) expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), "c": Series([1.5, np.nan, 2.5], dtype="Float64"), "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), "g": string_array, "h": string_array_na, } ) if dtype_backend == "pyarrow": from pandas.arrays import ArrowExtensionArray expected = DataFrame( { col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) for col in expected.columns } ) if orient == "values": expected.columns = list(range(8)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): # GH#50750 pa = pytest.importorskip("pyarrow") ser = Series([1, np.nan, 3], dtype="Int64") out = ser.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient, typ="series" ) expected = Series([1, np.nan, 3], dtype="Int64") if dtype_backend == "pyarrow": from pandas.arrays import ArrowExtensionArray expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True))) tm.assert_series_equal(result, expected) def test_invalid_dtype_backend(self): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): read_json("test", dtype_backend="numpy") def test_invalid_engine(): # GH 48893 ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="The engine type foo"): read_json(out, engine="foo") def test_pyarrow_engine_lines_false(): # GH 48893 ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="currently pyarrow engine only supports"): read_json(out, engine="pyarrow", lines=False) def test_json_roundtrip_string_inference(orient): pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) expected = DataFrame( [["a", "b"], ["c", "d"]], dtype="string[pyarrow_numpy]", index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) def test_json_pos_args_deprecation(): # GH-54229 df = DataFrame({"a": [1, 2, 3]}) msg = ( r"Starting with pandas version 3.0 all arguments of to_json except for the " r"argument 'path_or_buf' will be keyword-only." ) with tm.assert_produces_warning(FutureWarning, match=msg): buf = BytesIO() df.to_json(buf, "split") @td.skip_if_no("pyarrow") def test_to_json_ea_null(): # GH#57224 df = DataFrame( { "a": Series([1, NA], dtype="int64[pyarrow]"), "b": Series([2, NA], dtype="Int64"), } ) result = df.to_json(orient="records", lines=True) expected = """{"a":1,"b":2} {"a":null,"b":null} """ assert result == expected def test_read_json_lines_rangeindex(): # GH 57429 data = """ {"a": 1, "b": 2} {"a": 3, "b": 4} """ result = read_json(StringIO(data), lines=True).index expected = RangeIndex(2) tm.assert_index_equal(result, expected, exact=True)