""" Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ from io import StringIO import numpy as np import pytest from pandas import ( DataFrame, Series, ) import pandas._testing as tm pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_int_conversion(all_parsers): data = """A,B 1.0,1 2.0,2 3.0,3 """ parser = all_parsers result = parser.read_csv(StringIO(data)) expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data,kwargs,expected", [ ( "A,B\nTrue,1\nFalse,2\nTrue,3", {}, DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), ), ( "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, DataFrame( [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], columns=["A", "B"], ), ), ( "A,B\nTRUE,1\nFALSE,2\nTRUE,3", {}, DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), ), ( "A,B\nfoo,bar\nbar,foo", {"true_values": ["foo"], "false_values": ["bar"]}, DataFrame([[True, False], [False, True]], columns=["A", "B"]), ), ], ) def test_parse_bool(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) def test_parse_integers_above_fp_precision(all_parsers): data = """Numbers 17007000002000191 17007000002000191 17007000002000191 17007000002000191 17007000002000192 17007000002000192 17007000002000192 17007000002000192 17007000002000192 17007000002000194""" parser = all_parsers result = parser.read_csv(StringIO(data)) expected = DataFrame( { "Numbers": [ 17007000002000191, 17007000002000191, 17007000002000191, 17007000002000191, 17007000002000192, 17007000002000192, 17007000002000192, 17007000002000192, 17007000002000192, 17007000002000194, ] } ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 data = "65248E10 11\n55555E55 22\n" parser = all_parsers if parser.engine == "pyarrow" and sep != " ": msg = "the 'pyarrow' engine does not support regex separators" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=None, sep=sep) return result = parser.read_csv(StringIO(data), header=None, sep=sep) expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) tm.assert_frame_equal(result, expected) def test_int64_min_issues(all_parsers): # see gh-2599 parser = all_parsers data = "A,B\n0,0\n0," result = parser.read_csv(StringIO(data)) expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv, request): data = """ID 00013007854817840016671868 00013007854817840016749251 00013007854817840016754630 00013007854817840016781876 00013007854817840017028824 00013007854817840017963235 00013007854817840018860166""" parser = all_parsers if conv is None: # 13007854817840016671868 > UINT64_MAX, so this # will overflow and return object as the dtype. if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="parses to float64") request.applymarker(mark) result = parser.read_csv(StringIO(data)) expected = DataFrame( [ "00013007854817840016671868", "00013007854817840016749251", "00013007854817840016754630", "00013007854817840016781876", "00013007854817840017028824", "00013007854817840017963235", "00013007854817840018860166", ], columns=["ID"], ) tm.assert_frame_equal(result, expected) else: # 13007854817840016671868 > UINT64_MAX, so attempts # to cast to either int64 or uint64 will result in # an OverflowError being raised. msg = "|".join( [ "Python int too large to convert to C long", "long too big to convert", "int too big to convert", ] ) err = OverflowError if parser.engine == "pyarrow": err = ValueError msg = "The 'converters' option is not supported with the 'pyarrow' engine" with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), converters={"ID": conv}) @skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 # range, so they should be parsed as string. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) expected = DataFrame([val]) tm.assert_frame_equal(result, expected) @skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) def test_outside_int64_uint64_range(all_parsers, val): # These numbers fall just outside the int64-uint64 # range, so they should be parsed as string. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) expected = DataFrame([str(val)]) tm.assert_frame_equal(result, expected) @xfail_pyarrow # gets float64 dtype instead of object @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 # values, so they should be cast as string. parser = all_parsers data = "\n".join(exp_data) expected = DataFrame(exp_data) result = parser.read_csv(StringIO(data), header=None) tm.assert_frame_equal(result, expected) def test_integer_precision(all_parsers): # Gh 7072 s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" parser = all_parsers result = parser.read_csv(StringIO(s), header=None)[4] expected = Series([4321583677327450765, 4321113141090630389], name=4) tm.assert_series_equal(result, expected)