""" Tests date parsing functionality for all of the parsers defined in parsers.py """ from datetime import ( date, datetime, timedelta, timezone, ) from io import StringIO from dateutil.parser import parse as du_parse import numpy as np import pytest import pytz from pandas._libs.tslibs import parsing import pandas as pd from pandas import ( DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, ) import pandas._testing as tm from pandas.core.indexes.datetimes import date_range from pandas.core.tools.datetimes import start_caching_at from pandas.io.parsers import read_csv pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @xfail_pyarrow def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): time = time.astype(np.float64) time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( """time e n h 41047.00 -98573.7297 871458.0640 389.0089 41048.00 -98573.7299 871458.0640 389.0089 41049.00 -98573.7300 871458.0642 389.0088 41050.00 -98573.7299 871458.0643 389.0088 41051.00 -98573.7302 871458.0640 389.0086 """ ) result = all_parsers.read_csv_check_warnings( FutureWarning, "Please use 'date_format' instead", testdata, delim_whitespace=True, parse_dates=True, date_parser=__custom_date_parser, index_col="time", ) time = [41047, 41048, 41049, 41050, 41051] time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") expected = DataFrame( { "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], }, index=time, ) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): time = time.astype(np.float64) time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( """time e 41047.00 -93.77 41048.00 -95.79 41049.00 -98.73 41050.00 -93.99 41051.00 -97.72 """ ) result = all_parsers.read_csv_check_warnings( FutureWarning, "Please use 'date_format' instead", testdata, delim_whitespace=True, parse_dates=False, date_parser=__custom_date_parser, index_col="time", ) time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time") expected = DataFrame( {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]}, index=time, ) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # # Make sure thousands separator and # date parsing do not conflict. parser = all_parsers data = "06-02-2013;13:00;1-000.215" expected = DataFrame( [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] ) depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): df = parser.read_csv( StringIO(data), sep=";", thousands="-", parse_dates={"Date": [0, 1]}, header=None, ) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col_custom(all_parsers, keep_date_col, request): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers if keep_date_col and parser.engine == "pyarrow": # For this to pass, we need to disable auto-inference on the date columns # in parse_dates. We have no way of doing this though mark = pytest.mark.xfail( reason="pyarrow doesn't support disabling auto-inference on column numbers." ) request.applymarker(mark) def date_parser(*date_cols): """ Test date parser. Parameters ---------- date_cols : args The list of data columns to parse. Returns ------- parsed : Series """ return parsing.try_parse_dates( parsing.concat_date_cols(date_cols), parser=du_parse ) kwds = { "header": None, "date_parser": date_parser, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, "keep_date_col": keep_date_col, "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), **kwds, raise_on_extra_warnings=False, ) expected = DataFrame( [ [ datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", "19990127", " 19:00:00", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0, ], [ datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", "19990127", " 20:00:00", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), "KORD", "19990127", " 21:00:00", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), "KORD", "19990127", " 21:00:00", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0, ], [ datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), "KORD", "19990127", " 22:00:00", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0, ], [ datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), "KORD", "19990127", " 23:00:00", " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0, ], ], columns=[ "actual", "nominal", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", ], ) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) # Python can sometimes be flaky about how # the aggregated columns are entered, so # this standardizes the order. result = result[expected.columns] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): msg = "not all elements from date_cols are numpy arrays" value = "19990127" date_cols = tuple(container([value]) for _ in range(dim)) with pytest.raises(ValueError, match=msg): parsing.concat_date_cols(date_cols) @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col, request): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers if keep_date_col and parser.engine == "pyarrow": # For this to pass, we need to disable auto-inference on the date columns # in parse_dates. We have no way of doing this though mark = pytest.mark.xfail( reason="pyarrow doesn't support disabling auto-inference on column numbers." ) request.applymarker(mark) depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" kwds = { "header": None, "parse_dates": [[1, 2], [1, 3]], "keep_date_col": keep_date_col, "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } with tm.assert_produces_warning( (DeprecationWarning, FutureWarning), match=depr_msg, check_stacklevel=False ): result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ [ datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", "19990127", " 19:00:00", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0, ], [ datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", "19990127", " 20:00:00", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), "KORD", "19990127", " 21:00:00", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), "KORD", "19990127", " 21:00:00", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0, ], [ datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), "KORD", "19990127", " 22:00:00", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0, ], [ datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), "KORD", "19990127", " 23:00:00", " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0, ], ], columns=[ "X1_X2", "X1_X3", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", ], ) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) tm.assert_frame_equal(result, expected) def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 """ parser = all_parsers kwds = { "header": None, "parse_dates": [1], "index_col": 1, "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"], } result = parser.read_csv(StringIO(data), **kwds) index = Index( [ datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], name="X1", ) expected = DataFrame( [ ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], ], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index, ) if parser.engine == "pyarrow": # https://github.com/pandas-dev/pandas/issues/44231 # pyarrow 6.0 starts to infer time type expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time tm.assert_frame_equal(result, expected) def test_multiple_date_cols_int_cast(all_parsers): data = ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" "KORD,19990127, 23:00:00, 22:56:00, -0.5900" ) parse_dates = {"actual": [1, 2], "nominal": [1, 3]} parser = all_parsers kwds = { "header": None, "parse_dates": parse_dates, "date_parser": pd.to_datetime, } result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), **kwds, raise_on_extra_warnings=False, ) expected = DataFrame( [ [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), "KORD", -0.59, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), "KORD", -0.99, ], [ datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), "KORD", -0.59, ], [ datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), "KORD", -0.59, ], ], columns=["actual", "nominal", 0, 4], ) # Python can sometimes be flaky about how # the aggregated columns are entered, so # this standardizes the order. result = result[expected.columns] tm.assert_frame_equal(result, expected) def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), parse_dates=[[0, 1]], header=None, date_parser=Timestamp, raise_on_extra_warnings=False, ) expected = DataFrame( [ [ Timestamp("05/31/2012, 15:30:00.029"), 1306.25, 1, "E", 0, np.nan, 1306.25, ], [ Timestamp("05/31/2012, 15:30:00.029"), 1306.25, 8, "E", 0, np.nan, 1306.25, ], ], columns=["0_1", 2, 3, 4, 5, 6, 7], ) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) expected = DataFrame( [ [ datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0, ], [ datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0, ], [ datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0, ], [ datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0, ], [ datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0, ], [ datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0, ], ], columns=[ "nominal", "ID", "ActualTime", "TDew", "TAir", "Windspeed", "Precip", "WindDir", ], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data,parse_dates,msg", [ ( """\ date_NominalTime,date,NominalTime KORD1,19990127, 19:00:00 KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already in dict date_NominalTime"), ), ( """\ ID,date,nominalTime KORD,19990127, 19:00:00 KORD,19990127, 20:00:00""", {"ID": [1, 2]}, "Date column ID already in dict", ), ], ) def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser = all_parsers depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning( (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False ): parser.read_csv(StringIO(data), parse_dates=parse_dates) def test_date_parser_int_bug(all_parsers): # see gh-3071 parser = all_parsers data = ( "posix_timestamp,elapsed,sys,user,queries,query_time,rows," "accountid,userid,contactid,level,silo,method\n" "1343103150,0.062353,0,4,6,0.01690,3," "12345,1,-1,3,invoice_InvoiceResource,search\n" ) result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), index_col=0, parse_dates=[0], # Note: we must pass tz and then drop the tz attribute # (if we don't CI will flake out depending on the runner's local time) date_parser=lambda x: datetime.fromtimestamp(int(x), tz=timezone.utc).replace( tzinfo=None ), raise_on_extra_warnings=False, ) expected = DataFrame( [ [ 0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1, 3, "invoice_InvoiceResource", "search", ] ], columns=[ "elapsed", "sys", "user", "queries", "query_time", "rows", "accountid", "userid", "contactid", "level", "silo", "method", ], index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), ) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers df = DataFrame( { "A": np.arange(10, dtype="float64"), "B": Timestamp("20010101").as_unit("ns"), } ) df.iloc[3:6, :] = np.nan with tm.ensure_clean("__nat_parse_.csv") as path: df.to_csv(path) result = parser.read_csv(path, index_col=0, parse_dates=["B"]) tm.assert_frame_equal(result, df) @skip_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ parser = all_parsers result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), ) expected = parser.read_csv(StringIO(data), parse_dates=True) tm.assert_frame_equal(result, expected) result = parser.read_csv(StringIO(data), date_format="%Y%m%d") tm.assert_frame_equal(result, expected) @skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ parser = all_parsers result = parser.read_csv(StringIO(data), parse_dates=True) expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_parse_dates_string(all_parsers): data = """date,A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index ) tm.assert_frame_equal(result, expected) # Bug in https://github.com/dateutil/dateutil/issues/217 # has been addressed, but we just don't pass in the `yearfirst` @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) def test_yy_format_with_year_first(all_parsers, parse_dates): data = """date,time,B,C 090131,0010,1,2 090228,1020,3,4 090331,0830,5,6 """ parser = all_parsers result = parser.read_csv_check_warnings( UserWarning, "Could not infer format", StringIO(data), index_col=0, parse_dates=parse_dates, ) index = DatetimeIndex( [ datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), datetime(2009, 3, 31, 8, 30, 0), ], dtype=object, name="date_time", ) expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) tm.assert_frame_equal(result, expected) @xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" parser = all_parsers expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) expected = expected.set_index(["a", "b"]) result = parser.read_csv( StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True ) tm.assert_frame_equal(result, expected) @xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C 20090101,one,a,1,2 20090101,two,b,3,4 20090101,three,c,4,5 20090102,one,a,1,2 20090102,two,b,3,4 20090102,three,c,4,5 20090103,one,a,1,2 20090103,two,b,3,4 20090103,three,c,4,5 """ parser = all_parsers index = MultiIndex.from_product( [ (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), ("one", "two", "three"), ], names=["index1", "index2"], ) # Out of order. if index_col == [1, 0]: index = index.swaplevel(0, 1) expected = DataFrame( [ ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], ], columns=["A", "B", "C"], index=index, ) result = parser.read_csv_check_warnings( UserWarning, "Could not infer format", StringIO(data), index_col=index_col, parse_dates=True, ) tm.assert_frame_equal(result, expected) @xfail_pyarrow @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers data = """foo,bar,baz 31/01/2010,1,2 01/02/2010,1,NA 02/02/2010,1,2 """ if "dayfirst" in kwargs: df = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), names=["time", "Q", "NTU"], date_parser=lambda d: du_parse(d, **kwargs), header=0, index_col=0, parse_dates=True, na_values=["NA"], ) exp_index = Index( [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], name="time", ) expected = DataFrame( {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, index=exp_index, columns=["Q", "NTU"], ) tm.assert_frame_equal(df, expected) else: msg = "got an unexpected keyword argument 'day_first'" with pytest.raises(TypeError, match=msg): parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), names=["time", "Q", "NTU"], date_parser=lambda d: du_parse(d, **kwargs), skiprows=[0], index_col=0, parse_dates=True, na_values=["NA"], ) def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) # TODO: make unit check more specific if parser.engine == "pyarrow": result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) if parser.engine == "pyarrow": expected_tz = pytz.utc else: expected_tz = timezone.utc tm.assert_frame_equal(result, expected) assert result.index.tz is expected_tz @xfail_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], ) def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): parser = all_parsers data = """ ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ expected = DataFrame( [ [ datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0, ], [ datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0, ], [ datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0, ], [ datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0, ], [ datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0, ], [ datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0, ], ], columns=[ "nominal", "ID", "ActualTime", "TDew", "TAir", "Windspeed", "Precip", "WindDir", ], ) expected = expected.set_index("nominal") if not isinstance(parse_dates, dict): expected.index.name = "date_NominalTime" depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), parse_dates=parse_dates, index_col=index_col ) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ ID,date,nominalTime,actualTime,A,B,C,D,E KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ expected = DataFrame( [ [ datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0, ], [ datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0, ], [ datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0, ], [ datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0, ], [ datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0, ], [ datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0, ], ], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], ) expected = expected.set_index("nominal") depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): with parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal", chunksize=2, ) as reader: chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) tm.assert_frame_equal(chunks[2], expected[4:]) def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ ID,date,nominalTime,actualTime,A,B,C,D,E KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False ): with_indices = parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" ) with tm.assert_produces_warning( (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False ): with_names = parser.read_csv( StringIO(data), index_col="nominal", parse_dates={"nominal": ["date", "nominalTime"]}, ) tm.assert_frame_equal(with_indices, with_names) def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ ID,date,nominalTime,actualTime,A,B,C,D,E KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} ) with tm.assert_produces_warning( (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False ): expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) expected = expected.set_index(["nominal", "ID"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers msg = ( "Only booleans, lists, and dictionaries " "are accepted for the 'parse_dates' parameter" ) data = """A,B,C 1,2,2003-11-1""" with pytest.raises(TypeError, match=msg): parser.read_csv(StringIO(data), parse_dates="C", **kwargs) @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers msg = ( "Only booleans, lists, and dictionaries " "are accepted for the 'parse_dates' parameter" ) data = """A,B,C 1,2,2003-11-1""" with pytest.raises(TypeError, match=msg): parser.read_csv(StringIO(data), parse_dates=(1,)) @pytest.mark.parametrize("cache_dates", [True, False]) @pytest.mark.parametrize("value", ["nan", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers s = StringIO((f"{value},\n") * (start_caching_at + 1)) parser.read_csv( s, header=None, names=["foo", "bar"], parse_dates=["foo"], cache_dates=cache_dates, ) @pytest.mark.parametrize("cache_dates", [True, False]) @pytest.mark.parametrize("value", ["0"]) def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly. parser = all_parsers s = StringIO((f"{value},\n") * 50000) if parser.engine == "pyarrow": # pyarrow reads "0" as 0 (of type int64), and so # pandas doesn't try to guess the datetime format # TODO: parse dates directly in pyarrow, see # https://github.com/pandas-dev/pandas/issues/48017 warn = None elif cache_dates: # Note: warning is not raised if 'cache_dates', because here there is only a # single unique date and hence no risk of inconsistent parsing. warn = None else: warn = UserWarning parser.read_csv_check_warnings( warn, "Could not infer format", s, header=None, names=["foo", "bar"], parse_dates=["foo"], cache_dates=cache_dates, raise_on_extra_warnings=False, ) @xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers data = "Date,test\n2012-01-01,1\n,2" result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "reader", ["read_csv_check_warnings", "read_table_check_warnings"] ) def test_parse_dates_infer_datetime_format_warning(all_parsers, reader): # GH 49024, 51017 parser = all_parsers data = "Date,test\n2012-01-01,1\n,2" getattr(parser, reader)( FutureWarning, "The argument 'infer_datetime_format' is deprecated", StringIO(data), parse_dates=["Date"], infer_datetime_format=True, sep=",", raise_on_extra_warnings=False, ) @pytest.mark.parametrize( "reader", ["read_csv_check_warnings", "read_table_check_warnings"] ) def test_parse_dates_date_parser_and_date_format(all_parsers, reader): # GH 50601 parser = all_parsers data = "Date,test\n2012-01-01,1\n,2" msg = "Cannot use both 'date_parser' and 'date_format'" with pytest.raises(TypeError, match=msg): getattr(parser, reader)( FutureWarning, "use 'date_format' instead", StringIO(data), parse_dates=["Date"], date_parser=pd.to_datetime, date_format="ISO8601", sep=",", ) @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ ( "a\n04.15.2016", {"parse_dates": ["a"]}, DataFrame([datetime(2016, 4, 15)], columns=["a"]), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] ), columns=[], ), ), ], ) def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): # see gh-14066 parser = all_parsers result = parser.read_csv(StringIO(data), thousands=".", **kwargs) tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_parse_date_time_multi_level_column_name(all_parsers): data = """\ D,T,A,B date, time,a,b 2001-01-05, 09:00:00, 0.0, 10. 2001-01-06, 00:00:00, 1.0, 11. """ parser = all_parsers result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), header=[0, 1], parse_dates={"date_time": [0, 1]}, date_parser=pd.to_datetime, ) expected_data = [ [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], ] expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data,kwargs,expected", [ ( """\ date,time,a,b 2001-01-05, 10:00:00, 0.0, 10. 2001-01-05, 00:00:00, 1., 11. """, {"header": 0, "parse_dates": {"date_time": [0, 1]}}, DataFrame( [ [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], ], columns=["date_time", "a", "b"], ), ), ( ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" "KORD,19990127, 23:00:00, 22:56:00, -0.5900" ), {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, DataFrame( [ [ datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81, ], [ datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), "KORD", -0.59, ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), "KORD", -0.99, ], [ datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), "KORD", -0.59, ], [ datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), "KORD", -0.59, ], ], columns=["actual", "nominal", 0, 4], ), ), ], ) def test_parse_date_time(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), date_parser=pd.to_datetime, **kwargs, raise_on_extra_warnings=False, ) # Python can sometimes be flaky about how # the aggregated columns are entered, so # this standardizes the order. result = result[expected.columns] tm.assert_frame_equal(result, expected) def test_parse_date_fields(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), header=0, parse_dates={"ymd": [0, 1, 2]}, date_parser=lambda x: x, raise_on_extra_warnings=False, ) expected = DataFrame( [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], columns=["ymd", "a"], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( ("key", "value", "warn"), [ ( "date_parser", lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), FutureWarning, ), ("date_format", "%Y %m %d %H %M %S", None), ], ) def test_parse_date_all_fields(all_parsers, key, value, warn): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0,0.0,10. 2001,01,5,10,0,00,1.,11. """ result = parser.read_csv_check_warnings( warn, "use 'date_format' instead", StringIO(data), header=0, parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, **{key: value}, raise_on_extra_warnings=False, ) expected = DataFrame( [ [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], ], columns=["ymdHMS", "a", "b"], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( ("key", "value", "warn"), [ ( "date_parser", lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), FutureWarning, ), ("date_format", "%Y %m %d %H %M %S.%f", None), ], ) def test_datetime_fractional_seconds(all_parsers, key, value, warn): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0.123456,0.0,10. 2001,01,5,10,0,0.500000,1.,11. """ result = parser.read_csv_check_warnings( warn, "use 'date_format' instead", StringIO(data), header=0, parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, **{key: value}, raise_on_extra_warnings=False, ) expected = DataFrame( [ [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], ], columns=["ymdHMS", "a", "b"], ) tm.assert_frame_equal(result, expected) def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." def parse_function(yy, mm): return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), header=0, parse_dates={"ym": [0, 1]}, date_parser=parse_function, raise_on_extra_warnings=False, ) expected = DataFrame( [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], columns=["ym", "day", "a"], ) expected["ym"] = expected["ym"].astype("datetime64[ns]") tm.assert_frame_equal(result, expected) @xfail_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers data = """\ date,time,prn,rxstatus 2013-11-03,19:00:00,126,00E80000 2013-11-03,19:00:00,23,00E80000 2013-11-03,19:00:00,13,00E80000 """ def date_parser(dt, time): try: arr = dt + "T" + time except TypeError: # dt & time are date/time objects arr = [datetime.combine(d, t) for d, t in zip(dt, time)] return np.array(arr, dtype="datetime64[s]") result = parser.read_csv_check_warnings( FutureWarning, "use 'date_format' instead", StringIO(data), date_parser=date_parser, parse_dates={"datetime": ["date", "time"]}, index_col=["datetime", "prn"], ) datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") expected = DataFrame( data={"rxstatus": ["00E80000"] * 3}, index=MultiIndex.from_arrays( [datetimes, [126, 23, 13]], names=["datetime", "prn"], ), ) tm.assert_frame_equal(result, expected) def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]] expected = DataFrame(expected_data, columns=["case", "opdate"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data,expected", [ ( "a\n135217135789158401\n1352171357E+5", DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), ), ( "a\n99999999999\n123456789012345\n1234E+0", DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), ), ], ) @pytest.mark.parametrize("parse_dates", [True, False]) def test_parse_date_float(all_parsers, data, expected, parse_dates): # see gh-2697 # # Date parsing should fail, so we leave the data untouched # (i.e. float precision should remain unchanged). parser = all_parsers result = parser.read_csv(StringIO(data), parse_dates=parse_dates) tm.assert_frame_equal(result, expected) def test_parse_timezone(all_parsers): # see gh-22256 parser = all_parsers data = """dt,val 2018-01-04 09:01:00+09:00,23350 2018-01-04 09:02:00+09:00,23400 2018-01-04 09:03:00+09:00,23400 2018-01-04 09:04:00+09:00,23400 2018-01-04 09:05:00+09:00,23400""" result = parser.read_csv(StringIO(data), parse_dates=["dt"]) dti = date_range( start="2018-01-04 09:01:00", end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers expected = DataFrame({0: [date_string]}, dtype="object") result = parser.read_csv( StringIO(date_string), header=None, parse_dates=[0], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "date_string,dayfirst,expected", [ # %d/%m/%Y; month > 12 thus replacement ("13/02/2019", True, datetime(2019, 2, 13)), # %m/%d/%Y; day > 12 thus there will be no replacement ("02/13/2019", False, datetime(2019, 2, 13)), # %d/%m/%Y; dayfirst==True thus replacement ("04/02/2019", True, datetime(2019, 2, 4)), ], ) def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" pytest.skip(reason="https://github.com/apache/arrow/issues/38676") msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv( StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] ) return result = parser.read_csv( StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] ) tm.assert_frame_equal(result, expected) # ArrowInvalid: CSV parse error: Empty CSV file or block: cannot infer number of columns @skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ # %d/%m/%Y; month > 12 ("13/02/2019", False, datetime(2019, 2, 13)), # %m/%d/%Y; day > 12 ("02/13/2019", True, datetime(2019, 2, 13)), ], ) def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." ) result = parser.read_csv_check_warnings( UserWarning, warning_msg, StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0], ) tm.assert_frame_equal(result, expected) def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 with pytest.raises( ValueError, match=( r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' r"at position 1. You might want to try:" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) # ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file @skip_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ (None, ["val"], ["date", "time"], "date, time"), (None, ["val"], [0, "time"], "time"), (None, ["val"], [["date", "time"]], "date, time"), (None, ["val"], [[0, "time"]], "time"), (None, ["val"], {"date": [0, "time"]}, "time"), (None, ["val"], {"date": ["date", "time"]}, "date, time"), (None, ["val"], [["date", "time"], "date"], "date, time"), (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), ( ["date1", "time1", "temperature"], ["date1", "temperature"], ["date1", "time"], "time", ), ], ) def test_missing_parse_dates_column_raises( all_parsers, names, usecols, parse_dates, missing_cols ): # gh-31251 column names provided in parse_dates could be missing. parser = all_parsers content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) warn = FutureWarning if isinstance(parse_dates, list) and all( isinstance(x, (int, str)) for x in parse_dates ): warn = None with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): parser.read_csv( content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates ) @xfail_pyarrow # mismatched shape def test_date_parser_and_names(all_parsers): # GH#33699 parser = all_parsers data = StringIO("""x,y\n1,2""") warn = UserWarning if parser.engine == "pyarrow": # DeprecationWarning for passing a Manager object warn = (UserWarning, DeprecationWarning) result = parser.read_csv_check_warnings( warn, "Could not infer format", data, parse_dates=["B"], names=["B"], ) expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) tm.assert_frame_equal(result, expected) @xfail_pyarrow # TypeError: an integer is required def test_date_parser_multiindex_columns(all_parsers): parser = all_parsers data = """a,b 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) expected = DataFrame( {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} ) tm.assert_frame_equal(result, expected) @xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "parse_spec, col_name", [ ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), ], ) def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): parser = all_parsers data = """a,b,c 1,2,3 2019-12,-31,6""" depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), parse_dates=parse_spec, header=[0, 1], ) expected = DataFrame( {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} ) tm.assert_frame_equal(result, expected) def test_date_parser_usecols_thousands(all_parsers): # GH#39365 data = """A,B,C 1,3,20-09-01-01 2,4,20-09-01-01 """ parser = all_parsers if parser.engine == "pyarrow": # DeprecationWarning for passing a Manager object msg = "The 'thousands' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv( StringIO(data), parse_dates=[1], usecols=[1, 2], thousands="-", ) return result = parser.read_csv_check_warnings( UserWarning, "Could not infer format", StringIO(data), parse_dates=[1], usecols=[1, 2], thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) tm.assert_frame_equal(result, expected) @xfail_pyarrow # mismatched shape def test_parse_dates_and_keep_original_column(all_parsers): # GH#13378 parser = all_parsers data = """A 20150908 20150909 """ depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True ) expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] expected = DataFrame({"date": expected_data, "A": expected_data}) tm.assert_frame_equal(result, expected) def test_dayfirst_warnings(): # GH 12585 # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." ) # A. dayfirst arg correct, no warning res1 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" ).index tm.assert_index_equal(expected, res1) # B. dayfirst arg incorrect, warning with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index tm.assert_index_equal(expected, res2) # CASE 2: invalid input # cannot consistently process with single format # return to user unaltered # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") # A. use dayfirst=True res5 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" ).index tm.assert_index_equal(expected, res5) # B. use dayfirst=False with tm.assert_produces_warning(UserWarning, match=warning_msg): res6 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index tm.assert_index_equal(expected, res6) @pytest.mark.parametrize( "date_string, dayfirst", [ pytest.param( "31/1/2014", False, id="second date is single-digit", ), pytest.param( "1/31/2014", True, id="first date is single-digit", ), ], ) def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." ) with tm.assert_produces_warning(UserWarning, match=warning_msg): res = read_csv( StringIO(initial_value), parse_dates=["date"], index_col="date", dayfirst=dayfirst, ).index tm.assert_index_equal(expected, res) @skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers data = "a,b,c\n1970-01-01,2,3,4" result = parser.read_csv( StringIO(data), parse_dates=["a"], ) expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) tm.assert_frame_equal(result, expected) @xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values @pytest.mark.parametrize( ("key", "value", "warn"), [ ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), ("date_format", "%Y-%m-%d", None), ], ) def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): # GH#26203 parser = all_parsers data = """Test 2012-10-01 0 2015-05-15 # 2017-09-09 """ result = parser.read_csv_check_warnings( warn, "use 'date_format' instead", StringIO(data), na_values={"Test": ["#", "0"]}, parse_dates=["Test"], **{key: value}, ) expected = DataFrame( { "Test": [ Timestamp("2012-10-01"), pd.NaT, Timestamp("2015-05-15"), pd.NaT, Timestamp("2017-09-09"), ] } ) tm.assert_frame_equal(result, expected) @xfail_pyarrow # string[python] instead of dt64[ns] def test_parse_dates_and_string_dtype(all_parsers): # GH#34066 parser = all_parsers data = """a,b 1,2019-12-31 """ result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") tm.assert_frame_equal(result, expected) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers data = """a,b 27.03.2003 14:55:00.000,1 03.08.2003 15:20:00.000,2""" if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], dtype="object", name="a", ) warn = None else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], dtype="datetime64[ns]", name="a", ) warn = UserWarning msg = r"when dayfirst=False \(the default\) was specified" result = parser.read_csv_check_warnings( warn, msg, StringIO(data), parse_dates=True, index_col=0, raise_on_extra_warnings=False, ) expected = DataFrame({"b": [1, 2]}, index=expected_index) tm.assert_frame_equal(result, expected) def test_parse_dates_dict_format(all_parsers): # GH#51240 parser = all_parsers data = """a,b 2019-12-31,31-12-2019 2020-12-31,31-12-2020""" result = parser.read_csv( StringIO(data), date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"}, parse_dates=["a", "b"], ) expected = DataFrame( { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], } ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] ) def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): # GH#51240 parser = all_parsers data = """a,b 31-,12-2019 31-,12-2020""" depr_msg = ( "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates ) expected = DataFrame( { key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], } ) tm.assert_frame_equal(result, expected) @xfail_pyarrow # object dtype index def test_parse_dates_dict_format_index(all_parsers): # GH#51240 parser = all_parsers data = """a,b 2019-12-31,31-12-2019 2020-12-31,31-12-2020""" result = parser.read_csv( StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0 ) expected = DataFrame( { "b": ["31-12-2019", "31-12-2020"], }, index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"), ) tm.assert_frame_equal(result, expected) def test_parse_dates_arrow_engine(all_parsers): # GH#53295 parser = all_parsers data = """a,b 2000-01-01 00:00:00,1 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) # TODO: make unit check more specific if parser.engine == "pyarrow": result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ Timestamp("2000-01-01 00:00:00"), Timestamp("2000-01-01 00:00:01"), ], "b": 1, } ) tm.assert_frame_equal(result, expected) @xfail_pyarrow # object dtype index def test_from_csv_with_mixed_offsets(all_parsers): parser = all_parsers data = "a\n2020-01-01T00:00:00+01:00\n2020-01-01T00:00:00+00:00" result = parser.read_csv(StringIO(data), parse_dates=["a"])["a"] expected = Series( [ Timestamp("2020-01-01 00:00:00+01:00"), Timestamp("2020-01-01 00:00:00+00:00"), ], name="a", index=[0, 1], ) tm.assert_series_equal(result, expected)