from __future__ import annotations import os import pytest from pandas.compat._optional import VERSIONS from pandas import ( read_csv, read_table, ) import pandas._testing as tm class BaseParser: engine: str | None = None low_memory = True float_precision_choices: list[str | None] = [] def update_kwargs(self, kwargs): kwargs = kwargs.copy() kwargs.update({"engine": self.engine, "low_memory": self.low_memory}) return kwargs def read_csv(self, *args, **kwargs): kwargs = self.update_kwargs(kwargs) return read_csv(*args, **kwargs) def read_csv_check_warnings( self, warn_type: type[Warning], warn_msg: str, *args, raise_on_extra_warnings=True, check_stacklevel: bool = True, **kwargs, ): # We need to check the stacklevel here instead of in the tests # since this is where read_csv is called and where the warning # should point to. kwargs = self.update_kwargs(kwargs) with tm.assert_produces_warning( warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings, check_stacklevel=check_stacklevel, ): return read_csv(*args, **kwargs) def read_table(self, *args, **kwargs): kwargs = self.update_kwargs(kwargs) return read_table(*args, **kwargs) def read_table_check_warnings( self, warn_type: type[Warning], warn_msg: str, *args, raise_on_extra_warnings=True, **kwargs, ): # We need to check the stacklevel here instead of in the tests # since this is where read_table is called and where the warning # should point to. kwargs = self.update_kwargs(kwargs) with tm.assert_produces_warning( warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings ): return read_table(*args, **kwargs) class CParser(BaseParser): engine = "c" float_precision_choices = [None, "high", "round_trip"] class CParserHighMemory(CParser): low_memory = False class CParserLowMemory(CParser): low_memory = True class PythonParser(BaseParser): engine = "python" float_precision_choices = [None] class PyArrowParser(BaseParser): engine = "pyarrow" float_precision_choices = [None] @pytest.fixture def csv_dir_path(datapath): """ The directory path to the data files needed for parser tests. """ return datapath("io", "parser", "data") @pytest.fixture def csv1(datapath): """ The path to the data file "test1.csv" needed for parser tests. """ return os.path.join(datapath("io", "data", "csv"), "test1.csv") _cParserHighMemory = CParserHighMemory _cParserLowMemory = CParserLowMemory _pythonParser = PythonParser _pyarrowParser = PyArrowParser _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] _pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] _pyarrow_parsers_ids = ["pyarrow"] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) def all_parsers(request): """ Fixture all of the CSV parsers. """ parser = request.param() if parser.engine == "pyarrow": pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) # Try finding a way to disable threads all together # for more stable CI runs import pyarrow pyarrow.set_cpu_count(1) return parser @pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) def c_parser_only(request): """ Fixture all of the CSV parsers using the C engine. """ return request.param() @pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): """ Fixture all of the CSV parsers using the Python engine. """ return request.param() @pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids) def pyarrow_parser_only(request): """ Fixture all of the CSV parsers using the Pyarrow engine. """ return request.param() def _get_all_parser_float_precision_combinations(): """ Return all allowable parser and float precision combinations and corresponding ids. """ params = [] ids = [] for parser, parser_id in zip(_all_parsers, _all_parser_ids): if hasattr(parser, "values"): # Wrapped in pytest.param, get the actual parser back parser = parser.values[0] for precision in parser.float_precision_choices: # Re-wrap in pytest.param for pyarrow mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () param = pytest.param((parser(), precision), marks=mark) params.append(param) ids.append(f"{parser_id}-{precision}") return {"params": params, "ids": ids} @pytest.fixture( params=_get_all_parser_float_precision_combinations()["params"], ids=_get_all_parser_float_precision_combinations()["ids"], ) def all_parsers_all_precisions(request): """ Fixture for all allowable combinations of parser and float precision """ return request.param _utf_values = [8, 16, 32] _encoding_seps = ["", "-", "_"] _encoding_prefixes = ["utf", "UTF"] _encoding_fmts = [ f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes ] @pytest.fixture(params=_utf_values) def utf_value(request): """ Fixture for all possible integer values for a UTF encoding. """ return request.param @pytest.fixture(params=_encoding_fmts) def encoding_fmt(request): """ Fixture for all possible string formats of a UTF encoding. """ return request.param @pytest.fixture( params=[ ("-1,0", -1.0), ("-1,2e0", -1.2), ("-1e0", -1.0), ("+1e0", 1.0), ("+1e+0", 1.0), ("+1e-1", 0.1), ("+,1e1", 1.0), ("+1,e0", 1.0), ("-,1e1", -1.0), ("-1,e0", -1.0), ("0,1", 0.1), ("1,", 1.0), (",1", 0.1), ("-,1", -0.1), ("1_,", 1.0), ("1_234,56", 1234.56), ("1_234,56e0", 1234.56), # negative cases; must not parse as float ("_", "_"), ("-_", "-_"), ("-_1", "-_1"), ("-_1e0", "-_1e0"), ("_1", "_1"), ("_1,", "_1,"), ("_1,_", "_1,_"), ("_1e0", "_1e0"), ("1,2e_1", "1,2e_1"), ("1,2e1_0", "1,2e1_0"), ("1,_2", "1,_2"), (",1__2", ",1__2"), (",1e", ",1e"), ("-,1e", "-,1e"), ("1_000,000_000", "1_000,000_000"), ("1,e1_2", "1,e1_2"), ("e11,2", "e11,2"), ("1e11,2", "1e11,2"), ("1,2,2", "1,2,2"), ("1,2_1", "1,2_1"), ("1,2e-10e1", "1,2e-10e1"), ("--1,2", "--1,2"), ("1a_2,1", "1a_2,1"), ("1,2E-1", 0.12), ("1,2E1", 12.0), ] ) def numeric_decimal(request): """ Fixture for all numeric formats which should get recognized. The first entry represents the value to read while the second represents the expected result. """ return request.param @pytest.fixture def pyarrow_xfail(request): """ Fixture that xfails a test if the engine is pyarrow. Use if failure is do to unsupported keywords or inconsistent results. """ if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") elif "all_parsers_all_precisions" in request.fixturenames: # Return value is tuple of (engine, precision) parser = request.getfixturevalue("all_parsers_all_precisions")[0] else: return if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") request.applymarker(mark) @pytest.fixture def pyarrow_skip(request): """ Fixture that skips a test if the engine is pyarrow. Use if failure is do a parsing failure from pyarrow.csv.read_csv """ if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") elif "all_parsers_all_precisions" in request.fixturenames: # Return value is tuple of (engine, precision) parser = request.getfixturevalue("all_parsers_all_precisions")[0] else: return if parser.engine == "pyarrow": pytest.skip(reason="https://github.com/apache/arrow/issues/38676")