""" Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ from io import ( BytesIO, StringIO, ) import os import platform from urllib.error import URLError import uuid import numpy as np import pytest from pandas.errors import ( EmptyDataError, ParserError, ) import pandas.util._test_decorators as td from pandas import ( DataFrame, Index, ) import pandas._testing as tm pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.network @pytest.mark.single_cpu def test_url(all_parsers, csv_dir_path, httpserver): parser = all_parsers kwargs = {"sep": "\t"} local_path = os.path.join(csv_dir_path, "salaries.csv") with open(local_path, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) url_result = parser.read_csv(httpserver.url, **kwargs) local_result = parser.read_csv(local_path, **kwargs) tm.assert_frame_equal(url_result, local_result) @pytest.mark.slow def test_local_file(all_parsers, csv_dir_path): parser = all_parsers kwargs = {"sep": "\t"} local_path = os.path.join(csv_dir_path, "salaries.csv") local_result = parser.read_csv(local_path, **kwargs) url = "file://localhost/" + local_path try: url_result = parser.read_csv(url, **kwargs) tm.assert_frame_equal(url_result, local_result) except URLError: # Fails on some systems. pytest.skip("Failing on: " + " ".join(platform.uname())) @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD"), dtype=object), index=Index([f"i-{i}" for i in range(30)], dtype=object), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_local_path(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD"), dtype=object), index=Index([f"i-{i}" for i in range(30)], dtype=object), ) result = tm.round_trip_localpath( df.to_csv, lambda p: parser.read_csv(p, index_col=0) ) tm.assert_frame_equal(df, result) def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError # GH#29233 "File foo" instead of "File b'foo'" parser = all_parsers path = f"{uuid.uuid4()}.csv" msg = r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) assert path == e.value.filename @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 parser = all_parsers msg = r"\[Errno 13\]" with tm.ensure_clean() as path: os.chmod(path, 0) # make file unreadable # verify that this process cannot open the file (not running as sudo) try: with open(path, encoding="utf-8"): pass pytest.skip("Running as sudo.") except PermissionError: pass with pytest.raises(PermissionError, match=msg) as e: parser.read_csv(path) assert path == e.value.filename @pytest.mark.parametrize( "data,kwargs,expected,msg", [ # gh-10728: WHITESPACE_LINE ( "a,b,c\n4,5,6\n ", {}, DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None, ), # gh-10548: EAT_LINE_COMMENT ( "a,b,c\n4,5,6\n#comment", {"comment": "#"}, DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None, ), # EAT_CRNL_NOP ( "a,b,c\n4,5,6\n\r", {}, DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None, ), # EAT_COMMENT ( "a,b,c\n4,5,6#comment", {"comment": "#"}, DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None, ), # SKIP_LINE ( "a,b,c\n4,5,6\nskipme", {"skiprows": [2]}, DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None, ), # EAT_LINE_COMMENT ( "a,b,c\n4,5,6\n#comment", {"comment": "#", "skip_blank_lines": False}, DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None, ), # IN_FIELD ( "a,b,c\n4,5,6\n ", {"skip_blank_lines": False}, DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), None, ), # EAT_CRNL ( "a,b,c\n4,5,6\n\r", {"skip_blank_lines": False}, DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), None, ), # ESCAPED_CHAR ( "a,b,c\n4,5,6\n\\", {"escapechar": "\\"}, None, "(EOF following escape character)|(unexpected end of data)", ), # ESCAPE_IN_QUOTED_FIELD ( 'a,b,c\n4,5,6\n"\\', {"escapechar": "\\"}, None, "(EOF inside string starting at row 2)|(unexpected end of data)", ), # IN_QUOTED_FIELD ( 'a,b,c\n4,5,6\n"', {"escapechar": "\\"}, None, "(EOF inside string starting at row 2)|(unexpected end of data)", ), ], ids=[ "whitespace-line", "eat-line-comment", "eat-crnl-nop", "eat-comment", "skip-line", "eat-line-comment", "in-field", "eat-crnl", "escaped-char", "escape-in-quoted-field", "in-quoted-field", ], ) def test_eof_states(all_parsers, data, kwargs, expected, msg, request): # see gh-10728, gh-10548 parser = all_parsers if parser.engine == "pyarrow" and "comment" in kwargs: msg = "The 'comment' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return if parser.engine == "pyarrow" and "\r" not in data: # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1: # ValueError: skiprows argument must be an integer when using engine='pyarrow' # AssertionError: Regex pattern did not match. pytest.skip(reason="https://github.com/apache/arrow/issues/38676") if expected is None: with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), **kwargs) else: result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers data = "0 0" with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: new_file.write(data) new_file.flush() new_file.seek(0) if parser.engine == "pyarrow": msg = "the 'pyarrow' engine does not support regex separators" with pytest.raises(ValueError, match=msg): parser.read_csv(new_file, sep=r"\s+", header=None) return result = parser.read_csv(new_file, sep=r"\s+", header=None) expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) def test_internal_eof_byte(all_parsers): # see gh-5500 parser = all_parsers data = "a,b\n1\x1a,2" expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) def test_internal_eof_byte_to_file(all_parsers): # see gh-16559 parser = all_parsers data = b'c1,c2\r\n"test \x1a test", test\r\n' expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) path = f"__{uuid.uuid4()}__.csv" with tm.ensure_clean(path) as path: with open(path, "wb") as f: f.write(data) result = parser.read_csv(path) tm.assert_frame_equal(result, expected) def test_file_handle_string_io(all_parsers): # gh-14418 # # Don't close user provided file handles. parser = all_parsers data = "a,b\n1,2" fh = StringIO(data) parser.read_csv(fh) assert not fh.closed def test_file_handles_with_open(all_parsers, csv1): # gh-14418 # # Don't close user provided file handles. parser = all_parsers for mode in ["r", "rb"]: with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f: parser.read_csv(f) assert not f.closed def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: pass parser = all_parsers msg = "Invalid file path or buffer object type" with pytest.raises(ValueError, match=msg): parser.read_csv(InvalidBuffer()) def test_invalid_file_buffer_mock(all_parsers): # see gh-15337 parser = all_parsers msg = "Invalid file path or buffer object type" class Foo: pass with pytest.raises(ValueError, match=msg): parser.read_csv(Foo()) def test_valid_file_buffer_seems_invalid(all_parsers): # gh-16135: we want to ensure that "tell" and "seek" # aren't actually being used when we call `read_csv` # # Thus, while the object may look "invalid" (these # methods are attributes of the `StringIO` class), # it is still a valid file-object for our purposes. class NoSeekTellBuffer(StringIO): def tell(self): raise AttributeError("No tell method") def seek(self, pos, whence=0): raise AttributeError("No seek method") data = "a\n1" parser = all_parsers expected = DataFrame({"a": [1]}) result = parser.read_csv(NoSeekTellBuffer(data)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("io_class", [StringIO, BytesIO]) @pytest.mark.parametrize("encoding", [None, "utf-8"]) def test_read_csv_file_handle(all_parsers, io_class, encoding): """ Test whether read_csv does not close user-provided file handles. GH 36980 """ parser = all_parsers expected = DataFrame({"a": [1], "b": [2]}) content = "a,b\n1,2" handle = io_class(content.encode("utf-8") if io_class == BytesIO else content) tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) assert not handle.closed def test_memory_map_compression(all_parsers, compression): """ Support memory map for compressed files. GH 37621 """ parser = all_parsers expected = DataFrame({"a": [1], "b": [2]}) with tm.ensure_clean() as path: expected.to_csv(path, index=False, compression=compression) if parser.engine == "pyarrow": msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(path, memory_map=True, compression=compression) return result = parser.read_csv(path, memory_map=True, compression=compression) tm.assert_frame_equal( result, expected, ) def test_context_manager(all_parsers, datapath): # make sure that opened files are closed parser = all_parsers path = datapath("io", "data", "csv", "iris.csv") if parser.engine == "pyarrow": msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(path, chunksize=1) return reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: with reader: next(reader) assert False except AssertionError: assert reader.handles.handle.closed def test_context_manageri_user_provided(all_parsers, datapath): # make sure that user-provided handles are not closed parser = all_parsers with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: if parser.engine == "pyarrow": msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(path, chunksize=1) return reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: with reader: next(reader) assert False except AssertionError: assert not reader.handles.handle.closed @skip_pyarrow # ParserError: Empty CSV file def test_file_descriptor_leak(all_parsers, using_copy_on_write): # GH 31488 parser = all_parsers with tm.ensure_clean() as path: with pytest.raises(EmptyDataError, match="No columns to parse from file"): parser.read_csv(path) def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers expected = DataFrame( {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} ) if parser.engine == "pyarrow": msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(mmap_file, memory_map=True) return result = parser.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(result, expected)