""" Tests the 'read_fwf' function in parsers.py. This test suite is independent of the others because the engine is set to 'python-fwf' internally. """ from datetime import datetime from io import ( BytesIO, StringIO, ) from pathlib import Path import numpy as np import pytest from pandas.errors import EmptyDataError import pandas as pd from pandas import ( DataFrame, DatetimeIndex, ) import pandas._testing as tm from pandas.core.arrays import ( ArrowStringArray, StringArray, ) from pandas.io.common import urlopen from pandas.io.parsers import ( read_csv, read_fwf, ) def test_basic(): data = """\ A B C D 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201160 364.136849 183.628767 11806.2 201161 413.836124 184.375703 11916.8 201162 502.953953 173.237159 12468.3 """ result = read_fwf(StringIO(data)) expected = DataFrame( [ [201158, 360.242940, 149.910199, 11950.7], [201159, 444.953632, 166.985655, 11788.4], [201160, 364.136849, 183.628767, 11806.2], [201161, 413.836124, 184.375703, 11916.8], [201162, 502.953953, 173.237159, 12468.3], ], columns=["A", "B", "C", "D"], ) tm.assert_frame_equal(result, expected) def test_colspecs(): data = """\ A B C D E 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201160 364.136849 183.628767 11806.2 201161 413.836124 184.375703 11916.8 201162 502.953953 173.237159 12468.3 """ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(data), colspecs=colspecs) expected = DataFrame( [ [2011, 58, 360.242940, 149.910199, 11950.7], [2011, 59, 444.953632, 166.985655, 11788.4], [2011, 60, 364.136849, 183.628767, 11806.2], [2011, 61, 413.836124, 184.375703, 11916.8], [2011, 62, 502.953953, 173.237159, 12468.3], ], columns=["A", "B", "C", "D", "E"], ) tm.assert_frame_equal(result, expected) def test_widths(): data = """\ A B C D E 2011 58 360.242940 149.910199 11950.7 2011 59 444.953632 166.985655 11788.4 2011 60 364.136849 183.628767 11806.2 2011 61 413.836124 184.375703 11916.8 2011 62 502.953953 173.237159 12468.3 """ result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7]) expected = DataFrame( [ [2011, 58, 360.242940, 149.910199, 11950.7], [2011, 59, 444.953632, 166.985655, 11788.4], [2011, 60, 364.136849, 183.628767, 11806.2], [2011, 61, 413.836124, 184.375703, 11916.8], [2011, 62, 502.953953, 173.237159, 12468.3], ], columns=["A", "B", "C", "D", "E"], ) tm.assert_frame_equal(result, expected) def test_non_space_filler(): # From Thomas Kluyver: # # Apparently, some non-space filler characters can be seen, this is # supported by specifying the 'delimiter' character: # # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html data = """\ A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E 201158~~~~360.242940~~~149.910199~~~11950.7 201159~~~~444.953632~~~166.985655~~~11788.4 201160~~~~364.136849~~~183.628767~~~11806.2 201161~~~~413.836124~~~184.375703~~~11916.8 201162~~~~502.953953~~~173.237159~~~12468.3 """ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~") expected = DataFrame( [ [2011, 58, 360.242940, 149.910199, 11950.7], [2011, 59, 444.953632, 166.985655, 11788.4], [2011, 60, 364.136849, 183.628767, 11806.2], [2011, 61, 413.836124, 184.375703, 11916.8], [2011, 62, 502.953953, 173.237159, 12468.3], ], columns=["A", "B", "C", "D", "E"], ) tm.assert_frame_equal(result, expected) def test_over_specified(): data = """\ A B C D E 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201160 364.136849 183.628767 11806.2 201161 413.836124 184.375703 11916.8 201162 502.953953 173.237159 12468.3 """ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] with pytest.raises(ValueError, match="must specify only one of"): read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7]) def test_under_specified(): data = """\ A B C D E 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201160 364.136849 183.628767 11806.2 201161 413.836124 184.375703 11916.8 201162 502.953953 173.237159 12468.3 """ with pytest.raises(ValueError, match="Must specify either"): read_fwf(StringIO(data), colspecs=None, widths=None) def test_read_csv_compat(): csv_data = """\ A,B,C,D,E 2011,58,360.242940,149.910199,11950.7 2011,59,444.953632,166.985655,11788.4 2011,60,364.136849,183.628767,11806.2 2011,61,413.836124,184.375703,11916.8 2011,62,502.953953,173.237159,12468.3 """ expected = read_csv(StringIO(csv_data), engine="python") fwf_data = """\ A B C D E 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201160 364.136849 183.628767 11806.2 201161 413.836124 184.375703 11916.8 201162 502.953953 173.237159 12468.3 """ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(fwf_data), colspecs=colspecs) tm.assert_frame_equal(result, expected) def test_bytes_io_input(): data = BytesIO("שלום\nשלום".encode()) # noqa: RUF001 result = read_fwf(data, widths=[2, 2], encoding="utf8") expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) tm.assert_frame_equal(result, expected) def test_fwf_colspecs_is_list_or_tuple(): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ msg = "column specifications must be a list or tuple.+" with pytest.raises(TypeError, match=msg): read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",") def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ msg = "Each column specification must be.+" with pytest.raises(TypeError, match=msg): read_fwf(StringIO(data), colspecs=[("a", 1)]) @pytest.mark.parametrize( "colspecs,exp_data", [ ([(0, 3), (3, None)], [[123, 456], [456, 789]]), ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), ], ) def test_fwf_colspecs_none(colspecs, exp_data): # see gh-7079 data = """\ 123456 456789 """ expected = DataFrame(exp_data) result = read_fwf(StringIO(data), colspecs=colspecs, header=None) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "infer_nrows,exp_data", [ # infer_nrows --> colspec == [(2, 3), (5, 6)] (1, [[1, 2], [3, 8]]), # infer_nrows > number of rows (10, [[1, 2], [123, 98]]), ], ) def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data): # see gh-15138 data = """\ 1 2 123 98 """ expected = DataFrame(exp_data) result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None) tm.assert_frame_equal(result, expected) def test_fwf_regression(): # see gh-3594 # # Turns out "T060" is parsable as a datetime slice! tz_list = [1, 10, 20, 30, 60, 80, 100] widths = [16] + [8] * len(tz_list) names = ["SST"] + [f"T{z:03d}" for z in tz_list[1:]] data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): result = read_fwf( StringIO(data), index_col=0, header=None, names=names, widths=widths, parse_dates=True, date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), ) expected = DataFrame( [ [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], ], index=DatetimeIndex( [ "2009-06-13 20:20:00", "2009-06-13 20:30:00", "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", ] ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) tm.assert_frame_equal(result, expected) result = read_fwf( StringIO(data), index_col=0, header=None, names=names, widths=widths, parse_dates=True, date_format="%Y%j%H%M%S", ) tm.assert_frame_equal(result, expected) def test_fwf_for_uint8(): data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa: E501 df = read_fwf( StringIO(data), colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)], names=["time", "pri", "pgn", "dst", "src", "data"], converters={ "pgn": lambda x: int(x, 16), "src": lambda x: int(x, 16), "dst": lambda x: int(x, 16), "data": lambda x: len(x.split(" ")), }, ) expected = DataFrame( [ [1421302965.213420, 3, 61184, 23, 40, 8], [1421302964.226776, 6, 61442, None, 71, 8], ], columns=["time", "pri", "pgn", "dst", "src", "data"], ) expected["dst"] = expected["dst"].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("comment", ["#", "~", "!"]) def test_fwf_comment(comment): data = """\ 1 2. 4 #hello world 5 NaN 10.0 """ data = data.replace("#", comment) colspecs = [(0, 3), (4, 9), (9, 25)] expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]]) result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment) tm.assert_almost_equal(result, expected) def test_fwf_skip_blank_lines(): data = """ A B C D 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201162 502.953953 173.237159 12468.3 """ result = read_fwf(StringIO(data), skip_blank_lines=True) expected = DataFrame( [ [201158, 360.242940, 149.910199, 11950.7], [201159, 444.953632, 166.985655, 11788.4], [201162, 502.953953, 173.237159, 12468.3], ], columns=["A", "B", "C", "D"], ) tm.assert_frame_equal(result, expected) data = """\ A B C D 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201162 502.953953 173.237159 12468.3 """ result = read_fwf(StringIO(data), skip_blank_lines=False) expected = DataFrame( [ [201158, 360.242940, 149.910199, 11950.7], [201159, 444.953632, 166.985655, 11788.4], [np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan], [201162, 502.953953, 173.237159, 12468.3], ], columns=["A", "B", "C", "D"], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("thousands", [",", "#", "~"]) def test_fwf_thousands(thousands): data = """\ 1 2,334.0 5 10 13 10. """ data = data.replace(",", thousands) colspecs = [(0, 3), (3, 11), (12, 16)] expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]]) result = read_fwf( StringIO(data), header=None, colspecs=colspecs, thousands=thousands ) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("header", [True, False]) def test_bool_header_arg(header): # see gh-6114 data = """\ MyColumn a b a b""" msg = "Passing a bool to header is invalid" with pytest.raises(TypeError, match=msg): read_fwf(StringIO(data), header=header) def test_full_file(): # File with all values. test = """index A B C 2000-01-03T00:00:00 0.980268513777 3 foo 2000-01-04T00:00:00 1.04791624281 -4 bar 2000-01-05T00:00:00 0.498580885705 73 baz 2000-01-06T00:00:00 1.12020151869 1 foo 2000-01-07T00:00:00 0.487094399463 0 bar 2000-01-10T00:00:00 0.836648671666 2 baz 2000-01-11T00:00:00 0.157160753327 34 foo""" colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) expected = read_fwf(StringIO(test), colspecs=colspecs) result = read_fwf(StringIO(test)) tm.assert_frame_equal(result, expected) def test_full_file_with_missing(): # File with missing values. test = """index A B C 2000-01-03T00:00:00 0.980268513777 3 foo 2000-01-04T00:00:00 1.04791624281 -4 bar 0.498580885705 73 baz 2000-01-06T00:00:00 1.12020151869 1 foo 2000-01-07T00:00:00 0 bar 2000-01-10T00:00:00 0.836648671666 2 baz 34""" colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) expected = read_fwf(StringIO(test), colspecs=colspecs) result = read_fwf(StringIO(test)) tm.assert_frame_equal(result, expected) def test_full_file_with_spaces(): # File with spaces in columns. test = """ Account Name Balance CreditLimit AccountCreated 101 Keanu Reeves 9315.45 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 8/6/2003 868 Jennifer Love Hewitt 0 17000.00 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 5000.00 2/5/2007 """.strip( "\r\n" ) colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) result = read_fwf(StringIO(test)) tm.assert_frame_equal(result, expected) def test_full_file_with_spaces_and_missing(): # File with spaces and missing values in columns. test = """ Account Name Balance CreditLimit AccountCreated 101 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 8/6/2003 868 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 """.strip( "\r\n" ) colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) result = read_fwf(StringIO(test)) tm.assert_frame_equal(result, expected) def test_messed_up_data(): # Completely messed up file. test = """ Account Name Balance Credit Limit Account Created 101 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 """.strip( "\r\n" ) colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) expected = read_fwf(StringIO(test), colspecs=colspecs) result = read_fwf(StringIO(test)) tm.assert_frame_equal(result, expected) def test_multiple_delimiters(): test = r""" col1~~~~~col2 col3++++++++++++++++++col4 ~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves 33+++122.33\\\bar.........Gerard Butler ++44~~~~12.01 baz~~Jennifer Love Hewitt ~~55 11+++foo++++Jada Pinkett-Smith ..66++++++.03~~~bar Bill Murray """.strip( "\r\n" ) delimiter = " +~.\\" colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) result = read_fwf(StringIO(test), delimiter=delimiter) tm.assert_frame_equal(result, expected) def test_variable_width_unicode(): data = """ שלום שלום ום שלל של ום """.strip( "\r\n" ) encoding = "utf8" kwargs = {"header": None, "encoding": encoding} expected = read_fwf( BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs ) result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "int32"}]) def test_dtype(dtype): data = """ a b c 1 2 3.2 3 4 5.2 """ colspecs = [(0, 5), (5, 10), (10, None)] result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) expected = DataFrame( {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] ) for col, dt in dtype.items(): expected[col] = expected[col].astype(dt) tm.assert_frame_equal(result, expected) def test_skiprows_inference(): # see gh-11256 data = """ Text contained in the file header DataCol1 DataCol2 0.0 1.0 101.6 956.1 """.strip() skiprows = 2 depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" with tm.assert_produces_warning(FutureWarning, match=depr_msg): expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) def test_skiprows_by_index_inference(): data = """ To be skipped Not To Be Skipped Once more to be skipped 123 34 8 123 456 78 9 456 """.strip() skiprows = [0, 2] depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" with tm.assert_produces_warning(FutureWarning, match=depr_msg): expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) def test_skiprows_inference_empty(): data = """ AA BBB C 12 345 6 78 901 2 """.strip() msg = "No rows from which to infer column width" with pytest.raises(EmptyDataError, match=msg): read_fwf(StringIO(data), skiprows=3) def test_whitespace_preservation(): # see gh-16772 header = None csv_data = """ a ,bbb cc,dd """ fwf_data = """ a bbb ccdd """ result = read_fwf( StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" ) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) def test_default_delimiter(): header = None csv_data = """ a,bbb cc,dd""" fwf_data = """ a \tbbb cc\tdd """ result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0]) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("infer", [True, False]) def test_fwf_compression(compression_only, infer, compression_to_extension): data = """1111111111 2222222222 3333333333""".strip() compression = compression_only extension = compression_to_extension[compression] kwargs = {"widths": [5, 5], "names": ["one", "two"]} expected = read_fwf(StringIO(data), **kwargs) data = bytes(data, encoding="utf-8") with tm.ensure_clean(filename="tmp." + extension) as path: tm.write_to_compressed(compression, path, data) if infer is not None: kwargs["compression"] = "infer" if infer else compression result = read_fwf(path, **kwargs) tm.assert_frame_equal(result, expected) def test_binary_mode(): """ read_fwf supports opening files in binary mode. GH 18035. """ data = """aaa aaa aaa bba bab b a""" df_reference = DataFrame( [["bba", "bab", "b a"]], columns=["aaa", "aaa.1", "aaa.2"], index=[0] ) with tm.ensure_clean() as path: Path(path).write_text(data, encoding="utf-8") with open(path, "rb") as file: df = read_fwf(file) file.seek(0) tm.assert_frame_equal(df, df_reference) @pytest.mark.parametrize("memory_map", [True, False]) def test_encoding_mmap(memory_map): """ encoding should be working, even when using a memory-mapped file. GH 23254. """ encoding = "iso8859_1" with tm.ensure_clean() as path: Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding)) df = read_fwf( path, header=None, widths=[2, 2, 2, 2], encoding=encoding, memory_map=memory_map, ) df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) @pytest.mark.parametrize( "colspecs, names, widths, index_col", [ ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None, None, ), ( None, list("abcde"), [6] * 4, None, ), ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None, True, ), ( None, list("abcde"), [6] * 4, False, ), ( None, list("abcde"), [6] * 4, True, ), ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None, False, ), ], ) def test_len_colspecs_len_names(colspecs, names, widths, index_col): # GH#40830 data = """col1 col2 col3 col4 bab ba 2""" msg = "Length of colspecs must match length of names" with pytest.raises(ValueError, match=msg): read_fwf( StringIO(data), colspecs=colspecs, names=names, widths=widths, index_col=index_col, ) @pytest.mark.parametrize( "colspecs, names, widths, index_col, expected", [ ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abc"), None, 0, DataFrame( index=["col1", "ba"], columns=["a", "b", "c"], data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], ), ), ( [(0, 6), (6, 12), (12, 18), (18, None)], list("ab"), None, [0, 1], DataFrame( index=[["col1", "ba"], ["col2", "b ba"]], columns=["a", "b"], data=[["col3", "col4"], ["2", np.nan]], ), ), ( [(0, 6), (6, 12), (12, 18), (18, None)], list("a"), None, [0, 1, 2], DataFrame( index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], columns=["a"], data=[["col4"], [np.nan]], ), ), ( None, list("abc"), [6] * 4, 0, DataFrame( index=["col1", "ba"], columns=["a", "b", "c"], data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], ), ), ( None, list("ab"), [6] * 4, [0, 1], DataFrame( index=[["col1", "ba"], ["col2", "b ba"]], columns=["a", "b"], data=[["col3", "col4"], ["2", np.nan]], ), ), ( None, list("a"), [6] * 4, [0, 1, 2], DataFrame( index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], columns=["a"], data=[["col4"], [np.nan]], ), ), ], ) def test_len_colspecs_len_names_with_index_col( colspecs, names, widths, index_col, expected ): # GH#40830 data = """col1 col2 col3 col4 bab ba 2""" result = read_fwf( StringIO(data), colspecs=colspecs, names=names, widths=widths, index_col=index_col, ) tm.assert_frame_equal(result, expected) def test_colspecs_with_comment(): # GH 14135 result = read_fwf( StringIO("#\nA1K\n"), colspecs=[(1, 2), (2, 3)], comment="#", header=None ) expected = DataFrame([[1, "K"]], columns=[0, 1]) tm.assert_frame_equal(result, expected) def test_skip_rows_and_n_rows(): # GH#44021 data = """a\tb 1\t a 2\t b 3\t c 4\t d 5\t e 6\t f """ result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4]) expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]}) tm.assert_frame_equal(result, expected) def test_skiprows_with_iterator(): # GH#10261, GH#56323 data = """0 1 2 3 4 5 6 7 8 9 """ df_iter = read_fwf( StringIO(data), colspecs=[(0, 2)], names=["a"], iterator=True, chunksize=2, skiprows=[0, 1, 2, 6, 9], ) expected_frames = [ DataFrame({"a": [3, 4]}), DataFrame({"a": [5, 7]}, index=[2, 3]), DataFrame({"a": [8]}, index=[4]), ] for i, result in enumerate(df_iter): tm.assert_frame_equal(result, expected_frames[i]) def test_names_and_infer_colspecs(): # GH#45337 data = """X Y Z 959.0 345 22.2 """ result = read_fwf(StringIO(data), skiprows=1, usecols=[0, 2], names=["a", "b"]) expected = DataFrame({"a": [959.0], "b": 22.2}) tm.assert_frame_equal(result, expected) def test_widths_and_usecols(): # GH#46580 data = """0 1 n -0.4100.1 0 2 p 0.2 90.1 0 3 n -0.3140.4""" result = read_fwf( StringIO(data), header=None, usecols=(0, 1, 3), widths=(3, 5, 1, 5, 5), index_col=False, names=("c0", "c1", "c3"), ) expected = DataFrame( { "c0": 0, "c1": [1, 2, 3], "c3": [-0.4, 0.2, -0.3], } ) tm.assert_frame_equal(result, expected) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 if string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) elif dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray arr = ArrowExtensionArray(pa.array(["a", "b"])) arr_na = ArrowExtensionArray(pa.array([None, "a"])) else: pa = pytest.importorskip("pyarrow") arr = ArrowStringArray(pa.array(["a", "b"])) arr_na = ArrowStringArray(pa.array([None, "a"])) data = """a b c d e f g h i 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): result = read_fwf(StringIO(data), dtype_backend=dtype_backend) expected = DataFrame( { "a": pd.Series([1, 3], dtype="Int64"), "b": pd.Series([2.5, 4.5], dtype="Float64"), "c": pd.Series([True, False], dtype="boolean"), "d": arr, "e": pd.Series([pd.NA, True], dtype="boolean"), "f": pd.Series([pd.NA, 6], dtype="Int64"), "g": pd.Series([pd.NA, 7.5], dtype="Float64"), "h": arr_na, "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray expected = DataFrame( { col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) for col in expected.columns } ) expected["i"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): read_fwf("test", dtype_backend="numpy") @pytest.mark.network @pytest.mark.single_cpu def test_url_urlopen(httpserver): data = """\ A B C D 201158 360.242940 149.910199 11950.7 201159 444.953632 166.985655 11788.4 201160 364.136849 183.628767 11806.2 201161 413.836124 184.375703 11916.8 201162 502.953953 173.237159 12468.3 """ httpserver.serve_content(content=data) expected = pd.Index(list("ABCD")) with urlopen(httpserver.url) as f: result = read_fwf(f).columns tm.assert_index_equal(result, expected)