from collections.abc import Iterator from functools import partial from io import ( BytesIO, StringIO, ) import os from pathlib import Path import re import threading from urllib.error import URLError import numpy as np import pytest from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd from pandas import ( NA, DataFrame, MultiIndex, Series, Timestamp, date_range, read_csv, read_html, to_datetime, ) import pandas._testing as tm from pandas.core.arrays import ( ArrowStringArray, StringArray, ) from pandas.io.common import file_path_to_url @pytest.fixture( params=[ "chinese_utf-16.html", "chinese_utf-32.html", "chinese_utf-8.html", "letz_latin1.html", ] ) def html_encoding_file(request, datapath): """Parametrized fixture for HTML encoding test filenames.""" return datapath("io", "data", "html_encoding", request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): assert len(list1) == len(list2), ( "lists are not of equal size " f"len(list1) == {len(list1)}, " f"len(list2) == {len(list2)}" ) msg = "not all list elements are DataFrames" both_frames = all( map( lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame), list1, list2, ) ) assert both_frames, msg for frame_i, frame_j in zip(list1, list2): tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) assert not frame_i.empty, "frames are both empty" def test_bs4_version_fails(monkeypatch, datapath): bs4 = pytest.importorskip("bs4") pytest.importorskip("html5lib") monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4") def test_invalid_flavor(): url = "google.com" flavor = "invalid flavor" msg = r"\{" + flavor + r"\} is not a valid set of flavors" with pytest.raises(ValueError, match=msg): read_html(StringIO(url), match="google", flavor=flavor) def test_same_ordering(datapath): pytest.importorskip("bs4") pytest.importorskip("lxml") pytest.importorskip("html5lib") filename = datapath("io", "data", "html", "valid_markup.html") dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4) @pytest.fixture( params=[ pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]), pytest.param("lxml", marks=td.skip_if_no("lxml")), ], ) def flavor_read_html(request): return partial(read_html, flavor=request.param) class TestReadHtml: def test_literal_html_deprecation(self, flavor_read_html): # GH 53785 msg = ( "Passing literal html to 'read_html' is deprecated and " "will be removed in a future version. To read from a " "literal string, wrap it in a 'StringIO' object." ) with tm.assert_produces_warning(FutureWarning, match=msg): flavor_read_html( """
A B
1 2
3 4
""" ) @pytest.fixture def spam_data(self, datapath): return datapath("io", "data", "html", "spam.html") @pytest.fixture def banklist_data(self, datapath): return datapath("io", "data", "html", "banklist.html") def test_to_html_compat(self, flavor_read_html): df = ( DataFrame( np.random.default_rng(2).random((4, 3)), columns=pd.Index(list("abc"), dtype=object), ) # pylint: disable-next=consider-using-f-string .map("{:.3f}".format).astype(float) ) out = df.to_html() res = flavor_read_html( StringIO(out), attrs={"class": "dataframe"}, index_col=0 )[0] tm.assert_frame_equal(res, df) def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): # GH#50286 df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), "c": Series([1.5, np.nan, 2.5], dtype="Float64"), "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": [True, False, None], "f": [True, False, True], "g": ["a", "b", "c"], "h": ["a", "b", None], } ) if string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) elif dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), "c": Series([1.5, np.nan, 2.5], dtype="Float64"), "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), "g": string_array, "h": string_array_na, } ) if dtype_backend == "pyarrow": import pyarrow as pa from pandas.arrays import ArrowExtensionArray expected = DataFrame( { col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) for col in expected.columns } ) tm.assert_frame_equal(result, expected) @pytest.mark.network @pytest.mark.single_cpu def test_banklist_url(self, httpserver, banklist_data, flavor_read_html): with open(banklist_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) df1 = flavor_read_html( # lxml cannot find attrs leave out for now httpserver.url, match="First Federal Bank of Florida", # attrs={"class": "dataTable"} ) # lxml cannot find attrs leave out for now df2 = flavor_read_html( httpserver.url, match="Metcalf Bank", ) # attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) @pytest.mark.network @pytest.mark.single_cpu def test_spam_url(self, httpserver, spam_data, flavor_read_html): with open(spam_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) df1 = flavor_read_html(httpserver.url, match=".*Water.*") df2 = flavor_read_html(httpserver.url, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow def test_banklist(self, banklist_data, flavor_read_html): df1 = flavor_read_html( banklist_data, match=".*Florida.*", attrs={"id": "table"} ) df2 = flavor_read_html( banklist_data, match="Metcalf Bank", attrs={"id": "table"} ) assert_framelist_equal(df1, df2) def test_spam(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*") df2 = flavor_read_html(spam_data, match="Unit") assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == "Proximates" assert df1[0].columns[0] == "Nutrient" def test_spam_no_match(self, spam_data, flavor_read_html): dfs = flavor_read_html(spam_data) for df in dfs: assert isinstance(df, DataFrame) def test_banklist_no_match(self, banklist_data, flavor_read_html): dfs = flavor_read_html(banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) def test_spam_header(self, spam_data, flavor_read_html): df = flavor_read_html(spam_data, match=".*Water.*", header=2)[0] assert df.columns[0] == "Proximates" assert not df.empty def test_skiprows_int(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_range(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=range(2)) df2 = flavor_read_html(spam_data, match="Unit", skiprows=range(2)) assert_framelist_equal(df1, df2) def test_skiprows_list(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) df2 = flavor_read_html(spam_data, match="Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows={1, 2}) df2 = flavor_read_html(spam_data, match="Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) def test_skiprows_slice(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2)) df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) df2 = flavor_read_html(spam_data, match="Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self, spam_data, flavor_read_html): with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): flavor_read_html(spam_data, match=".*Water.*", skiprows="asdf") def test_index(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self, spam_data, flavor_read_html): df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self, spam_data, flavor_read_html): # 10892 infer_types removed df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) def test_string_io(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data1 = StringIO(f.read()) with open(spam_data, encoding="UTF-8") as f: data2 = StringIO(f.read()) df1 = flavor_read_html(data1, match=".*Water.*") df2 = flavor_read_html(data2, match="Unit") assert_framelist_equal(df1, df2) def test_string(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data = f.read() df1 = flavor_read_html(StringIO(data), match=".*Water.*") df2 = flavor_read_html(StringIO(data), match="Unit") assert_framelist_equal(df1, df2) def test_file_like(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: df1 = flavor_read_html(f, match=".*Water.*") with open(spam_data, encoding="UTF-8") as f: df2 = flavor_read_html(f, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.network @pytest.mark.single_cpu def test_bad_url_protocol(self, httpserver, flavor_read_html): httpserver.serve_content("urlopen error unknown url type: git", code=404) with pytest.raises(URLError, match="urlopen error unknown url type: git"): flavor_read_html("git://github.com", match=".*Water.*") @pytest.mark.slow @pytest.mark.network @pytest.mark.single_cpu def test_invalid_url(self, httpserver, flavor_read_html): httpserver.serve_content("Name or service not known", code=404) with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"): flavor_read_html(httpserver.url, match=".*Water.*") @pytest.mark.slow def test_file_url(self, banklist_data, flavor_read_html): url = banklist_data dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} ) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) @pytest.mark.slow def test_invalid_table_attrs(self, banklist_data, flavor_read_html): url = banklist_data with pytest.raises(ValueError, match="No tables found"): flavor_read_html( url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) @pytest.mark.slow def test_multiindex_header(self, banklist_data, flavor_read_html): df = flavor_read_html( banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1] )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_index(self, banklist_data, flavor_read_html): df = flavor_read_html( banklist_data, match="Metcalf", attrs={"id": "table"}, index_col=[0, 1] )[0] assert isinstance(df.index, MultiIndex) @pytest.mark.slow def test_multiindex_header_index(self, banklist_data, flavor_read_html): df = flavor_read_html( banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1], index_col=[0, 1], )[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) @pytest.mark.slow def test_multiindex_header_skiprows_tuples(self, banklist_data, flavor_read_html): df = flavor_read_html( banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1], skiprows=1, )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_header_skiprows(self, banklist_data, flavor_read_html): df = flavor_read_html( banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1], skiprows=1, )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_header_index_skiprows(self, banklist_data, flavor_read_html): df = flavor_read_html( banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1], index_col=[0, 1], skiprows=1, )[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_regex_idempotency(self, banklist_data, flavor_read_html): url = banklist_data dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile("Florida")), attrs={"id": "table"}, ) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) def test_negative_skiprows(self, spam_data, flavor_read_html): msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): flavor_read_html(spam_data, match="Water", skiprows=-1) @pytest.fixture def python_docs(self): return """

Indices and tables:

""" # noqa: E501 @pytest.mark.network @pytest.mark.single_cpu def test_multiple_matches(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) dfs = flavor_read_html(httpserver.url, match="Python") assert len(dfs) > 1 @pytest.mark.network @pytest.mark.single_cpu def test_python_docs_table(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) dfs = flavor_read_html(httpserver.url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == ["Pyth", "What"] def test_empty_tables(self, flavor_read_html): """ Make sure that read_html ignores empty tables. """ html = """
A B
1 2
""" result = flavor_read_html(StringIO(html)) assert len(result) == 1 def test_multiple_tbody(self, flavor_read_html): # GH-20690 # Read all tbody tags within a single table. result = flavor_read_html( StringIO( """
A B
1 2
3 4
""" ) )[0] expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_header_and_one_column(self, flavor_read_html): """ Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ result = flavor_read_html( StringIO( """
Header
first
""" ) )[0] expected = DataFrame(data={"Header": "first"}, index=[0]) tm.assert_frame_equal(result, expected) def test_thead_without_tr(self, flavor_read_html): """ Ensure parser adds within on malformed HTML. """ result = flavor_read_html( StringIO( """
Country Municipality Year
Ukraine Odessa 1944
""" ) )[0] expected = DataFrame( data=[["Ukraine", "Odessa", 1944]], columns=["Country", "Municipality", "Year"], ) tm.assert_frame_equal(result, expected) def test_tfoot_read(self, flavor_read_html): """ Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot """ data_template = """ {footer}
A B
bodyA bodyB
""" expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"]) expected2 = DataFrame( data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"] ) data1 = data_template.format(footer="") data2 = data_template.format(footer="footAfootB") result1 = flavor_read_html(StringIO(data1))[0] result2 = flavor_read_html(StringIO(data2))[0] tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) def test_parse_header_of_non_string_column(self, flavor_read_html): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str result = flavor_read_html( StringIO( """
S I
text 1944
""" ), header=0, )[0] expected = DataFrame([["text", 1944]], columns=("S", "I")) tm.assert_frame_equal(result, expected) @pytest.mark.slow def test_banklist_header(self, banklist_data, datapath, flavor_read_html): from pandas.io.html import _remove_whitespace def try_remove_ws(x): try: return _remove_whitespace(x) except AttributeError: return x df = flavor_read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, ) assert df.shape == ground_truth.shape old = [ "First Vietnamese American Bank In Vietnamese", "Westernbank Puerto Rico En Espanol", "R-G Premier Bank of Puerto Rico En Espanol", "Eurobank En Espanol", "Sanderson State Bank En Espanol", "Washington Mutual Bank (Including its subsidiary Washington " "Mutual Bank FSB)", "Silver State Bank En Espanol", "AmTrade International Bank En Espanol", "Hamilton Bank, NA En Espanol", "The Citizens Savings Bank Pioneer Community Bank, Inc.", ] new = [ "First Vietnamese American Bank", "Westernbank Puerto Rico", "R-G Premier Bank of Puerto Rico", "Eurobank", "Sanderson State Bank", "Washington Mutual Bank", "Silver State Bank", "AmTrade International Bank", "Hamilton Bank, NA", "The Citizens Savings Bank", ] dfnew = df.map(try_remove_ws).replace(old, new) gtnew = ground_truth.map(try_remove_ws) converted = dfnew date_cols = ["Closing Date", "Updated Date"] converted[date_cols] = converted[date_cols].apply(to_datetime) tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow def test_gold_canyon(self, banklist_data, flavor_read_html): gc = "Gold Canyon" with open(banklist_data, encoding="utf-8") as f: raw_text = f.read() assert gc in raw_text df = flavor_read_html( banklist_data, match="Gold Canyon", attrs={"id": "table"} )[0] assert gc in df.to_string() def test_different_number_of_cols(self, flavor_read_html): expected = flavor_read_html( StringIO( """
C_l0_g0 C_l0_g1 C_l0_g2 C_l0_g3 C_l0_g4
R_l0_g0 0.763 0.233 nan nan nan
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" ), index_col=0, )[0] result = flavor_read_html( StringIO( """
C_l0_g0 C_l0_g1 C_l0_g2 C_l0_g3 C_l0_g4
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" ), index_col=0, )[0] tm.assert_frame_equal(result, expected) def test_colspan_rowspan_1(self, flavor_read_html): # GH17054 result = flavor_read_html( StringIO( """
A B C
a b c
""" ) )[0] expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) def test_colspan_rowspan_copy_values(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: # # X x Y Z W # A B b z C result = flavor_read_html( StringIO( """
X Y Z W
A B C
""" ), header=0, )[0] expected = DataFrame( data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"] ) tm.assert_frame_equal(result, expected) def test_colspan_rowspan_both_not_1(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: # # A B b b C # a b b b D result = flavor_read_html( StringIO( """
A B C
D
""" ), header=0, )[0] expected = DataFrame( data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"] ) tm.assert_frame_equal(result, expected) def test_rowspan_at_end_of_row(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: # # A B # C b result = flavor_read_html( StringIO( """
A B
C
""" ), header=0, )[0] expected = DataFrame(data=[["C", "B"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_rowspan_only_rows(self, flavor_read_html): # GH17054 result = flavor_read_html( StringIO( """
A B
""" ), header=0, )[0] expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( StringIO( """
A B
a b
1 2
""" ) )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) def test_parse_dates_combine(self, flavor_read_html): raw_dates = Series(date_range("1/1/2001", periods=10)) df = DataFrame( { "date": raw_dates.map(lambda x: str(x.date())), "time": raw_dates.map(lambda x: str(x.time())), } ) res = flavor_read_html( StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 ) newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{repr(data)} is not a file" assert os.path.getsize(data), f"{repr(data)} is an empty file" result = flavor_read_html(data, match="Arizona", header=1)[0] assert result.shape == (60, 12) assert "Unnamed" in result.columns[-1] assert result["sq mi"].dtype == np.dtype("float64") assert np.allclose(result.loc[0, "sq mi"], 665384.04) def test_wikipedia_states_multiindex(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") result = flavor_read_html(data, match="Arizona", index_col=0)[0] assert result.shape == (60, 11) assert "Unnamed" in result.columns[-1][1] assert result.columns.nlevels == 2 assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) def test_parser_error_on_empty_header_row(self, flavor_read_html): result = flavor_read_html( StringIO( """
AB
ab
""" ), header=[0, 1], ) expected = DataFrame( [["a", "b"]], columns=MultiIndex.from_tuples( [("Unnamed: 0_level_0", "A"), ("Unnamed: 1_level_0", "B")] ), ) tm.assert_frame_equal(result[0], expected) def test_decimal_rows(self, flavor_read_html): # GH 12907 result = flavor_read_html( StringIO( """
Header
1100#101
""" ), decimal="#", )[0] expected = DataFrame(data={"Header": 1100.101}, index=[0]) assert result["Header"].dtype == np.dtype("float64") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) def test_bool_header_arg(self, spam_data, arg, flavor_read_html): # GH 6114 msg = re.escape( "Passing a bool to header is invalid. Use header=None for no header or " "header=int or list-like of ints to specify the row(s) making up the " "column names" ) with pytest.raises(TypeError, match=msg): flavor_read_html(spam_data, header=arg) def test_converters(self, flavor_read_html): # GH 13461 result = flavor_read_html( StringIO( """
a
0.763
0.244
""" ), converters={"a": str}, )[0] expected = DataFrame({"a": ["0.763", "0.244"]}) tm.assert_frame_equal(result, expected) def test_na_values(self, flavor_read_html): # GH 13461 result = flavor_read_html( StringIO( """
a
0.763
0.244
""" ), na_values=[0.244], )[0] expected = DataFrame({"a": [0.763, np.nan]}) tm.assert_frame_equal(result, expected) def test_keep_default_na(self, flavor_read_html): html_data = """
a
N/A
NA
""" expected_df = DataFrame({"a": ["N/A", "NA"]}) html_df = flavor_read_html(StringIO(html_data), keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({"a": [np.nan, np.nan]}) html_df = flavor_read_html(StringIO(html_data), keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_preserve_empty_rows(self, flavor_read_html): result = flavor_read_html( StringIO( """
A B
a b
""" ) )[0] expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html): result = flavor_read_html( StringIO( """
AB
ab
12
""" ) )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) def test_multiple_header_rows(self, flavor_read_html): # Issue #13434 expected_df = DataFrame( data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] ) expected_df.columns = [ ["Unnamed: 0_level_0", "Age", "Party"], ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], ] html = expected_df.to_html(index=False) html_df = flavor_read_html(StringIO(html))[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath, flavor_read_html): filename = datapath("io", "data", "html", "valid_markup.html") dfs = flavor_read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath, flavor_read_html): banklist_data = datapath("io", "data", "html", "banklist.html") flavor_read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): rng = date_range("2000-01-01", periods=10) df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=rng) result = df.to_html() assert "2000-01-01" in result def test_to_html_borderless(self): df = DataFrame([{"A": 1, "B": 2}]) out_border_default = df.to_html() out_border_true = df.to_html(border=True) out_border_explicit_default = df.to_html(border=1) out_border_nondefault = df.to_html(border=2) out_border_zero = df.to_html(border=0) out_border_false = df.to_html(border=False) assert ' border="1"' in out_border_default assert out_border_true == out_border_default assert out_border_default == out_border_explicit_default assert out_border_default != out_border_nondefault assert ' border="2"' in out_border_nondefault assert ' border="0"' not in out_border_zero assert " border" not in out_border_false assert out_border_zero == out_border_false @pytest.mark.parametrize( "displayed_only,exp0,exp1", [ (True, DataFrame(["foo"]), None), (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), ], ) def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html): # GH 20027 data = """
foo bar baz qux
foo
""" dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: tm.assert_frame_equal(dfs[1], exp1) else: assert len(dfs) == 1 # Should not parse hidden table @pytest.mark.parametrize("displayed_only", [True, False]) def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_html): html_table = """
A B
1 2
4 5
""" result = flavor_read_html(StringIO(html_table), displayed_only=displayed_only)[ 0 ] expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings( "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" ) def test_encode(self, html_encoding_file, flavor_read_html): base_path = os.path.basename(html_encoding_file) root = os.path.splitext(base_path)[0] _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: from_string = flavor_read_html( fobj.read(), encoding=encoding, index_col=0 ).pop() with open(html_encoding_file, "rb") as fobj: from_file_like = flavor_read_html( BytesIO(fobj.read()), encoding=encoding, index_col=0 ).pop() from_filename = flavor_read_html( html_encoding_file, encoding=encoding, index_col=0 ).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if "16" in encoding or "32" in encoding: pytest.skip() raise def test_parse_failure_unseekable(self, flavor_read_html): # Issue #17975 if flavor_read_html.keywords.get("flavor") == "lxml": pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): def seekable(self): return False bad = UnseekableStringIO( """
spameggs
""" ) assert flavor_read_html(bad) with pytest.raises(ValueError, match="passed a non-rewindable file object"): flavor_read_html(bad) def test_parse_failure_rewinds(self, flavor_read_html): # Issue #17975 class MockFile: def __init__(self, data) -> None: self.data = data self.at_end = False def read(self, size=None): data = "" if self.at_end else self.data self.at_end = True return data def seek(self, offset): self.at_end = False def seekable(self): return True # GH 49036 pylint checks for presence of __next__ for iterators def __next__(self): ... def __iter__(self) -> Iterator: # `is_file_like` depends on the presence of # the __iter__ attribute. return self good = MockFile("
spam
eggs
") bad = MockFile("
spameggs
") assert flavor_read_html(good) assert flavor_read_html(bad) @pytest.mark.slow @pytest.mark.single_cpu def test_importcheck_thread_safety(self, datapath, flavor_read_html): # see gh-16928 class ErrorThread(threading.Thread): def run(self): try: super().run() except Exception as err: self.err = err else: self.err = None filename = datapath("io", "data", "html", "valid_markup.html") helper_thread1 = ErrorThread(target=flavor_read_html, args=(filename,)) helper_thread2 = ErrorThread(target=flavor_read_html, args=(filename,)) helper_thread1.start() helper_thread2.start() while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err def test_parse_path_object(self, datapath, flavor_read_html): # GH 37705 file_path_string = datapath("io", "data", "html", "spam.html") file_path = Path(file_path_string) df1 = flavor_read_html(file_path_string)[0] df2 = flavor_read_html(file_path)[0] tm.assert_frame_equal(df1, df2) def test_parse_br_as_space(self, flavor_read_html): # GH 29528: pd.read_html() convert
to space result = flavor_read_html( StringIO( """
A
word1
word2
""" ) )[0] expected = DataFrame(data=[["word1 word2"]], columns=["A"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) def test_extract_links(self, arg, flavor_read_html): gh_13141_data = """
HTTP FTP Linkless
Wikipedia SURROUNDING Debian TEXT Linkless
Footer Multiple links: Only first captured.
""" gh_13141_expected = { "head_ignore": ["HTTP", "FTP", "Linkless"], "head_extract": [ ("HTTP", None), ("FTP", None), ("Linkless", "https://en.wiktionary.org/wiki/linkless"), ], "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"], "body_extract": [ ("Wikipedia", "https://en.wikipedia.org/"), ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"), ("Linkless", None), ], "footer_ignore": [ "Footer", "Multiple links: Only first captured.", None, ], "footer_extract": [ ("Footer", "https://en.wikipedia.org/wiki/Page_footer"), ("Multiple links: Only first captured.", "1"), None, ], } data_exp = gh_13141_expected["body_ignore"] foot_exp = gh_13141_expected["footer_ignore"] head_exp = gh_13141_expected["head_ignore"] if arg == "all": data_exp = gh_13141_expected["body_extract"] foot_exp = gh_13141_expected["footer_extract"] head_exp = gh_13141_expected["head_extract"] elif arg == "body": data_exp = gh_13141_expected["body_extract"] elif arg == "footer": foot_exp = gh_13141_expected["footer_extract"] elif arg == "header": head_exp = gh_13141_expected["head_extract"] result = flavor_read_html(StringIO(gh_13141_data), extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) expected = expected.fillna(np.nan) tm.assert_frame_equal(result, expected) def test_extract_links_bad(self, spam_data): msg = ( "`extract_links` must be one of " '{None, "header", "footer", "body", "all"}, got "incorrect"' ) with pytest.raises(ValueError, match=msg): read_html(spam_data, extract_links="incorrect") def test_extract_links_all_no_header(self, flavor_read_html): # GH 48316 data = """
Google.com
""" result = flavor_read_html(StringIO(data), extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): read_html("test", dtype_backend="numpy") def test_style_tag(self, flavor_read_html): # GH 48316 data = """
A B
A1 B1
A2 B2
""" result = flavor_read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected)