""" :mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. """ from __future__ import annotations from collections import abc import numbers import re from re import Pattern from typing import ( TYPE_CHECKING, Literal, cast, ) import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, ) from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like from pandas import isna from pandas.core.indexes.base import Index from pandas.core.indexes.multi import MultiIndex from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( file_exists, get_handle, is_file_like, is_fsspec_url, is_url, stringify_path, validate_header_arg, ) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser if TYPE_CHECKING: from collections.abc import ( Iterable, Sequence, ) from pandas._typing import ( BaseBuffer, DtypeBackend, FilePath, HTMLFlavors, ReadBuffer, StorageOptions, ) from pandas import DataFrame ############# # READ HTML # ############# _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: """ Replace extra whitespace inside of a string with a single space. Parameters ---------- s : str or unicode The string from which to remove extra whitespace. regex : re.Pattern The regular expression to use to remove extra whitespace. Returns ------- subd : str or unicode `s` with all extra whitespace replaced with a single space. """ return regex.sub(" ", s.strip()) def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: """ Get an iterator given an integer, slice or container. Parameters ---------- skiprows : int, slice, container The iterator to use to skip rows; can also be a slice. Raises ------ TypeError * If `skiprows` is not a slice, integer, or Container Returns ------- it : iterable A proper iterator to use to skip rows of a DataFrame. """ if isinstance(skiprows, slice): start, step = skiprows.start or 0, skiprows.step or 1 return list(range(start, skiprows.stop, step)) elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return cast("int | Sequence[int]", skiprows) elif skiprows is None: return 0 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") def _read( obj: FilePath | BaseBuffer, encoding: str | None, storage_options: StorageOptions | None, ) -> str | bytes: """ Try to read from a url, file or string. Parameters ---------- obj : str, unicode, path object, or file-like object Returns ------- raw_text : str """ text: str | bytes if ( is_url(obj) or hasattr(obj, "read") or (isinstance(obj, str) and file_exists(obj)) ): with get_handle( obj, "r", encoding=encoding, storage_options=storage_options ) as handles: text = handles.handle.read() elif isinstance(obj, (str, bytes)): text = obj else: raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") return text class _HtmlFrameParser: """ Base class for parsers that parse HTML into DataFrames. Parameters ---------- io : str or file-like This can be either a string of raw HTML, a valid URL using the HTTP, FTP, or FILE protocols or a file-like object. match : str or regex The text to match in the document. attrs : dict List of HTML