from __future__ import annotations
from io import (
BytesIO,
StringIO,
)
from lzma import LZMAError
import os
from tarfile import ReadError
from urllib.error import HTTPError
from xml.etree.ElementTree import ParseError
from zipfile import BadZipFile
import numpy as np
import pytest
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
EmptyDataError,
ParserError,
)
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
NA,
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
from pandas.io.common import get_handle
from pandas.io.xml import read_xml
# CHECK LIST
# [x] - ValueError: "Values for parser can only be lxml or etree."
# etree
# [X] - ImportError: "lxml not found, please install or use the etree parser."
# [X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType"
# [X] - ValueError: "Either element or attributes can be parsed not both."
# [X] - ValueError: "xpath does not return any nodes..."
# [X] - SyntaxError: "You have used an incorrect or unsupported XPath"
# [X] - ValueError: "names does not match length of child elements in xpath."
# [X] - TypeError: "...is not a valid type for names"
# [X] - ValueError: "To use stylesheet, you need lxml installed..."
# [] - URLError: (GENERAL ERROR WITH HTTPError AS SUBCLASS)
# [X] - HTTPError: "HTTP Error 404: Not Found"
# [] - OSError: (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS)
# [X] - FileNotFoundError: "No such file or directory"
# [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML)
# [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..."
# [X] - UnicodeError: "UTF-16 stream does not start with BOM"
# [X] - BadZipFile: "File is not a zip file"
# [X] - OSError: "Invalid data stream"
# [X] - LZMAError: "Input format not supported by decoder"
# [X] - ValueError: "Unrecognized compression type"
# [X] - PermissionError: "Forbidden"
# lxml
# [X] - ValueError: "Either element or attributes can be parsed not both."
# [X] - AttributeError: "__enter__"
# [X] - XSLTApplyError: "Cannot resolve URI"
# [X] - XSLTParseError: "document is not a stylesheet"
# [X] - ValueError: "xpath does not return any nodes."
# [X] - XPathEvalError: "Invalid expression"
# [] - XPathSyntaxError: (OLD VERSION IN lxml FOR XPATH ERRORS)
# [X] - TypeError: "empty namespace prefix is not supported in XPath"
# [X] - ValueError: "names does not match length of child elements in xpath."
# [X] - TypeError: "...is not a valid type for names"
# [X] - LookupError: "unknown encoding"
# [] - URLError: (USUALLY DUE TO NETWORKING)
# [X - HTTPError: "HTTP Error 404: Not Found"
# [X] - OSError: "failed to load external entity"
# [X] - XMLSyntaxError: "Start tag expected, '<' not found"
# [] - ParserError: (FAILSAFE CATCH ALL FOR VERY COMPLEX XML
# [X] - ValueError: "Values for parser can only be lxml or etree."
# [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..."
# [X] - UnicodeError: "UTF-16 stream does not start with BOM"
# [X] - BadZipFile: "File is not a zip file"
# [X] - OSError: "Invalid data stream"
# [X] - LZMAError: "Input format not supported by decoder"
# [X] - ValueError: "Unrecognized compression type"
# [X] - PermissionError: "Forbidden"
geom_df = DataFrame(
{
"shape": ["square", "circle", "triangle"],
"degrees": [360, 360, 180],
"sides": [4, np.nan, 3],
}
)
xml_default_nmsp = """\
square
360
4
circle
360
triangle
180
3
"""
xml_prefix_nmsp = """\
square
360
4.0
circle
360
triangle
180
3.0
"""
df_kml = DataFrame(
{
"id": {
0: "ID_00001",
1: "ID_00002",
2: "ID_00003",
3: "ID_00004",
4: "ID_00005",
},
"name": {
0: "Blue Line (Forest Park)",
1: "Red, Purple Line",
2: "Red, Purple Line",
3: "Red, Purple Line",
4: "Red, Purple Line",
},
"styleUrl": {
0: "#LineStyle01",
1: "#LineStyle01",
2: "#LineStyle01",
3: "#LineStyle01",
4: "#LineStyle01",
},
"extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
"altitudeMode": {
0: "clampedToGround",
1: "clampedToGround",
2: "clampedToGround",
3: "clampedToGround",
4: "clampedToGround",
},
"coordinates": {
0: (
"-87.77678526964958,41.8708863930319,0 "
"-87.77826234150609,41.87097820122218,0 "
"-87.78251583439344,41.87130129991005,0 "
"-87.78418294588424,41.87145055520308,0 "
"-87.7872369165933,41.8717239119163,0 "
"-87.79160214925886,41.87210797280065,0"
),
1: (
"-87.65758750947528,41.96427269188822,0 "
"-87.65802133507393,41.96581929055245,0 "
"-87.65819033925305,41.96621846093642,0 "
"-87.6583189819129,41.96650362897086,0 "
"-87.65835858701473,41.96669002089185,0 "
"-87.65838428411853,41.96688150295095,0 "
"-87.65842208882658,41.96745896091846,0 "
"-87.65846556843937,41.9683761425439,0 "
"-87.65849296214573,41.96913893870342,0"
),
2: (
"-87.65492939166126,41.95377494531437,0 "
"-87.65557043199591,41.95376544118533,0 "
"-87.65606302030132,41.95376391658746,0 "
"-87.65623502146268,41.95377379126367,0 "
"-87.65634748981634,41.95380103566435,0 "
"-87.65646537904269,41.95387703994676,0 "
"-87.65656532461145,41.95396622645799,0 "
"-87.65664760856414,41.95404201996044,0 "
"-87.65671750555913,41.95416647054043,0 "
"-87.65673983607117,41.95429949810849,0 "
"-87.65673866475777,41.95441024240925,0 "
"-87.6567690255541,41.95490657227902,0 "
"-87.65683672482363,41.95692259283837,0 "
"-87.6568900886376,41.95861070983142,0 "
"-87.65699865558875,41.96181418669004,0 "
"-87.65756347177603,41.96397045777844,0 "
"-87.65758750947528,41.96427269188822,0"
),
3: (
"-87.65362593118043,41.94742799535678,0 "
"-87.65363554415794,41.94819886386848,0 "
"-87.6536456393239,41.95059994675451,0 "
"-87.65365831235026,41.95108288489359,0 "
"-87.6536604873874,41.9519954657554,0 "
"-87.65362592053201,41.95245597302328,0 "
"-87.65367158496069,41.95311153649393,0 "
"-87.65368468595476,41.9533202828916,0 "
"-87.65369271253692,41.95343095587119,0 "
"-87.65373335834569,41.95351536301472,0 "
"-87.65378605844126,41.95358212680591,0 "
"-87.65385067928185,41.95364452823767,0 "
"-87.6539390793817,41.95370263886964,0 "
"-87.6540786298351,41.95373403675265,0 "
"-87.65430648647626,41.9537535411832,0 "
"-87.65492939166126,41.95377494531437,0"
),
4: (
"-87.65345391792157,41.94217681262115,0 "
"-87.65342448305786,41.94237224420864,0 "
"-87.65339745703922,41.94268217746244,0 "
"-87.65337753982941,41.94288140770284,0 "
"-87.65336256753105,41.94317369618263,0 "
"-87.65338799707138,41.94357253961736,0 "
"-87.65340240886648,41.94389158188269,0 "
"-87.65341837392448,41.94406444407721,0 "
"-87.65342275247338,41.94421065714904,0 "
"-87.65347469646018,41.94434829382345,0 "
"-87.65351486483024,41.94447699917548,0 "
"-87.65353483605053,41.9453896864472,0 "
"-87.65361975532807,41.94689193720703,0 "
"-87.65362593118043,41.94742799535678,0"
),
},
}
)
def test_literal_xml_deprecation():
# GH 53809
pytest.importorskip("lxml")
msg = (
"Passing literal xml to 'read_xml' is deprecated and "
"will be removed in a future version. To read from a "
"literal string, wrap it in a 'StringIO' object."
)
with tm.assert_produces_warning(FutureWarning, match=msg):
read_xml(xml_default_nmsp)
@pytest.fixture(params=["rb", "r"])
def mode(request):
return request.param
@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
def parser(request):
return request.param
def read_xml_iterparse(data, **kwargs):
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write(data)
return read_xml(path, **kwargs)
def read_xml_iterparse_comp(comp_path, compression_only, **kwargs):
with get_handle(comp_path, "r", compression=compression_only) as handles:
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write(handles.handle.read())
return read_xml(path, **kwargs)
# FILE / URL
def test_parser_consistency_file(xml_books):
pytest.importorskip("lxml")
df_file_lxml = read_xml(xml_books, parser="lxml")
df_file_etree = read_xml(xml_books, parser="etree")
df_iter_lxml = read_xml(
xml_books,
parser="lxml",
iterparse={"book": ["category", "title", "year", "author", "price"]},
)
df_iter_etree = read_xml(
xml_books,
parser="etree",
iterparse={"book": ["category", "title", "year", "author", "price"]},
)
tm.assert_frame_equal(df_file_lxml, df_file_etree)
tm.assert_frame_equal(df_file_lxml, df_iter_lxml)
tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
@pytest.mark.network
@pytest.mark.single_cpu
def test_parser_consistency_url(parser, httpserver):
httpserver.serve_content(content=xml_default_nmsp)
df_xpath = read_xml(StringIO(xml_default_nmsp), parser=parser)
df_iter = read_xml(
BytesIO(xml_default_nmsp.encode()),
parser=parser,
iterparse={"row": ["shape", "degrees", "sides"]},
)
tm.assert_frame_equal(df_xpath, df_iter)
def test_file_like(xml_books, parser, mode):
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
df_file = read_xml(f, parser=parser)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_file, df_expected)
def test_file_io(xml_books, parser, mode):
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
xml_obj = f.read()
df_io = read_xml(
(BytesIO(xml_obj) if isinstance(xml_obj, bytes) else StringIO(xml_obj)),
parser=parser,
)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_io, df_expected)
def test_file_buffered_reader_string(xml_books, parser, mode):
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
xml_obj = f.read()
if mode == "rb":
xml_obj = StringIO(xml_obj.decode())
elif mode == "r":
xml_obj = StringIO(xml_obj)
df_str = read_xml(xml_obj, parser=parser)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_str, df_expected)
def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode):
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
next(f)
xml_obj = f.read()
if mode == "rb":
xml_obj = StringIO(xml_obj.decode())
elif mode == "r":
xml_obj = StringIO(xml_obj)
df_str = read_xml(xml_obj, parser=parser)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_str, df_expected)
def test_string_charset(parser):
txt = "<中文標籤>12
中文標籤>"
df_str = read_xml(StringIO(txt), parser=parser)
df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
tm.assert_frame_equal(df_str, df_expected)
def test_file_charset(xml_doc_ch_utf, parser):
df_file = read_xml(xml_doc_ch_utf, parser=parser)
df_expected = DataFrame(
{
"問": [
"問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
"問 既破有得申無得 亦應但破性執申假名以不",
"問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
],
"答": [
"".join(
[
"答 邪既無量 正亦多途 大略為言不出二種 謂",
"有得與無得 有得是邪須破 無得是正須申\n\t\t故",
]
),
None,
"答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
],
"a": [
None,
"答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也",
None,
],
}
)
tm.assert_frame_equal(df_file, df_expected)
def test_file_handle_close(xml_books, parser):
with open(xml_books, "rb") as f:
read_xml(BytesIO(f.read()), parser=parser)
assert not f.closed
@pytest.mark.parametrize("val", ["", b""])
def test_empty_string_lxml(val):
lxml_etree = pytest.importorskip("lxml.etree")
msg = "|".join(
[
"Document is empty",
# Seen on Mac with lxml 4.91
r"None \(line 0\)",
]
)
with pytest.raises(lxml_etree.XMLSyntaxError, match=msg):
if isinstance(val, str):
read_xml(StringIO(val), parser="lxml")
else:
read_xml(BytesIO(val), parser="lxml")
@pytest.mark.parametrize("val", ["", b""])
def test_empty_string_etree(val):
with pytest.raises(ParseError, match="no element found"):
if isinstance(val, str):
read_xml(StringIO(val), parser="etree")
else:
read_xml(BytesIO(val), parser="etree")
def test_wrong_file_path(parser):
msg = (
"Passing literal xml to 'read_xml' is deprecated and "
"will be removed in a future version. To read from a "
"literal string, wrap it in a 'StringIO' object."
)
filename = os.path.join("data", "html", "books.xml")
with pytest.raises(
FutureWarning,
match=msg,
):
read_xml(filename, parser=parser)
@pytest.mark.network
@pytest.mark.single_cpu
def test_url(httpserver, xml_file):
pytest.importorskip("lxml")
with open(xml_file, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]")
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_url, df_expected)
@pytest.mark.network
@pytest.mark.single_cpu
def test_wrong_url(parser, httpserver):
httpserver.serve_content("NOT FOUND", code=404)
with pytest.raises(HTTPError, match=("HTTP Error 404: NOT FOUND")):
read_xml(httpserver.url, xpath=".//book[count(*)=4]", parser=parser)
# CONTENT
def test_whitespace(parser):
xml = """
square
360
circle
360
triangle
180
"""
df_xpath = read_xml(StringIO(xml), parser=parser, dtype="string")
df_iter = read_xml_iterparse(
xml,
parser=parser,
iterparse={"row": ["sides", "shape", "degrees"]},
dtype="string",
)
df_expected = DataFrame(
{
"sides": [" 4 ", " 0 ", " 3 "],
"shape": [
"\n square\n ",
"\n circle\n ",
"\n triangle\n ",
],
"degrees": ["\t360\t", "\t360\t", "\t180\t"],
},
dtype="string",
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
# XPATH
def test_empty_xpath_lxml(xml_books):
pytest.importorskip("lxml")
with pytest.raises(ValueError, match=("xpath does not return any nodes")):
read_xml(xml_books, xpath=".//python", parser="lxml")
def test_bad_xpath_etree(xml_books):
with pytest.raises(
SyntaxError, match=("You have used an incorrect or unsupported XPath")
):
read_xml(xml_books, xpath=".//[book]", parser="etree")
def test_bad_xpath_lxml(xml_books):
lxml_etree = pytest.importorskip("lxml.etree")
with pytest.raises(lxml_etree.XPathEvalError, match=("Invalid expression")):
read_xml(xml_books, xpath=".//[book]", parser="lxml")
# NAMESPACE
def test_default_namespace(parser):
df_nmsp = read_xml(
StringIO(xml_default_nmsp),
xpath=".//ns:row",
namespaces={"ns": "http://example.com"},
parser=parser,
)
df_iter = read_xml_iterparse(
xml_default_nmsp,
parser=parser,
iterparse={"row": ["shape", "degrees", "sides"]},
)
df_expected = DataFrame(
{
"shape": ["square", "circle", "triangle"],
"degrees": [360, 360, 180],
"sides": [4.0, float("nan"), 3.0],
}
)
tm.assert_frame_equal(df_nmsp, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_prefix_namespace(parser):
df_nmsp = read_xml(
StringIO(xml_prefix_nmsp),
xpath=".//doc:row",
namespaces={"doc": "http://example.com"},
parser=parser,
)
df_iter = read_xml_iterparse(
xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
)
df_expected = DataFrame(
{
"shape": ["square", "circle", "triangle"],
"degrees": [360, 360, 180],
"sides": [4.0, float("nan"), 3.0],
}
)
tm.assert_frame_equal(df_nmsp, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_consistency_default_namespace():
pytest.importorskip("lxml")
df_lxml = read_xml(
StringIO(xml_default_nmsp),
xpath=".//ns:row",
namespaces={"ns": "http://example.com"},
parser="lxml",
)
df_etree = read_xml(
StringIO(xml_default_nmsp),
xpath=".//doc:row",
namespaces={"doc": "http://example.com"},
parser="etree",
)
tm.assert_frame_equal(df_lxml, df_etree)
def test_consistency_prefix_namespace():
pytest.importorskip("lxml")
df_lxml = read_xml(
StringIO(xml_prefix_nmsp),
xpath=".//doc:row",
namespaces={"doc": "http://example.com"},
parser="lxml",
)
df_etree = read_xml(
StringIO(xml_prefix_nmsp),
xpath=".//doc:row",
namespaces={"doc": "http://example.com"},
parser="etree",
)
tm.assert_frame_equal(df_lxml, df_etree)
# PREFIX
def test_missing_prefix_with_default_namespace(xml_books, parser):
with pytest.raises(ValueError, match=("xpath does not return any nodes")):
read_xml(xml_books, xpath=".//Placemark", parser=parser)
def test_missing_prefix_definition_etree(kml_cta_rail_lines):
with pytest.raises(SyntaxError, match=("you used an undeclared namespace prefix")):
read_xml(kml_cta_rail_lines, xpath=".//kml:Placemark", parser="etree")
def test_missing_prefix_definition_lxml(kml_cta_rail_lines):
lxml_etree = pytest.importorskip("lxml.etree")
with pytest.raises(lxml_etree.XPathEvalError, match=("Undefined namespace prefix")):
read_xml(kml_cta_rail_lines, xpath=".//kml:Placemark", parser="lxml")
@pytest.mark.parametrize("key", ["", None])
def test_none_namespace_prefix(key):
pytest.importorskip("lxml")
with pytest.raises(
TypeError, match=("empty namespace prefix is not supported in XPath")
):
read_xml(
StringIO(xml_default_nmsp),
xpath=".//kml:Placemark",
namespaces={key: "http://www.opengis.net/kml/2.2"},
parser="lxml",
)
# ELEMS AND ATTRS
def test_file_elems_and_attrs(xml_books, parser):
df_file = read_xml(xml_books, parser=parser)
df_iter = read_xml(
xml_books,
parser=parser,
iterparse={"book": ["category", "title", "author", "year", "price"]},
)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_file, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_file_only_attrs(xml_books, parser):
df_file = read_xml(xml_books, attrs_only=True, parser=parser)
df_iter = read_xml(xml_books, parser=parser, iterparse={"book": ["category"]})
df_expected = DataFrame({"category": ["cooking", "children", "web"]})
tm.assert_frame_equal(df_file, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_file_only_elems(xml_books, parser):
df_file = read_xml(xml_books, elems_only=True, parser=parser)
df_iter = read_xml(
xml_books,
parser=parser,
iterparse={"book": ["title", "author", "year", "price"]},
)
df_expected = DataFrame(
{
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_file, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_elem_and_attrs_only(kml_cta_rail_lines, parser):
with pytest.raises(
ValueError,
match=("Either element or attributes can be parsed not both"),
):
read_xml(kml_cta_rail_lines, elems_only=True, attrs_only=True, parser=parser)
def test_empty_attrs_only(parser):
xml = """
square
360
circle
360
triangle
180
"""
with pytest.raises(
ValueError,
match=("xpath does not return any nodes or attributes"),
):
read_xml(StringIO(xml), xpath="./row", attrs_only=True, parser=parser)
def test_empty_elems_only(parser):
xml = """
"""
with pytest.raises(
ValueError,
match=("xpath does not return any nodes or attributes"),
):
read_xml(StringIO(xml), xpath="./row", elems_only=True, parser=parser)
def test_attribute_centric_xml():
pytest.importorskip("lxml")
xml = """\
"""
df_lxml = read_xml(StringIO(xml), xpath=".//station")
df_etree = read_xml(StringIO(xml), xpath=".//station", parser="etree")
df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]})
df_iter_et = read_xml_iterparse(
xml, parser="etree", iterparse={"station": ["Name", "coords"]}
)
tm.assert_frame_equal(df_lxml, df_etree)
tm.assert_frame_equal(df_iter_lx, df_iter_et)
# NAMES
def test_names_option_output(xml_books, parser):
df_file = read_xml(
xml_books, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser
)
df_iter = read_xml(
xml_books,
parser=parser,
names=["Col1", "Col2", "Col3", "Col4", "Col5"],
iterparse={"book": ["category", "title", "author", "year", "price"]},
)
df_expected = DataFrame(
{
"Col1": ["cooking", "children", "web"],
"Col2": ["Everyday Italian", "Harry Potter", "Learning XML"],
"Col3": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"Col4": [2005, 2005, 2003],
"Col5": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_file, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_repeat_names(parser):
xml = """\
circle
curved
sphere
curved
"""
df_xpath = read_xml(
StringIO(xml),
xpath=".//shape",
parser=parser,
names=["type_dim", "shape", "type_edge"],
)
df_iter = read_xml_iterparse(
xml,
parser=parser,
iterparse={"shape": ["type", "name", "type"]},
names=["type_dim", "shape", "type_edge"],
)
df_expected = DataFrame(
{
"type_dim": ["2D", "3D"],
"shape": ["circle", "sphere"],
"type_edge": ["curved", "curved"],
}
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_repeat_values_new_names(parser):
xml = """\
rectangle
rectangle
square
rectangle
ellipse
ellipse
circle
ellipse
"""
df_xpath = read_xml(
StringIO(xml), xpath=".//shape", parser=parser, names=["name", "group"]
)
df_iter = read_xml_iterparse(
xml,
parser=parser,
iterparse={"shape": ["name", "family"]},
names=["name", "group"],
)
df_expected = DataFrame(
{
"name": ["rectangle", "square", "ellipse", "circle"],
"group": ["rectangle", "rectangle", "ellipse", "ellipse"],
}
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_repeat_elements(parser):
xml = """\
circle
ellipse
360
0
triangle
polygon
180
3
square
polygon
360
4
"""
df_xpath = read_xml(
StringIO(xml),
xpath=".//shape",
parser=parser,
names=["name", "family", "degrees", "sides"],
)
df_iter = read_xml_iterparse(
xml,
parser=parser,
iterparse={"shape": ["value", "value", "value", "value"]},
names=["name", "family", "degrees", "sides"],
)
df_expected = DataFrame(
{
"name": ["circle", "triangle", "square"],
"family": ["ellipse", "polygon", "polygon"],
"degrees": [360, 180, 360],
"sides": [0, 3, 4],
}
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_names_option_wrong_length(xml_books, parser):
with pytest.raises(ValueError, match=("names does not match length")):
read_xml(xml_books, names=["Col1", "Col2", "Col3"], parser=parser)
def test_names_option_wrong_type(xml_books, parser):
with pytest.raises(TypeError, match=("is not a valid type for names")):
read_xml(xml_books, names="Col1, Col2, Col3", parser=parser)
# ENCODING
def test_wrong_encoding(xml_baby_names, parser):
with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode")):
read_xml(xml_baby_names, parser=parser)
def test_utf16_encoding(xml_baby_names, parser):
with pytest.raises(
UnicodeError,
match=(
"UTF-16 stream does not start with BOM|"
"'utf-16(-le)?' codec can't decode byte"
),
):
read_xml(xml_baby_names, encoding="UTF-16", parser=parser)
def test_unknown_encoding(xml_baby_names, parser):
with pytest.raises(LookupError, match=("unknown encoding: UFT-8")):
read_xml(xml_baby_names, encoding="UFT-8", parser=parser)
def test_ascii_encoding(xml_baby_names, parser):
with pytest.raises(UnicodeDecodeError, match=("'ascii' codec can't decode byte")):
read_xml(xml_baby_names, encoding="ascii", parser=parser)
def test_parser_consistency_with_encoding(xml_baby_names):
pytest.importorskip("lxml")
df_xpath_lxml = read_xml(xml_baby_names, parser="lxml", encoding="ISO-8859-1")
df_xpath_etree = read_xml(xml_baby_names, parser="etree", encoding="iso-8859-1")
df_iter_lxml = read_xml(
xml_baby_names,
parser="lxml",
encoding="ISO-8859-1",
iterparse={"row": ["rank", "malename", "femalename"]},
)
df_iter_etree = read_xml(
xml_baby_names,
parser="etree",
encoding="ISO-8859-1",
iterparse={"row": ["rank", "malename", "femalename"]},
)
tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree)
tm.assert_frame_equal(df_xpath_etree, df_iter_etree)
tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
def test_wrong_encoding_for_lxml():
pytest.importorskip("lxml")
# GH#45133
data = """
c
"""
with pytest.raises(TypeError, match="encoding None"):
read_xml(StringIO(data), parser="lxml", encoding=None)
def test_none_encoding_etree():
# GH#45133
data = """
c
"""
result = read_xml(StringIO(data), parser="etree", encoding=None)
expected = DataFrame({"a": ["c"]})
tm.assert_frame_equal(result, expected)
# PARSER
@td.skip_if_installed("lxml")
def test_default_parser_no_lxml(xml_books):
with pytest.raises(
ImportError, match=("lxml not found, please install or use the etree parser.")
):
read_xml(xml_books)
def test_wrong_parser(xml_books):
with pytest.raises(
ValueError, match=("Values for parser can only be lxml or etree.")
):
read_xml(xml_books, parser="bs4")
# STYLESHEET
def test_stylesheet_file(kml_cta_rail_lines, xsl_flatten_doc):
pytest.importorskip("lxml")
df_style = read_xml(
kml_cta_rail_lines,
xpath=".//k:Placemark",
namespaces={"k": "http://www.opengis.net/kml/2.2"},
stylesheet=xsl_flatten_doc,
)
df_iter = read_xml(
kml_cta_rail_lines,
iterparse={
"Placemark": [
"id",
"name",
"styleUrl",
"extrude",
"altitudeMode",
"coordinates",
]
},
)
tm.assert_frame_equal(df_kml, df_style)
tm.assert_frame_equal(df_kml, df_iter)
def test_stylesheet_file_like(kml_cta_rail_lines, xsl_flatten_doc, mode):
pytest.importorskip("lxml")
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
df_style = read_xml(
kml_cta_rail_lines,
xpath=".//k:Placemark",
namespaces={"k": "http://www.opengis.net/kml/2.2"},
stylesheet=f,
)
tm.assert_frame_equal(df_kml, df_style)
def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode):
# note: By default the bodies of untyped functions are not checked,
# consider using --check-untyped-defs
pytest.importorskip("lxml")
xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked]
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
if mode == "rb":
xsl_obj = BytesIO(f.read())
else:
xsl_obj = StringIO(f.read())
df_style = read_xml(
kml_cta_rail_lines,
xpath=".//k:Placemark",
namespaces={"k": "http://www.opengis.net/kml/2.2"},
stylesheet=xsl_obj,
)
tm.assert_frame_equal(df_kml, df_style)
def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode):
pytest.importorskip("lxml")
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
xsl_obj = f.read()
df_style = read_xml(
kml_cta_rail_lines,
xpath=".//k:Placemark",
namespaces={"k": "http://www.opengis.net/kml/2.2"},
stylesheet=xsl_obj,
)
tm.assert_frame_equal(df_kml, df_style)
def test_style_charset():
pytest.importorskip("lxml")
xml = "<中文標籤>12
中文標籤>"
xsl = """\
<根>
根>
"""
df_orig = read_xml(StringIO(xml))
df_style = read_xml(StringIO(xml), stylesheet=xsl)
tm.assert_frame_equal(df_orig, df_style)
def test_not_stylesheet(kml_cta_rail_lines, xml_books):
lxml_etree = pytest.importorskip("lxml.etree")
with pytest.raises(
lxml_etree.XSLTParseError, match=("document is not a stylesheet")
):
read_xml(kml_cta_rail_lines, stylesheet=xml_books)
def test_incorrect_xsl_syntax(kml_cta_rail_lines):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = """\
"""
with pytest.raises(
lxml_etree.XMLSyntaxError, match=("Extra content at the end of the document")
):
read_xml(kml_cta_rail_lines, stylesheet=xsl)
def test_incorrect_xsl_eval(kml_cta_rail_lines):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = """\
"""
with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")):
read_xml(kml_cta_rail_lines, stylesheet=xsl)
def test_incorrect_xsl_apply(kml_cta_rail_lines):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = """\
"""
with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")):
read_xml(kml_cta_rail_lines, stylesheet=xsl)
def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path):
xml_etree = pytest.importorskip("lxml.etree")
xsl = xml_data_path / "flatten.xsl"
with pytest.raises(
xml_etree.XMLSyntaxError,
match=("Start tag expected, '<' not found"),
):
read_xml(kml_cta_rail_lines, stylesheet=xsl)
def test_stylesheet_file_close(kml_cta_rail_lines, xsl_flatten_doc, mode):
# note: By default the bodies of untyped functions are not checked,
# consider using --check-untyped-defs
pytest.importorskip("lxml")
xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked]
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
if mode == "rb":
xsl_obj = BytesIO(f.read())
else:
xsl_obj = StringIO(f.read())
read_xml(kml_cta_rail_lines, stylesheet=xsl_obj)
assert not f.closed
def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc):
pytest.importorskip("lxml")
with pytest.raises(
ValueError, match=("To use stylesheet, you need lxml installed")
):
read_xml(kml_cta_rail_lines, parser="etree", stylesheet=xsl_flatten_doc)
@pytest.mark.parametrize("val", ["", b""])
def test_empty_stylesheet(val):
pytest.importorskip("lxml")
msg = (
"Passing literal xml to 'read_xml' is deprecated and "
"will be removed in a future version. To read from a "
"literal string, wrap it in a 'StringIO' object."
)
kml = os.path.join("data", "xml", "cta_rail_lines.kml")
with pytest.raises(FutureWarning, match=msg):
read_xml(kml, stylesheet=val)
# ITERPARSE
def test_file_like_iterparse(xml_books, parser, mode):
with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
if mode == "r" and parser == "lxml":
with pytest.raises(
TypeError, match=("reading file objects must return bytes objects")
):
read_xml(
f,
parser=parser,
iterparse={
"book": ["category", "title", "year", "author", "price"]
},
)
return None
else:
df_filelike = read_xml(
f,
parser=parser,
iterparse={"book": ["category", "title", "year", "author", "price"]},
)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_filelike, df_expected)
def test_file_io_iterparse(xml_books, parser, mode):
funcIO = StringIO if mode == "r" else BytesIO
with open(
xml_books,
mode,
encoding="utf-8" if mode == "r" else None,
) as f:
with funcIO(f.read()) as b:
if mode == "r" and parser == "lxml":
with pytest.raises(
TypeError, match=("reading file objects must return bytes objects")
):
read_xml(
b,
parser=parser,
iterparse={
"book": ["category", "title", "year", "author", "price"]
},
)
return None
else:
df_fileio = read_xml(
b,
parser=parser,
iterparse={
"book": ["category", "title", "year", "author", "price"]
},
)
df_expected = DataFrame(
{
"category": ["cooking", "children", "web"],
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
}
)
tm.assert_frame_equal(df_fileio, df_expected)
@pytest.mark.network
@pytest.mark.single_cpu
def test_url_path_error(parser, httpserver, xml_file):
with open(xml_file, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
with pytest.raises(
ParserError, match=("iterparse is designed for large XML files")
):
read_xml(
httpserver.url,
parser=parser,
iterparse={"row": ["shape", "degrees", "sides", "date"]},
)
def test_compression_error(parser, compression_only):
with tm.ensure_clean(filename="geom_xml.zip") as path:
geom_df.to_xml(path, parser=parser, compression=compression_only)
with pytest.raises(
ParserError, match=("iterparse is designed for large XML files")
):
read_xml(
path,
parser=parser,
iterparse={"row": ["shape", "degrees", "sides", "date"]},
compression=compression_only,
)
def test_wrong_dict_type(xml_books, parser):
with pytest.raises(TypeError, match="list is not a valid type for iterparse"):
read_xml(
xml_books,
parser=parser,
iterparse=["category", "title", "year", "author", "price"],
)
def test_wrong_dict_value(xml_books, parser):
with pytest.raises(
TypeError, match=" is not a valid type for value in iterparse"
):
read_xml(xml_books, parser=parser, iterparse={"book": "category"})
def test_bad_xml(parser):
bad_xml = """\
square
00360
4.0
2020-01-01
circle
00360
2021-01-01
triangle
00180
3.0
2022-01-01
"""
with tm.ensure_clean(filename="bad.xml") as path:
with open(path, "w", encoding="utf-8") as f:
f.write(bad_xml)
with pytest.raises(
SyntaxError,
match=(
"Extra content at the end of the document|"
"junk after document element"
),
):
read_xml(
path,
parser=parser,
parse_dates=["date"],
iterparse={"row": ["shape", "degrees", "sides", "date"]},
)
def test_comment(parser):
xml = """\
circle
2D
sphere
3D
"""
df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
df_iter = read_xml_iterparse(
xml, parser=parser, iterparse={"shape": ["name", "type"]}
)
df_expected = DataFrame(
{
"name": ["circle", "sphere"],
"type": ["2D", "3D"],
}
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_dtd(parser):
xml = """\
]>
circle
2D
sphere
3D
"""
df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
df_iter = read_xml_iterparse(
xml, parser=parser, iterparse={"shape": ["name", "type"]}
)
df_expected = DataFrame(
{
"name": ["circle", "sphere"],
"type": ["2D", "3D"],
}
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_processing_instruction(parser):
xml = """\
, , ?>
circle
2D
sphere
3D
"""
df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
df_iter = read_xml_iterparse(
xml, parser=parser, iterparse={"shape": ["name", "type"]}
)
df_expected = DataFrame(
{
"name": ["circle", "sphere"],
"type": ["2D", "3D"],
}
)
tm.assert_frame_equal(df_xpath, df_expected)
tm.assert_frame_equal(df_iter, df_expected)
def test_no_result(xml_books, parser):
with pytest.raises(
ParserError, match="No result from selected items in iterparse."
):
read_xml(
xml_books,
parser=parser,
iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]},
)
def test_empty_data(xml_books, parser):
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
read_xml(
xml_books,
parser=parser,
iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]},
)
def test_online_stylesheet():
pytest.importorskip("lxml")
xml = """\
Empire Burlesque
Bob Dylan
USA
Columbia
10.90
1985
Hide your heart
Bonnie Tyler
UK
CBS Records
9.90
1988
Greatest Hits
Dolly Parton
USA
RCA
9.90
1982
Still got the blues
Gary Moore
UK
Virgin records
10.20
1990
Eros
Eros Ramazzotti
EU
BMG
9.90
1997
One night only
Bee Gees
UK
Polydor
10.90
1998
Sylvias Mother
Dr.Hook
UK
CBS
8.10
1973
Maggie May
Rod Stewart
UK
Pickwick
8.50
1990
Romanza
Andrea Bocelli
EU
Polydor
10.80
1996
When a man loves a woman
Percy Sledge
USA
Atlantic
8.70
1987
Black angel
Savage Rose
EU
Mega
10.90
1995
1999 Grammy Nominees
Many
USA
Grammy
10.20
1999
For the good times
Kenny Rogers
UK
Mucik Master
8.70
1995
Big Willie style
Will Smith
USA
Columbia
9.90
1997
Tupelo Honey
Van Morrison
UK
Polydor
8.20
1971
Soulsville
Jorn Hoel
Norway
WEA
7.90
1996
The very best of
Cat Stevens
UK
Island
8.90
1990
Stop
Sam Brown
UK
A and M
8.90
1988
Bridge of Spies
T`Pau
UK
Siren
7.90
1987
Private Dancer
Tina Turner
UK
Capitol
8.90
1983
Midt om natten
Kim Larsen
EU
Medley
7.80
1983
Pavarotti Gala Concert
Luciano Pavarotti
UK
DECCA
9.90
1991
The dock of the bay
Otis Redding
USA
Stax Records
7.90
1968
Picture book
Simply Red
EU
Elektra
7.20
1985
Red
The Communards
UK
London
7.80
1987
Unchain my heart
Joe Cocker
USA
EMI
8.20
1987
"""
xsl = """\
My CD Collection
"""
df_xsl = read_xml(
StringIO(xml),
xpath=".//tr[td and position() <= 6]",
names=["title", "artist"],
stylesheet=xsl,
)
df_expected = DataFrame(
{
"title": {
0: "Empire Burlesque",
1: "Hide your heart",
2: "Greatest Hits",
3: "Still got the blues",
4: "Eros",
},
"artist": {
0: "Bob Dylan",
1: "Bonnie Tyler",
2: "Dolly Parton",
3: "Gary Moore",
4: "Eros Ramazzotti",
},
}
)
tm.assert_frame_equal(df_expected, df_xsl)
# COMPRESSION
def test_compression_read(parser, compression_only):
with tm.ensure_clean() as comp_path:
geom_df.to_xml(
comp_path, index=False, parser=parser, compression=compression_only
)
df_xpath = read_xml(comp_path, parser=parser, compression=compression_only)
df_iter = read_xml_iterparse_comp(
comp_path,
compression_only,
parser=parser,
iterparse={"row": ["shape", "degrees", "sides"]},
compression=compression_only,
)
tm.assert_frame_equal(df_xpath, geom_df)
tm.assert_frame_equal(df_iter, geom_df)
def test_wrong_compression(parser, compression, compression_only):
actual_compression = compression
attempted_compression = compression_only
if actual_compression == attempted_compression:
pytest.skip(f"{actual_compression} == {attempted_compression}")
errors = {
"bz2": (OSError, "Invalid data stream"),
"gzip": (OSError, "Not a gzipped file"),
"zip": (BadZipFile, "File is not a zip file"),
"tar": (ReadError, "file could not be opened successfully"),
}
zstd = import_optional_dependency("zstandard", errors="ignore")
if zstd is not None:
errors["zstd"] = (zstd.ZstdError, "Unknown frame descriptor")
lzma = import_optional_dependency("lzma", errors="ignore")
if lzma is not None:
errors["xz"] = (LZMAError, "Input format not supported by decoder")
error_cls, error_str = errors[attempted_compression]
with tm.ensure_clean() as path:
geom_df.to_xml(path, parser=parser, compression=actual_compression)
with pytest.raises(error_cls, match=error_str):
read_xml(path, parser=parser, compression=attempted_compression)
def test_unsuported_compression(parser):
with pytest.raises(ValueError, match="Unrecognized compression type"):
with tm.ensure_clean() as path:
read_xml(path, parser=parser, compression="7z")
# STORAGE OPTIONS
@pytest.mark.network
@pytest.mark.single_cpu
def test_s3_parser_consistency(s3_public_bucket_with_data, s3so):
pytest.importorskip("s3fs")
pytest.importorskip("lxml")
s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml"
df_lxml = read_xml(s3, parser="lxml", storage_options=s3so)
df_etree = read_xml(s3, parser="etree", storage_options=s3so)
tm.assert_frame_equal(df_lxml, df_etree)
def test_read_xml_nullable_dtypes(
parser, string_storage, dtype_backend, using_infer_string
):
# GH#50500
data = """
x
1
4.0
x
2
4.0
True
False
y
2
5.0
False
"""
if using_infer_string:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"]))
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None]))
elif string_storage == "python":
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray
string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))
else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["x", "y"]))
string_array_na = ArrowStringArray(pa.array(["x", None]))
with pd.option_context("mode.string_storage", string_storage):
result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
expected = DataFrame(
{
"a": string_array,
"b": Series([1, 2], dtype="Int64"),
"c": Series([4.0, 5.0], dtype="Float64"),
"d": string_array_na,
"e": Series([2, NA], dtype="Int64"),
"f": Series([4.0, NA], dtype="Float64"),
"g": Series([NA, NA], dtype="Int64"),
"h": Series([True, False], dtype="boolean"),
"i": Series([False, NA], dtype="boolean"),
}
)
if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray
expected = DataFrame(
{
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
for col in expected.columns
}
)
expected["g"] = ArrowExtensionArray(pa.array([None, None]))
tm.assert_frame_equal(result, expected)
def test_invalid_dtype_backend():
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
read_xml("test", dtype_backend="numpy")