from __future__ import annotations
from io import (
BytesIO,
StringIO,
)
import os
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
NA,
DataFrame,
Index,
)
import pandas._testing as tm
from pandas.io.common import get_handle
from pandas.io.xml import read_xml
# CHECKLIST
# [x] - ValueError: "Values for parser can only be lxml or etree."
# etree
# [x] - ImportError: "lxml not found, please install or use the etree parser."
# [X] - TypeError: "...is not a valid type for attr_cols"
# [X] - TypeError: "...is not a valid type for elem_cols"
# [X] - LookupError: "unknown encoding"
# [X] - KeyError: "...is not included in namespaces"
# [X] - KeyError: "no valid column"
# [X] - ValueError: "To use stylesheet, you need lxml installed..."
# [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.)
# [X] - FileNotFoundError: "No such file or directory"
# [X] - PermissionError: "Forbidden"
# lxml
# [X] - TypeError: "...is not a valid type for attr_cols"
# [X] - TypeError: "...is not a valid type for elem_cols"
# [X] - LookupError: "unknown encoding"
# [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.)
# [X] - FileNotFoundError: "No such file or directory"
# [X] - KeyError: "...is not included in namespaces"
# [X] - KeyError: "no valid column"
# [X] - ValueError: "stylesheet is not a url, file, or xml string."
# [] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT)
# [] - URLError: (USUALLY DUE TO NETWORKING)
# [] - HTTPError: (NEED AN ONLINE STYLESHEET)
# [X] - OSError: "failed to load external entity"
# [X] - XMLSyntaxError: "Opening and ending tag mismatch"
# [X] - XSLTApplyError: "Cannot resolve URI"
# [X] - XSLTParseError: "failed to compile"
# [X] - PermissionError: "Forbidden"
@pytest.fixture
def geom_df():
return DataFrame(
{
"shape": ["square", "circle", "triangle"],
"degrees": [360, 360, 180],
"sides": [4, np.nan, 3],
}
)
@pytest.fixture
def planet_df():
return DataFrame(
{
"planet": [
"Mercury",
"Venus",
"Earth",
"Mars",
"Jupiter",
"Saturn",
"Uranus",
"Neptune",
],
"type": [
"terrestrial",
"terrestrial",
"terrestrial",
"terrestrial",
"gas giant",
"gas giant",
"ice giant",
"ice giant",
],
"location": [
"inner",
"inner",
"inner",
"inner",
"outer",
"outer",
"outer",
"outer",
],
"mass": [
0.330114,
4.86747,
5.97237,
0.641712,
1898.187,
568.3174,
86.8127,
102.4126,
],
}
)
@pytest.fixture
def from_file_expected():
return """\
0
cooking
Everyday Italian
Giada De Laurentiis
2005
30.0
1
children
Harry Potter
J K. Rowling
2005
29.99
2
web
Learning XML
Erik T. Ray
2003
39.95
"""
def equalize_decl(doc):
# etree and lxml differ on quotes and case in xml declaration
if doc is not None:
doc = doc.replace(
'
cooking
Everyday Italian
Giada De Laurentiis
2005
30.0
children
Harry Potter
J K. Rowling
2005
29.99
web
Learning XML
Erik T. Ray
2003
39.95
"""
df_file = read_xml(xml_books, parser=parser)
with tm.ensure_clean("test.xml") as path:
df_file.to_xml(path, index=False, parser=parser)
with open(path, "rb") as f:
output = f.read().decode("utf-8").strip()
output = equalize_decl(output)
assert output == expected
def test_index_false_rename_row_root(xml_books, parser):
expected = """\
cooking
Everyday Italian
Giada De Laurentiis
2005
30.0
children
Harry Potter
J K. Rowling
2005
29.99
web
Learning XML
Erik T. Ray
2003
39.95
"""
df_file = read_xml(xml_books, parser=parser)
with tm.ensure_clean("test.xml") as path:
df_file.to_xml(
path, index=False, root_name="books", row_name="book", parser=parser
)
with open(path, "rb") as f:
output = f.read().decode("utf-8").strip()
output = equalize_decl(output)
assert output == expected
@pytest.mark.parametrize(
"offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]]
)
def test_index_false_with_offset_input_index(parser, offset_index, geom_df):
"""
Tests that the output does not contain the `` field when the index of the
input Dataframe has an offset.
This is a regression test for issue #42458.
"""
expected = """\
square
360
4.0
circle
360
triangle
180
3.0
"""
offset_geom_df = geom_df.copy()
offset_geom_df.index = Index(offset_index)
output = offset_geom_df.to_xml(index=False, parser=parser)
output = equalize_decl(output)
assert output == expected
# NA_REP
na_expected = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
def test_na_elem_output(parser, geom_df):
output = geom_df.to_xml(parser=parser)
output = equalize_decl(output)
assert output == na_expected
def test_na_empty_str_elem_option(parser, geom_df):
output = geom_df.to_xml(na_rep="", parser=parser)
output = equalize_decl(output)
assert output == na_expected
def test_na_empty_elem_option(parser, geom_df):
expected = """\
0
square
360
4.0
1
circle
360
0.0
2
triangle
180
3.0
"""
output = geom_df.to_xml(na_rep="0.0", parser=parser)
output = equalize_decl(output)
assert output == expected
# ATTR_COLS
def test_attrs_cols_nan_output(parser, geom_df):
expected = """\
"""
output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser)
output = equalize_decl(output)
assert output == expected
def test_attrs_cols_prefix(parser, geom_df):
expected = """\
"""
output = geom_df.to_xml(
attr_cols=["index", "shape", "degrees", "sides"],
namespaces={"doc": "http://example.xom"},
prefix="doc",
parser=parser,
)
output = equalize_decl(output)
assert output == expected
def test_attrs_unknown_column(parser, geom_df):
with pytest.raises(KeyError, match=("no valid column")):
geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser)
def test_attrs_wrong_type(parser, geom_df):
with pytest.raises(TypeError, match=("is not a valid type for attr_cols")):
geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser)
# ELEM_COLS
def test_elems_cols_nan_output(parser, geom_df):
elems_cols_expected = """\
360
4.0
square
360
circle
180
3.0
triangle
"""
output = geom_df.to_xml(
index=False, elem_cols=["degrees", "sides", "shape"], parser=parser
)
output = equalize_decl(output)
assert output == elems_cols_expected
def test_elems_unknown_column(parser, geom_df):
with pytest.raises(KeyError, match=("no valid column")):
geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser)
def test_elems_wrong_type(parser, geom_df):
with pytest.raises(TypeError, match=("is not a valid type for elem_cols")):
geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser)
def test_elems_and_attrs_cols(parser, geom_df):
elems_cols_expected = """\
360
4.0
360
180
3.0
"""
output = geom_df.to_xml(
index=False,
elem_cols=["degrees", "sides"],
attr_cols=["shape"],
parser=parser,
)
output = equalize_decl(output)
assert output == elems_cols_expected
# HIERARCHICAL COLUMNS
def test_hierarchical_columns(parser, planet_df):
expected = """\
inner
terrestrial
4
11.81
2.95
outer
gas giant
2
2466.5
1233.25
outer
ice giant
2
189.23
94.61
All
8
2667.54
333.44
"""
pvt = planet_df.pivot_table(
index=["location", "type"],
values="mass",
aggfunc=["count", "sum", "mean"],
margins=True,
).round(2)
output = pvt.to_xml(parser=parser)
output = equalize_decl(output)
assert output == expected
def test_hierarchical_attrs_columns(parser, planet_df):
expected = """\
"""
pvt = planet_df.pivot_table(
index=["location", "type"],
values="mass",
aggfunc=["count", "sum", "mean"],
margins=True,
).round(2)
output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser)
output = equalize_decl(output)
assert output == expected
# MULTIINDEX
def test_multi_index(parser, planet_df):
expected = """\
inner
terrestrial
4
11.81
2.95
outer
gas giant
2
2466.5
1233.25
outer
ice giant
2
189.23
94.61
"""
agg = (
planet_df.groupby(["location", "type"])["mass"]
.agg(["count", "sum", "mean"])
.round(2)
)
output = agg.to_xml(parser=parser)
output = equalize_decl(output)
assert output == expected
def test_multi_index_attrs_cols(parser, planet_df):
expected = """\
"""
agg = (
planet_df.groupby(["location", "type"])["mass"]
.agg(["count", "sum", "mean"])
.round(2)
)
output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser)
output = equalize_decl(output)
assert output == expected
# NAMESPACE
def test_default_namespace(parser, geom_df):
expected = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser)
output = equalize_decl(output)
assert output == expected
def test_unused_namespaces(parser, geom_df):
expected = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
output = geom_df.to_xml(
namespaces={"oth": "http://other.org", "ex": "http://example.com"},
parser=parser,
)
output = equalize_decl(output)
assert output == expected
# PREFIX
def test_namespace_prefix(parser, geom_df):
expected = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
output = geom_df.to_xml(
namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser
)
output = equalize_decl(output)
assert output == expected
def test_missing_prefix_in_nmsp(parser, geom_df):
with pytest.raises(KeyError, match=("doc is not included in namespaces")):
geom_df.to_xml(
namespaces={"": "http://example.com"}, prefix="doc", parser=parser
)
def test_namespace_prefix_and_default(parser, geom_df):
expected = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
output = geom_df.to_xml(
namespaces={"": "http://example.com", "doc": "http://other.org"},
prefix="doc",
parser=parser,
)
output = equalize_decl(output)
assert output == expected
# ENCODING
encoding_expected = """\
0
1
José
Sofía
1
2
Luis
Valentina
2
3
Carlos
Isabella
3
4
Juan
Camila
4
5
Jorge
Valeria
"""
def test_encoding_option_str(xml_baby_names, parser):
df_file = read_xml(xml_baby_names, parser=parser, encoding="ISO-8859-1").head(5)
output = df_file.to_xml(encoding="ISO-8859-1", parser=parser)
if output is not None:
# etree and lxml differ on quotes and case in xml declaration
output = output.replace(
'
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
output = geom_df.to_xml(xml_declaration=False)
assert output == expected
def test_no_pretty_print_with_decl(parser, geom_df):
expected = (
"\n"
"0square"
"3604.0
"
"1circle360"
"
2"
"triangle1803.0"
"
"
)
output = geom_df.to_xml(pretty_print=False, parser=parser)
output = equalize_decl(output)
# etree adds space for closed tags
if output is not None:
output = output.replace(" />", "/>")
assert output == expected
def test_no_pretty_print_no_decl(parser, geom_df):
expected = (
"0square"
"3604.0
"
"1circle360"
"
2"
"triangle1803.0"
"
"
)
output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser)
# etree adds space for closed tags
if output is not None:
output = output.replace(" />", "/>")
assert output == expected
# PARSER
@td.skip_if_installed("lxml")
def test_default_parser_no_lxml(geom_df):
with pytest.raises(
ImportError, match=("lxml not found, please install or use the etree parser.")
):
geom_df.to_xml()
def test_unknown_parser(geom_df):
with pytest.raises(
ValueError, match=("Values for parser can only be lxml or etree.")
):
geom_df.to_xml(parser="bs4")
# STYLESHEET
xsl_expected = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
def test_stylesheet_file_like(xsl_row_field_output, mode, geom_df):
pytest.importorskip("lxml")
with open(
xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None
) as f:
assert geom_df.to_xml(stylesheet=f) == xsl_expected
def test_stylesheet_io(xsl_row_field_output, mode, geom_df):
# note: By default the bodies of untyped functions are not checked,
# consider using --check-untyped-defs
pytest.importorskip("lxml")
xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked]
with open(
xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None
) as f:
if mode == "rb":
xsl_obj = BytesIO(f.read())
else:
xsl_obj = StringIO(f.read())
output = geom_df.to_xml(stylesheet=xsl_obj)
assert output == xsl_expected
def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df):
pytest.importorskip("lxml")
with open(
xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None
) as f:
xsl_obj = f.read()
output = geom_df.to_xml(stylesheet=xsl_obj)
assert output == xsl_expected
def test_stylesheet_wrong_path(geom_df):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = os.path.join("data", "xml", "row_field_output.xslt")
with pytest.raises(
lxml_etree.XMLSyntaxError,
match=("Start tag expected, '<' not found"),
):
geom_df.to_xml(stylesheet=xsl)
@pytest.mark.parametrize("val", ["", b""])
def test_empty_string_stylesheet(val, geom_df):
lxml_etree = pytest.importorskip("lxml.etree")
msg = "|".join(
[
"Document is empty",
"Start tag expected, '<' not found",
# Seen on Mac with lxml 4.9.1
r"None \(line 0\)",
]
)
with pytest.raises(lxml_etree.XMLSyntaxError, match=msg):
geom_df.to_xml(stylesheet=val)
def test_incorrect_xsl_syntax(geom_df):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = """\
"""
with pytest.raises(
lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch")
):
geom_df.to_xml(stylesheet=xsl)
def test_incorrect_xsl_eval(geom_df):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = """\
"""
with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")):
geom_df.to_xml(stylesheet=xsl)
def test_incorrect_xsl_apply(geom_df):
lxml_etree = pytest.importorskip("lxml.etree")
xsl = """\
"""
with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")):
with tm.ensure_clean("test.xml") as path:
geom_df.to_xml(path, stylesheet=xsl)
def test_stylesheet_with_etree(geom_df):
xsl = """\
"""
with pytest.raises(
ValueError, match=("To use stylesheet, you need lxml installed")
):
geom_df.to_xml(parser="etree", stylesheet=xsl)
def test_style_to_csv(geom_df):
pytest.importorskip("lxml")
xsl = """\
,
,shape,degrees,sides
"""
out_csv = geom_df.to_csv(lineterminator="\n")
if out_csv is not None:
out_csv = out_csv.strip()
out_xml = geom_df.to_xml(stylesheet=xsl)
assert out_csv == out_xml
def test_style_to_string(geom_df):
pytest.importorskip("lxml")
xsl = """\
shape degrees sides
"""
out_str = geom_df.to_string()
out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl)
assert out_xml == out_str
def test_style_to_json(geom_df):
pytest.importorskip("lxml")
xsl = """\
"
{"shape":{
},"degrees":{
},"sides":{
}}
,
"""
out_json = geom_df.to_json()
out_xml = geom_df.to_xml(stylesheet=xsl)
assert out_json == out_xml
# COMPRESSION
geom_xml = """\
0
square
360
4.0
1
circle
360
2
triangle
180
3.0
"""
def test_compression_output(parser, compression_only, geom_df):
with tm.ensure_clean() as path:
geom_df.to_xml(path, parser=parser, compression=compression_only)
with get_handle(
path,
"r",
compression=compression_only,
) as handle_obj:
output = handle_obj.handle.read()
output = equalize_decl(output)
assert geom_xml == output.strip()
def test_filename_and_suffix_comp(
parser, compression_only, geom_df, compression_to_extension
):
compfile = "xml." + compression_to_extension[compression_only]
with tm.ensure_clean(filename=compfile) as path:
geom_df.to_xml(path, parser=parser, compression=compression_only)
with get_handle(
path,
"r",
compression=compression_only,
) as handle_obj:
output = handle_obj.handle.read()
output = equalize_decl(output)
assert geom_xml == output.strip()
def test_ea_dtypes(any_numeric_ea_dtype, parser):
# GH#43903
expected = """
0
"""
df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype)
result = df.to_xml(parser=parser)
assert equalize_decl(result).strip() == expected
def test_unsuported_compression(parser, geom_df):
with pytest.raises(ValueError, match="Unrecognized compression type"):
with tm.ensure_clean() as path:
geom_df.to_xml(path, parser=parser, compression="7z")
# STORAGE OPTIONS
@pytest.mark.single_cpu
def test_s3_permission_output(parser, s3_public_bucket, geom_df):
s3fs = pytest.importorskip("s3fs")
pytest.importorskip("lxml")
with tm.external_error_raised((PermissionError, FileNotFoundError)):
fs = s3fs.S3FileSystem(anon=True)
fs.ls(s3_public_bucket.name)
geom_df.to_xml(
f"s3://{s3_public_bucket.name}/geom.xml", compression="zip", parser=parser
)