from __future__ import annotations from io import ( BytesIO, StringIO, ) import os import numpy as np import pytest import pandas.util._test_decorators as td from pandas import ( NA, DataFrame, Index, ) import pandas._testing as tm from pandas.io.common import get_handle from pandas.io.xml import read_xml # CHECKLIST # [x] - ValueError: "Values for parser can only be lxml or etree." # etree # [x] - ImportError: "lxml not found, please install or use the etree parser." # [X] - TypeError: "...is not a valid type for attr_cols" # [X] - TypeError: "...is not a valid type for elem_cols" # [X] - LookupError: "unknown encoding" # [X] - KeyError: "...is not included in namespaces" # [X] - KeyError: "no valid column" # [X] - ValueError: "To use stylesheet, you need lxml installed..." # [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) # [X] - FileNotFoundError: "No such file or directory" # [X] - PermissionError: "Forbidden" # lxml # [X] - TypeError: "...is not a valid type for attr_cols" # [X] - TypeError: "...is not a valid type for elem_cols" # [X] - LookupError: "unknown encoding" # [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) # [X] - FileNotFoundError: "No such file or directory" # [X] - KeyError: "...is not included in namespaces" # [X] - KeyError: "no valid column" # [X] - ValueError: "stylesheet is not a url, file, or xml string." # [] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT) # [] - URLError: (USUALLY DUE TO NETWORKING) # [] - HTTPError: (NEED AN ONLINE STYLESHEET) # [X] - OSError: "failed to load external entity" # [X] - XMLSyntaxError: "Opening and ending tag mismatch" # [X] - XSLTApplyError: "Cannot resolve URI" # [X] - XSLTParseError: "failed to compile" # [X] - PermissionError: "Forbidden" @pytest.fixture def geom_df(): return DataFrame( { "shape": ["square", "circle", "triangle"], "degrees": [360, 360, 180], "sides": [4, np.nan, 3], } ) @pytest.fixture def planet_df(): return DataFrame( { "planet": [ "Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune", ], "type": [ "terrestrial", "terrestrial", "terrestrial", "terrestrial", "gas giant", "gas giant", "ice giant", "ice giant", ], "location": [ "inner", "inner", "inner", "inner", "outer", "outer", "outer", "outer", ], "mass": [ 0.330114, 4.86747, 5.97237, 0.641712, 1898.187, 568.3174, 86.8127, 102.4126, ], } ) @pytest.fixture def from_file_expected(): return """\ 0 cooking Everyday Italian Giada De Laurentiis 2005 30.0 1 children Harry Potter J K. Rowling 2005 29.99 2 web Learning XML Erik T. Ray 2003 39.95 """ def equalize_decl(doc): # etree and lxml differ on quotes and case in xml declaration if doc is not None: doc = doc.replace( ' cooking Everyday Italian Giada De Laurentiis 2005 30.0 children Harry Potter J K. Rowling 2005 29.99 web Learning XML Erik T. Ray 2003 39.95 """ df_file = read_xml(xml_books, parser=parser) with tm.ensure_clean("test.xml") as path: df_file.to_xml(path, index=False, parser=parser) with open(path, "rb") as f: output = f.read().decode("utf-8").strip() output = equalize_decl(output) assert output == expected def test_index_false_rename_row_root(xml_books, parser): expected = """\ cooking Everyday Italian Giada De Laurentiis 2005 30.0 children Harry Potter J K. Rowling 2005 29.99 web Learning XML Erik T. Ray 2003 39.95 """ df_file = read_xml(xml_books, parser=parser) with tm.ensure_clean("test.xml") as path: df_file.to_xml( path, index=False, root_name="books", row_name="book", parser=parser ) with open(path, "rb") as f: output = f.read().decode("utf-8").strip() output = equalize_decl(output) assert output == expected @pytest.mark.parametrize( "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] ) def test_index_false_with_offset_input_index(parser, offset_index, geom_df): """ Tests that the output does not contain the `` field when the index of the input Dataframe has an offset. This is a regression test for issue #42458. """ expected = """\ square 360 4.0 circle 360 triangle 180 3.0 """ offset_geom_df = geom_df.copy() offset_geom_df.index = Index(offset_index) output = offset_geom_df.to_xml(index=False, parser=parser) output = equalize_decl(output) assert output == expected # NA_REP na_expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ def test_na_elem_output(parser, geom_df): output = geom_df.to_xml(parser=parser) output = equalize_decl(output) assert output == na_expected def test_na_empty_str_elem_option(parser, geom_df): output = geom_df.to_xml(na_rep="", parser=parser) output = equalize_decl(output) assert output == na_expected def test_na_empty_elem_option(parser, geom_df): expected = """\ 0 square 360 4.0 1 circle 360 0.0 2 triangle 180 3.0 """ output = geom_df.to_xml(na_rep="0.0", parser=parser) output = equalize_decl(output) assert output == expected # ATTR_COLS def test_attrs_cols_nan_output(parser, geom_df): expected = """\ """ output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser) output = equalize_decl(output) assert output == expected def test_attrs_cols_prefix(parser, geom_df): expected = """\ """ output = geom_df.to_xml( attr_cols=["index", "shape", "degrees", "sides"], namespaces={"doc": "http://example.xom"}, prefix="doc", parser=parser, ) output = equalize_decl(output) assert output == expected def test_attrs_unknown_column(parser, geom_df): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser) def test_attrs_wrong_type(parser, geom_df): with pytest.raises(TypeError, match=("is not a valid type for attr_cols")): geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser) # ELEM_COLS def test_elems_cols_nan_output(parser, geom_df): elems_cols_expected = """\ 360 4.0 square 360 circle 180 3.0 triangle """ output = geom_df.to_xml( index=False, elem_cols=["degrees", "sides", "shape"], parser=parser ) output = equalize_decl(output) assert output == elems_cols_expected def test_elems_unknown_column(parser, geom_df): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser) def test_elems_wrong_type(parser, geom_df): with pytest.raises(TypeError, match=("is not a valid type for elem_cols")): geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser) def test_elems_and_attrs_cols(parser, geom_df): elems_cols_expected = """\ 360 4.0 360 180 3.0 """ output = geom_df.to_xml( index=False, elem_cols=["degrees", "sides"], attr_cols=["shape"], parser=parser, ) output = equalize_decl(output) assert output == elems_cols_expected # HIERARCHICAL COLUMNS def test_hierarchical_columns(parser, planet_df): expected = """\ inner terrestrial 4 11.81 2.95 outer gas giant 2 2466.5 1233.25 outer ice giant 2 189.23 94.61 All 8 2667.54 333.44 """ pvt = planet_df.pivot_table( index=["location", "type"], values="mass", aggfunc=["count", "sum", "mean"], margins=True, ).round(2) output = pvt.to_xml(parser=parser) output = equalize_decl(output) assert output == expected def test_hierarchical_attrs_columns(parser, planet_df): expected = """\ """ pvt = planet_df.pivot_table( index=["location", "type"], values="mass", aggfunc=["count", "sum", "mean"], margins=True, ).round(2) output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser) output = equalize_decl(output) assert output == expected # MULTIINDEX def test_multi_index(parser, planet_df): expected = """\ inner terrestrial 4 11.81 2.95 outer gas giant 2 2466.5 1233.25 outer ice giant 2 189.23 94.61 """ agg = ( planet_df.groupby(["location", "type"])["mass"] .agg(["count", "sum", "mean"]) .round(2) ) output = agg.to_xml(parser=parser) output = equalize_decl(output) assert output == expected def test_multi_index_attrs_cols(parser, planet_df): expected = """\ """ agg = ( planet_df.groupby(["location", "type"])["mass"] .agg(["count", "sum", "mean"]) .round(2) ) output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser) output = equalize_decl(output) assert output == expected # NAMESPACE def test_default_namespace(parser, geom_df): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser) output = equalize_decl(output) assert output == expected def test_unused_namespaces(parser, geom_df): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml( namespaces={"oth": "http://other.org", "ex": "http://example.com"}, parser=parser, ) output = equalize_decl(output) assert output == expected # PREFIX def test_namespace_prefix(parser, geom_df): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml( namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser ) output = equalize_decl(output) assert output == expected def test_missing_prefix_in_nmsp(parser, geom_df): with pytest.raises(KeyError, match=("doc is not included in namespaces")): geom_df.to_xml( namespaces={"": "http://example.com"}, prefix="doc", parser=parser ) def test_namespace_prefix_and_default(parser, geom_df): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml( namespaces={"": "http://example.com", "doc": "http://other.org"}, prefix="doc", parser=parser, ) output = equalize_decl(output) assert output == expected # ENCODING encoding_expected = """\ 0 1 José Sofía 1 2 Luis Valentina 2 3 Carlos Isabella 3 4 Juan Camila 4 5 Jorge Valeria """ def test_encoding_option_str(xml_baby_names, parser): df_file = read_xml(xml_baby_names, parser=parser, encoding="ISO-8859-1").head(5) output = df_file.to_xml(encoding="ISO-8859-1", parser=parser) if output is not None: # etree and lxml differ on quotes and case in xml declaration output = output.replace( ' 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml(xml_declaration=False) assert output == expected def test_no_pretty_print_with_decl(parser, geom_df): expected = ( "\n" "0square" "3604.0" "1circle360" "2" "triangle1803.0" "" ) output = geom_df.to_xml(pretty_print=False, parser=parser) output = equalize_decl(output) # etree adds space for closed tags if output is not None: output = output.replace(" />", "/>") assert output == expected def test_no_pretty_print_no_decl(parser, geom_df): expected = ( "0square" "3604.0" "1circle360" "2" "triangle1803.0" "" ) output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser) # etree adds space for closed tags if output is not None: output = output.replace(" />", "/>") assert output == expected # PARSER @td.skip_if_installed("lxml") def test_default_parser_no_lxml(geom_df): with pytest.raises( ImportError, match=("lxml not found, please install or use the etree parser.") ): geom_df.to_xml() def test_unknown_parser(geom_df): with pytest.raises( ValueError, match=("Values for parser can only be lxml or etree.") ): geom_df.to_xml(parser="bs4") # STYLESHEET xsl_expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ def test_stylesheet_file_like(xsl_row_field_output, mode, geom_df): pytest.importorskip("lxml") with open( xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None ) as f: assert geom_df.to_xml(stylesheet=f) == xsl_expected def test_stylesheet_io(xsl_row_field_output, mode, geom_df): # note: By default the bodies of untyped functions are not checked, # consider using --check-untyped-defs pytest.importorskip("lxml") xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] with open( xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None ) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: xsl_obj = StringIO(f.read()) output = geom_df.to_xml(stylesheet=xsl_obj) assert output == xsl_expected def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df): pytest.importorskip("lxml") with open( xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None ) as f: xsl_obj = f.read() output = geom_df.to_xml(stylesheet=xsl_obj) assert output == xsl_expected def test_stylesheet_wrong_path(geom_df): lxml_etree = pytest.importorskip("lxml.etree") xsl = os.path.join("data", "xml", "row_field_output.xslt") with pytest.raises( lxml_etree.XMLSyntaxError, match=("Start tag expected, '<' not found"), ): geom_df.to_xml(stylesheet=xsl) @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_stylesheet(val, geom_df): lxml_etree = pytest.importorskip("lxml.etree") msg = "|".join( [ "Document is empty", "Start tag expected, '<' not found", # Seen on Mac with lxml 4.9.1 r"None \(line 0\)", ] ) with pytest.raises(lxml_etree.XMLSyntaxError, match=msg): geom_df.to_xml(stylesheet=val) def test_incorrect_xsl_syntax(geom_df): lxml_etree = pytest.importorskip("lxml.etree") xsl = """\ """ with pytest.raises( lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch") ): geom_df.to_xml(stylesheet=xsl) def test_incorrect_xsl_eval(geom_df): lxml_etree = pytest.importorskip("lxml.etree") xsl = """\ """ with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")): geom_df.to_xml(stylesheet=xsl) def test_incorrect_xsl_apply(geom_df): lxml_etree = pytest.importorskip("lxml.etree") xsl = """\ """ with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")): with tm.ensure_clean("test.xml") as path: geom_df.to_xml(path, stylesheet=xsl) def test_stylesheet_with_etree(geom_df): xsl = """\ """ with pytest.raises( ValueError, match=("To use stylesheet, you need lxml installed") ): geom_df.to_xml(parser="etree", stylesheet=xsl) def test_style_to_csv(geom_df): pytest.importorskip("lxml") xsl = """\ , ,shape,degrees,sides """ out_csv = geom_df.to_csv(lineterminator="\n") if out_csv is not None: out_csv = out_csv.strip() out_xml = geom_df.to_xml(stylesheet=xsl) assert out_csv == out_xml def test_style_to_string(geom_df): pytest.importorskip("lxml") xsl = """\ shape degrees sides """ out_str = geom_df.to_string() out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) assert out_xml == out_str def test_style_to_json(geom_df): pytest.importorskip("lxml") xsl = """\ " {"shape":{ },"degrees":{ },"sides":{ }} , """ out_json = geom_df.to_json() out_xml = geom_df.to_xml(stylesheet=xsl) assert out_json == out_xml # COMPRESSION geom_xml = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ def test_compression_output(parser, compression_only, geom_df): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression=compression_only) with get_handle( path, "r", compression=compression_only, ) as handle_obj: output = handle_obj.handle.read() output = equalize_decl(output) assert geom_xml == output.strip() def test_filename_and_suffix_comp( parser, compression_only, geom_df, compression_to_extension ): compfile = "xml." + compression_to_extension[compression_only] with tm.ensure_clean(filename=compfile) as path: geom_df.to_xml(path, parser=parser, compression=compression_only) with get_handle( path, "r", compression=compression_only, ) as handle_obj: output = handle_obj.handle.read() output = equalize_decl(output) assert geom_xml == output.strip() def test_ea_dtypes(any_numeric_ea_dtype, parser): # GH#43903 expected = """ 0 """ df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype) result = df.to_xml(parser=parser) assert equalize_decl(result).strip() == expected def test_unsuported_compression(parser, geom_df): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression="7z") # STORAGE OPTIONS @pytest.mark.single_cpu def test_s3_permission_output(parser, s3_public_bucket, geom_df): s3fs = pytest.importorskip("s3fs") pytest.importorskip("lxml") with tm.external_error_raised((PermissionError, FileNotFoundError)): fs = s3fs.S3FileSystem(anon=True) fs.ls(s3_public_bucket.name) geom_df.to_xml( f"s3://{s3_public_bucket.name}/geom.xml", compression="zip", parser=parser )