from datetime import datetime import re import numpy as np import pytest from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td import pandas as pd from pandas import ( Series, _testing as tm, ) from pandas.tests.strings import ( _convert_na_value, object_pyarrow_numpy, ) # -------------------------------------------------------------------------------------- # str.contains # -------------------------------------------------------------------------------------- def using_pyarrow(dtype): return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) values = Series(values, dtype=any_string_dtype) pat = "mmm[_]+" result = values.str.contains(pat) expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) expected = Series( np.array([False, np.nan, False, False, True], dtype=np.object_), dtype=expected_dtype, ) tm.assert_series_equal(result, expected) values = Series( np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object), dtype=any_string_dtype, ) result = values.str.contains(pat) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # case insensitive using regex values = Series( np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # case insensitive without regex result = values.str.contains("foo", regex=False, case=False) expected = Series(np.array([True, False, True, False]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # unicode values = Series( np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) pat = "mmm[_]+" result = values.str.contains(pat) expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) result = values.str.contains(pat) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_contains_object_mixed(): mixed = Series( np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) ) result = mixed.str.contains("o") expected = Series( np.array( [False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan], dtype=np.object_, ) ) tm.assert_series_equal(result, expected) def test_contains_na_kwarg_for_object_category(): # gh 22158 # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "na, expected", [ (None, pd.NA), (True, True), (False, False), (0, False), (3, True), (np.nan, pd.NA), ], ) @pytest.mark.parametrize("regex", [True, False]) def test_contains_na_kwarg_for_nullable_string_dtype( nullable_string_dtype, na, expected, regex ): # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) def test_contains_moar(any_string_dtype): # PR #1179 s = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype=any_string_dtype, ) result = s.str.contains("a") expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) def test_contains_nan(any_string_dtype): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) elif any_string_dtype == "string[pyarrow_numpy]": expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- # str.startswith # -------------------------------------------------------------------------------------- @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_startswith(pat, dtype, null_value, na): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], dtype=dtype, ) result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) if dtype == "object" and null_value is pd.NA: # GH#18463 exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) exp = Series([False, na, True, False, False, na, True]) tm.assert_series_equal(result, exp) # mixed mixed = np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=np.object_, ) rs = Series(mixed).str.startswith("f") xp = Series([False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan]) tm.assert_series_equal(rs, xp) @pytest.mark.parametrize("na", [None, True, False]) def test_startswith_nullable_string_dtype(nullable_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) result = values.str.startswith("foo", na=na) exp = Series( [False, na, True, False, False, na, True, False, False], dtype="boolean" ) tm.assert_series_equal(result, exp) result = values.str.startswith("rege.", na=na) exp = Series( [False, na, False, False, False, na, False, False, True], dtype="boolean" ) tm.assert_series_equal(result, exp) # -------------------------------------------------------------------------------------- # str.endswith # -------------------------------------------------------------------------------------- @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_endswith(pat, dtype, null_value, na): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], dtype=dtype, ) result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) if dtype == "object" and null_value is pd.NA: # GH#18463 exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) exp = Series([False, na, False, False, True, na, True]) tm.assert_series_equal(result, exp) # mixed mixed = np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) rs = Series(mixed).str.endswith("f") xp = Series([False, np.nan, False, np.nan, np.nan, False, None, np.nan, np.nan]) tm.assert_series_equal(rs, xp) @pytest.mark.parametrize("na", [None, True, False]) def test_endswith_nullable_string_dtype(nullable_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) result = values.str.endswith("foo", na=na) exp = Series( [False, na, False, False, True, na, True, False, False], dtype="boolean" ) tm.assert_series_equal(result, exp) result = values.str.endswith("rege.", na=na) exp = Series( [False, na, False, False, False, na, False, False, True], dtype="boolean" ) tm.assert_series_equal(result, exp) # -------------------------------------------------------------------------------------- # str.replace # -------------------------------------------------------------------------------------- def test_replace(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) result = ser.str.replace("BAD[_]*", "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_max_replacements(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) result = ser.str.replace("BAD[_]*", "", n=1, regex=True) tm.assert_series_equal(result, expected) expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype) result = ser.str.replace("BAD", "", n=1, regex=False) tm.assert_series_equal(result, expected) def test_replace_mixed_object(): ser = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace("BAD[_]*", "", regex=True) expected = Series( ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object ) tm.assert_series_equal(result, expected) def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("repl", [None, 3, {"a": "b"}]) @pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]]) def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, data): # https://github.com/pandas-dev/pandas/issues/13438 msg = "repl must be a string or callable" obj = index_or_series(data, dtype=any_string_dtype) with pytest.raises(TypeError, match=msg): obj.str.replace("a", repl) def test_replace_callable(any_string_dtype): # GH 15055 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with callable repl = lambda m: m.group(0).swapcase() with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None] ) def test_replace_callable_raises(any_string_dtype, repl): # GH 15055 values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with wrong number of arguments, raising an error msg = ( r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning( PerformanceWarning, using_pyarrow(any_string_dtype) ): values.str.replace("a", repl, regex=True) def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_compiled_regex(any_string_dtype): # GH 15446 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_mixed_object(): pat = re.compile(r"BAD_*") ser = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) expected = Series( ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object ) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_raises(any_string_dtype): # case and flags provided to str.replace will have no effect # and will produce warnings ser = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype) pat = re.compile(r"BAD_*") msg = "case and flags cannot be set when pat is a compiled regex" with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True) with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", case=False, regex=True) with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", case=True, regex=True) def test_replace_compiled_regex_callable(any_string_dtype): # test with callable ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "regex,expected", [(True, ["bao", "bao", np.nan]), (False, ["bao", "foo", np.nan])] ) def test_replace_literal(regex, expected, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) expected = Series(expected, dtype=any_string_dtype) result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) def test_replace_literal_callable_raises(any_string_dtype): ser = Series([], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() msg = "Cannot use a callable replacement when regex=False" with pytest.raises(ValueError, match=msg): ser.str.replace("abc", repl, regex=False) def test_replace_literal_compiled_raises(any_string_dtype): ser = Series([], dtype=any_string_dtype) pat = re.compile("[a-z][A-Z]{2}") msg = "Cannot use a compiled regex as replacement pattern with regex=False" with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", regex=False) def test_replace_moar(any_string_dtype): # PR #1179 ser = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype=any_string_dtype, ) result = ser.str.replace("A", "YYY") expected = Series( ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", "", np.nan, "CYYYBYYY", "dog", "cYYYt", ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", "B", "C", "XX-XX ba", "XX-XX ca", "", np.nan, "XX-XX BA", "XX-XX ", "XX-XX t", ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809 s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype) result = s.str.replace("^.$", "a", regex=True) expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("regex", [True, False]) def test_replace_regex_single_character(regex, any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0 # GH 24804 s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype) result = s.str.replace(".", "a", regex=regex) if regex: expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype) else: expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- # str.match # -------------------------------------------------------------------------------------- def test_match(any_string_dtype): # New match behavior introduced in 0.13 expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") expected = Series([True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match(".*BAD[_]+.*BAD") expected = Series([True, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match("^BAD[_]+.*BAD") expected = Series([False, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("\\^BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_match_mixed_object(): mixed = Series( [ "aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0, ] ) result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") expected = Series([True, np.nan, True, np.nan, np.nan, False, None, np.nan, np.nan]) assert isinstance(result, Series) tm.assert_series_equal(result, expected) def test_match_na_kwarg(any_string_dtype): # GH #6609 s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- # str.fullmatch # -------------------------------------------------------------------------------------- def test_fullmatch(any_string_dtype): # GH 32806 ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) result = ser.str.fullmatch("ab", case=True) tm.assert_series_equal(result, expected) expected = Series([True, True, False, False], dtype=expected_dtype) result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- # str.findall # -------------------------------------------------------------------------------------- def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) expected = _convert_na_value(ser, expected) tm.assert_series_equal(result, expected) def test_findall_mixed_object(): ser = Series( [ "fooBAD__barBAD", np.nan, "foo", True, datetime.today(), "BAD", None, 1, 2.0, ] ) result = ser.str.findall("BAD[_]*") expected = Series( [ ["BAD__", "BAD"], np.nan, [], np.nan, np.nan, ["BAD"], None, np.nan, np.nan, ] ) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- # str.find # -------------------------------------------------------------------------------------- def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) expected = np.array([v.find("EF") for v in np.array(ser)], dtype=np.int64) tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected) result = ser.str.rfind("EF") expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) expected = np.array([v.rfind("EF") for v in np.array(ser)], dtype=np.int64) tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected) result = ser.str.find("EF", 3) expected = Series([4, 3, 7, 4, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) expected = np.array([v.find("EF", 3) for v in np.array(ser)], dtype=np.int64) tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected) result = ser.str.rfind("EF", 3) expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) expected = np.array([v.rfind("EF", 3) for v in np.array(ser)], dtype=np.int64) tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected) result = ser.str.find("EF", 3, 6) expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) expected = np.array([v.find("EF", 3, 6) for v in np.array(ser)], dtype=np.int64) tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected) result = ser.str.rfind("EF", 3, 6) expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) expected = np.array([v.rfind("EF", 3, 6) for v in np.array(ser)], dtype=np.int64) tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected) def test_find_bad_arg_raises(any_string_dtype): ser = Series([], dtype=any_string_dtype) with pytest.raises(TypeError, match="expected a string object, not int"): ser.str.find(0) with pytest.raises(TypeError, match="expected a string object, not int"): ser.str.rfind(0) def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = ser.str.rfind("EF") expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = ser.str.find("EF", 3) expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = ser.str.rfind("EF", 3) expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = ser.str.find("EF", 3, 6) expected = Series([4, np.nan, -1, np.nan, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = ser.str.rfind("EF", 3, 6) expected = Series([4, np.nan, -1, np.nan, -1], dtype=expected_dtype) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- # str.translate # -------------------------------------------------------------------------------------- @pytest.mark.parametrize( "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] ) def test_translate(index_or_series, any_string_dtype, infer_string): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype ) table = str.maketrans("abc", "cde") result = obj.str.translate(table) expected = index_or_series( ["cdedefg", "cdee", "edddfg", "edefggg"], dtype=any_string_dtype ) tm.assert_equal(result, expected) def test_translate_mixed_object(): # Series with non-string values s = Series(["a", "b", "c", 1.2]) table = str.maketrans("abc", "cde") expected = Series(["c", "d", "e", np.nan], dtype=object) result = s.str.translate(table) tm.assert_series_equal(result, expected) # -------------------------------------------------------------------------------------- def test_flags_kwarg(any_string_dtype): data = { "Dave": "dave@google.com", "Steve": "steve@gmail.com", "Rob": "rob@gmail.com", "Wes": np.nan, } data = Series(data, dtype=any_string_dtype) pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" use_pyarrow = using_pyarrow(any_string_dtype) result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) assert result.iloc[0][0] == ("dave", "google", "com") result = data.str.count(pat, flags=re.IGNORECASE) assert result.iloc[0] == 1 msg = "has match groups" with tm.assert_produces_warning( UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow ): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0]