from datetime import datetime import numpy as np import pytest from pandas.errors import MergeError import pandas as pd from pandas import ( DataFrame, Index, MultiIndex, date_range, period_range, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @pytest.fixture def frame_with_period_index(): return DataFrame( data=np.arange(20).reshape(4, 5), columns=list("abcde"), index=period_range(start="2000", freq="Y", periods=4), ) @pytest.fixture def left(): return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture def right(): return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) @pytest.fixture def left_no_dup(): return DataFrame( {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]}, index=range(4), ) @pytest.fixture def right_no_dup(): return DataFrame( { "a": ["a", "b", "c", "d", "e"], "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"], }, index=range(5), ).set_index("a") @pytest.fixture def left_w_dups(left_no_dup): return concat( [left_no_dup, DataFrame({"a": ["a"], "b": ["cow"]}, index=[3])], sort=True ) @pytest.fixture def right_w_dups(right_no_dup): return concat( [right_no_dup, DataFrame({"a": ["e"], "c": ["moo"]}, index=[3])] ).set_index("a") @pytest.mark.parametrize( "how, sort, expected", [ ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), ( "left", False, DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), ), ( "left", True, DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), ), ( "right", False, DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]), ), ( "right", True, DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]), ), ( "outer", False, DataFrame( {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3], ), ), ( "outer", True, DataFrame( {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3], ), ), ], ) def test_join(left, right, how, sort, expected): result = left.join(right, how=how, sort=sort, validate="1:1") tm.assert_frame_equal(result, expected) def test_suffix_on_list_join(): first = DataFrame({"key": [1, 2, 3, 4, 5]}) second = DataFrame({"key": [1, 8, 3, 2, 5], "v1": [1, 2, 3, 4, 5]}) third = DataFrame({"keys": [5, 2, 3, 4, 1], "v2": [1, 2, 3, 4, 5]}) # check proper errors are raised msg = "Suffixes not supported when joining multiple DataFrames" with pytest.raises(ValueError, match=msg): first.join([second], lsuffix="y") with pytest.raises(ValueError, match=msg): first.join([second, third], rsuffix="x") with pytest.raises(ValueError, match=msg): first.join([second, third], lsuffix="y", rsuffix="x") with pytest.raises(ValueError, match="Indexes have overlapping values"): first.join([second, third]) # no errors should be raised arr_joined = first.join([third]) norm_joined = first.join(third) tm.assert_frame_equal(arr_joined, norm_joined) def test_join_invalid_validate(left_no_dup, right_no_dup): # GH 46622 # Check invalid arguments msg = ( '"invalid" is not a valid argument. ' "Valid arguments are:\n" '- "1:1"\n' '- "1:m"\n' '- "m:1"\n' '- "m:m"\n' '- "one_to_one"\n' '- "one_to_many"\n' '- "many_to_one"\n' '- "many_to_many"' ) with pytest.raises(ValueError, match=msg): left_no_dup.merge(right_no_dup, on="a", validate="invalid") @pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): # GH 46622 # Dups on right allowed by one_to_many constraint if dtype == "string[pyarrow]": pytest.importorskip("pyarrow") left_no_dup = left_no_dup.astype(dtype) right_w_dups.index = right_w_dups.index.astype(dtype) left_no_dup.join( right_w_dups, on="a", validate="one_to_many", ) # Dups on right not allowed by one_to_one constraint msg = "Merge keys are not unique in right dataset; not a one-to-one merge" with pytest.raises(MergeError, match=msg): left_no_dup.join( right_w_dups, on="a", validate="one_to_one", ) def test_join_on_single_col_dup_on_left(left_w_dups, right_no_dup): # GH 46622 # Dups on left allowed by many_to_one constraint left_w_dups.join( right_no_dup, on="a", validate="many_to_one", ) # Dups on left not allowed by one_to_one constraint msg = "Merge keys are not unique in left dataset; not a one-to-one merge" with pytest.raises(MergeError, match=msg): left_w_dups.join( right_no_dup, on="a", validate="one_to_one", ) def test_join_on_single_col_dup_on_both(left_w_dups, right_w_dups): # GH 46622 # Dups on both allowed by many_to_many constraint left_w_dups.join(right_w_dups, on="a", validate="many_to_many") # Dups on both not allowed by many_to_one constraint msg = "Merge keys are not unique in right dataset; not a many-to-one merge" with pytest.raises(MergeError, match=msg): left_w_dups.join( right_w_dups, on="a", validate="many_to_one", ) # Dups on both not allowed by one_to_many constraint msg = "Merge keys are not unique in left dataset; not a one-to-many merge" with pytest.raises(MergeError, match=msg): left_w_dups.join( right_w_dups, on="a", validate="one_to_many", ) def test_join_on_multi_col_check_dup(): # GH 46622 # Two column join, dups in both, but jointly no dups left = DataFrame( { "a": ["a", "a", "b", "b"], "b": [0, 1, 0, 1], "c": ["cat", "dog", "weasel", "horse"], }, index=range(4), ).set_index(["a", "b"]) right = DataFrame( { "a": ["a", "a", "b"], "b": [0, 1, 0], "d": ["meow", "bark", "um... weasel noise?"], }, index=range(3), ).set_index(["a", "b"]) expected_multi = DataFrame( { "a": ["a", "a", "b"], "b": [0, 1, 0], "c": ["cat", "dog", "weasel"], "d": ["meow", "bark", "um... weasel noise?"], }, index=range(3), ).set_index(["a", "b"]) # Jointly no dups allowed by one_to_one constraint result = left.join(right, how="inner", validate="1:1") tm.assert_frame_equal(result, expected_multi) def test_join_index(float_frame): # left / right f = float_frame.loc[float_frame.index[:10], ["A", "B"]] f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1] joined = f.join(f2) tm.assert_index_equal(f.index, joined.index) expected_columns = Index(["A", "B", "C", "D"]) tm.assert_index_equal(joined.columns, expected_columns) joined = f.join(f2, how="left") tm.assert_index_equal(joined.index, f.index) tm.assert_index_equal(joined.columns, expected_columns) joined = f.join(f2, how="right") tm.assert_index_equal(joined.index, f2.index) tm.assert_index_equal(joined.columns, expected_columns) # inner joined = f.join(f2, how="inner") tm.assert_index_equal(joined.index, f.index[5:10]) tm.assert_index_equal(joined.columns, expected_columns) # outer joined = f.join(f2, how="outer") tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) with pytest.raises(ValueError, match="join method"): f.join(f2, how="foo") # corner case - overlapping columns msg = "columns overlap but no suffix" for how in ("outer", "left", "inner"): with pytest.raises(ValueError, match=msg): float_frame.join(float_frame, how=how) def test_join_index_more(float_frame): af = float_frame.loc[:, ["A", "B"]] bf = float_frame.loc[::2, ["C", "D"]] expected = af.copy() expected["C"] = float_frame["C"][::2] expected["D"] = float_frame["D"][::2] result = af.join(bf) tm.assert_frame_equal(result, expected) result = af.join(bf, how="right") tm.assert_frame_equal(result, expected[::2]) result = bf.join(af, how="right") tm.assert_frame_equal(result, expected.loc[:, result.columns]) def test_join_index_series(float_frame): df = float_frame.copy() ser = df.pop(float_frame.columns[-1]) joined = df.join(ser) tm.assert_frame_equal(joined, float_frame) ser.name = None with pytest.raises(ValueError, match="must have a name"): df.join(ser) def test_join_overlap(float_frame): df1 = float_frame.loc[:, ["A", "B", "C"]] df2 = float_frame.loc[:, ["B", "C", "D"]] joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2") df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1") df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2") no_overlap = float_frame.loc[:, ["A", "D"]] expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) def test_join_period_index(frame_with_period_index): other = frame_with_period_index.rename(columns=lambda key: f"{key}{key}") joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1) joined_cols = frame_with_period_index.columns.append(other.columns) joined = frame_with_period_index.join(other) expected = DataFrame( data=joined_values, columns=joined_cols, index=frame_with_period_index.index ) tm.assert_frame_equal(joined, expected) def test_join_left_sequence_non_unique_index(): # https://github.com/pandas-dev/pandas/issues/19607 df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2]) df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4]) joined = df1.join([df2, df3], how="left") expected = DataFrame( { "a": [0, 10, 10, 20], "b": [np.nan, 300, 300, 200], "c": [np.nan, 400, 500, np.nan], }, index=[1, 2, 2, 3], ) tm.assert_frame_equal(joined, expected) def test_join_list_series(float_frame): # GH#46850 # Join a DataFrame with a list containing both a Series and a DataFrame left = float_frame.A.to_frame() right = [float_frame.B, float_frame[["C", "D"]]] result = left.join(right) tm.assert_frame_equal(result, float_frame) @pytest.mark.parametrize("sort_kw", [True, False]) def test_suppress_future_warning_with_sort_kw(sort_kw): a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) b = DataFrame({"col2": [4, 5]}, index=["b", "a"]) c = DataFrame({"col3": [7, 8]}, index=["a", "b"]) expected = DataFrame( { "col1": {"a": 2.0, "b": float("nan"), "c": 1.0}, "col2": {"a": 5.0, "b": 4.0, "c": float("nan")}, "col3": {"a": 7.0, "b": 8.0, "c": float("nan")}, } ) if sort_kw is False: expected = expected.reindex(index=["c", "a", "b"]) with tm.assert_produces_warning(None): result = a.join([b, c], how="outer", sort=sort_kw) tm.assert_frame_equal(result, expected) class TestDataFrameJoin: def test_join(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data a = frame.loc[frame.index[:5], ["A"]] b = frame.loc[frame.index[2:], ["B", "C"]] joined = a.join(b, how="outer").reindex(frame.index) expected = frame.copy().values.copy() expected[np.isnan(joined.values)] = np.nan expected = DataFrame(expected, index=frame.index, columns=frame.columns) assert not np.isnan(joined.values).all() tm.assert_frame_equal(joined, expected) def test_join_segfault(self): # GH#1532 df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) df1 = df1.set_index(["a", "b"]) df2 = df2.set_index(["a", "b"]) # it works! for how in ["left", "right", "outer"]: df1.join(df2, how=how) def test_join_str_datetime(self): str_dates = ["20120209", "20120222"] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] A = DataFrame(str_dates, index=range(2), columns=["aa"]) C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) tst = A.join(C, on="aa") assert len(tst.columns) == 3 def test_join_multiindex_leftright(self): # GH 10741 df1 = DataFrame( [ ["a", "x", 0.471780], ["a", "y", 0.774908], ["a", "z", 0.563634], ["b", "x", -0.353756], ["b", "y", 0.368062], ["b", "z", -1.721840], ["c", "x", 1], ["c", "y", 2], ["c", "z", 3], ], columns=["first", "second", "value1"], ).set_index(["first", "second"]) df2 = DataFrame([["a", 10], ["b", 20]], columns=["first", "value2"]).set_index( ["first"] ) exp = DataFrame( [ [0.471780, 10], [0.774908, 10], [0.563634, 10], [-0.353756, 20], [0.368062, 20], [-1.721840, 20], [1.000000, np.nan], [2.000000, np.nan], [3.000000, np.nan], ], index=df1.index, columns=["value1", "value2"], ) # these must be the same results (but columns are flipped) tm.assert_frame_equal(df1.join(df2, how="left"), exp) tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) exp_idx = MultiIndex.from_product( [["a", "b"], ["x", "y", "z"]], names=["first", "second"] ) exp = DataFrame( [ [0.471780, 10], [0.774908, 10], [0.563634, 10], [-0.353756, 20], [0.368062, 20], [-1.721840, 20], ], index=exp_idx, columns=["value1", "value2"], ) tm.assert_frame_equal(df1.join(df2, how="right"), exp) tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) def test_join_multiindex_dates(self): # GH 33692 date = pd.Timestamp(2000, 1, 1).date() df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) df1 = DataFrame({"col1": [0]}, index=df1_index) df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) df2 = DataFrame({"col2": [0]}, index=df2_index) df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) df3 = DataFrame({"col3": [0]}, index=df3_index) result = df1.join([df2, df3]) expected_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) expected = DataFrame( {"col1": [0], "col2": [0], "col3": [0]}, index=expected_index ) tm.assert_equal(result, expected) def test_merge_join_different_levels_raises(self): # GH#9455 # GH 40993: For raising, enforced in 2.0 # first dataframe df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) # second dataframe columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")]) df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) # merge with pytest.raises( MergeError, match="Not allowed to merge between different levels" ): pd.merge(df1, df2, on="a") # join, see discussion in GH#12219 with pytest.raises( MergeError, match="Not allowed to merge between different levels" ): df1.join(df2, on="a") def test_frame_join_tzaware(self): test1 = DataFrame( np.zeros((6, 3)), index=date_range( "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" ), ) test2 = DataFrame( np.zeros((3, 3)), index=date_range( "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" ), columns=range(3, 6), ) result = test1.join(test2, how="outer") expected = test1.index.union(test2.index) tm.assert_index_equal(result.index, expected) assert result.index.tz.zone == "US/Central"