""" self-contained to write legacy storage pickle files To use this script. Create an environment where you want generate pickles, say its for 0.20.3, with your pandas clone in ~/pandas . activate pandas_0.20.3 cd ~/pandas/pandas $ python -m tests.io.generate_legacy_storage_files \ tests/io/data/legacy_pickle/0.20.3/ pickle This script generates a storage file for the current arch, system, and python version pandas version: 0.20.3 output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ storage format: pickle created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle The idea here is you are using the *current* version of the generate_legacy_storage_files with an *older* version of pandas to generate a pickle file. We will then check this file into a current branch, and test using test_pickle.py. This will load the *older* pickles and test versus the current data that is generated (with main). These are then compared. If we have cases where we changed the signature (e.g. we renamed offset -> freq in Timestamp). Then we have to conditionally execute in the generate_legacy_storage_files.py to make it run under the older AND the newer version. """ from datetime import timedelta import os import pickle import platform as pl import sys # Remove script directory from path, otherwise Python will try to # import the JSON test directory as the json module sys.path.pop(0) import numpy as np import pandas from pandas import ( Categorical, DataFrame, Index, MultiIndex, NaT, Period, RangeIndex, Series, Timestamp, bdate_range, date_range, interval_range, period_range, timedelta_range, ) from pandas.arrays import SparseArray from pandas.tseries.offsets import ( FY5253, BusinessDay, BusinessHour, CustomBusinessDay, DateOffset, Day, Easter, Hour, LastWeekOfMonth, Minute, MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, SemiMonthBegin, SemiMonthEnd, Week, WeekOfMonth, YearBegin, YearEnd, ) def _create_sp_series(): nan = np.nan # nan-based arr = np.arange(15, dtype=np.float64) arr[7:12] = nan arr[-1:] = nan bseries = Series(SparseArray(arr, kind="block")) bseries.name = "bseries" return bseries def _create_sp_tsseries(): nan = np.nan # nan-based arr = np.arange(15, dtype=np.float64) arr[7:12] = nan arr[-1:] = nan date_index = bdate_range("1/1/2011", periods=len(arr)) bseries = Series(SparseArray(arr, kind="block"), index=date_index) bseries.name = "btsseries" return bseries def _create_sp_frame(): nan = np.nan data = { "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], "C": np.arange(10).astype(np.int64), "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], } dates = bdate_range("1/1/2011", periods=10) return DataFrame(data, index=dates).apply(SparseArray) def create_pickle_data(): """create the pickle data""" data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")} index = { "int": Index(np.arange(10)), "date": date_range("20130101", periods=10), "period": period_range("2013-01-01", freq="M", periods=10), "float": Index(np.arange(10, dtype=np.float64)), "uint": Index(np.arange(10, dtype=np.uint64)), "timedelta": timedelta_range("00:00:00", freq="30min", periods=10), } index["range"] = RangeIndex(10) index["interval"] = interval_range(0, periods=10) mi = { "reg2": MultiIndex.from_tuples( tuple( zip( *[ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] ) ), names=["first", "second"], ) } series = { "float": Series(data["A"]), "int": Series(data["B"]), "mixed": Series(data["E"]), "ts": Series( np.arange(10).astype(np.int64), index=date_range("20130101", periods=10) ), "mi": Series( np.arange(5).astype(np.float64), index=MultiIndex.from_tuples( tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] ), ), "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), "cat": Series(Categorical(["foo", "bar", "baz"])), "dt": Series(date_range("20130101", periods=5)), "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")), "period": Series([Period("2000Q1")] * 5), } mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = { "float": DataFrame({"A": series["float"], "B": series["float"] + 1}), "int": DataFrame({"A": series["int"], "B": series["int"] + 1}), "mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), "mi": DataFrame( {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)}, index=MultiIndex.from_tuples( tuple( zip( *[ ["bar", "bar", "baz", "baz", "baz"], ["one", "two", "one", "two", "three"], ] ) ), names=["first", "second"], ), ), "dup": DataFrame( np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] ), "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}), "cat_and_float": DataFrame( { "A": Categorical(["foo", "bar", "baz"]), "B": np.arange(3).astype(np.int64), } ), "mixed_dup": mixed_dup_df, "dt_mixed_tzs": DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ), "dt_mixed2_tzs": DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), "C": Timestamp("20130603", tz="UTC"), }, index=range(5), ), } cat = { "int8": Categorical(list("abcdefg")), "int16": Categorical(np.arange(1000)), "int32": Categorical(np.arange(10000)), } timestamp = { "normal": Timestamp("2011-01-01"), "nat": NaT, "tz": Timestamp("2011-01-01", tz="US/Eastern"), } off = { "DateOffset": DateOffset(years=1), "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), "SemiMonthBegin": SemiMonthBegin(day_of_month=9), "SemiMonthEnd": SemiMonthEnd(day_of_month=24), "MonthBegin": MonthBegin(1), "MonthEnd": MonthEnd(1), "QuarterBegin": QuarterBegin(1), "QuarterEnd": QuarterEnd(1), "Day": Day(1), "YearBegin": YearBegin(1), "YearEnd": YearEnd(1), "Week": Week(1), "Week_Tues": Week(2, normalize=False, weekday=1), "WeekOfMonth": WeekOfMonth(week=3, weekday=4), "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), "Easter": Easter(), "Hour": Hour(1), "Minute": Minute(1), } return { "series": series, "frame": frame, "index": index, "scalars": scalars, "mi": mi, "sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()}, "sp_frame": {"float": _create_sp_frame()}, "cat": cat, "timestamp": timestamp, "offsets": off, } def platform_name(): return "_".join( [ str(pandas.__version__), str(pl.machine()), str(pl.system().lower()), str(pl.python_version()), ] ) def write_legacy_pickles(output_dir): version = pandas.__version__ print( "This script generates a storage file for the current arch, system, " "and python version" ) print(f" pandas version: {version}") print(f" output dir : {output_dir}") print(" storage format: pickle") pth = f"{platform_name()}.pickle" with open(os.path.join(output_dir, pth), "wb") as fh: pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL) print(f"created pickle file: {pth}") def write_legacy_file(): # force our cwd to be the first searched sys.path.insert(0, "") if not 3 <= len(sys.argv) <= 4: sys.exit( "Specify output directory and storage type: generate_legacy_" "storage_files.py " ) output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) if not os.path.exists(output_dir): os.mkdir(output_dir) if storage_type == "pickle": write_legacy_pickles(output_dir=output_dir) else: sys.exit("storage_type must be one of {'pickle'}") if __name__ == "__main__": write_legacy_file()