From 562fe091bf92f34bd18a598e1a36a769da013fc3 Mon Sep 17 00:00:00 2001 From: genisis0x Date: Thu, 14 May 2026 13:52:02 +0530 Subject: [PATCH] Fix PortAnaRecord crash on duplicate "datetime" level When the prediction DataFrame's MultiIndex carries two levels both named "datetime" (which can happen with some handler/processor chains), pandas raises "datetime occurs multiple times" on the name-based lookup. Resolve the level positionally instead via index.names.index("datetime"). Same fix applied to MultiPassPortAnaRecord.random_init. Fixes #1909 --- qlib/workflow/record_temp.py | 12 ++++- .../test_record_temp_duplicate_datetime.py | 45 +++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/misc/test_record_temp_duplicate_datetime.py diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index ecd58ec2098..03871612bee 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -471,7 +471,12 @@ def _generate(self, **kwargs): setattr(self, k, fill_placeholder(getattr(self, k), placeholder_value)) # if the backtesting time range is not set, it will automatically extract time range from the prediction file - dt_values = pred.index.get_level_values("datetime") + # Resolve the "datetime" level positionally. With some handler chains + # (e.g. Alpha360 plus certain processors) the predicted DataFrame can + # carry two levels named "datetime", and the name-based lookup raises + # "datetime occurs multiple times" (#1909). + dt_level = pred.index.names.index("datetime") + dt_values = pred.index.get_level_values(dt_level) if self.backtest_config["start_time"] is None: self.backtest_config["start_time"] = dt_values.min() if self.backtest_config["end_time"] is None: @@ -617,7 +622,10 @@ def __init__(self, recorder, pass_num=10, shuffle_init_score=True, **kwargs): def random_init(self): pred_df = self.load("pred.pkl") - all_pred_dates = pred_df.index.get_level_values("datetime") + # Same positional resolution as PortAnaRecord._generate to survive + # MultiIndexes that carry duplicate "datetime" level names (#1909). + dt_level = pred_df.index.names.index("datetime") + all_pred_dates = pred_df.index.get_level_values(dt_level) bt_start_date = pd.to_datetime(self.backtest_config.get("start_time")) if bt_start_date is None: first_bt_pred_date = all_pred_dates.min() diff --git a/tests/misc/test_record_temp_duplicate_datetime.py b/tests/misc/test_record_temp_duplicate_datetime.py new file mode 100644 index 00000000000..2b97e2109ee --- /dev/null +++ b/tests/misc/test_record_temp_duplicate_datetime.py @@ -0,0 +1,45 @@ +"""Regression test for https://github.com/microsoft/qlib/issues/1909. + +The reporter saw `record_temp.PortAnaRecord` crash with +`ValueError: 'datetime' occurs multiple times` when the predicted DataFrame +ended up with a MultiIndex carrying two levels both named ``datetime``. The +old code relied on pandas' name-based level lookup, which is ambiguous in +that case. The fix resolves the level positionally. + +This test exercises the resolution directly so the regression cannot creep +back even if pandas decides to be stricter about duplicate level names. +""" + +import unittest + +import numpy as np +import pandas as pd + + +class TestDuplicateDatetimeLevelResolution(unittest.TestCase): + def _build_index_with_duplicate_datetime(self) -> pd.MultiIndex: + dates = pd.date_range("2024-01-01", periods=3, freq="D") + return pd.MultiIndex.from_arrays( + [dates, ["a", "b", "c"], dates], + names=["datetime", "instrument", "datetime"], + ) + + def test_name_based_lookup_fails(self) -> None: + """Sanity-check the precondition: name lookup is ambiguous.""" + idx = self._build_index_with_duplicate_datetime() + with self.assertRaises(ValueError): + idx.get_level_values("datetime") + + def test_positional_lookup_resolves(self) -> None: + """The fix path: index.names.index('datetime') + positional lookup.""" + idx = self._build_index_with_duplicate_datetime() + dt_level = idx.names.index("datetime") + values = idx.get_level_values(dt_level) + np.testing.assert_array_equal( + values.values, + pd.date_range("2024-01-01", periods=3, freq="D").values, + ) + + +if __name__ == "__main__": + unittest.main()