Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions qlib/workflow/record_temp.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,12 @@ def _generate(self, **kwargs):
setattr(self, k, fill_placeholder(getattr(self, k), placeholder_value))

# if the backtesting time range is not set, it will automatically extract time range from the prediction file
dt_values = pred.index.get_level_values("datetime")
# Resolve the "datetime" level positionally. With some handler chains
# (e.g. Alpha360 plus certain processors) the predicted DataFrame can
# carry two levels named "datetime", and the name-based lookup raises
# "datetime occurs multiple times" (#1909).
dt_level = pred.index.names.index("datetime")
dt_values = pred.index.get_level_values(dt_level)
if self.backtest_config["start_time"] is None:
self.backtest_config["start_time"] = dt_values.min()
if self.backtest_config["end_time"] is None:
Expand Down Expand Up @@ -617,7 +622,10 @@ def __init__(self, recorder, pass_num=10, shuffle_init_score=True, **kwargs):
def random_init(self):
pred_df = self.load("pred.pkl")

all_pred_dates = pred_df.index.get_level_values("datetime")
# Same positional resolution as PortAnaRecord._generate to survive
# MultiIndexes that carry duplicate "datetime" level names (#1909).
dt_level = pred_df.index.names.index("datetime")
all_pred_dates = pred_df.index.get_level_values(dt_level)
bt_start_date = pd.to_datetime(self.backtest_config.get("start_time"))
if bt_start_date is None:
first_bt_pred_date = all_pred_dates.min()
Expand Down
45 changes: 45 additions & 0 deletions tests/misc/test_record_temp_duplicate_datetime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Regression test for https://github.com/microsoft/qlib/issues/1909.

The reporter saw `record_temp.PortAnaRecord` crash with
`ValueError: 'datetime' occurs multiple times` when the predicted DataFrame
ended up with a MultiIndex carrying two levels both named ``datetime``. The
old code relied on pandas' name-based level lookup, which is ambiguous in
that case. The fix resolves the level positionally.

This test exercises the resolution directly so the regression cannot creep
back even if pandas decides to be stricter about duplicate level names.
"""

import unittest

import numpy as np
import pandas as pd


class TestDuplicateDatetimeLevelResolution(unittest.TestCase):
def _build_index_with_duplicate_datetime(self) -> pd.MultiIndex:
dates = pd.date_range("2024-01-01", periods=3, freq="D")
return pd.MultiIndex.from_arrays(
[dates, ["a", "b", "c"], dates],
names=["datetime", "instrument", "datetime"],
)

def test_name_based_lookup_fails(self) -> None:
"""Sanity-check the precondition: name lookup is ambiguous."""
idx = self._build_index_with_duplicate_datetime()
with self.assertRaises(ValueError):
idx.get_level_values("datetime")

def test_positional_lookup_resolves(self) -> None:
"""The fix path: index.names.index('datetime') + positional lookup."""
idx = self._build_index_with_duplicate_datetime()
dt_level = idx.names.index("datetime")
values = idx.get_level_values(dt_level)
np.testing.assert_array_equal(
values.values,
pd.date_range("2024-01-01", periods=3, freq="D").values,
)


if __name__ == "__main__":
unittest.main()