|
13 | 13 | FeatureSetConfig, |
14 | 14 | LagConfig, |
15 | 15 | LifecycleConfig, |
| 16 | + PromotionConfig, |
16 | 17 | RollingConfig, |
17 | 18 | ) |
18 | 19 | from app.features.featuresets.service import FeatureEngineeringService |
@@ -418,3 +419,155 @@ def test_lifecycle_group_isolation_no_cross_product_leakage( |
418 | 419 | f"days_since_launch_lag1={actual}, expected={base_lag}. " |
419 | 420 | "Lifecycle lag is mixing across products." |
420 | 421 | ) |
| 422 | + |
| 423 | + |
| 424 | +class TestPromotionLeakage: |
| 425 | + """Tests verifying promotion features never use future data. |
| 426 | +
|
| 427 | + PRP-3.1D — these leakage cases are LOAD-BEARING. They assert that a |
| 428 | + promotion active on day D MUST NOT appear in day D's |
| 429 | + ``promo_<kind>_active_lag1`` column; it appears at day D+1 only. The |
| 430 | + date-range semantics (start_date <= D <= end_date, both inclusive) |
| 431 | + plus ``groupby(...).shift(lag_days)`` are the mathematical leakage gate. |
| 432 | + """ |
| 433 | + |
| 434 | + def test_promotion_active_no_leakage_at_same_day( |
| 435 | + self, |
| 436 | + sample_time_series: pd.DataFrame, |
| 437 | + phase2_promotion_rows_df: pd.DataFrame, |
| 438 | + ) -> None: |
| 439 | + """CRITICAL: A promotion active on day D MUST NOT appear in lag1 at D.""" |
| 440 | + config = FeatureSetConfig( |
| 441 | + name="test", |
| 442 | + entity_columns=("store_id", "product_id"), |
| 443 | + promotion_config=PromotionConfig( |
| 444 | + kinds_to_track=("markdown",), |
| 445 | + include_active=True, |
| 446 | + include_intensity=False, |
| 447 | + lag_days=1, |
| 448 | + ), |
| 449 | + ) |
| 450 | + service = FeatureEngineeringService(config) |
| 451 | + service._promotion_rows_df = phase2_promotion_rows_df # type: ignore[attr-defined] |
| 452 | + result = service.compute_features(sample_time_series) |
| 453 | + |
| 454 | + # The fixture's markdown is active 2024-01-07 .. 2024-01-14 (8 days). |
| 455 | + # promo_markdown_active_lag1 should be 1 on 2024-01-08 .. 2024-01-15. |
| 456 | + df = result.df.reset_index(drop=True) |
| 457 | + dates = pd.to_datetime(df["date"]).dt.date |
| 458 | + |
| 459 | + # Day BEFORE start (D=Jan 6): lag1 reads Jan 5 — inactive. EXPECT 0. |
| 460 | + assert df.loc[dates == date(2024, 1, 6), "promo_markdown_active_lag1"].iloc[0] == 0 |
| 461 | + |
| 462 | + # Day OF start (D=Jan 7): lag1 reads Jan 6 — inactive. EXPECT 0. |
| 463 | + # This is the load-bearing leakage check: same-day MUST NOT leak. |
| 464 | + assert df.loc[dates == date(2024, 1, 7), "promo_markdown_active_lag1"].iloc[0] == 0, ( |
| 465 | + "LEAKAGE DETECTED: promo active on day D appeared in active_lag1 at day D" |
| 466 | + ) |
| 467 | + |
| 468 | + # Day AFTER start (D=Jan 8): lag1 reads Jan 7 — active. EXPECT 1. |
| 469 | + assert df.loc[dates == date(2024, 1, 8), "promo_markdown_active_lag1"].iloc[0] == 1 |
| 470 | + |
| 471 | + # Day AFTER end (D=Jan 15): lag1 reads Jan 14 — last active day. EXPECT 1. |
| 472 | + assert df.loc[dates == date(2024, 1, 15), "promo_markdown_active_lag1"].iloc[0] == 1 |
| 473 | + |
| 474 | + # Two days AFTER end (D=Jan 16): lag1 reads Jan 15 — inactive. EXPECT 0. |
| 475 | + assert df.loc[dates == date(2024, 1, 16), "promo_markdown_active_lag1"].iloc[0] == 0 |
| 476 | + |
| 477 | + def test_promotion_boundary_end_date_at_cutoff( |
| 478 | + self, |
| 479 | + sample_time_series: pd.DataFrame, |
| 480 | + ) -> None: |
| 481 | + """A promo ending exactly on cutoff_date - 1 yields active_lag1=1 at cutoff.""" |
| 482 | + cutoff = date(2024, 1, 15) |
| 483 | + promo_rows = pd.DataFrame( |
| 484 | + { |
| 485 | + "product_id": [1], |
| 486 | + "store_id": [1], |
| 487 | + "kind": ["markdown"], |
| 488 | + "discount_pct": [0.20], |
| 489 | + "start_date": [date(2024, 1, 10)], |
| 490 | + "end_date": [date(2024, 1, 14)], # cutoff - 1 |
| 491 | + } |
| 492 | + ) |
| 493 | + config = FeatureSetConfig( |
| 494 | + name="test", |
| 495 | + entity_columns=("store_id", "product_id"), |
| 496 | + promotion_config=PromotionConfig(kinds_to_track=("markdown",), lag_days=1), |
| 497 | + ) |
| 498 | + service = FeatureEngineeringService(config) |
| 499 | + service._promotion_rows_df = promo_rows # type: ignore[attr-defined] |
| 500 | + result = service.compute_features(sample_time_series, cutoff_date=cutoff) |
| 501 | + |
| 502 | + df = result.df.reset_index(drop=True) |
| 503 | + dates = pd.to_datetime(df["date"]).dt.date |
| 504 | + # At cutoff (Jan 15), lag1 reads Jan 14 — end_date, INCLUSIVE → active. |
| 505 | + last = df.loc[dates == cutoff].iloc[0] |
| 506 | + assert last["promo_markdown_active_lag1"] == 1, ( |
| 507 | + "Boundary leakage: end_date INCLUSIVE on the previous day failed" |
| 508 | + ) |
| 509 | + |
| 510 | + def test_promotion_starts_on_cutoff_not_in_lag1( |
| 511 | + self, |
| 512 | + sample_time_series: pd.DataFrame, |
| 513 | + ) -> None: |
| 514 | + """A promo starting exactly on cutoff is NOT in active_lag1 at cutoff.""" |
| 515 | + cutoff = date(2024, 1, 15) |
| 516 | + promo_rows = pd.DataFrame( |
| 517 | + { |
| 518 | + "product_id": [1], |
| 519 | + "store_id": [1], |
| 520 | + "kind": ["markdown"], |
| 521 | + "discount_pct": [0.20], |
| 522 | + "start_date": [cutoff], # starts today |
| 523 | + "end_date": [date(2024, 1, 25)], |
| 524 | + } |
| 525 | + ) |
| 526 | + config = FeatureSetConfig( |
| 527 | + name="test", |
| 528 | + entity_columns=("store_id", "product_id"), |
| 529 | + promotion_config=PromotionConfig(kinds_to_track=("markdown",), lag_days=1), |
| 530 | + ) |
| 531 | + service = FeatureEngineeringService(config) |
| 532 | + service._promotion_rows_df = promo_rows # type: ignore[attr-defined] |
| 533 | + result = service.compute_features(sample_time_series, cutoff_date=cutoff) |
| 534 | + |
| 535 | + df = result.df.reset_index(drop=True) |
| 536 | + dates = pd.to_datetime(df["date"]).dt.date |
| 537 | + last = df.loc[dates == cutoff].iloc[0] |
| 538 | + # lag1 reads cutoff - 1 = Jan 14, BEFORE start_date. |
| 539 | + assert last["promo_markdown_active_lag1"] == 0, ( |
| 540 | + "Same-day leakage: promo starting on D appeared in active_lag1 at D" |
| 541 | + ) |
| 542 | + |
| 543 | + def test_chain_wide_promo_does_not_bleed_across_products( |
| 544 | + self, |
| 545 | + multi_series_time_series: pd.DataFrame, |
| 546 | + ) -> None: |
| 547 | + """A chain-wide promo on product=1 must NOT activate features for product=2.""" |
| 548 | + promo_rows = pd.DataFrame( |
| 549 | + { |
| 550 | + "product_id": [1], |
| 551 | + "store_id": [None], # chain-wide |
| 552 | + "kind": ["markdown"], |
| 553 | + "discount_pct": [0.30], |
| 554 | + "start_date": [date(2024, 1, 3)], |
| 555 | + "end_date": [date(2024, 1, 7)], |
| 556 | + } |
| 557 | + ) |
| 558 | + config = FeatureSetConfig( |
| 559 | + name="test", |
| 560 | + entity_columns=("store_id", "product_id"), |
| 561 | + promotion_config=PromotionConfig(kinds_to_track=("markdown",), lag_days=1), |
| 562 | + ) |
| 563 | + service = FeatureEngineeringService(config) |
| 564 | + service._promotion_rows_df = promo_rows # type: ignore[attr-defined] |
| 565 | + result = service.compute_features(multi_series_time_series) |
| 566 | + |
| 567 | + df = result.df |
| 568 | + # Product 1 should see activity 2024-01-04 .. 2024-01-08 (lag1) -- 5 days x 2 stores. |
| 569 | + prod1 = df[df["product_id"] == 1] |
| 570 | + assert int(prod1["promo_markdown_active_lag1"].sum()) == 5 * 2 |
| 571 | + # Product 2 should see ZERO activity (chain-wide is product-scoped). |
| 572 | + prod2 = df[df["product_id"] == 2] |
| 573 | + assert int(prod2["promo_markdown_active_lag1"].sum()) == 0 |
0 commit comments