diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c76119af..0b5a04fd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,7 +17,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + # Reduced Python version matrix for faster CI + python-version: ["3.11", "3.12"] env: UV_PYTHON: ${{ matrix.python-version }} @@ -30,8 +31,21 @@ jobs: with: enable-cache: true + # Cache model weights to speed up any foundation model tests + - name: Cache model weights + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface + ~/.cache/torch + key: model-weights-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }} + restore-keys: | + model-weights-${{ runner.os }}-${{ matrix.python-version }}- + model-weights-${{ runner.os }}- + - name: Run tests run: uv run pytest + timeout-minutes: 10 - name: Test import class run: uv run -- python -c "from timecopilot import TimeCopilot, TimeCopilotForecaster" @@ -53,6 +67,7 @@ jobs: - name: Run tests run: uv run pytest -m live + timeout-minutes: 15 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/comprehensive.yaml b/.github/workflows/comprehensive.yaml new file mode 100644 index 00000000..700b41e3 --- /dev/null +++ b/.github/workflows/comprehensive.yaml @@ -0,0 +1,58 @@ +name: Comprehensive Tests + +on: + schedule: + # Run comprehensive tests nightly at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: + # Allow manual triggering + push: + branches: [main] + paths: + # Only run when model-related files change + - 'timecopilot/models/**' + - 'tests/models/test_models_comprehensive.py' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-comprehensive: + name: comprehensive test on ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + # Test on fewer Python versions for comprehensive tests + python-version: ["3.11", "3.12"] + env: + UV_PYTHON: ${{ matrix.python-version }} + + steps: + - name: Clone repo + uses: actions/checkout@v4 + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + # Cache model weights to speed up foundation model tests + - name: Cache model weights + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface + ~/.cache/torch + key: model-weights-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }} + restore-keys: | + model-weights-${{ runner.os }}-${{ matrix.python-version }}- + model-weights-${{ runner.os }}- + + - name: Run comprehensive tests + run: uv run pytest -m slow -v + timeout-minutes: 45 + + - name: Test import class + run: uv run -- python -c "from timecopilot import TimeCopilot, TimeCopilotForecaster" \ No newline at end of file diff --git a/TESTING.md b/TESTING.md new file mode 100644 index 00000000..97c81e18 --- /dev/null +++ b/TESTING.md @@ -0,0 +1,107 @@ +# Testing Guide + +This project uses a two-tier testing strategy to balance speed and comprehensive coverage. + +## Test Categories + +### Fast Tests (Regular CI) +- **Purpose**: Quick validation for every PR/push +- **Runtime**: ~2-3 minutes +- **Models**: Lightweight benchmark models (SeasonalNaive, ZeroModel, AutoARIMA, Prophet, etc.) +- **Trigger**: Runs on every push/PR +- **Command**: `uv run pytest` (excludes slow tests by default) + +### Slow Tests (Comprehensive) +- **Purpose**: Full model validation including heavy foundation models +- **Runtime**: ~30-45 minutes +- **Models**: All models including Chronos, Moirai, TimesFM, TiRex, etc. +- **Trigger**: Nightly schedule + manual dispatch +- **Command**: `uv run pytest -m slow` + +### Live Tests +- **Purpose**: Test integration with external APIs (OpenAI, etc.) +- **Models**: Uses real LLM endpoints +- **Trigger**: Regular CI (with API keys) +- **Command**: `uv run pytest -m live` + +## Running Tests Locally + +```bash +# Fast tests only (recommended for development) +uv run pytest + +# Include slow foundation model tests +uv run pytest -m "not live" + +# All tests including live API calls +uv run pytest -m "" + +# Specific test categories +uv run pytest -m slow # Foundation models only +uv run pytest -m live # API integration only +uv run pytest tests/test_smoke.py # Basic smoke tests +``` + +## Test Optimization Details + +### Model Selection Strategy +- **Fast models** (`tests/conftest.py::fast_models`): Used in regular CI + - Benchmark models: SeasonalNaive, ZeroModel, AutoARIMA, Prophet, ADIDA + - Lightweight foundation: TabPFN (MOCK mode) + +- **Comprehensive models** (`tests/conftest.py::all_models`): Used in nightly tests + - All fast models plus heavy foundation models + - Foundation models: Chronos, Moirai, TimesFM, TiRex, Toto + +### Parameterization Reductions +To speed up CI, several test parameters were reduced: +- **Frequencies**: 4 → 2 (H,D,W-MON,MS → D,W-MON) +- **Series counts**: 5 → 3 per test +- **Quantiles**: 9 → 3 (0.1-0.9 → 0.1,0.5,0.9) +- **Cross-validation horizon**: 12 → 6 +- **Python versions**: 4 → 2 (3.11, 3.12 for regular CI) + +### Caching Strategy +- **Model weights**: HuggingFace and PyTorch caches preserved between runs +- **Dependencies**: UV package cache enabled +- **Cache keys**: Include Python version and dependency hash + +## Adding New Tests + +### For Fast Models +Add tests to existing files in `tests/` - they will automatically use fast models and run in regular CI. + +### For Foundation Models +- Add tests with `@pytest.mark.slow` decorator +- Consider using reduced parameterization for speed +- Test will run in nightly comprehensive workflow + +### Example +```python +@pytest.mark.slow +@pytest.mark.parametrize("model", all_models) # Use all_models for comprehensive +def test_new_foundation_feature(model): + # Test with foundation models + pass + +@pytest.mark.parametrize("model", models) # Use models for fast tests +def test_new_basic_feature(model): + # Test with fast models only + pass +``` + +## CI Workflows + +### `.github/workflows/ci.yaml` +- **Triggers**: Push to main, PRs +- **Python versions**: 3.11, 3.12 +- **Timeout**: 10 minutes regular tests, 15 minutes live tests +- **Excludes**: Slow tests (foundation models) + +### `.github/workflows/comprehensive.yaml` +- **Triggers**: Nightly at 2 AM UTC, manual dispatch, model file changes +- **Python versions**: 3.11, 3.12 +- **Timeout**: 45 minutes +- **Includes**: Only slow tests (foundation models) + +This structure ensures rapid feedback for developers while maintaining comprehensive test coverage through automated nightly runs. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8f31e695..b5521f20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,9 +88,10 @@ disable_error_code = ["no-redef"] # for fasthtml [tool.pytest.ini_options] testpaths = ["tests"] -addopts = "-m 'not live' -n auto" +addopts = "-m 'not live and not slow' -n auto" markers = [ - "live: marks tests that require calls to llm providers" + "live: marks tests that require calls to llm providers", + "slow: marks tests with heavy foundation models that take longer to run" ] [tool.ruff] diff --git a/tests/conftest.py b/tests/conftest.py index 6b2434b9..778e1c29 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,28 +5,37 @@ from timecopilot.models.foundational.moirai import Moirai from timecopilot.models.foundational.toto import Toto +# Fast benchmark models for regular CI benchmark_models = [ "AutoARIMA", "SeasonalNaive", "ZeroModel", "ADIDA", - "TimesFM", "Prophet", ] -models = [MODELS[str_model] for str_model in benchmark_models] -if sys.version_info >= (3, 11): - from timecopilot.models.foundational.tirex import TiRex - models.append(TiRex()) +# Fast models for regular CI (exclude TimesFM as it's heavy) +fast_models = [MODELS[str_model] for str_model in benchmark_models] +# Add lightweight foundation models for fast tests if sys.version_info < (3, 13): from tabpfn_time_series import TabPFNMode - from timecopilot.models.foundational.tabpfn import TabPFN + fast_models.append(TabPFN(mode=TabPFNMode.MOCK)) + +# Comprehensive model list including heavy foundation models (for slow tests) +all_models = fast_models.copy() - models.append(TabPFN(mode=TabPFNMode.MOCK)) +# Add TimesFM to comprehensive list +all_models.append(MODELS["TimesFM"]) -models.extend( +# Add TiRex for Python >= 3.11 +if sys.version_info >= (3, 11): + from timecopilot.models.foundational.tirex import TiRex + all_models.append(TiRex()) + +# Add heavy foundation models to comprehensive list +all_models.extend( [ Chronos(repo_id="amazon/chronos-t5-tiny", alias="Chronos-T5"), Chronos(repo_id="amazon/chronos-bolt-tiny", alias="Chronos-Bolt"), @@ -44,3 +53,6 @@ ), ] ) + +# Default to fast models for regular CI +models = fast_models diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 57b04eb2..3fdd0701 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -40,7 +40,7 @@ def test_tabpfn_import_fails(): @pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"]) +@pytest.mark.parametrize("freq", ["D", "W-MON"]) # Reduced freq combinations for speed def test_freq_inferred_correctly(model, freq): n_series = 2 df = generate_series( @@ -66,10 +66,10 @@ def test_freq_inferred_correctly(model, freq): @pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"]) +@pytest.mark.parametrize("freq", ["D", "W-MON"]) # Reduced freq combinations for speed @pytest.mark.parametrize("h", [1, 12]) def test_correct_forecast_dates(model, freq, h): - n_series = 5 + n_series = 3 # Reduced from 5 for speed df = generate_series( n_series, freq=freq, @@ -91,11 +91,11 @@ def test_correct_forecast_dates(model, freq, h): @pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"]) +@pytest.mark.parametrize("freq", ["D", "W-MON"]) # Reduced freq combinations for speed @pytest.mark.parametrize("n_windows", [1, 4]) def test_cross_validation(model, freq, n_windows): - h = 12 - n_series = 5 + h = 6 # Reduced from 12 for speed + n_series = 3 # Reduced from 5 for speed df = generate_series(n_series, freq=freq, equal_ends=True) cv_df = model.cross_validation( df, @@ -152,8 +152,8 @@ def test_passing_both_level_and_quantiles(model): @pytest.mark.parametrize("model", models) def test_using_quantiles(model): - qs = [round(i * 0.1, 1) for i in range(1, 10)] - df = generate_series(n_series=3, freq="D") + qs = [0.1, 0.5, 0.9] # Reduced quantiles for speed + df = generate_series(n_series=2, freq="D") # Reduced series for speed fcst_df = model.forecast( df=df, h=2, @@ -186,7 +186,7 @@ def test_using_quantiles(model): @pytest.mark.parametrize("model", models) def test_using_level(model): - level = [0, 20, 40, 60, 80] # corresponds to qs [0.1, 0.2, ..., 0.9] + level = [20, 80] # Reduced levels for speed df = generate_series(n_series=2, freq="D") fcst_df = model.forecast( df=df, diff --git a/tests/models/test_models_comprehensive.py b/tests/models/test_models_comprehensive.py new file mode 100644 index 00000000..b55e0c30 --- /dev/null +++ b/tests/models/test_models_comprehensive.py @@ -0,0 +1,132 @@ +""" +Comprehensive tests for all models including heavy foundation models. +These tests are marked as slow and excluded from regular CI runs. +""" +import sys + +import pandas as pd +import pytest +from utilsforecast.data import generate_series as _generate_series + +from ..conftest import all_models + + +def generate_series(n_series, freq, **kwargs): + df = _generate_series(n_series, freq, **kwargs) + df["unique_id"] = df["unique_id"].astype(str) + return df + + +@pytest.mark.slow +@pytest.mark.parametrize("model", all_models) +@pytest.mark.parametrize("freq", ["D", "W-MON"]) # Reduced freq combinations for speed +def test_freq_inferred_correctly_comprehensive(model, freq): + n_series = 2 + df = generate_series( + n_series, + freq=freq, + ) + fcsts_no_freq = model.forecast(df, h=3) + fcsts_with_freq = model.forecast(df, h=3, freq=freq) + cv_no_freq = model.cross_validation(df, h=3) + cv_with_freq = model.cross_validation(df, h=3, freq=freq) + # some foundation models produce different results + # each time they are called + cols_to_check = ["unique_id", "ds"] + cols_to_check_cv = ["unique_id", "ds", "y", "cutoff"] + pd.testing.assert_frame_equal( + fcsts_no_freq[cols_to_check], + fcsts_with_freq[cols_to_check], + ) + pd.testing.assert_frame_equal( + cv_no_freq[cols_to_check_cv], + cv_with_freq[cols_to_check_cv], + ) + + +@pytest.mark.slow +@pytest.mark.parametrize("model", all_models) +@pytest.mark.parametrize("freq", ["D", "W-MON"]) # Reduced freq combinations +@pytest.mark.parametrize("h", [1, 12]) +def test_correct_forecast_dates_comprehensive(model, freq, h): + n_series = 3 # Reduced from 5 for speed + df = generate_series( + n_series, + freq=freq, + ) + df_test = df.groupby("unique_id").tail(h) + df_train = df.drop(df_test.index) + fcst_df = model.forecast( + df_train, + h=h, + freq=freq, + ) + exp_n_cols = 3 + assert fcst_df.shape == (n_series * h, exp_n_cols) + exp_cols = ["unique_id", "ds"] + pd.testing.assert_frame_equal( + fcst_df[exp_cols].sort_values(["unique_id", "ds"]).reset_index(drop=True), + df_test[exp_cols].sort_values(["unique_id", "ds"]).reset_index(drop=True), + ) + + +@pytest.mark.slow +@pytest.mark.parametrize("model", all_models) +@pytest.mark.parametrize("freq", ["D"]) # Only test daily for foundation models +@pytest.mark.parametrize("n_windows", [1]) # Reduced windows for speed +def test_cross_validation_comprehensive(model, freq, n_windows): + h = 6 # Reduced from 12 for speed + n_series = 3 # Reduced from 5 for speed + df = generate_series(n_series, freq=freq, equal_ends=True) + cv_df = model.cross_validation( + df, + h=h, + freq=freq, + n_windows=n_windows, + ) + exp_n_cols = 5 # unique_id, cutoff, ds, y, model + assert cv_df.shape == (n_series * h * n_windows, exp_n_cols) + cutoffs = cv_df["cutoff"].unique() + assert len(cutoffs) == n_windows + df_test = df.groupby("unique_id").tail(h * n_windows) + exp_cols = ["unique_id", "ds", "y"] + pd.testing.assert_frame_equal( + cv_df.sort_values(["unique_id", "ds"]).reset_index(drop=True)[exp_cols], + df_test.sort_values(["unique_id", "ds"]).reset_index(drop=True)[exp_cols], + ) + + +@pytest.mark.slow +@pytest.mark.parametrize("model", all_models) +def test_using_quantiles_comprehensive(model): + qs = [0.1, 0.5, 0.9] # Reduced quantiles for speed + df = generate_series(n_series=2, freq="D") # Reduced series + fcst_df = model.forecast( + df=df, + h=2, + freq="D", + quantiles=qs, + ) + exp_qs_cols = [f"{model.alias}-q-{int(100 * q)}" for q in qs] + assert all(col in fcst_df.columns for col in exp_qs_cols) + assert not any(("-lo-" in col or "-hi-" in col) for col in fcst_df.columns) + + +@pytest.mark.slow +@pytest.mark.parametrize("model", all_models) +def test_using_level_comprehensive(model): + level = [20, 80] # Reduced levels for speed + df = generate_series(n_series=2, freq="D") + fcst_df = model.forecast( + df=df, + h=2, + freq="D", + level=level, + ) + exp_lv_cols = [] + for lv in level: + if lv == 0: + continue + exp_lv_cols.extend([f"{model.alias}-lo-{lv}", f"{model.alias}-hi-{lv}"]) + assert all(col in fcst_df.columns for col in exp_lv_cols) + assert not any(("-q-" in col) for col in fcst_df.columns) \ No newline at end of file diff --git a/tests/test_smoke.py b/tests/test_smoke.py new file mode 100644 index 00000000..a4e93e2b --- /dev/null +++ b/tests/test_smoke.py @@ -0,0 +1,44 @@ +""" +Fast smoke tests for basic functionality. +These tests run very quickly to catch import and basic setup issues. +""" +import pytest + + +def test_timecopilot_imports(): + """Test that main classes can be imported without errors.""" + from timecopilot import TimeCopilot, TimeCopilotForecaster + assert TimeCopilot is not None + assert TimeCopilotForecaster is not None + + +def test_models_import(): + """Test that model classes can be imported.""" + from timecopilot.models import SeasonalNaive, ZeroModel + assert SeasonalNaive is not None + assert ZeroModel is not None + + +def test_basic_model_creation(): + """Test that basic models can be instantiated.""" + from timecopilot.models import SeasonalNaive, ZeroModel + + model1 = SeasonalNaive() + model2 = ZeroModel() + + assert model1 is not None + assert model2 is not None + assert hasattr(model1, 'forecast') + assert hasattr(model2, 'forecast') + + +def test_forecaster_creation(): + """Test that TimeCopilotForecaster can be created with basic models.""" + from timecopilot.forecaster import TimeCopilotForecaster + from timecopilot.models import SeasonalNaive, ZeroModel + + models = [SeasonalNaive(), ZeroModel()] + forecaster = TimeCopilotForecaster(models=models) + + assert forecaster is not None + assert len(forecaster.models) == 2 \ No newline at end of file