TimeCopilot · Copilot · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -17,7 +17,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        # Reduced Python version matrix for faster CI
+        python-version: ["3.11", "3.12"]
     env:
       UV_PYTHON: ${{ matrix.python-version }}
 
@@ -30,8 +31,21 @@ jobs:
         with:
           enable-cache: true
 
+      # Cache model weights to speed up any foundation model tests
+      - name: Cache model weights
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/huggingface
+            ~/.cache/torch
+          key: model-weights-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
+          restore-keys: |
+            model-weights-${{ runner.os }}-${{ matrix.python-version }}-
+            model-weights-${{ runner.os }}-
+
       - name: Run tests 
         run: uv run pytest
+        timeout-minutes: 10
 
       - name: Test import class
         run: uv run -- python -c "from timecopilot import TimeCopilot, TimeCopilotForecaster" 
@@ -53,6 +67,7 @@ jobs:
 
       - name: Run tests 
         run: uv run pytest -m live
+        timeout-minutes: 15
         env: 
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 
 

diff --git a/.github/workflows/comprehensive.yaml b/.github/workflows/comprehensive.yaml
@@ -0,0 +1,58 @@
+name: Comprehensive Tests
+
+on:
+  schedule:
+    # Run comprehensive tests nightly at 2 AM UTC
+    - cron: '0 2 * * *'
+  workflow_dispatch:
+    # Allow manual triggering
+  push:
+    branches: [main]
+    paths:
+      # Only run when model-related files change
+      - 'timecopilot/models/**'
+      - 'tests/models/test_models_comprehensive.py'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-comprehensive:
+    name: comprehensive test on ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        # Test on fewer Python versions for comprehensive tests
+        python-version: ["3.11", "3.12"]
+    env:
+      UV_PYTHON: ${{ matrix.python-version }}
+
+    steps:
+      - name: Clone repo
+        uses: actions/checkout@v4
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v6 
+        with:
+          enable-cache: true
+
+      # Cache model weights to speed up foundation model tests
+      - name: Cache model weights
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/huggingface
+            ~/.cache/torch
+          key: model-weights-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
+          restore-keys: |
+            model-weights-${{ runner.os }}-${{ matrix.python-version }}-
+            model-weights-${{ runner.os }}-
+
+      - name: Run comprehensive tests
+        run: uv run pytest -m slow -v
+        timeout-minutes: 45
+
+      - name: Test import class
+        run: uv run -- python -c "from timecopilot import TimeCopilot, TimeCopilotForecaster"
diff --git a/TESTING.md b/TESTING.md
@@ -0,0 +1,107 @@
+# Testing Guide
+
+This project uses a two-tier testing strategy to balance speed and comprehensive coverage.
+
+## Test Categories
+
+### Fast Tests (Regular CI)
+- **Purpose**: Quick validation for every PR/push
+- **Runtime**: ~2-3 minutes
+- **Models**: Lightweight benchmark models (SeasonalNaive, ZeroModel, AutoARIMA, Prophet, etc.)
+- **Trigger**: Runs on every push/PR
+- **Command**: `uv run pytest` (excludes slow tests by default)
+
+### Slow Tests (Comprehensive)
+- **Purpose**: Full model validation including heavy foundation models  
+- **Runtime**: ~30-45 minutes
+- **Models**: All models including Chronos, Moirai, TimesFM, TiRex, etc.
+- **Trigger**: Nightly schedule + manual dispatch
+- **Command**: `uv run pytest -m slow`
+
+### Live Tests
+- **Purpose**: Test integration with external APIs (OpenAI, etc.)
+- **Models**: Uses real LLM endpoints
+- **Trigger**: Regular CI (with API keys)
+- **Command**: `uv run pytest -m live`
+
+## Running Tests Locally
+
+```bash
+# Fast tests only (recommended for development)
+uv run pytest
+
+# Include slow foundation model tests  
+uv run pytest -m "not live"
+
+# All tests including live API calls
+uv run pytest -m ""
+
+# Specific test categories
+uv run pytest -m slow          # Foundation models only
+uv run pytest -m live          # API integration only
+uv run pytest tests/test_smoke.py  # Basic smoke tests
+```
+
+## Test Optimization Details
+
+### Model Selection Strategy
+- **Fast models** (`tests/conftest.py::fast_models`): Used in regular CI
+  - Benchmark models: SeasonalNaive, ZeroModel, AutoARIMA, Prophet, ADIDA
+  - Lightweight foundation: TabPFN (MOCK mode)
+
+- **Comprehensive models** (`tests/conftest.py::all_models`): Used in nightly tests
+  - All fast models plus heavy foundation models
+  - Foundation models: Chronos, Moirai, TimesFM, TiRex, Toto
+
+### Parameterization Reductions
+To speed up CI, several test parameters were reduced:
+- **Frequencies**: 4 → 2 (H,D,W-MON,MS → D,W-MON)
+- **Series counts**: 5 → 3 per test
+- **Quantiles**: 9 → 3 (0.1-0.9 → 0.1,0.5,0.9)  
+- **Cross-validation horizon**: 12 → 6
+- **Python versions**: 4 → 2 (3.11, 3.12 for regular CI)
+
+### Caching Strategy
+- **Model weights**: HuggingFace and PyTorch caches preserved between runs
+- **Dependencies**: UV package cache enabled
+- **Cache keys**: Include Python version and dependency hash
+
+## Adding New Tests
+
+### For Fast Models
+Add tests to existing files in `tests/` - they will automatically use fast models and run in regular CI.
+
+### For Foundation Models
+- Add tests with `@pytest.mark.slow` decorator
+- Consider using reduced parameterization for speed
+- Test will run in nightly comprehensive workflow
+
+### Example
+```python
+@pytest.mark.slow
+@pytest.mark.parametrize("model", all_models)  # Use all_models for comprehensive
+def test_new_foundation_feature(model):
+    # Test with foundation models
+    pass
+
+@pytest.mark.parametrize("model", models)  # Use models for fast tests
+def test_new_basic_feature(model):
+    # Test with fast models only
+    pass
+```
+
+## CI Workflows
+
+### `.github/workflows/ci.yaml` 
+- **Triggers**: Push to main, PRs
+- **Python versions**: 3.11, 3.12
+- **Timeout**: 10 minutes regular tests, 15 minutes live tests
+- **Excludes**: Slow tests (foundation models)
+
+### `.github/workflows/comprehensive.yaml`
+- **Triggers**: Nightly at 2 AM UTC, manual dispatch, model file changes
+- **Python versions**: 3.11, 3.12  
+- **Timeout**: 45 minutes
+- **Includes**: Only slow tests (foundation models)
+
+This structure ensures rapid feedback for developers while maintaining comprehensive test coverage through automated nightly runs.
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,9 +88,10 @@ disable_error_code = ["no-redef"]  # for fasthtml
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
-addopts = "-m 'not live' -n auto"
+addopts = "-m 'not live and not slow' -n auto"
 markers = [
-    "live: marks tests that require calls to llm providers"
+    "live: marks tests that require calls to llm providers",
+    "slow: marks tests with heavy foundation models that take longer to run"
 ]
 
 [tool.ruff]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,28 +5,37 @@
 from timecopilot.models.foundational.moirai import Moirai
 from timecopilot.models.foundational.toto import Toto
 
+# Fast benchmark models for regular CI
 benchmark_models = [
     "AutoARIMA",
     "SeasonalNaive",
     "ZeroModel",
     "ADIDA",
-    "TimesFM",
     "Prophet",
 ]
-models = [MODELS[str_model] for str_model in benchmark_models]
-if sys.version_info >= (3, 11):
-    from timecopilot.models.foundational.tirex import TiRex
 
-    models.append(TiRex())
+# Fast models for regular CI (exclude TimesFM as it's heavy)
+fast_models = [MODELS[str_model] for str_model in benchmark_models]
 
+# Add lightweight foundation models for fast tests
 if sys.version_info < (3, 13):
     from tabpfn_time_series import TabPFNMode
-
     from timecopilot.models.foundational.tabpfn import TabPFN
+    fast_models.append(TabPFN(mode=TabPFNMode.MOCK))
+
+# Comprehensive model list including heavy foundation models (for slow tests)
+all_models = fast_models.copy()
 
-    models.append(TabPFN(mode=TabPFNMode.MOCK))
+# Add TimesFM to comprehensive list
+all_models.append(MODELS["TimesFM"])
 
-models.extend(
+# Add TiRex for Python >= 3.11
+if sys.version_info >= (3, 11):
+    from timecopilot.models.foundational.tirex import TiRex
+    all_models.append(TiRex())
+
+# Add heavy foundation models to comprehensive list
+all_models.extend(
     [
         Chronos(repo_id="amazon/chronos-t5-tiny", alias="Chronos-T5"),
         Chronos(repo_id="amazon/chronos-bolt-tiny", alias="Chronos-Bolt"),
@@ -44,3 +53,6 @@
         ),
     ]
 )
+
+# Default to fast models for regular CI
+models = fast_models
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
@@ -40,7 +40,7 @@ def test_tabpfn_import_fails():
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"])
+@pytest.mark.parametrize("freq", ["D", "W-MON"])  # Reduced freq combinations for speed
 def test_freq_inferred_correctly(model, freq):
     n_series = 2
     df = generate_series(
@@ -66,10 +66,10 @@ def test_freq_inferred_correctly(model, freq):
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"])
+@pytest.mark.parametrize("freq", ["D", "W-MON"])  # Reduced freq combinations for speed
 @pytest.mark.parametrize("h", [1, 12])
 def test_correct_forecast_dates(model, freq, h):
-    n_series = 5
+    n_series = 3  # Reduced from 5 for speed
     df = generate_series(
         n_series,
         freq=freq,
@@ -91,11 +91,11 @@ def test_correct_forecast_dates(model, freq, h):
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"])
+@pytest.mark.parametrize("freq", ["D", "W-MON"])  # Reduced freq combinations for speed
 @pytest.mark.parametrize("n_windows", [1, 4])
 def test_cross_validation(model, freq, n_windows):
-    h = 12
-    n_series = 5
+    h = 6  # Reduced from 12 for speed
+    n_series = 3  # Reduced from 5 for speed
     df = generate_series(n_series, freq=freq, equal_ends=True)
     cv_df = model.cross_validation(
         df,
@@ -152,8 +152,8 @@ def test_passing_both_level_and_quantiles(model):
 
 @pytest.mark.parametrize("model", models)
 def test_using_quantiles(model):
-    qs = [round(i * 0.1, 1) for i in range(1, 10)]
-    df = generate_series(n_series=3, freq="D")
+    qs = [0.1, 0.5, 0.9]  # Reduced quantiles for speed
+    df = generate_series(n_series=2, freq="D")  # Reduced series for speed
     fcst_df = model.forecast(
         df=df,
         h=2,
@@ -186,7 +186,7 @@ def test_using_quantiles(model):
 
 @pytest.mark.parametrize("model", models)
 def test_using_level(model):
-    level = [0, 20, 40, 60, 80]  # corresponds to qs [0.1, 0.2, ..., 0.9]
+    level = [20, 80]  # Reduced levels for speed
     df = generate_series(n_series=2, freq="D")
     fcst_df = model.forecast(
         df=df,