feat(models): add backtesting helpers (#2)

Danite · web-flow · commit b00c4ca5386d · 2025-09-07T19:29:31.000+02:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -8,6 +8,9 @@ on:
 jobs:
   test:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        extras: ["dev", "dev,backtest"]
 
     steps:
       - name: Checkout repository
@@ -19,8 +22,8 @@ jobs:
           activate-environment: true
           python-version: "3.13"
 
-      - name: Install dependencies
-        run: uv pip install -e ".[dev]"
+      - name: Install dependencies with extras
+        run: uv pip install -e ".[${{ matrix.extras }}]"
 
       - name: Run tests with pytest
         run: uv run pytest
diff --git a/README.md b/README.md
@@ -12,6 +12,12 @@ You can install it directly from the GitHub repository.
 pip install git+https://github.com/ZeroGuacamole/mode-python-sdk.git
 ```
 
+Install with backtesting helpers (pandas/numpy) via extras:
+
+```bash
+pip install "git+https://github.com/ZeroGuacamole/mode-python-sdk.git#egg=mode-sdk[backtest]"
+```
+
 ## Quickstart
 
 Here's a quick example of how to use the client to fetch historical data.
@@ -60,6 +66,45 @@ except ModeAPIError as e:
 
 ```
 
+### Helpers
+
+The models include utilities commonly used in research/backtesting pipelines.
+
+1. Convert historical data to a pandas DataFrame (UTC index):
+
+```python
+from mode_sdk.client import ModeAPIClient
+
+client = ModeAPIClient()
+hist = client.market_data.get_historical_data("AAPL", "2024-01-01", "2024-01-31", "daily")
+
+# Requires: pip install pandas
+df = hist.to_dataframe()
+print(df.head())
+```
+
+2. Convert historical data to NumPy arrays for vectorized processing:
+
+```python
+# Requires: pip install numpy
+ts, open_, high, low, close, volume = hist.to_numpy()
+```
+
+3. Quote convenience properties:
+
+```python
+quotes = client.market_data.get_quotes(["AAPL"]).quotes
+q = quotes["AAPL"]
+print(q.mid_price, q.spread)
+```
+
+### Data validation and normalization
+
+- Symbols are normalized to uppercase in `Asset` and `HistoricalDataResponse`.
+- Timestamps are normalized to UTC in all models that include time fields.
+- OHLCV values are validated (non-negative; high/low consistency) for `HistoricalDataPoint`.
+- `Quote` validation ensures non-negative prices and `ask >= bid` when both are present.
+
 ## Development
 
 1.  Clone the repository.
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,14 +14,15 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "requests==2.32.4",
+    "requests==2.32.5",
     "pydantic==2.11.7",
     "python-dotenv",
     "types-requests>=2.32.4",
 ]
 
 [project.optional-dependencies]
 dev = ["pytest", "pytest-mock", "requests-mock", "black", "ruff", "mypy"]
+backtest = ["pandas>=2.3.2", "numpy>=2.3.2"]
 
 [tool.ruff]
 line-length = 120
diff --git a/src/mode_sdk/models.py b/src/mode_sdk/models.py
@@ -1,8 +1,9 @@
-from datetime import datetime
+from datetime import datetime, timezone
+import importlib
 from enum import Enum
 from typing import Any, Dict, List, Optional
 
-from pydantic import BaseModel, Field, field_validator, ValidationInfo
+from pydantic import BaseModel, Field, field_validator, ValidationInfo, model_validator
 
 
 class AssetType(str, Enum):
@@ -53,6 +54,20 @@ def validate_details(cls, v: Any, info: ValidationInfo) -> Any:
 
         return v
 
+    @field_validator("symbol", mode="after")
+    @classmethod
+    def normalize_symbol(cls, v: str) -> str:
+        """Normalize symbols to uppercase for consistency across models."""
+        return v.upper()
+
+    @field_validator("last_updated", mode="after")
+    @classmethod
+    def normalize_last_updated(cls, v: datetime) -> datetime:
+        """Ensure timestamps are timezone-aware and normalized to UTC."""
+        if v.tzinfo is None:
+            return v.replace(tzinfo=timezone.utc)
+        return v.astimezone(timezone.utc)
+
 
 class Quote(BaseModel):
     """Represents a real-time quote for a symbol."""
@@ -70,13 +85,59 @@ class Quote(BaseModel):
     previous_close: Optional[float] = Field(default=None, alias="previousClose")
     open: Optional[float] = None
 
+    @field_validator("timestamp", mode="after")
+    @classmethod
+    def normalize_timestamp(cls, v: datetime) -> datetime:
+        """Ensure quote timestamps are timezone-aware (UTC)."""
+        if v.tzinfo is None:
+            return v.replace(tzinfo=timezone.utc)
+        return v.astimezone(timezone.utc)
+
+    @model_validator(mode="after")
+    def validate_prices(self) -> "Quote":
+        """Basic sanity checks for price fields used in backtesting."""
+        if self.price is not None and self.price < 0:
+            raise ValueError("price must be non-negative")
+        if self.bid is not None and self.bid < 0:
+            raise ValueError("bid must be non-negative")
+        if self.ask is not None and self.ask < 0:
+            raise ValueError("ask must be non-negative")
+        if self.bid is not None and self.ask is not None and self.ask < self.bid:
+            raise ValueError("ask must be greater than or equal to bid")
+        return self
+
+    @property
+    def mid_price(self) -> float:
+        """Return mid price if bid/ask present, otherwise fall back to last price."""
+        if self.bid is not None and self.ask is not None:
+            return (self.bid + self.ask) / 2.0
+        return self.price
+
+    @property
+    def spread(self) -> Optional[float]:
+        """Return bid/ask spread if available."""
+        if self.bid is not None and self.ask is not None:
+            return self.ask - self.bid
+        return None
+
 
 class QuoteResponse(BaseModel):
     """Represents the structure of the quotes API response."""
 
     quotes: Dict[str, Quote]
     errors: Dict[str, str]
 
+    @model_validator(mode="after")
+    def validate_quote_keys(self) -> "QuoteResponse":
+        """Ensure the mapping keys align with each nested quote's symbol when present."""
+        for key, quote in self.quotes.items():
+            # Only validate when symbol is present; API may return partials
+            if quote.symbol and key.upper() != quote.symbol.upper():
+                raise ValueError(
+                    f"quotes key '{key}' does not match nested symbol '{quote.symbol}'"
+                )
+        return self
+
 
 class HistoricalDataPoint(BaseModel):
     """Represents a single OHLCV data point."""
@@ -88,9 +149,141 @@ class HistoricalDataPoint(BaseModel):
     close: Optional[float] = None
     volume: Optional[int] = None
 
+    @field_validator("timestamp", mode="after")
+    @classmethod
+    def normalize_timestamp(cls, v: datetime) -> datetime:
+        """Ensure bar timestamps are timezone-aware (UTC)."""
+        if v.tzinfo is None:
+            return v.replace(tzinfo=timezone.utc)
+        return v.astimezone(timezone.utc)
+
+    @model_validator(mode="after")
+    def validate_ohlcv(self) -> "HistoricalDataPoint":
+        """Sanity checks for OHLCV used in backtesting pipelines."""
+        # Non-negativity
+        for name in ("open", "high", "low", "close"):
+            value = getattr(self, name)
+            if value is not None and value < 0:
+                raise ValueError(f"{name} must be non-negative")
+        if self.volume is not None and self.volume < 0:
+            raise ValueError("volume must be non-negative")
+
+        # High/low consistency with other provided fields
+        candidates_for_high: List[float] = [
+            v for v in [self.open, self.close, self.low] if v is not None
+        ]
+        if (
+            self.high is not None
+            and candidates_for_high
+            and self.high < max(candidates_for_high)
+        ):
+            raise ValueError("high must be >= max(open, close, low) when provided")
+        candidates_for_low: List[float] = [
+            v for v in [self.open, self.close, self.high] if v is not None
+        ]
+        if (
+            self.low is not None
+            and candidates_for_low
+            and self.low > min(candidates_for_low)
+        ):
+            raise ValueError("low must be <= min(open, close, high) when provided")
+
+        return self
+
 
 class HistoricalDataResponse(BaseModel):
     """Represents the structure of the historical data API response."""
 
     symbol: str
     data_points: List[HistoricalDataPoint] = Field(..., alias="dataPoints")
+
+    @field_validator("symbol", mode="after")
+    @classmethod
+    def normalize_symbol(cls, v: str) -> str:
+        return v.upper()
+
+    def to_records(self) -> List[Dict[str, Any]]:
+        """Return the historical data as a list of dictionaries."""
+        return [
+            {
+                "timestamp": point.timestamp,
+                "open": point.open,
+                "high": point.high,
+                "low": point.low,
+                "close": point.close,
+                "volume": point.volume,
+            }
+            for point in self.data_points
+        ]
+
+    def to_dataframe(self):  # type: ignore[override]
+        """Convert the historical data to a pandas DataFrame (if pandas is installed).
+
+        Returns a DataFrame indexed by UTC timestamps with columns: open, high, low,
+        close, volume. The frame is sorted by index and duplicate timestamps are
+        collapsed keeping the last occurrence.
+        """
+        try:
+            pd = importlib.import_module("pandas")
+        except Exception as exc:
+            raise ImportError(
+                "pandas is required for to_dataframe(); install with 'pip install pandas'"
+            ) from exc
+
+        records = self.to_records()
+        if not records:
+            return pd.DataFrame(
+                columns=["open", "high", "low", "close", "volume"]
+            ).astype(
+                {
+                    "open": "float64",
+                    "high": "float64",
+                    "low": "float64",
+                    "close": "float64",
+                    "volume": "float64",
+                }
+            )
+
+        frame = pd.DataFrame.from_records(records)
+        frame["timestamp"] = pd.to_datetime(frame["timestamp"], utc=True)
+        frame.set_index("timestamp", inplace=True)
+        frame.sort_index(inplace=True)
+        frame = frame[~frame.index.duplicated(keep="last")]
+        return frame
+
+    def to_numpy(self):  # type: ignore[override]
+        """Return numpy arrays (timestamps, open, high, low, close, volume).
+
+        Arrays are suitable for fast vectorized backtests. Requires numpy.
+        """
+        try:
+            np = importlib.import_module("numpy")
+        except Exception as exc:
+            raise ImportError(
+                "numpy is required for to_numpy(); install with 'pip install numpy'"
+            ) from exc
+
+        points = self.data_points
+        n = len(points)
+        ts = np.empty(n, dtype="datetime64[ns]")
+        open_ = np.full(n, np.nan)
+        high = np.full(n, np.nan)
+        low = np.full(n, np.nan)
+        close = np.full(n, np.nan)
+        vol = np.full(n, np.nan)
+
+        for i, p in enumerate(points):
+            dt_utc = p.timestamp.astimezone(timezone.utc).replace(tzinfo=None)
+            ts[i] = np.datetime64(dt_utc, "ns")
+            if p.open is not None:
+                open_[i] = p.open
+            if p.high is not None:
+                high[i] = p.high
+            if p.low is not None:
+                low[i] = p.low
+            if p.close is not None:
+                close[i] = p.close
+            if p.volume is not None:
+                vol[i] = p.volume
+
+        return ts, open_, high, low, close, vol
diff --git a/tests/test_models.py b/tests/test_models.py
diff --git a/uv.lock b/uv.lock