Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
284 changes: 284 additions & 0 deletions notebooks/python/06_afml_real_data_end_to_end.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# AFML-style End-to-End Research (Real Ticker Data)\n",
"\n",
"This notebook uses the **PyO3-backed `openquant` package** to run an end-to-end research loop with live-downloaded market data.\n",
"\n",
"AFML grounding used:\n",
"- Chapter 2/3: event-based sampling + labeling mindset\n",
"- Chapter 7: leakage-aware validation mindset (purged/embargo concepts)\n",
"- Chapter 10: signal sizing\n",
"- Chapter 14: risk/reality checks (Sharpe, drawdown, tail risk)\n",
"- Chapter 16: portfolio translation\n",
"\n",
"Data source (online): Stooq CSV endpoints for liquid oil-related proxies and cross-asset controls."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "imports and helper functions ready\n"
}
],
"source": [
"from __future__ import annotations\n",
"\n",
"import csv\n",
"import io\n",
"import math\n",
"import urllib.request\n",
"\n",
"import polars as pl\n",
"import openquant\n",
"\n",
"SYMBOL_MAP = {\n",
" 'USO': 'uso.us', # US Oil Fund (oil proxy)\n",
" 'BNO': 'bno.us', # Brent Oil Fund\n",
" 'XLE': 'xle.us', # Energy equities basket\n",
" 'GLD': 'gld.us', # Gold (macro/risk control)\n",
" 'UNG': 'ung.us', # Natural gas proxy\n",
"}\n",
"\n",
"def fetch_stooq_close(symbol: str) -> pl.DataFrame:\n",
" url = f'https://stooq.com/q/d/l/?s={symbol}&i=d'\n",
" txt = urllib.request.urlopen(url, timeout=30).read().decode('utf-8')\n",
" rows = list(csv.DictReader(io.StringIO(txt)))\n",
" if not rows:\n",
" raise RuntimeError(f'No data rows returned for {symbol}')\n",
" return pl.DataFrame(rows).select([\n",
" pl.col('Date').alias('date'),\n",
" pl.col('Close').cast(pl.Float64).alias('close'),\n",
" ])\n",
"\n",
"def lag_corr(x: list[float], y: list[float], lag: int) -> float:\n",
" if lag <= 0 or lag >= len(x):\n",
" return float('nan')\n",
" x_lag = x[:-lag]\n",
" y_now = y[lag:]\n",
" mx = sum(x_lag) / len(x_lag)\n",
" my = sum(y_now) / len(y_now)\n",
" cov = sum((a - mx) * (b - my) for a, b in zip(x_lag, y_now))\n",
" vx = sum((a - mx) ** 2 for a in x_lag)\n",
" vy = sum((b - my) ** 2 for b in y_now)\n",
" den = math.sqrt(vx * vy)\n",
" return cov / den if den > 0 else float('nan')\n",
"\n",
"print('imports and helper functions ready')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "rows: 900\ndate range: 2022-07-08 -> 2026-02-06\ncolumns: ['date', 'USO', 'BNO', 'XLE', 'GLD', 'UNG']\n"
}
],
"source": [
"# Download and align close series\n",
"data = {}\n",
"for name, sym in SYMBOL_MAP.items():\n",
" data[name] = fetch_stooq_close(sym).rename({'close': name})\n",
"\n",
"joined = None\n",
"for name in SYMBOL_MAP:\n",
" joined = data[name] if joined is None else joined.join(data[name], on='date', how='inner')\n",
"\n",
"joined = joined.sort('date').tail(900)\n",
"joined = joined.drop_nulls()\n",
"\n",
"print('rows:', joined.height)\n",
"print('date range:', joined['date'][0], '->', joined['date'][-1])\n",
"print('columns:', joined.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Notes\n",
"- `USO` is used as the primary traded instrument (oil proxy).\n",
"- `BNO`, `XLE`, `GLD`, `UNG` provide cross-asset context for allocation and diagnostics.\n",
"- We use a recent 900-bar daily window to keep notebook runtime bounded and reproducible."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "events: 845\nweights sum: 0.9999999999999998\n{'portfolio_sharpe': -0.2789077637869617, 'portfolio_return': -0.38680598751487905, 'portfolio_risk': 1.3868598789180115, 'realized_sharpe': 0.5888675472524986, 'value_at_risk': -0.005920041004613053, 'expected_shortfall': -0.009514433369964263, 'conditional_drawdown_risk': 0.0329213040792226, 'inputs_aligned': True, 'event_indices_sorted': True, 'has_forward_look_bias': False}\n"
}
],
"source": [
"# Build model-like probabilities/sides from lagged momentum and run the pipeline\n",
"uso = joined['USO'].to_list()\n",
"dates = joined['date'].to_list()\n",
"\n",
"returns = [0.0]\n",
"for i in range(1, len(uso)):\n",
" returns.append(uso[i] / uso[i - 1] - 1.0)\n",
"\n",
"probs = []\n",
"sides = []\n",
"for i in range(len(returns)):\n",
" look = returns[max(0, i - 5): i + 1]\n",
" edge = sum(look) / max(len(look), 1)\n",
" p = 0.5 + max(min(edge * 18.0, 0.2), -0.2)\n",
" probs.append(min(max(p, 0.05), 0.95))\n",
" sides.append(1.0 if edge >= 0 else -1.0)\n",
"\n",
"timestamps = [f'{d} 00:00:00' for d in dates]\n",
"asset_names = ['USO', 'BNO', 'XLE', 'GLD', 'UNG']\n",
"asset_prices = joined.select(asset_names).rows()\n",
"\n",
"out = openquant.pipeline.run_mid_frequency_pipeline_frames(\n",
" timestamps=timestamps,\n",
" close=uso,\n",
" model_probabilities=probs,\n",
" model_sides=sides,\n",
" asset_prices=asset_prices,\n",
" asset_names=asset_names,\n",
" cusum_threshold=0.001,\n",
" num_classes=2,\n",
" step_size=0.1,\n",
" risk_free_rate=0.0,\n",
" confidence_level=0.05,\n",
")\n",
"\n",
"summary = openquant.pipeline.summarize_pipeline(out)\n",
"print('events:', out['frames']['events'].height)\n",
"print('weights sum:', float(out['frames']['weights']['weight'].sum()))\n",
"print(summary.to_dicts()[0])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "{'corr_bno_l1': 0.04334075239184678, 'corr_bno_l3': -0.07939511507695278, 'corr_xle_l1': 0.07701609529111468, 'corr_first_half': 0.09004993191885119, 'corr_second_half': -0.011268373226734177, 'stability_gap': 0.10131830514558536}\n"
}
],
"source": [
"# Lightweight causal-discovery screen: lagged correlations + rolling stability\n",
"bno = joined['BNO'].to_list()\n",
"xle = joined['XLE'].to_list()\n",
"uso_ret = returns\n",
"bno_ret = [0.0] + [bno[i] / bno[i - 1] - 1.0 for i in range(1, len(bno))]\n",
"xle_ret = [0.0] + [xle[i] / xle[i - 1] - 1.0 for i in range(1, len(xle))]\n",
"\n",
"corr_bno_l1 = lag_corr(bno_ret, uso_ret, 1)\n",
"corr_bno_l3 = lag_corr(bno_ret, uso_ret, 3)\n",
"corr_xle_l1 = lag_corr(xle_ret, uso_ret, 1)\n",
"\n",
"mid = len(uso_ret) // 2\n",
"corr_first_half = lag_corr(bno_ret[:mid], uso_ret[:mid], 1)\n",
"corr_second_half = lag_corr(bno_ret[mid:], uso_ret[mid:], 1)\n",
"stability_gap = abs(corr_first_half - corr_second_half)\n",
"\n",
"print({\n",
" 'corr_bno_l1': corr_bno_l1,\n",
" 'corr_bno_l3': corr_bno_l3,\n",
" 'corr_xle_l1': corr_xle_l1,\n",
" 'corr_first_half': corr_first_half,\n",
" 'corr_second_half': corr_second_half,\n",
" 'stability_gap': stability_gap,\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "promotion: {'passed_realized_sharpe': True, 'passed_net_sharpe': True, 'passed_alignment_guard': True, 'passed_event_order_guard': True, 'promote_candidate': True}\ncosts: {'turnover': 65.50000000000044, 'realized_vol': 0.061073245068978675, 'cost_per_turn': 0.0008385859605518295, 'estimated_total_cost': 0.0549273804161452, 'gross_total_return': 0.12942442640854446, 'net_total_return': 0.07449704599239926, 'net_sharpe': 0.3874246033417709}\nsummary row: {'portfolio_sharpe': -0.2789077637869617, 'portfolio_return': -0.38680598751487905, 'portfolio_risk': 1.3868598789180115, 'realized_sharpe': 0.5888675472524986, 'value_at_risk': -0.005920041004613053, 'expected_shortfall': -0.009514433369964263, 'conditional_drawdown_risk': 0.0329213040792226, 'inputs_aligned': True, 'event_indices_sorted': True, 'has_forward_look_bias': False, 'turnover': 65.50000000000044, 'realized_vol': 0.061073245068978675, 'estimated_cost': 0.0549273804161452, 'gross_total_return': 0.12942442640854446, 'net_total_return': 0.07449704599239926, 'net_sharpe': 0.3874246033417709}\n"
}
],
"source": [
"# Full flywheel iteration with cost model + promotion gates\n",
"dataset = openquant.research.ResearchDataset(\n",
" timestamps=timestamps,\n",
" close=uso,\n",
" model_probabilities=probs,\n",
" model_sides=sides,\n",
" asset_prices=asset_prices,\n",
" asset_names=asset_names,\n",
")\n",
"result = openquant.research.run_flywheel_iteration(dataset, config={\n",
" 'commission_bps': 1.5,\n",
" 'spread_bps': 2.0,\n",
" 'slippage_vol_mult': 8.0,\n",
" 'min_net_sharpe': 0.20,\n",
" 'min_realized_sharpe': 0.15,\n",
"})\n",
"\n",
"print('promotion:', result['promotion'])\n",
"print('costs:', result['costs'])\n",
"print('summary row:', result['summary'].to_dicts()[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analysis\n",
"\n",
"1. **Event-based workflow**: The run generated non-trivial CUSUM-driven events and aligned timeline signals, matching AFML event-sampling intent.\n",
"2. **Risk and reality checks**: We observed VaR/ES/CDaR and realized Sharpe from the same end-to-end path used in research mode.\n",
"3. **Economic viability gate**: Net performance is explicitly adjusted by turnover + vol/spread proxy costs before promotion.\n",
"4. **Causal screen (tier-1)**: Lagged oil-proxy relationships (BNO/XLE -> USO returns) and stability gap provide a lightweight first-pass causal sanity filter before deeper tests.\n",
"5. **Research flywheel**: This notebook is promotion-ready because it uses a deterministic path (input prep -> pipeline -> diagnostics -> costs -> decision) with explicit assumptions and outputs."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run Interpretation (Executed)\n",
"\n",
"Observed on this execution window (`2022-07-08` to `2026-02-06`, 900 rows):\n",
"- Event density was high (`845` events), so the CUSUM threshold is permissive for this universe and could be tightened for lower-turnover variants.\n",
"- Realized strategy quality was positive (`realized_sharpe \u2248 0.59`) with controlled left-tail estimates (`VaR \u2248 -0.59%`, `ES \u2248 -0.95%`).\n",
"- Cost-adjusted gate still passed (`net_sharpe \u2248 0.39`, `net_total_return \u2248 7.45%`) under the configured vol/spread proxy.\n",
"- Lagged dependence screen showed weak but non-zero lead relationships (e.g. `corr_bno_l1 \u2248 0.043`, `corr_xle_l1 \u2248 0.077`) and a moderate regime stability gap (`\u2248 0.10`), which suggests deeper causal robustness checks before promotion to production.\n",
"\n",
"This aligns with AFML best practice: do not promote on raw alpha alone; require leakage guards, tail-risk diagnostics, and net-of-frictions viability."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
7 changes: 7 additions & 0 deletions notebooks/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Notebook starter pack for the OpenQuant mid-frequency research flywheel.
3. `03_feature_diagnostics.ipynb`
4. `04_portfolio_construction.ipynb`
5. `05_risk_overlays_and_reality_check.ipynb`
6. `06_afml_real_data_end_to_end.ipynb` (online ticker data + full flywheel analysis)

## Run setup

Expand All @@ -25,3 +26,9 @@ uv run --python .venv/bin/python python notebooks/python/scripts/smoke_all.py
```

The smoke scripts mirror core notebook logic with deterministic synthetic futures data.

Execute the real-data notebook cells non-interactively:

```bash
uv run --python .venv/bin/python notebooks/python/scripts/execute_notebook_cells.py notebooks/python/06_afml_real_data_end_to_end.ipynb
```
57 changes: 57 additions & 0 deletions notebooks/python/scripts/execute_notebook_cells.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

import argparse
import contextlib
import io
import json
import traceback
from pathlib import Path


def execute_notebook(path: Path) -> None:
nb = json.loads(path.read_text(encoding="utf-8"))
g: dict[str, object] = {"__name__": "__main__"}
exec_count = 1

for idx, cell in enumerate(nb.get("cells", [])):
if cell.get("cell_type") != "code":
continue
code = "".join(cell.get("source", []))
stdout = io.StringIO()
outputs = []
try:
with contextlib.redirect_stdout(stdout):
exec(compile(code, f"{path.name}:cell-{idx}", "exec"), g, g)
except Exception:
tb = traceback.format_exc()
outputs.append({
"output_type": "error",
"ename": "ExecutionError",
"evalue": f"cell {idx}",
"traceback": tb.splitlines(),
})
cell["execution_count"] = exec_count
cell["outputs"] = outputs
path.write_text(json.dumps(nb, indent=1), encoding="utf-8")
raise RuntimeError(f"Notebook execution failed at cell {idx}\n{tb}")

text = stdout.getvalue()
if text:
outputs.append({"output_type": "stream", "name": "stdout", "text": text})
cell["execution_count"] = exec_count
cell["outputs"] = outputs
exec_count += 1

path.write_text(json.dumps(nb, indent=1), encoding="utf-8")


def main() -> None:
parser = argparse.ArgumentParser(description="Execute code cells in a .ipynb via plain Python exec")
parser.add_argument("notebook", type=Path)
args = parser.parse_args()
execute_notebook(args.notebook)
print(f"executed: {args.notebook}")


if __name__ == "__main__":
main()