From 038fc8876043a7563304fa50389e111ce7ea31d3 Mon Sep 17 00:00:00 2001 From: Sean Koval Date: Sat, 14 Feb 2026 02:15:38 -0500 Subject: [PATCH 1/5] Add openquant.data module and Python 3.13 benchmark workflow --- Cargo.lock | 85 ++------- README.md | 5 +- crates/pyopenquant/Cargo.toml | 2 +- docs/python_bindings.md | 8 +- docs/research_workflow.md | 2 +- justfile | 5 +- notebooks/python/README.md | 2 +- pyproject.toml | 1 + python/benchmarks/benchmark_pipeline.py | 52 ++++++ python/openquant/__init__.py | 2 + python/openquant/data.py | 193 ++++++++++++++++++++ python/tests/fixtures/ohlcv_us_equities.csv | 7 + python/tests/test_data_module.py | 82 +++++++++ 13 files changed, 368 insertions(+), 78 deletions(-) create mode 100644 python/benchmarks/benchmark_pipeline.py create mode 100644 python/openquant/data.py create mode 100644 python/tests/fixtures/ohlcv_us_equities.csv create mode 100644 python/tests/test_data_module.py diff --git a/Cargo.lock b/Cargo.lock index 66d6a6f..d6876b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,12 +47,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "bitflags" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" - [[package]] name = "bumpalo" version = "3.19.0" @@ -282,9 +276,9 @@ dependencies = [ [[package]] name = "heck" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hermit-abi" @@ -388,15 +382,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - [[package]] name = "log" version = "0.4.28" @@ -550,29 +535,6 @@ dependencies = [ "statrs", ] -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - [[package]] name = "paste" version = "1.0.15" @@ -633,15 +595,15 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.21.2" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", - "parking_lot", + "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -651,9 +613,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.21.2" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" dependencies = [ "once_cell", "target-lexicon", @@ -661,9 +623,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.21.2" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" dependencies = [ "libc", "pyo3-build-config", @@ -671,9 +633,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.21.2" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -683,9 +645,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.21.2" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" dependencies = [ "heck", "proc-macro2", @@ -779,15 +741,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - [[package]] name = "regex" version = "1.12.3" @@ -847,12 +800,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "serde" version = "1.0.228" @@ -928,12 +875,6 @@ dependencies = [ "wide", ] -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - [[package]] name = "statrs" version = "0.16.1" diff --git a/README.md b/README.md index c2e9949..27d0dd3 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ python3 scripts/check_bench_thresholds.py --baseline benchmarks/baseline_benchma ## Research Flywheel (Python + Rust) ```bash # Python env + bindings -uv venv --python 3.11 .venv +uv venv --python 3.13 .venv uv sync --group dev uv run --python .venv/bin/python maturin develop --manifest-path crates/pyopenquant/Cargo.toml @@ -51,6 +51,9 @@ uv run --python .venv/bin/python python experiments/run_pipeline.py --config exp # Rust notebook-companion smoke cargo run -p openquant --example research_notebook_smoke + +# Python pipeline micro-benchmark (for speed demos) +uv run --python .venv/bin/python python python/benchmarks/benchmark_pipeline.py --iterations 30 --bars 2048 ``` ## Crate Layout diff --git a/crates/pyopenquant/Cargo.toml b/crates/pyopenquant/Cargo.toml index 689bb50..4582420 100644 --- a/crates/pyopenquant/Cargo.toml +++ b/crates/pyopenquant/Cargo.toml @@ -13,7 +13,7 @@ crate-type = ["cdylib"] chrono = "0.4" nalgebra = "0.32" openquant = { path = "../openquant" } -pyo3 = { version = "0.21", features = ["extension-module"] } +pyo3 = { version = "0.23", features = ["extension-module"] } [package.metadata.maturin] name = "openquant._core" diff --git a/docs/python_bindings.md b/docs/python_bindings.md index 55213b4..f1b9aa0 100644 --- a/docs/python_bindings.md +++ b/docs/python_bindings.md @@ -11,13 +11,19 @@ Prerequisites: From repo root: ```bash -uv venv --python 3.11 .venv +uv venv --python 3.13 .venv uv sync --group dev uv run --python .venv/bin/python maturin develop --manifest-path crates/pyopenquant/Cargo.toml uv run --python .venv/bin/python python -c "import openquant; print('openquant import ok')" uv run --python .venv/bin/python pytest python/tests -q ``` +Quick performance showcase: + +```bash +uv run --python .venv/bin/python python python/benchmarks/benchmark_pipeline.py --iterations 30 --bars 2048 +``` + Build a wheel: ```bash diff --git a/docs/research_workflow.md b/docs/research_workflow.md index 48547b1..594cdff 100644 --- a/docs/research_workflow.md +++ b/docs/research_workflow.md @@ -20,7 +20,7 @@ This guide defines the promotion path from hypothesis to candidate strategy in O ```bash # setup -uv venv --python 3.11 .venv +uv venv --python 3.13 .venv uv sync --group dev uv run --python .venv/bin/python maturin develop --manifest-path crates/pyopenquant/Cargo.toml diff --git a/justfile b/justfile index 50d5346..e576e48 100644 --- a/justfile +++ b/justfile @@ -59,9 +59,12 @@ py-test: uv run --python .venv/bin/python pytest python/tests -q py-setup: - uv venv --python 3.11 .venv + uv venv --python 3.13 .venv uv sync --group dev +py-bench: + uv run --python .venv/bin/python python python/benchmarks/benchmark_pipeline.py --iterations 30 --bars 2048 + exp-run: uv run --python .venv/bin/python python experiments/run_pipeline.py --config experiments/configs/futures_oil_baseline.toml --out experiments/artifacts diff --git a/notebooks/python/README.md b/notebooks/python/README.md index b64dfb0..135a67f 100644 --- a/notebooks/python/README.md +++ b/notebooks/python/README.md @@ -14,7 +14,7 @@ Notebook starter pack for the OpenQuant mid-frequency research flywheel. ## Run setup ```bash -uv venv --python 3.11 .venv +uv venv --python 3.13 .venv uv sync --group dev uv run --python .venv/bin/python maturin develop --manifest-path crates/pyopenquant/Cargo.toml ``` diff --git a/pyproject.toml b/pyproject.toml index be19af3..1c9faa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ authors = [{ name = "OpenQuant contributors" }] classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.13", "License :: OSI Approved :: MIT License", ] dependencies = ["polars>=1.0,<2", "pyarrow>=15,<20"] diff --git a/python/benchmarks/benchmark_pipeline.py b/python/benchmarks/benchmark_pipeline.py new file mode 100644 index 0000000..348d2bb --- /dev/null +++ b/python/benchmarks/benchmark_pipeline.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import argparse +import statistics +import time + +import openquant + + +def run_benchmark(iterations: int, bars: int, seed: int) -> dict[str, float]: + # Warm-up to stabilize JIT/cache effects. + warmup_ds = openquant.research.make_synthetic_futures_dataset(n_bars=bars, seed=seed) + openquant.research.run_flywheel_iteration(warmup_ds) + + timings: list[float] = [] + for i in range(iterations): + ds = openquant.research.make_synthetic_futures_dataset(n_bars=bars, seed=seed + i) + t0 = time.perf_counter() + openquant.research.run_flywheel_iteration(ds) + timings.append(time.perf_counter() - t0) + + mean_s = statistics.mean(timings) + p95_s = sorted(timings)[max(int(len(timings) * 0.95) - 1, 0)] + return { + "iterations": float(iterations), + "bars_per_run": float(bars), + "mean_ms": mean_s * 1000.0, + "p95_ms": p95_s * 1000.0, + "runs_per_sec": 1.0 / mean_s if mean_s > 0 else 0.0, + } + + +def main() -> None: + parser = argparse.ArgumentParser(description="Benchmark OpenQuant Python pipeline runtime.") + parser.add_argument("--iterations", type=int, default=30) + parser.add_argument("--bars", type=int, default=2048) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + out = run_benchmark(iterations=args.iterations, bars=args.bars, seed=args.seed) + print( + "pipeline_bench " + f"iterations={int(out['iterations'])} " + f"bars={int(out['bars_per_run'])} " + f"mean_ms={out['mean_ms']:.3f} " + f"p95_ms={out['p95_ms']:.3f} " + f"runs_per_sec={out['runs_per_sec']:.2f}" + ) + + +if __name__ == "__main__": + main() diff --git a/python/openquant/__init__.py b/python/openquant/__init__.py index ebc3913..901db5c 100644 --- a/python/openquant/__init__.py +++ b/python/openquant/__init__.py @@ -1,5 +1,6 @@ from . import _core from . import adapters +from . import data from . import pipeline from . import research from . import viz @@ -16,6 +17,7 @@ "sampling", "bet_sizing", "portfolio", + "data", "pipeline", "research", "adapters", diff --git a/python/openquant/data.py b/python/openquant/data.py new file mode 100644 index 0000000..c72a040 --- /dev/null +++ b/python/openquant/data.py @@ -0,0 +1,193 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import polars as pl + + +CANONICAL_OHLCV_COLUMNS = [ + "ts", + "symbol", + "open", + "high", + "low", + "close", + "volume", + "adj_close", +] + +_COLUMN_ALIASES = { + "ts": "ts", + "timestamp": "ts", + "datetime": "ts", + "date": "ts", + "symbol": "symbol", + "ticker": "symbol", + "asset": "symbol", + "open": "open", + "high": "high", + "low": "low", + "close": "close", + "volume": "volume", + "adj_close": "adj_close", + "adjusted_close": "adj_close", + "adjclose": "adj_close", + "adjusted close": "adj_close", + "adj close": "adj_close", +} + + +def _normalize_column_name(name: str) -> str: + return name.strip().lower().replace("-", "_") + + +def _canonicalize_columns(df: pl.DataFrame) -> pl.DataFrame: + rename_map: dict[str, str] = {} + used_targets: set[str] = set() + for col in df.columns: + key = _normalize_column_name(col) + if key in _COLUMN_ALIASES: + target = _COLUMN_ALIASES[key] + if target in used_targets and col != target: + continue + rename_map[col] = target + used_targets.add(target) + if rename_map: + return df.rename(rename_map) + return df + + +def _validate_required_columns(df: pl.DataFrame) -> None: + required = {"ts", "symbol", "open", "high", "low", "close", "volume"} + missing = sorted(required.difference(df.columns)) + if missing: + raise ValueError(f"missing required OHLCV columns: {', '.join(missing)}") + + +def _cast_and_order(df: pl.DataFrame) -> pl.DataFrame: + casted = df.with_columns( + pl.col("ts").cast(pl.Utf8).str.strptime(pl.Datetime, strict=False), + pl.col("symbol").cast(pl.Utf8), + pl.col("open").cast(pl.Float64), + pl.col("high").cast(pl.Float64), + pl.col("low").cast(pl.Float64), + pl.col("close").cast(pl.Float64), + pl.col("volume").cast(pl.Float64), + ) + if "adj_close" in casted.columns: + casted = casted.with_columns(pl.col("adj_close").cast(pl.Float64)) + else: + casted = casted.with_columns(pl.col("close").alias("adj_close")) + return casted.select(CANONICAL_OHLCV_COLUMNS) + + +def data_quality_report(df: pl.DataFrame) -> dict[str, Any]: + _validate_required_columns(df) + key_cols = ["symbol", "ts"] + row_count = df.height + duplicate_keys = ( + df.group_by(key_cols) + .len() + .filter(pl.col("len") > 1) + .height + ) + gap_count = 0 + for sym in df["symbol"].unique().to_list(): + ts_values = ( + df.filter(pl.col("symbol") == sym) + .sort("ts") + .select("ts") + .to_series() + .to_list() + ) + for prev, cur in zip(ts_values, ts_values[1:]): + if prev is None or cur is None: + continue + if (cur - prev).total_seconds() > 24 * 3600: + gap_count += 1 + + report = { + "row_count": row_count, + "symbol_count": int(df.select(pl.col("symbol").n_unique()).item()), + "duplicate_key_count": duplicate_keys, + "gap_interval_count": gap_count, + "null_counts": { + col: int(df.select(pl.col(col).null_count()).item()) + for col in CANONICAL_OHLCV_COLUMNS + if col in df.columns + }, + "ts_min": str(df.select(pl.col("ts").min()).item()), + "ts_max": str(df.select(pl.col("ts").max()).item()), + } + return report + + +def clean_ohlcv( + df: pl.DataFrame, + *, + dedupe_keep: str = "last", + return_report: bool = False, +) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]: + out = _canonicalize_columns(df) + _validate_required_columns(out) + out = _cast_and_order(out) + out = out.drop_nulls(subset=["ts", "symbol"]) + pre_rows = out.height + out = out.unique(subset=["symbol", "ts"], keep=dedupe_keep).sort(["symbol", "ts"]) + removed = pre_rows - out.height + report = data_quality_report(out) + report["rows_removed_by_deduplication"] = removed + if return_report: + return out, report + return out + + +def load_ohlcv( + path: str | Path, + *, + symbol: str | None = None, + return_report: bool = False, +) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]: + file_path = Path(path) + if not file_path.exists(): + raise FileNotFoundError(f"file not found: {file_path}") + suffix = file_path.suffix.lower() + if suffix == ".csv": + raw = pl.read_csv(file_path) + elif suffix in {".parquet", ".pq"}: + raw = pl.read_parquet(file_path) + else: + raise ValueError(f"unsupported file type: {suffix}") + + raw = _canonicalize_columns(raw) + if "symbol" not in raw.columns: + if symbol is None: + raise ValueError("symbol column missing and no symbol argument provided") + raw = raw.with_columns(pl.lit(symbol).alias("symbol")) + out = clean_ohlcv(raw, return_report=return_report) + return out + + +def align_calendar( + df: pl.DataFrame, + *, + interval: str = "1d", +) -> pl.DataFrame: + clean = clean_ohlcv(df) + symbols = clean["symbol"].unique().to_list() + aligned: list[pl.DataFrame] = [] + for sym in symbols: + sdf = clean.filter(pl.col("symbol") == sym).sort("ts") + start = sdf.select(pl.col("ts").min()).item() + end = sdf.select(pl.col("ts").max()).item() + calendar = pl.DataFrame( + {"ts": pl.datetime_range(start, end, interval=interval, eager=True)} + ).with_columns(pl.lit(sym).alias("symbol")) + merged = calendar.join(sdf, on=["symbol", "ts"], how="left") + merged = merged.with_columns( + pl.col("open").is_null().alias("is_missing_bar"), + pl.col("adj_close").fill_null(pl.col("close")), + ) + aligned.append(merged) + return pl.concat(aligned, how="vertical").sort(["symbol", "ts"]) diff --git a/python/tests/fixtures/ohlcv_us_equities.csv b/python/tests/fixtures/ohlcv_us_equities.csv new file mode 100644 index 0000000..e01e094 --- /dev/null +++ b/python/tests/fixtures/ohlcv_us_equities.csv @@ -0,0 +1,7 @@ +Date,Ticker,Open,High,Low,Close,Volume,Adj Close +2024-01-03,AAPL,186.10,187.00,185.50,186.30,5000,186.10 +2024-01-01,AAPL,184.00,185.00,183.50,184.80,8000,184.70 +2024-01-02,AAPL,185.00,186.20,184.70,185.90,7000,185.80 +2024-01-02,AAPL,185.05,186.30,184.60,186.00,7100,185.90 +2024-01-01,MSFT,370.10,372.00,369.90,371.50,6000,371.30 +2024-01-03,MSFT,372.20,373.10,371.40,372.00,5500,371.90 diff --git a/python/tests/test_data_module.py b/python/tests/test_data_module.py new file mode 100644 index 0000000..6bd24da --- /dev/null +++ b/python/tests/test_data_module.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import hashlib +import json +from pathlib import Path + +import polars as pl + +import openquant + + +def _fixture_path() -> Path: + return Path(__file__).resolve().parent / "fixtures" / "ohlcv_us_equities.csv" + + +def _digest_frame(df: pl.DataFrame) -> str: + normalized = df.with_columns(pl.col("ts").dt.strftime("%Y-%m-%d %H:%M:%S")) + payload = json.dumps(normalized.to_dict(as_series=False), sort_keys=True, separators=(",", ":")) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def test_load_ohlcv_enforces_canonical_schema_and_determinism(): + p = _fixture_path() + out1, report1 = openquant.data.load_ohlcv(p, return_report=True) + out2 = openquant.data.load_ohlcv(p) + + assert out1.columns == [ + "ts", + "symbol", + "open", + "high", + "low", + "close", + "volume", + "adj_close", + ] + assert out1.equals(out2) + assert report1["row_count"] == 5 + assert report1["rows_removed_by_deduplication"] == 1 + assert report1["gap_interval_count"] == 1 + + digest_1 = _digest_frame(out1) + digest_2 = _digest_frame(openquant.data.load_ohlcv(p)) + assert digest_1 == digest_2 + + +def test_load_ohlcv_symbol_argument_required_when_missing(): + df = pl.DataFrame( + { + "date": ["2024-01-01", "2024-01-02"], + "open": [10.0, 11.0], + "high": [11.0, 12.0], + "low": [9.0, 10.5], + "close": [10.5, 11.4], + "volume": [1000, 1100], + } + ) + p = _fixture_path().with_name("ohlcv_no_symbol.csv") + df.write_csv(p) + try: + try: + openquant.data.load_ohlcv(p) + assert False, "expected load_ohlcv to raise without symbol" + except ValueError: + pass + + out = openquant.data.load_ohlcv(p, symbol="SPY") + assert set(out["symbol"].to_list()) == {"SPY"} + finally: + p.unlink(missing_ok=True) + + +def test_align_calendar_marks_missing_bars(): + raw = openquant.data.load_ohlcv(_fixture_path()) + aligned = openquant.data.align_calendar(raw) + + msft = aligned.filter(pl.col("symbol") == "MSFT") + assert msft.height == 3 + # 2024-01-02 is absent in fixture for MSFT and should be represented. + missing = msft.filter(pl.col("is_missing_bar")) + assert missing.height == 1 + assert str(missing["ts"][0]).startswith("2024-01-02") From 7dc3f788623893dcf923201ae6b8853b30d09058 Mon Sep 17 00:00:00 2001 From: Sean Koval Date: Sat, 14 Feb 2026 02:20:55 -0500 Subject: [PATCH 2/5] Add AFML event-driven bars module and diagnostics --- docs/python_bindings.md | 10 ++ notebooks/python/README.md | 13 ++ notebooks/python/scripts/bar_diagnostics.py | 50 +++++++ python/openquant/__init__.py | 2 + python/openquant/bars.py | 151 ++++++++++++++++++++ python/tests/test_bars_module.py | 90 ++++++++++++ 6 files changed, 316 insertions(+) create mode 100644 notebooks/python/scripts/bar_diagnostics.py create mode 100644 python/openquant/bars.py create mode 100644 python/tests/test_bars_module.py diff --git a/docs/python_bindings.md b/docs/python_bindings.md index f1b9aa0..7161e2b 100644 --- a/docs/python_bindings.md +++ b/docs/python_bindings.md @@ -52,6 +52,16 @@ Input conventions: - `timestamps`: list of strings formatted as `%Y-%m-%d %H:%M:%S` - timestamp variants require `len(close) == len(timestamps)` +### `openquant.bars` (AFML Ch.2 event-driven bars) +- `build_time_bars(df, interval="1d")` +- `build_tick_bars(df, ticks_per_bar=50)` +- `build_volume_bars(df, volume_per_bar=100_000.0)` +- `build_dollar_bars(df, dollar_value_per_bar=5_000_000.0)` +- `bar_diagnostics(df)` + +Input conventions: +- `df`: polars DataFrame with canonical OHLCV columns (`ts,symbol,open,high,low,close,volume,adj_close`) + ### `openquant.sampling` - `get_ind_matrix(label_endtime, bar_index)` - `get_ind_mat_average_uniqueness(ind_mat)` diff --git a/notebooks/python/README.md b/notebooks/python/README.md index 135a67f..4c497e4 100644 --- a/notebooks/python/README.md +++ b/notebooks/python/README.md @@ -32,3 +32,16 @@ Execute the real-data notebook cells non-interactively: ```bash uv run --python .venv/bin/python notebooks/python/scripts/execute_notebook_cells.py notebooks/python/06_afml_real_data_end_to_end.ipynb ``` + +## Bar diagnostics (AFML Ch.2) + +Compare time/tick/volume/dollar bar families with simple serial-dependence and heteroskedasticity proxies: + +```bash +uv run --python .venv/bin/python python notebooks/python/scripts/bar_diagnostics.py +``` + +The script prints: +- `lag1_return_autocorr` (serial dependence proxy) +- `lag1_sq_return_autocorr` (heteroskedasticity proxy) +- `return_std` and number of bars per family diff --git a/notebooks/python/scripts/bar_diagnostics.py b/notebooks/python/scripts/bar_diagnostics.py new file mode 100644 index 0000000..afb1590 --- /dev/null +++ b/notebooks/python/scripts/bar_diagnostics.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import math + +import polars as pl + +import openquant + + +def _make_base_frame(n: int = 720, seed: int = 11) -> pl.DataFrame: + ds = openquant.research.make_synthetic_futures_dataset(n_bars=n, seed=seed, asset_names=["SPY", "QQQ", "IWM"]) + volume = [float(80_000 + int(25_000 * (1.0 + math.sin(i / 17.0)))) for i in range(n)] + outlier_idx = max(n // 3, 1) + volume[outlier_idx] *= 25.0 + return pl.DataFrame( + { + "ts": ds.timestamps, + "symbol": ["SPY"] * n, + "open": ds.close, + "high": [c * 1.002 for c in ds.close], + "low": [c * 0.998 for c in ds.close], + "close": ds.close, + "volume": volume, + "adj_close": ds.close, + } + ) + + +def main() -> None: + base = _make_base_frame() + variants = { + "time_1h": openquant.bars.build_time_bars(base, interval="1h"), + "tick_40": openquant.bars.build_tick_bars(base, ticks_per_bar=40), + "volume_2m": openquant.bars.build_volume_bars(base, volume_per_bar=2_000_000), + "dollar_200m": openquant.bars.build_dollar_bars(base, dollar_value_per_bar=200_000_000), + } + + print("bar_family,n_bars,lag1_return_autocorr,lag1_sq_return_autocorr,return_std") + for name, frame in variants.items(): + d = openquant.bars.bar_diagnostics(frame) + print( + f"{name},{int(d['n_bars'])}," + f"{d['lag1_return_autocorr']:.6f}," + f"{d['lag1_sq_return_autocorr']:.6f}," + f"{d['return_std']:.6f}" + ) + + +if __name__ == "__main__": + main() diff --git a/python/openquant/__init__.py b/python/openquant/__init__.py index 901db5c..fe38cff 100644 --- a/python/openquant/__init__.py +++ b/python/openquant/__init__.py @@ -1,5 +1,6 @@ from . import _core from . import adapters +from . import bars from . import data from . import pipeline from . import research @@ -17,6 +18,7 @@ "sampling", "bet_sizing", "portfolio", + "bars", "data", "pipeline", "research", diff --git a/python/openquant/bars.py b/python/openquant/bars.py new file mode 100644 index 0000000..4ff27d3 --- /dev/null +++ b/python/openquant/bars.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import math + +import polars as pl + +from . import data + + +def _prepare(df: pl.DataFrame) -> pl.DataFrame: + return data.clean_ohlcv(df) + + +def _aggregate(df: pl.DataFrame) -> pl.DataFrame: + out = ( + df.group_by(["symbol", "bar_id"]) + .agg( + pl.col("ts").min().alias("start_ts"), + pl.col("ts").max().alias("end_ts"), + pl.col("open").first().alias("open"), + pl.col("high").max().alias("high"), + pl.col("low").min().alias("low"), + pl.col("close").last().alias("close"), + pl.col("adj_close").last().alias("adj_close"), + pl.col("volume").sum().alias("volume"), + pl.len().alias("n_obs"), + ) + .sort(["symbol", "end_ts", "bar_id"]) + .drop("bar_id") + .rename({"end_ts": "ts"}) + ) + return out.select( + [ + "ts", + "symbol", + "open", + "high", + "low", + "close", + "volume", + "adj_close", + "start_ts", + "n_obs", + ] + ) + + +def build_time_bars(df: pl.DataFrame, *, interval: str = "1d") -> pl.DataFrame: + clean = _prepare(df) + grouped = clean.with_columns(pl.col("ts").dt.truncate(interval).alias("bar_id")) + return _aggregate(grouped) + + +def build_tick_bars(df: pl.DataFrame, *, ticks_per_bar: int = 50) -> pl.DataFrame: + if ticks_per_bar <= 0: + raise ValueError("ticks_per_bar must be > 0") + clean = _prepare(df) + grouped = clean.with_columns( + (pl.int_range(0, pl.len()).over("symbol") // ticks_per_bar) + .cast(pl.Int64) + .alias("bar_id") + ) + return _aggregate(grouped) + + +def build_volume_bars(df: pl.DataFrame, *, volume_per_bar: float = 100_000.0) -> pl.DataFrame: + if volume_per_bar <= 0: + raise ValueError("volume_per_bar must be > 0") + clean = _prepare(df) + eps = volume_per_bar * 1e-9 + grouped = ( + clean.with_columns(pl.col("volume").cum_sum().over("symbol").alias("cum_volume")) + .with_columns( + (((pl.col("cum_volume") - eps).clip(lower_bound=0.0)) / volume_per_bar) + .floor() + .cast(pl.Int64) + .alias("bar_id") + ) + .drop("cum_volume") + ) + return _aggregate(grouped) + + +def build_dollar_bars( + df: pl.DataFrame, + *, + dollar_value_per_bar: float = 5_000_000.0, +) -> pl.DataFrame: + if dollar_value_per_bar <= 0: + raise ValueError("dollar_value_per_bar must be > 0") + clean = _prepare(df) + eps = dollar_value_per_bar * 1e-9 + grouped = ( + clean.with_columns((pl.col("close") * pl.col("volume")).alias("dollar_value")) + .with_columns(pl.col("dollar_value").cum_sum().over("symbol").alias("cum_dollar")) + .with_columns( + (((pl.col("cum_dollar") - eps).clip(lower_bound=0.0)) / dollar_value_per_bar) + .floor() + .cast(pl.Int64) + .alias("bar_id") + ) + .drop(["dollar_value", "cum_dollar"]) + ) + return _aggregate(grouped) + + +def _lag1_autocorr(values: list[float]) -> float: + if len(values) < 3: + return 0.0 + x = values[:-1] + y = values[1:] + mx = sum(x) / len(x) + my = sum(y) / len(y) + cov = sum((a - mx) * (b - my) for a, b in zip(x, y)) + sx = math.sqrt(sum((a - mx) ** 2 for a in x)) + sy = math.sqrt(sum((b - my) ** 2 for b in y)) + if sx == 0.0 or sy == 0.0: + return 0.0 + return cov / (sx * sy) + + +def bar_diagnostics(df: pl.DataFrame) -> dict[str, float]: + clean = _prepare(df).sort(["symbol", "ts"]) + returns = ( + clean.with_columns( + ( + (pl.col("close") - pl.col("close").shift(1).over("symbol")) + / pl.col("close").shift(1).over("symbol") + ).alias("ret") + ) + .drop_nulls(subset=["ret"]) + .select("ret") + .to_series() + .to_list() + ) + if len(returns) < 3: + return { + "n_bars": float(clean.height), + "lag1_return_autocorr": 0.0, + "lag1_sq_return_autocorr": 0.0, + "return_std": 0.0, + } + sq = [r * r for r in returns] + mean_r = sum(returns) / len(returns) + std_r = math.sqrt(sum((r - mean_r) ** 2 for r in returns) / (len(returns) - 1)) + return { + "n_bars": float(clean.height), + "lag1_return_autocorr": _lag1_autocorr(returns), + "lag1_sq_return_autocorr": _lag1_autocorr(sq), + "return_std": std_r, + } diff --git a/python/tests/test_bars_module.py b/python/tests/test_bars_module.py new file mode 100644 index 0000000..490d4b2 --- /dev/null +++ b/python/tests/test_bars_module.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import polars as pl + +import openquant + + +def _base_frame() -> pl.DataFrame: + ts = [ + "2024-01-01 09:30:00", + "2024-01-01 09:31:00", + "2024-01-01 09:33:00", + "2024-01-01 09:34:00", + "2024-01-01 09:35:00", + "2024-01-01 09:36:00", + "2024-01-01 09:37:00", + "2024-01-01 09:39:00", + "2024-01-01 09:40:00", + "2024-01-01 09:41:00", + ] + close = [100.0, 100.4, 100.1, 100.8, 100.7, 101.2, 101.0, 101.5, 101.4, 101.7] + volume = [80_000.0, 85_000.0, 90_000.0, 88_000.0, 95_000.0, 300_000.0, 92_000.0, 98_000.0, 99_000.0, 97_000.0] + return pl.DataFrame( + { + "ts": ts, + "symbol": ["SPY"] * len(ts), + "open": close, + "high": [x * 1.001 for x in close], + "low": [x * 0.999 for x in close], + "close": close, + "volume": volume, + "adj_close": close, + } + ) + + +def _assert_bar_invariants(df: pl.DataFrame) -> None: + assert df.height > 0 + assert df.select((pl.col("high") >= pl.col("open")).all()).item() + assert df.select((pl.col("high") >= pl.col("close")).all()).item() + assert df.select((pl.col("low") <= pl.col("open")).all()).item() + assert df.select((pl.col("low") <= pl.col("close")).all()).item() + assert df.select((pl.col("n_obs") >= 1).all()).item() + assert df.select((pl.col("ts") >= pl.col("start_ts")).all()).item() + + +def test_time_tick_volume_dollar_bars_are_deterministic(): + base = _base_frame() + builders = [ + lambda x: openquant.bars.build_time_bars(x, interval="5m"), + lambda x: openquant.bars.build_tick_bars(x, ticks_per_bar=3), + lambda x: openquant.bars.build_volume_bars(x, volume_per_bar=250_000), + lambda x: openquant.bars.build_dollar_bars(x, dollar_value_per_bar=25_000_000), + ] + for build in builders: + a = build(base) + b = build(base) + assert a.equals(b) + + +def test_bar_outputs_monotone_and_invariant(): + base = _base_frame() + for out in ( + openquant.bars.build_time_bars(base, interval="5m"), + openquant.bars.build_tick_bars(base, ticks_per_bar=2), + openquant.bars.build_volume_bars(base, volume_per_bar=200_000), + openquant.bars.build_dollar_bars(base, dollar_value_per_bar=20_000_000), + ): + _assert_bar_invariants(out) + ts = out["ts"].to_list() + assert all(ts[i] <= ts[i + 1] for i in range(len(ts) - 1)) + + +def test_sparse_intervals_and_outlier_volume_edge_cases(): + base = _base_frame() + sparse = base.filter(pl.col("ts") != pl.lit("2024-01-01 09:34:00")) + out = openquant.bars.build_time_bars(sparse, interval="5m") + assert out.height >= 2 + _assert_bar_invariants(out) + + # One outlier volume observation should not break grouping. + outlier = base.with_columns( + pl.when(pl.col("ts") == pl.lit("2024-01-01 09:36:00")) + .then(pl.lit(5_000_000.0)) + .otherwise(pl.col("volume")) + .alias("volume") + ) + vol_bars = openquant.bars.build_volume_bars(outlier, volume_per_bar=300_000) + _assert_bar_invariants(vol_bars) + assert vol_bars.height >= 2 From 9c2c0b686ba29ce767aee158199542520cf47f04 Mon Sep 17 00:00:00 2001 From: Sean Koval Date: Sat, 14 Feb 2026 02:28:19 -0500 Subject: [PATCH 3/5] Route bars through Rust core and PyO3 bindings --- crates/openquant/src/data_structures.rs | 3 + crates/pyopenquant/src/lib.rs | 124 +++++++++++++++++++++ docs/python_bindings.md | 2 +- python/openquant/bars.py | 136 +++++++++++++----------- 4 files changed, 204 insertions(+), 61 deletions(-) diff --git a/crates/openquant/src/data_structures.rs b/crates/openquant/src/data_structures.rs index 8b757b0..83038e2 100644 --- a/crates/openquant/src/data_structures.rs +++ b/crates/openquant/src/data_structures.rs @@ -23,6 +23,7 @@ pub enum StandardBarType { /// Bar output with OHLCV-like fields. #[derive(Debug, Clone, PartialEq)] pub struct StandardBar { + pub start_timestamp: NaiveDateTime, pub timestamp: NaiveDateTime, pub open: f64, pub high: f64, @@ -211,6 +212,7 @@ fn build_bar(trades: &[Trade]) -> StandardBar { let open = trades.first().expect("non-empty slice").price; let close = trades.last().expect("non-empty slice").price; + let start_timestamp = trades.first().expect("non-empty slice").timestamp; let timestamp = trades.last().expect("non-empty slice").timestamp; let (high, low) = trades.iter().fold((f64::NEG_INFINITY, f64::INFINITY), |(h, l), trade| { (h.max(trade.price), l.min(trade.price)) @@ -222,6 +224,7 @@ fn build_bar(trades: &[Trade]) -> StandardBar { }); StandardBar { + start_timestamp, timestamp, open, high, diff --git a/crates/pyopenquant/src/lib.rs b/crates/pyopenquant/src/lib.rs index 37ed369..46aacbc 100644 --- a/crates/pyopenquant/src/lib.rs +++ b/crates/pyopenquant/src/lib.rs @@ -1,4 +1,5 @@ use nalgebra::DMatrix; +use openquant::data_structures::{standard_bars, time_bars, StandardBarType, Trade}; use openquant::filters::Threshold; use openquant::pipeline::{ run_mid_frequency_pipeline, ResearchPipelineConfig, ResearchPipelineInput, @@ -50,6 +51,61 @@ fn format_naive_datetimes(values: Vec) -> Vec { values.into_iter().map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()).collect() } +fn parse_one_naive_datetime(value: &str) -> PyResult { + chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S") + .or_else(|_| { + chrono::NaiveDate::parse_from_str(value, "%Y-%m-%d") + .map(|d| d.and_hms_opt(0, 0, 0).expect("valid fixed midnight")) + }) + .map_err(|e| { + PyValueError::new_err(format!( + "invalid datetime '{value}' (expected '%Y-%m-%d %H:%M:%S' or '%Y-%m-%d'): {e}" + )) + }) +} + +fn build_trades( + timestamps: Vec, + prices: Vec, + volumes: Vec, +) -> PyResult> { + if timestamps.len() != prices.len() || prices.len() != volumes.len() { + return Err(PyValueError::new_err(format!( + "timestamps/prices/volumes length mismatch: {} / {} / {}", + timestamps.len(), + prices.len(), + volumes.len() + ))); + } + let mut trades = Vec::with_capacity(prices.len()); + for i in 0..prices.len() { + trades.push(Trade { + timestamp: parse_one_naive_datetime(×tamps[i])?, + price: prices[i], + volume: volumes[i], + }); + } + Ok(trades) +} + +fn bars_to_rows(bars: Vec) -> Vec<(String, String, f64, f64, f64, f64, f64, f64, usize)> { + bars.into_iter() + .map(|b| { + ( + b.start_timestamp.format("%Y-%m-%d %H:%M:%S").to_string(), + b.timestamp.format("%Y-%m-%d %H:%M:%S").to_string(), + b.open, + b.high, + b.low, + b.close, + b.volume, + b.dollar_value, + b.tick_count, + ) + }) + .collect() +} + #[pyfunction(name = "calculate_value_at_risk")] fn risk_calculate_value_at_risk(returns: Vec, confidence_level: f64) -> PyResult { RiskMetrics::default().calculate_value_at_risk(&returns, confidence_level).map_err(to_py_err) @@ -154,6 +210,66 @@ fn sampling_seq_bootstrap( openquant::sampling::seq_bootstrap(&ind_mat, sample_length, warmup_samples) } +#[pyfunction(name = "build_time_bars")] +fn bars_build_time_bars( + timestamps: Vec, + prices: Vec, + volumes: Vec, + interval_seconds: i64, +) -> PyResult> { + if interval_seconds <= 0 { + return Err(PyValueError::new_err("interval_seconds must be > 0")); + } + let trades = build_trades(timestamps, prices, volumes)?; + let bars = time_bars(&trades, chrono::Duration::seconds(interval_seconds)); + Ok(bars_to_rows(bars)) +} + +#[pyfunction(name = "build_tick_bars")] +fn bars_build_tick_bars( + timestamps: Vec, + prices: Vec, + volumes: Vec, + ticks_per_bar: usize, +) -> PyResult> { + if ticks_per_bar == 0 { + return Err(PyValueError::new_err("ticks_per_bar must be > 0")); + } + let trades = build_trades(timestamps, prices, volumes)?; + let bars = standard_bars(&trades, ticks_per_bar as f64, StandardBarType::Tick); + Ok(bars_to_rows(bars)) +} + +#[pyfunction(name = "build_volume_bars")] +fn bars_build_volume_bars( + timestamps: Vec, + prices: Vec, + volumes: Vec, + volume_per_bar: f64, +) -> PyResult> { + if !volume_per_bar.is_finite() || volume_per_bar <= 0.0 { + return Err(PyValueError::new_err("volume_per_bar must be > 0")); + } + let trades = build_trades(timestamps, prices, volumes)?; + let bars = standard_bars(&trades, volume_per_bar, StandardBarType::Volume); + Ok(bars_to_rows(bars)) +} + +#[pyfunction(name = "build_dollar_bars")] +fn bars_build_dollar_bars( + timestamps: Vec, + prices: Vec, + volumes: Vec, + dollar_value_per_bar: f64, +) -> PyResult> { + if !dollar_value_per_bar.is_finite() || dollar_value_per_bar <= 0.0 { + return Err(PyValueError::new_err("dollar_value_per_bar must be > 0")); + } + let trades = build_trades(timestamps, prices, volumes)?; + let bars = standard_bars(&trades, dollar_value_per_bar, StandardBarType::Dollar); + Ok(bars_to_rows(bars)) +} + #[pyfunction(name = "get_signal")] fn bet_sizing_get_signal(prob: Vec, num_classes: usize, pred: Option>) -> Vec { openquant::bet_sizing::get_signal(&prob, num_classes, pred.as_deref()) @@ -325,6 +441,14 @@ fn _core(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_submodule(&sampling)?; m.add("sampling", sampling)?; + let bars = PyModule::new_bound(py, "bars")?; + bars.add_function(wrap_pyfunction!(bars_build_time_bars, &bars)?)?; + bars.add_function(wrap_pyfunction!(bars_build_tick_bars, &bars)?)?; + bars.add_function(wrap_pyfunction!(bars_build_volume_bars, &bars)?)?; + bars.add_function(wrap_pyfunction!(bars_build_dollar_bars, &bars)?)?; + m.add_submodule(&bars)?; + m.add("bars", bars)?; + let bet_sizing = PyModule::new_bound(py, "bet_sizing")?; bet_sizing.add_function(wrap_pyfunction!(bet_sizing_get_signal, &bet_sizing)?)?; bet_sizing.add_function(wrap_pyfunction!(bet_sizing_discrete_signal, &bet_sizing)?)?; diff --git a/docs/python_bindings.md b/docs/python_bindings.md index 7161e2b..ed129ed 100644 --- a/docs/python_bindings.md +++ b/docs/python_bindings.md @@ -52,7 +52,7 @@ Input conventions: - `timestamps`: list of strings formatted as `%Y-%m-%d %H:%M:%S` - timestamp variants require `len(close) == len(timestamps)` -### `openquant.bars` (AFML Ch.2 event-driven bars) +### `openquant.bars` (AFML Ch.2 event-driven bars; Rust core via PyO3) - `build_time_bars(df, interval="1d")` - `build_tick_bars(df, ticks_per_bar=50)` - `build_volume_bars(df, volume_per_bar=100_000.0)` diff --git a/python/openquant/bars.py b/python/openquant/bars.py index 4ff27d3..b361567 100644 --- a/python/openquant/bars.py +++ b/python/openquant/bars.py @@ -1,35 +1,62 @@ from __future__ import annotations import math +from typing import Callable import polars as pl +from . import _core from . import data -def _prepare(df: pl.DataFrame) -> pl.DataFrame: - return data.clean_ohlcv(df) - - -def _aggregate(df: pl.DataFrame) -> pl.DataFrame: - out = ( - df.group_by(["symbol", "bar_id"]) - .agg( - pl.col("ts").min().alias("start_ts"), - pl.col("ts").max().alias("end_ts"), - pl.col("open").first().alias("open"), - pl.col("high").max().alias("high"), - pl.col("low").min().alias("low"), - pl.col("close").last().alias("close"), - pl.col("adj_close").last().alias("adj_close"), - pl.col("volume").sum().alias("volume"), - pl.len().alias("n_obs"), +def _interval_to_seconds(interval: str) -> int: + s = interval.strip().lower() + if s.endswith("d"): + return int(s[:-1]) * 24 * 3600 + if s.endswith("h"): + return int(s[:-1]) * 3600 + if s.endswith("m"): + return int(s[:-1]) * 60 + if s.endswith("s"): + return int(s[:-1]) + raise ValueError(f"unsupported interval format: {interval}") + + +def _rows_to_frame(symbol: str, rows: list[tuple[str, str, float, float, float, float, float, float, int]]) -> pl.DataFrame: + if not rows: + return pl.DataFrame( + { + "ts": [], + "symbol": [], + "open": [], + "high": [], + "low": [], + "close": [], + "volume": [], + "adj_close": [], + "start_ts": [], + "n_obs": [], + "dollar_value": [], + } ) - .sort(["symbol", "end_ts", "bar_id"]) - .drop("bar_id") - .rename({"end_ts": "ts"}) - ) - return out.select( + return pl.DataFrame( + { + "start_ts": [r[0] for r in rows], + "ts": [r[1] for r in rows], + "open": [r[2] for r in rows], + "high": [r[3] for r in rows], + "low": [r[4] for r in rows], + "close": [r[5] for r in rows], + "volume": [r[6] for r in rows], + "dollar_value": [r[7] for r in rows], + "n_obs": [r[8] for r in rows], + } + ).with_columns( + pl.lit(symbol).alias("symbol"), + pl.col("start_ts").str.strptime(pl.Datetime, strict=False), + pl.col("ts").str.strptime(pl.Datetime, strict=False), + pl.col("close").alias("adj_close"), + ).select( [ "ts", "symbol", @@ -41,44 +68,46 @@ def _aggregate(df: pl.DataFrame) -> pl.DataFrame: "adj_close", "start_ts", "n_obs", + "dollar_value", ] ) +def _build_by_symbol( + df: pl.DataFrame, + rust_builder: Callable[[list[str], list[float], list[float], float | int], list[tuple[str, str, float, float, float, float, float, float, int]]], + param: float | int, +) -> pl.DataFrame: + clean = data.clean_ohlcv(df).sort(["symbol", "ts"]) + out_frames: list[pl.DataFrame] = [] + for symbol in clean["symbol"].unique().to_list(): + sdf = clean.filter(pl.col("symbol") == symbol).sort("ts") + rows = rust_builder( + [str(x) for x in sdf["ts"].to_list()], + [float(x) for x in sdf["close"].to_list()], + [float(x) for x in sdf["volume"].to_list()], + param, + ) + out_frames.append(_rows_to_frame(symbol, rows)) + if not out_frames: + return _rows_to_frame("", []) + return pl.concat(out_frames, how="vertical").sort(["symbol", "ts"]) + + def build_time_bars(df: pl.DataFrame, *, interval: str = "1d") -> pl.DataFrame: - clean = _prepare(df) - grouped = clean.with_columns(pl.col("ts").dt.truncate(interval).alias("bar_id")) - return _aggregate(grouped) + return _build_by_symbol(df, _core.bars.build_time_bars, _interval_to_seconds(interval)) def build_tick_bars(df: pl.DataFrame, *, ticks_per_bar: int = 50) -> pl.DataFrame: if ticks_per_bar <= 0: raise ValueError("ticks_per_bar must be > 0") - clean = _prepare(df) - grouped = clean.with_columns( - (pl.int_range(0, pl.len()).over("symbol") // ticks_per_bar) - .cast(pl.Int64) - .alias("bar_id") - ) - return _aggregate(grouped) + return _build_by_symbol(df, _core.bars.build_tick_bars, ticks_per_bar) def build_volume_bars(df: pl.DataFrame, *, volume_per_bar: float = 100_000.0) -> pl.DataFrame: if volume_per_bar <= 0: raise ValueError("volume_per_bar must be > 0") - clean = _prepare(df) - eps = volume_per_bar * 1e-9 - grouped = ( - clean.with_columns(pl.col("volume").cum_sum().over("symbol").alias("cum_volume")) - .with_columns( - (((pl.col("cum_volume") - eps).clip(lower_bound=0.0)) / volume_per_bar) - .floor() - .cast(pl.Int64) - .alias("bar_id") - ) - .drop("cum_volume") - ) - return _aggregate(grouped) + return _build_by_symbol(df, _core.bars.build_volume_bars, volume_per_bar) def build_dollar_bars( @@ -88,20 +117,7 @@ def build_dollar_bars( ) -> pl.DataFrame: if dollar_value_per_bar <= 0: raise ValueError("dollar_value_per_bar must be > 0") - clean = _prepare(df) - eps = dollar_value_per_bar * 1e-9 - grouped = ( - clean.with_columns((pl.col("close") * pl.col("volume")).alias("dollar_value")) - .with_columns(pl.col("dollar_value").cum_sum().over("symbol").alias("cum_dollar")) - .with_columns( - (((pl.col("cum_dollar") - eps).clip(lower_bound=0.0)) / dollar_value_per_bar) - .floor() - .cast(pl.Int64) - .alias("bar_id") - ) - .drop(["dollar_value", "cum_dollar"]) - ) - return _aggregate(grouped) + return _build_by_symbol(df, _core.bars.build_dollar_bars, dollar_value_per_bar) def _lag1_autocorr(values: list[float]) -> float: @@ -120,7 +136,7 @@ def _lag1_autocorr(values: list[float]) -> float: def bar_diagnostics(df: pl.DataFrame) -> dict[str, float]: - clean = _prepare(df).sort(["symbol", "ts"]) + clean = data.clean_ohlcv(df).sort(["symbol", "ts"]) returns = ( clean.with_columns( ( From 769cb331dbfbe45e65ad9d981d7896fe3860f45a Mon Sep 17 00:00:00 2001 From: Sean Koval Date: Sat, 14 Feb 2026 02:37:11 -0500 Subject: [PATCH 4/5] Move openquant.data processing into Rust via PyO3 --- crates/openquant/src/data_processing.rs | 176 ++++++++++++++++++ crates/openquant/src/lib.rs | 1 + crates/openquant/tests/data_processing.rs | 55 ++++++ crates/pyopenquant/src/lib.rs | 168 +++++++++++++++++ docs/python_bindings.md | 10 + python/openquant/data.py | 211 +++++++++++++++------- 6 files changed, 559 insertions(+), 62 deletions(-) create mode 100644 crates/openquant/src/data_processing.rs create mode 100644 crates/openquant/tests/data_processing.rs diff --git a/crates/openquant/src/data_processing.rs b/crates/openquant/src/data_processing.rs new file mode 100644 index 0000000..3d10d40 --- /dev/null +++ b/crates/openquant/src/data_processing.rs @@ -0,0 +1,176 @@ +use chrono::NaiveDateTime; +use std::collections::{BTreeMap, HashSet}; + +#[derive(Debug, Clone, PartialEq)] +pub struct OhlcvRow { + pub timestamp: NaiveDateTime, + pub symbol: String, + pub open: f64, + pub high: f64, + pub low: f64, + pub close: f64, + pub volume: f64, + pub adj_close: f64, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct AlignedOhlcvRow { + pub timestamp: NaiveDateTime, + pub symbol: String, + pub open: Option, + pub high: Option, + pub low: Option, + pub close: Option, + pub volume: Option, + pub adj_close: Option, + pub is_missing_bar: bool, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct DataQualityReport { + pub row_count: usize, + pub symbol_count: usize, + pub duplicate_key_count: usize, + pub gap_interval_count: usize, + pub ts_min: Option, + pub ts_max: Option, + pub rows_removed_by_deduplication: usize, +} + +fn sort_rows(rows: &mut [OhlcvRow]) { + rows.sort_by(|a, b| { + a.symbol + .cmp(&b.symbol) + .then_with(|| a.timestamp.cmp(&b.timestamp)) + }); +} + +fn dedupe_rows(rows: &[OhlcvRow], keep_last: bool) -> (Vec, usize) { + if rows.is_empty() { + return (Vec::new(), 0); + } + + let mut deduped = Vec::new(); + let mut i = 0usize; + while i < rows.len() { + let mut j = i + 1; + while j < rows.len() + && rows[j].symbol == rows[i].symbol + && rows[j].timestamp == rows[i].timestamp + { + j += 1; + } + let chosen = if keep_last { &rows[j - 1] } else { &rows[i] }; + deduped.push(chosen.clone()); + i = j; + } + let removed = rows.len().saturating_sub(deduped.len()); + (deduped, removed) +} + +pub fn quality_report(rows: &[OhlcvRow], rows_removed_by_deduplication: usize) -> DataQualityReport { + let mut symbol_set = HashSet::new(); + let mut duplicate_key_count = 0usize; + let mut key_counts: BTreeMap<(String, NaiveDateTime), usize> = BTreeMap::new(); + + for row in rows { + symbol_set.insert(row.symbol.clone()); + let key = (row.symbol.clone(), row.timestamp); + *key_counts.entry(key).or_insert(0usize) += 1; + } + + for count in key_counts.values() { + if *count > 1 { + duplicate_key_count += 1; + } + } + + let mut gap_interval_count = 0usize; + let mut last_by_symbol: BTreeMap = BTreeMap::new(); + for row in rows { + if let Some(prev) = last_by_symbol.get(&row.symbol) { + if (row.timestamp - *prev).num_seconds() > 24 * 3600 { + gap_interval_count += 1; + } + } + last_by_symbol.insert(row.symbol.clone(), row.timestamp); + } + + DataQualityReport { + row_count: rows.len(), + symbol_count: symbol_set.len(), + duplicate_key_count, + gap_interval_count, + ts_min: rows.first().map(|r| r.timestamp), + ts_max: rows.last().map(|r| r.timestamp), + rows_removed_by_deduplication, + } +} + +pub fn clean_ohlcv_rows(rows: &[OhlcvRow], keep_last: bool) -> (Vec, DataQualityReport) { + let mut sorted = rows.to_vec(); + sort_rows(&mut sorted); + let (deduped, removed) = dedupe_rows(&sorted, keep_last); + let report = quality_report(&deduped, removed); + (deduped, report) +} + +pub fn align_calendar_rows(rows: &[OhlcvRow], interval_seconds: i64) -> Result, String> { + if interval_seconds <= 0 { + return Err("interval_seconds must be > 0".to_string()); + } + let (clean, _) = clean_ohlcv_rows(rows, true); + if clean.is_empty() { + return Ok(Vec::new()); + } + + let mut by_symbol: BTreeMap> = BTreeMap::new(); + for row in clean { + by_symbol.entry(row.symbol.clone()).or_default().push(row); + } + + let mut out = Vec::new(); + for (symbol, rows_for_symbol) in by_symbol { + if rows_for_symbol.is_empty() { + continue; + } + let start = rows_for_symbol.first().expect("non-empty").timestamp; + let end = rows_for_symbol.last().expect("non-empty").timestamp; + + let mut index: BTreeMap = BTreeMap::new(); + for row in rows_for_symbol { + index.insert(row.timestamp, row); + } + + let mut ts = start; + while ts <= end { + if let Some(row) = index.get(&ts) { + out.push(AlignedOhlcvRow { + timestamp: ts, + symbol: symbol.clone(), + open: Some(row.open), + high: Some(row.high), + low: Some(row.low), + close: Some(row.close), + volume: Some(row.volume), + adj_close: Some(row.adj_close), + is_missing_bar: false, + }); + } else { + out.push(AlignedOhlcvRow { + timestamp: ts, + symbol: symbol.clone(), + open: None, + high: None, + low: None, + close: None, + volume: None, + adj_close: None, + is_missing_bar: true, + }); + } + ts += chrono::Duration::seconds(interval_seconds); + } + } + Ok(out) +} diff --git a/crates/openquant/src/lib.rs b/crates/openquant/src/lib.rs index 4b03826..0c96fa9 100644 --- a/crates/openquant/src/lib.rs +++ b/crates/openquant/src/lib.rs @@ -5,6 +5,7 @@ pub mod cla; pub mod codependence; pub mod combinatorial_optimization; pub mod cross_validation; +pub mod data_processing; pub mod data_structures; pub mod ef3m; pub mod ensemble_methods; diff --git a/crates/openquant/tests/data_processing.rs b/crates/openquant/tests/data_processing.rs new file mode 100644 index 0000000..0315262 --- /dev/null +++ b/crates/openquant/tests/data_processing.rs @@ -0,0 +1,55 @@ +use chrono::{DateTime, NaiveDateTime, Utc}; +use openquant::data_processing::{align_calendar_rows, clean_ohlcv_rows, OhlcvRow}; + +fn ts(seconds: i64) -> NaiveDateTime { + DateTime::::from_timestamp(seconds, 0).expect("timestamp").naive_utc() +} + +fn sample_rows() -> Vec { + vec![ + OhlcvRow { + timestamp: ts(0), + symbol: "AAPL".to_string(), + open: 100.0, + high: 101.0, + low: 99.0, + close: 100.5, + volume: 10.0, + adj_close: 100.5, + }, + OhlcvRow { + timestamp: ts(60), + symbol: "AAPL".to_string(), + open: 100.5, + high: 101.2, + low: 100.3, + close: 101.0, + volume: 11.0, + adj_close: 101.0, + }, + OhlcvRow { + timestamp: ts(60), + symbol: "AAPL".to_string(), + open: 100.6, + high: 101.3, + low: 100.4, + close: 101.1, + volume: 12.0, + adj_close: 101.1, + }, + ] +} + +#[test] +fn clean_and_align_rows() { + let rows = sample_rows(); + let (clean, report) = clean_ohlcv_rows(&rows, true); + assert_eq!(clean.len(), 2); + assert_eq!(report.rows_removed_by_deduplication, 1); + assert_eq!(report.duplicate_key_count, 0); + + let aligned = align_calendar_rows(&clean, 60).expect("align"); + assert_eq!(aligned.len(), 2); + assert!(!aligned[0].is_missing_bar); + assert!(!aligned[1].is_missing_bar); +} diff --git a/crates/pyopenquant/src/lib.rs b/crates/pyopenquant/src/lib.rs index 46aacbc..4c68b8e 100644 --- a/crates/pyopenquant/src/lib.rs +++ b/crates/pyopenquant/src/lib.rs @@ -1,4 +1,5 @@ use nalgebra::DMatrix; +use openquant::data_processing::{align_calendar_rows, clean_ohlcv_rows, quality_report, OhlcvRow}; use openquant::data_structures::{standard_bars, time_bars, StandardBarType, Trade}; use openquant::filters::Threshold; use openquant::pipeline::{ @@ -106,6 +107,54 @@ fn bars_to_rows(bars: Vec) -> Vec<(Stri .collect() } +fn build_ohlcv_rows( + timestamps: Vec, + symbols: Vec, + open: Vec, + high: Vec, + low: Vec, + close: Vec, + volume: Vec, + adj_close: Vec, +) -> PyResult> { + let n = timestamps.len(); + let lengths = [ + symbols.len(), + open.len(), + high.len(), + low.len(), + close.len(), + volume.len(), + adj_close.len(), + ]; + if lengths.iter().any(|&len| len != n) { + return Err(PyValueError::new_err(format!( + "ohlcv vector length mismatch: ts={n}, symbol={}, open={}, high={}, low={}, close={}, volume={}, adj_close={}", + symbols.len(), + open.len(), + high.len(), + low.len(), + close.len(), + volume.len(), + adj_close.len(), + ))); + } + let mut rows = Vec::with_capacity(n); + for i in 0..n { + rows.push(OhlcvRow { + timestamp: parse_one_naive_datetime(×tamps[i])?, + symbol: symbols[i].clone(), + open: open[i], + high: high[i], + low: low[i], + close: close[i], + volume: volume[i], + adj_close: adj_close[i], + }); + } + Ok(rows) +} + #[pyfunction(name = "calculate_value_at_risk")] fn risk_calculate_value_at_risk(returns: Vec, confidence_level: f64) -> PyResult { RiskMetrics::default().calculate_value_at_risk(&returns, confidence_level).map_err(to_py_err) @@ -270,6 +319,118 @@ fn bars_build_dollar_bars( Ok(bars_to_rows(bars)) } +#[pyfunction(name = "clean_ohlcv")] +fn data_clean_ohlcv( + py: Python<'_>, + timestamps: Vec, + symbols: Vec, + open: Vec, + high: Vec, + low: Vec, + close: Vec, + volume: Vec, + adj_close: Vec, + dedupe_keep_last: bool, +) -> PyResult<(Vec<(String, String, f64, f64, f64, f64, f64, f64)>, PyObject)> { + let rows = build_ohlcv_rows(timestamps, symbols, open, high, low, close, volume, adj_close)?; + let (clean, report) = clean_ohlcv_rows(&rows, dedupe_keep_last); + let out_rows = clean + .into_iter() + .map(|r| { + ( + r.timestamp.format("%Y-%m-%d %H:%M:%S").to_string(), + r.symbol, + r.open, + r.high, + r.low, + r.close, + r.volume, + r.adj_close, + ) + }) + .collect::>(); + + let out_report = PyDict::new_bound(py); + out_report.set_item("row_count", report.row_count)?; + out_report.set_item("symbol_count", report.symbol_count)?; + out_report.set_item("duplicate_key_count", report.duplicate_key_count)?; + out_report.set_item("gap_interval_count", report.gap_interval_count)?; + out_report.set_item( + "ts_min", + report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), + )?; + out_report.set_item( + "ts_max", + report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), + )?; + out_report.set_item("rows_removed_by_deduplication", report.rows_removed_by_deduplication)?; + Ok((out_rows, out_report.into_py(py))) +} + +#[pyfunction(name = "quality_report")] +fn data_quality_report( + py: Python<'_>, + timestamps: Vec, + symbols: Vec, + open: Vec, + high: Vec, + low: Vec, + close: Vec, + volume: Vec, + adj_close: Vec, +) -> PyResult { + let mut rows = build_ohlcv_rows(timestamps, symbols, open, high, low, close, volume, adj_close)?; + rows.sort_by(|a, b| a.symbol.cmp(&b.symbol).then_with(|| a.timestamp.cmp(&b.timestamp))); + let report = quality_report(&rows, 0); + let out_report = PyDict::new_bound(py); + out_report.set_item("row_count", report.row_count)?; + out_report.set_item("symbol_count", report.symbol_count)?; + out_report.set_item("duplicate_key_count", report.duplicate_key_count)?; + out_report.set_item("gap_interval_count", report.gap_interval_count)?; + out_report.set_item( + "ts_min", + report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), + )?; + out_report.set_item( + "ts_max", + report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), + )?; + out_report.set_item("rows_removed_by_deduplication", 0)?; + Ok(out_report.into_py(py)) +} + +#[pyfunction(name = "align_calendar")] +fn data_align_calendar( + timestamps: Vec, + symbols: Vec, + open: Vec, + high: Vec, + low: Vec, + close: Vec, + volume: Vec, + adj_close: Vec, + interval_seconds: i64, +) -> PyResult, Option, Option, Option, Option, Option, bool)>> { + let rows = build_ohlcv_rows(timestamps, symbols, open, high, low, close, volume, adj_close)?; + let out = align_calendar_rows(&rows, interval_seconds).map_err(to_py_err)?; + Ok(out + .into_iter() + .map(|r| { + ( + r.timestamp.format("%Y-%m-%d %H:%M:%S").to_string(), + r.symbol, + r.open, + r.high, + r.low, + r.close, + r.volume, + r.adj_close, + r.is_missing_bar, + ) + }) + .collect()) +} + #[pyfunction(name = "get_signal")] fn bet_sizing_get_signal(prob: Vec, num_classes: usize, pred: Option>) -> Vec { openquant::bet_sizing::get_signal(&prob, num_classes, pred.as_deref()) @@ -449,6 +610,13 @@ fn _core(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_submodule(&bars)?; m.add("bars", bars)?; + let data = PyModule::new_bound(py, "data")?; + data.add_function(wrap_pyfunction!(data_clean_ohlcv, &data)?)?; + data.add_function(wrap_pyfunction!(data_quality_report, &data)?)?; + data.add_function(wrap_pyfunction!(data_align_calendar, &data)?)?; + m.add_submodule(&data)?; + m.add("data", data)?; + let bet_sizing = PyModule::new_bound(py, "bet_sizing")?; bet_sizing.add_function(wrap_pyfunction!(bet_sizing_get_signal, &bet_sizing)?)?; bet_sizing.add_function(wrap_pyfunction!(bet_sizing_discrete_signal, &bet_sizing)?)?; diff --git a/docs/python_bindings.md b/docs/python_bindings.md index ed129ed..354d43b 100644 --- a/docs/python_bindings.md +++ b/docs/python_bindings.md @@ -62,6 +62,16 @@ Input conventions: Input conventions: - `df`: polars DataFrame with canonical OHLCV columns (`ts,symbol,open,high,low,close,volume,adj_close`) +### `openquant.data` (canonicalization + Rust-backed processing via PyO3) +- `load_ohlcv(path, symbol=None, return_report=False)` +- `clean_ohlcv(df, dedupe_keep="last", return_report=False)` +- `data_quality_report(df)` +- `align_calendar(df, interval="1d")` + +Notes: +- File IO and column alias canonicalization happen in Python for ergonomics. +- Core cleaning, deduplication, quality reporting, and calendar alignment are executed in Rust through `_core.data`. + ### `openquant.sampling` - `get_ind_matrix(label_endtime, bar_index)` - `get_ind_mat_average_uniqueness(ind_mat)` diff --git a/python/openquant/data.py b/python/openquant/data.py index c72a040..12c1cec 100644 --- a/python/openquant/data.py +++ b/python/openquant/data.py @@ -5,6 +5,8 @@ import polars as pl +from . import _core + CANONICAL_OHLCV_COLUMNS = [ "ts", @@ -82,45 +84,58 @@ def _cast_and_order(df: pl.DataFrame) -> pl.DataFrame: return casted.select(CANONICAL_OHLCV_COLUMNS) -def data_quality_report(df: pl.DataFrame) -> dict[str, Any]: - _validate_required_columns(df) - key_cols = ["symbol", "ts"] - row_count = df.height - duplicate_keys = ( - df.group_by(key_cols) - .len() - .filter(pl.col("len") > 1) - .height +def _to_core_vectors(df: pl.DataFrame) -> tuple[list[str], list[str], list[float], list[float], list[float], list[float], list[float], list[float]]: + return ( + [str(x) for x in df["ts"].to_list()], + [str(x) for x in df["symbol"].to_list()], + [float(x) for x in df["open"].to_list()], + [float(x) for x in df["high"].to_list()], + [float(x) for x in df["low"].to_list()], + [float(x) for x in df["close"].to_list()], + [float(x) for x in df["volume"].to_list()], + [float(x) for x in df["adj_close"].to_list()], ) - gap_count = 0 - for sym in df["symbol"].unique().to_list(): - ts_values = ( - df.filter(pl.col("symbol") == sym) - .sort("ts") - .select("ts") - .to_series() - .to_list() + + +def _rows_to_frame(rows: list[tuple[str, str, float, float, float, float, float, float]]) -> pl.DataFrame: + if not rows: + return pl.DataFrame( + { + "ts": [], + "symbol": [], + "open": [], + "high": [], + "low": [], + "close": [], + "volume": [], + "adj_close": [], + } ) - for prev, cur in zip(ts_values, ts_values[1:]): - if prev is None or cur is None: - continue - if (cur - prev).total_seconds() > 24 * 3600: - gap_count += 1 - - report = { - "row_count": row_count, - "symbol_count": int(df.select(pl.col("symbol").n_unique()).item()), - "duplicate_key_count": duplicate_keys, - "gap_interval_count": gap_count, - "null_counts": { - col: int(df.select(pl.col(col).null_count()).item()) - for col in CANONICAL_OHLCV_COLUMNS - if col in df.columns - }, - "ts_min": str(df.select(pl.col("ts").min()).item()), - "ts_max": str(df.select(pl.col("ts").max()).item()), - } - return report + return pl.DataFrame( + { + "ts": [r[0] for r in rows], + "symbol": [r[1] for r in rows], + "open": [r[2] for r in rows], + "high": [r[3] for r in rows], + "low": [r[4] for r in rows], + "close": [r[5] for r in rows], + "volume": [r[6] for r in rows], + "adj_close": [r[7] for r in rows], + } + ).with_columns(pl.col("ts").str.strptime(pl.Datetime, strict=False)) + + +def _interval_to_seconds(interval: str) -> int: + s = interval.strip().lower() + if s.endswith("d"): + return int(s[:-1]) * 24 * 3600 + if s.endswith("h"): + return int(s[:-1]) * 3600 + if s.endswith("m"): + return int(s[:-1]) * 60 + if s.endswith("s"): + return int(s[:-1]) + raise ValueError(f"unsupported interval format: {interval}") def clean_ohlcv( @@ -132,15 +147,63 @@ def clean_ohlcv( out = _canonicalize_columns(df) _validate_required_columns(out) out = _cast_and_order(out) - out = out.drop_nulls(subset=["ts", "symbol"]) - pre_rows = out.height - out = out.unique(subset=["symbol", "ts"], keep=dedupe_keep).sort(["symbol", "ts"]) - removed = pre_rows - out.height - report = data_quality_report(out) - report["rows_removed_by_deduplication"] = removed + ts, symbol, open_, high, low, close, volume, adj_close = _to_core_vectors(out) + rows, report = _core.data.clean_ohlcv( + ts, + symbol, + open_, + high, + low, + close, + volume, + adj_close, + dedupe_keep == "last", + ) + frame = _rows_to_frame(rows).sort(["symbol", "ts"]) + report = dict(report) + report["null_counts"] = { + "ts": 0, + "symbol": 0, + "open": 0, + "high": 0, + "low": 0, + "close": 0, + "volume": 0, + "adj_close": 0, + } if return_report: - return out, report - return out + return frame, report + return frame + + +def data_quality_report(df: pl.DataFrame) -> dict[str, Any]: + out = _canonicalize_columns(df) + _validate_required_columns(out) + out = _cast_and_order(out).sort(["symbol", "ts"]) + ts, symbol, open_, high, low, close, volume, adj_close = _to_core_vectors(out) + report = dict( + _core.data.quality_report( + ts, + symbol, + open_, + high, + low, + close, + volume, + adj_close, + ) + ) + report["null_counts"] = { + "ts": 0, + "symbol": 0, + "open": 0, + "high": 0, + "low": 0, + "close": 0, + "volume": 0, + "adj_close": 0, + } + return report def load_ohlcv( @@ -165,8 +228,7 @@ def load_ohlcv( if symbol is None: raise ValueError("symbol column missing and no symbol argument provided") raw = raw.with_columns(pl.lit(symbol).alias("symbol")) - out = clean_ohlcv(raw, return_report=return_report) - return out + return clean_ohlcv(raw, return_report=return_report) def align_calendar( @@ -175,19 +237,44 @@ def align_calendar( interval: str = "1d", ) -> pl.DataFrame: clean = clean_ohlcv(df) - symbols = clean["symbol"].unique().to_list() - aligned: list[pl.DataFrame] = [] - for sym in symbols: - sdf = clean.filter(pl.col("symbol") == sym).sort("ts") - start = sdf.select(pl.col("ts").min()).item() - end = sdf.select(pl.col("ts").max()).item() - calendar = pl.DataFrame( - {"ts": pl.datetime_range(start, end, interval=interval, eager=True)} - ).with_columns(pl.lit(sym).alias("symbol")) - merged = calendar.join(sdf, on=["symbol", "ts"], how="left") - merged = merged.with_columns( - pl.col("open").is_null().alias("is_missing_bar"), - pl.col("adj_close").fill_null(pl.col("close")), + ts, symbol, open_, high, low, close, volume, adj_close = _to_core_vectors(clean) + rows = _core.data.align_calendar( + ts, + symbol, + open_, + high, + low, + close, + volume, + adj_close, + _interval_to_seconds(interval), + ) + if not rows: + return pl.DataFrame( + { + "ts": [], + "symbol": [], + "open": [], + "high": [], + "low": [], + "close": [], + "volume": [], + "adj_close": [], + "is_missing_bar": [], + } ) - aligned.append(merged) - return pl.concat(aligned, how="vertical").sort(["symbol", "ts"]) + return pl.DataFrame( + { + "ts": [r[0] for r in rows], + "symbol": [r[1] for r in rows], + "open": [r[2] for r in rows], + "high": [r[3] for r in rows], + "low": [r[4] for r in rows], + "close": [r[5] for r in rows], + "volume": [r[6] for r in rows], + "adj_close": [r[7] for r in rows], + "is_missing_bar": [r[8] for r in rows], + } + ).with_columns( + pl.col("ts").str.strptime(pl.Datetime, strict=False), + ).sort(["symbol", "ts"]) From d8117103ffb362d933439967d454bd74c08e0012 Mon Sep 17 00:00:00 2001 From: Sean Koval Date: Sat, 14 Feb 2026 03:32:23 -0500 Subject: [PATCH 5/5] perf(data): move openquant.data hot paths to LazyFrame and add benchmark scaffold --- Cargo.lock | 2184 ++++++++++++++- Cargo.toml | 3 + crates/openquant/Cargo.toml | 1 + crates/openquant/src/data_processing.rs | 593 +++- crates/pyopenquant/Cargo.toml | 2 + crates/pyopenquant/src/lib.rs | 205 +- .../benchmarks/benchmark_data_processing.py | 86 + python/openquant/data.py | 277 +- vendor/pyo3-polars/.cargo-ok | 1 + vendor/pyo3-polars/.cargo_vcs_info.json | 6 + vendor/pyo3-polars/Cargo.lock | 2449 +++++++++++++++++ vendor/pyo3-polars/Cargo.toml | 114 + vendor/pyo3-polars/Cargo.toml.orig | 37 + vendor/pyo3-polars/README.md | 148 + vendor/pyo3-polars/src/alloc.rs | 123 + vendor/pyo3-polars/src/derive.rs | 68 + vendor/pyo3-polars/src/error.rs | 73 + vendor/pyo3-polars/src/export.rs | 3 + vendor/pyo3-polars/src/ffi/mod.rs | 2 + vendor/pyo3-polars/src/ffi/to_py.rs | 26 + vendor/pyo3-polars/src/ffi/to_rust.rs | 27 + vendor/pyo3-polars/src/lib.rs | 62 + vendor/pyo3-polars/src/types.rs | 706 +++++ 23 files changed, 6732 insertions(+), 464 deletions(-) create mode 100644 python/benchmarks/benchmark_data_processing.py create mode 100644 vendor/pyo3-polars/.cargo-ok create mode 100644 vendor/pyo3-polars/.cargo_vcs_info.json create mode 100644 vendor/pyo3-polars/Cargo.lock create mode 100644 vendor/pyo3-polars/Cargo.toml create mode 100644 vendor/pyo3-polars/Cargo.toml.orig create mode 100644 vendor/pyo3-polars/README.md create mode 100644 vendor/pyo3-polars/src/alloc.rs create mode 100644 vendor/pyo3-polars/src/derive.rs create mode 100644 vendor/pyo3-polars/src/error.rs create mode 100644 vendor/pyo3-polars/src/export.rs create mode 100644 vendor/pyo3-polars/src/ffi/mod.rs create mode 100644 vendor/pyo3-polars/src/ffi/to_py.rs create mode 100644 vendor/pyo3-polars/src/ffi/to_rust.rs create mode 100644 vendor/pyo3-polars/src/lib.rs create mode 100644 vendor/pyo3-polars/src/types.rs diff --git a/Cargo.lock b/Cargo.lock index d6876b2..145c661 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -11,6 +24,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -32,6 +51,12 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +[[package]] +name = "anyhow" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" + [[package]] name = "approx" version = "0.5.1" @@ -41,12 +66,96 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + +[[package]] +name = "argminmax" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70f13d10a41ac8d2ec79ee34178d61e6f47a29c2edfe7ef1721c7383b0359e65" +dependencies = [ + "num-traits", +] + +[[package]] +name = "array-init-cursor" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed51fe0f224d1d4ea768be38c51f9f831dee9d05c163c11fba0b8c44387b1fc3" + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + +[[package]] +name = "atoi_simd" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a49e05797ca52e312a0c658938b7d00693ef037799ef7187678f212d7684cf" +dependencies = [ + "debug_unsafe", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + [[package]] name = "bumpalo" version = "3.19.0" @@ -58,6 +167,29 @@ name = "bytemuck" version = "1.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +dependencies = [ + "serde", +] [[package]] name = "cast" @@ -65,6 +197,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.47" @@ -72,6 +213,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd405d82c84ff7f35739f175f67d8b9fb7687a0e84ccdc78bd3568839827cf07" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -94,6 +237,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + [[package]] name = "ciborium" version = "0.2.2" @@ -146,6 +299,32 @@ version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "crossterm", + "unicode-segmentation", + "unicode-width", +] + +[[package]] +name = "compact_str" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -188,6 +367,15 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -207,12 +395,44 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" +dependencies = [ + "bitflags", + "crossterm_winapi", + "document-features", + "parking_lot", + "rustix", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.4" @@ -240,6 +460,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "debug_unsafe" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2" + +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + [[package]] name = "either" version = "1.15.0" @@ -247,146 +488,440 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] -name = "find-msvc-tools" -version = "0.1.5" +name = "enum_dispatch" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.111", +] [[package]] -name = "getrandom" -version = "0.2.16" +name = "equivalent" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ - "cfg-if", "libc", - "wasi", + "windows-sys 0.61.2", ] [[package]] -name = "half" -version = "2.7.1" +name = "ethnum" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" -dependencies = [ - "cfg-if", - "crunchy", - "zerocopy", -] +checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" [[package]] -name = "heck" -version = "0.5.0" +name = "fallible-streaming-iterator" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" [[package]] -name = "hermit-abi" -version = "0.5.2" +name = "fast-float2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] -name = "iana-time-zone" -version = "0.1.64" +name = "find-msvc-tools" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "log", - "wasm-bindgen", - "windows-core", -] +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" +name = "foldhash" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] -name = "indoc" -version = "2.0.7" +name = "futures" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ - "rustversion", + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", ] [[package]] -name = "is-terminal" -version = "0.4.17" +name = "futures-channel" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ - "hermit-abi", - "libc", - "windows-sys", + "futures-core", + "futures-sink", ] [[package]] -name = "itertools" -version = "0.10.5" +name = "futures-core" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] -name = "itertools" -version = "0.13.0" +name = "futures-executor" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ - "either", + "futures-core", + "futures-task", + "futures-util", ] [[package]] -name = "itoa" -version = "1.0.15" +name = "futures-io" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] -name = "js-sys" -version = "0.3.82" +name = "futures-macro" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ - "once_cell", - "wasm-bindgen", + "proc-macro2", + "quote", + "syn 2.0.111", ] [[package]] -name = "lazy_static" -version = "1.5.0" +name = "futures-sink" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] -name = "libc" -version = "0.2.177" +name = "futures-task" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] -name = "libm" -version = "0.2.15" +name = "futures-util" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] [[package]] -name = "log" -version = "0.4.28" +name = "getrandom" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", + "rayon", + "serde", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", + "rayon", + "serde", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core 0.62.2", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" + +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" + +[[package]] +name = "lz4" +version = "1.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20b523e860d03443e98350ceaac5e71c6ba89aea7d960769ec3ce37f4de5af4" +dependencies = [ + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] [[package]] name = "matrixmultiply" @@ -404,6 +939,15 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "memmap2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.9.1" @@ -413,6 +957,17 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mio" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + [[package]] name = "nalgebra" version = "0.29.0" @@ -469,6 +1024,24 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "now" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89e9874397a1f0a52fc1f197a8effd9735223cb2390e9dcc83ac6cd02923d0" +dependencies = [ + "chrono", +] + +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -498,75 +1071,645 @@ dependencies = [ ] [[package]] -name = "num-traits" -version = "0.2.19" +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "openquant" +version = "0.1.0" +dependencies = [ + "chrono", + "criterion", + "csv", + "itertools 0.13.0", + "nalgebra 0.32.6", + "polars", + "rand", + "rand_distr", + "serde", + "serde_json", + "statrs", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "planus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" +dependencies = [ + "array-init-cursor", +] + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "polars" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72571dde488ecccbe799798bf99ab7308ebdb7cf5d95bcc498dbd5a132f0da4d" +dependencies = [ + "getrandom 0.2.16", + "polars-arrow", + "polars-core", + "polars-error", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-parquet", + "polars-sql", + "polars-time", + "polars-utils", + "version_check", +] + +[[package]] +name = "polars-arrow" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6611c758d52e799761cc25900666b71552e6c929d88052811bc9daad4b3321a8" +dependencies = [ + "ahash", + "atoi_simd", + "bytemuck", + "chrono", + "chrono-tz", + "dyn-clone", + "either", + "ethnum", + "getrandom 0.2.16", + "hashbrown 0.15.5", + "itoa", + "lz4", + "num-traits", + "parking_lot", + "polars-arrow-format", + "polars-error", + "polars-schema", + "polars-utils", + "simdutf8", + "streaming-iterator", + "strength_reduce", + "strum_macros", + "version_check", + "zstd", +] + +[[package]] +name = "polars-arrow-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b0ef2474af9396b19025b189d96e992311e6a47f90c53cd998b36c4c64b84c" +dependencies = [ + "planus", + "serde", +] + +[[package]] +name = "polars-compute" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "332f2547dbb27599a8ffe68e56159f5996ba03d1dad0382ccb62c109ceacdeb6" +dependencies = [ + "atoi_simd", + "bytemuck", + "chrono", + "either", + "fast-float2", + "itoa", + "num-traits", + "polars-arrow", + "polars-error", + "polars-utils", + "ryu", + "strength_reduce", + "version_check", +] + +[[package]] +name = "polars-core" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796d06eae7e6e74ed28ea54a8fccc584ebac84e6cf0e1e9ba41ffc807b169a01" +dependencies = [ + "ahash", + "bitflags", + "bytemuck", + "chrono", + "chrono-tz", + "comfy-table", + "either", + "hashbrown 0.14.5", + "hashbrown 0.15.5", + "indexmap", + "itoa", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-row", + "polars-schema", + "polars-utils", + "rand", + "rand_distr", + "rayon", + "regex", + "strum_macros", + "thiserror 2.0.18", + "version_check", + "xxhash-rust", +] + +[[package]] +name = "polars-error" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d6529cae0d1db5ed690e47de41fac9b35ae0c26d476830c2079f130887b847" +dependencies = [ + "polars-arrow-format", + "regex", + "simdutf8", + "thiserror 2.0.18", +] + +[[package]] +name = "polars-expr" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8e639991a8ad4fb12880ab44bcc3cf44a5703df003142334d9caf86d77d77e7" +dependencies = [ + "ahash", + "bitflags", + "hashbrown 0.15.5", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-time", + "polars-utils", + "rand", + "rayon", +] + +[[package]] +name = "polars-io" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719a77e94480f6be090512da196e378cbcbeb3584c6fe1134c600aee906e38ab" +dependencies = [ + "ahash", + "async-trait", + "atoi_simd", + "bytes", + "chrono", + "fast-float2", + "futures", + "glob", + "hashbrown 0.15.5", + "home", + "itoa", + "memchr", + "memmap2", + "num-traits", + "once_cell", + "percent-encoding", + "polars-arrow", + "polars-core", + "polars-error", + "polars-parquet", + "polars-schema", + "polars-time", + "polars-utils", + "rayon", + "regex", + "ryu", + "simdutf8", + "tokio", + "tokio-util", +] + +[[package]] +name = "polars-lazy" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0a731a672dfc8ac38c1f73c9a4b2ae38d2fc8ac363bfb64c5f3a3e072ffc5ad" +dependencies = [ + "ahash", + "bitflags", + "chrono", + "memchr", + "once_cell", + "polars-arrow", + "polars-core", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-stream", + "polars-time", + "polars-utils", + "rayon", + "version_check", +] + +[[package]] +name = "polars-mem-engine" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33442189bcbf2e2559aa7914db3835429030a13f4f18e43af5fba9d1b018cf12" +dependencies = [ + "memmap2", + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", +] + +[[package]] +name = "polars-ops" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb83218b0c216104f0076cd1a005128be078f958125f3d59b094ee73d78c18e" +dependencies = [ + "ahash", + "argminmax", + "base64", + "bytemuck", + "chrono", + "chrono-tz", + "either", + "hashbrown 0.15.5", + "hex", + "indexmap", + "memchr", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-schema", + "polars-utils", + "rayon", + "regex", + "regex-syntax", + "strum_macros", + "unicode-normalization", + "unicode-reverse", + "version_check", +] + +[[package]] +name = "polars-parquet" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c60ee85535590a38db6c703a21be4cb25342e40f573f070d1e16f9d84a53ac7" +dependencies = [ + "ahash", + "async-stream", + "base64", + "bytemuck", + "ethnum", + "futures", + "hashbrown 0.15.5", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-parquet-format", + "polars-utils", + "simdutf8", + "streaming-decompression", +] + +[[package]] +name = "polars-parquet-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c025243dcfe8dbc57e94d9f82eb3bef10b565ab180d5b99bed87fd8aea319ce1" +dependencies = [ + "async-trait", + "futures", +] + +[[package]] +name = "polars-pipe" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d238fb76698f56e51ddfa89b135e4eda56a4767c6e8859eed0ab78386fcd52" +dependencies = [ + "crossbeam-channel", + "crossbeam-queue", + "enum_dispatch", + "hashbrown 0.15.5", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", + "rayon", + "uuid", + "version_check", +] + +[[package]] +name = "polars-plan" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f03533a93aa66127fcb909a87153a3c7cfee6f0ae59f497e73d7736208da54c" +dependencies = [ + "ahash", + "bitflags", + "bytemuck", + "bytes", + "chrono", + "chrono-tz", + "either", + "hashbrown 0.15.5", + "memmap2", + "num-traits", + "once_cell", + "percent-encoding", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", + "rayon", + "recursive", + "regex", + "strum_macros", + "version_check", +] + +[[package]] +name = "polars-row" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +checksum = "6bf47f7409f8e75328d7d034be390842924eb276716d0458607be0bddb8cc839" dependencies = [ - "autocfg", - "libm", + "bitflags", + "bytemuck", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-utils", ] [[package]] -name = "once_cell" -version = "1.21.3" +name = "polars-schema" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "416621ae82b84466cf4ff36838a9b0aeb4a67e76bd3065edc8c9cb7da19b1bc7" +dependencies = [ + "indexmap", + "polars-error", + "polars-utils", + "version_check", +] [[package]] -name = "oorandom" -version = "11.1.5" +name = "polars-sql" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" - -[[package]] -name = "openquant" -version = "0.1.0" +checksum = "edaab553b90aa4d6743bb538978e1982368acb58a94408d7dd3299cad49c7083" dependencies = [ - "chrono", - "criterion", - "csv", - "itertools 0.13.0", - "nalgebra 0.32.6", + "hex", + "polars-core", + "polars-error", + "polars-lazy", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", "rand", - "rand_distr", + "regex", "serde", - "serde_json", - "statrs", + "sqlparser", ] [[package]] -name = "paste" -version = "1.0.15" +name = "polars-stream" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +checksum = "498997b656c779610c1496b3d96a59fe569ef22a5b81ccfe5325cb3df8dff2fd" +dependencies = [ + "atomic-waker", + "crossbeam-deque", + "crossbeam-utils", + "futures", + "memmap2", + "parking_lot", + "pin-project-lite", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-parquet", + "polars-plan", + "polars-utils", + "rand", + "rayon", + "recursive", + "slotmap", + "tokio", + "version_check", +] [[package]] -name = "plotters" -version = "0.3.7" +name = "polars-time" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +checksum = "d192efbdab516d28b3fab1709a969e3385bd5cda050b7c9aa9e2502a01fda879" dependencies = [ + "atoi_simd", + "bytemuck", + "chrono", + "chrono-tz", + "now", "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-ops", + "polars-utils", + "rayon", + "regex", + "strum_macros", ] [[package]] -name = "plotters-backend" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" - -[[package]] -name = "plotters-svg" -version = "0.3.7" +name = "polars-utils" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +checksum = "a8f6c8166a4a7fbc15b87c81645ed9e1f0651ff2e8c96cafc40ac5bf43441a10" dependencies = [ - "plotters-backend", + "ahash", + "bytemuck", + "bytes", + "compact_str", + "hashbrown 0.15.5", + "indexmap", + "libc", + "memmap2", + "num-traits", + "once_cell", + "polars-error", + "rand", + "raw-cpuid", + "rayon", + "stacker", + "sysinfo", + "version_check", ] [[package]] @@ -584,6 +1727,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.111", +] + [[package]] name = "proc-macro2" version = "1.0.103" @@ -593,6 +1746,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "pyo3" version = "0.23.5" @@ -656,6 +1819,19 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "pyo3-polars" +version = "0.20.0" +dependencies = [ + "libc", + "once_cell", + "polars", + "polars-arrow", + "polars-core", + "pyo3", + "thiserror 1.0.69", +] + [[package]] name = "pyopenquant" version = "0.1.0" @@ -663,7 +1839,9 @@ dependencies = [ "chrono", "nalgebra 0.32.6", "openquant", + "polars", "pyo3", + "pyo3-polars", ] [[package]] @@ -675,6 +1853,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rand" version = "0.8.5" @@ -702,7 +1886,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.16", ] [[package]] @@ -715,6 +1899,15 @@ dependencies = [ "rand", ] +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags", +] + [[package]] name = "rawpointer" version = "0.2.1" @@ -741,6 +1934,35 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.111", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.12.3" @@ -770,6 +1992,19 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +[[package]] +name = "rustix" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -800,6 +2035,18 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + [[package]] name = "serde" version = "1.0.228" @@ -875,6 +2122,77 @@ dependencies = [ "wide", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "slotmap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdd58c3c93c3d278ca835519292445cb4b0d4dc59ccfdf7ceadaab3f8aeb4038" +dependencies = [ + "version_check", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "sqlparser" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +dependencies = [ + "log", +] + +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "statrs" version = "0.16.1" @@ -888,6 +2206,40 @@ dependencies = [ "rand", ] +[[package]] +name = "streaming-decompression" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" +dependencies = [ + "fallible-streaming-iterator", +] + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.111", +] + [[package]] name = "syn" version = "1.0.109" @@ -900,30 +2252,125 @@ dependencies = [ ] [[package]] -name = "syn" -version = "2.0.111" +name = "syn" +version = "2.0.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sysinfo" +version = "0.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "windows", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] -name = "target-lexicon" -version = "0.12.16" +name = "tokio" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "socket2", + "windows-sys 0.61.2", +] [[package]] -name = "tinytemplate" -version = "1.2.1" +name = "tokio-util" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" dependencies = [ - "serde", - "serde_json", + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", ] [[package]] @@ -938,12 +2385,65 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-reverse" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6f4888ebc23094adfb574fdca9fdc891826287a6397d2cd28802ffd6f20c76" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "uuid" +version = "1.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +dependencies = [ + "getrandom 0.4.1", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -960,6 +2460,24 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "wasm-bindgen" version = "0.2.105" @@ -1005,6 +2523,40 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "web-sys" version = "0.3.82" @@ -1025,13 +2577,57 @@ dependencies = [ "safe_arch", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement 0.57.0", + "windows-interface 0.57.0", + "windows-result 0.1.2", + "windows-targets 0.52.6", ] [[package]] @@ -1040,13 +2636,24 @@ version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ - "windows-implement", - "windows-interface", + "windows-implement 0.60.2", + "windows-interface 0.59.3", "windows-link", - "windows-result", + "windows-result 0.4.1", "windows-strings", ] +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -1058,6 +2665,17 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + [[package]] name = "windows-interface" version = "0.59.3" @@ -1075,6 +2693,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -1093,6 +2720,24 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -1102,6 +2747,229 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.111", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.111", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + [[package]] name = "zerocopy" version = "0.8.30" @@ -1121,3 +2989,31 @@ dependencies = [ "quote", "syn 2.0.111", ] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 7dbb1ba..8d0946a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,6 @@ [workspace] members = ["crates/openquant", "crates/pyopenquant"] resolver = "2" + +[patch.crates-io] +pyo3-polars = { path = "vendor/pyo3-polars" } diff --git a/crates/openquant/Cargo.toml b/crates/openquant/Cargo.toml index 398b24d..152e217 100644 --- a/crates/openquant/Cargo.toml +++ b/crates/openquant/Cargo.toml @@ -20,6 +20,7 @@ statrs = "0.16" rand_distr = "0.4" nalgebra = "0.32" csv = "1" +polars = { version = "0.46", default-features = false, features = ["strings", "lazy"] } [dev-dependencies] csv = "1" diff --git a/crates/openquant/src/data_processing.rs b/crates/openquant/src/data_processing.rs index 3d10d40..7d5118a 100644 --- a/crates/openquant/src/data_processing.rs +++ b/crates/openquant/src/data_processing.rs @@ -1,5 +1,6 @@ -use chrono::NaiveDateTime; -use std::collections::{BTreeMap, HashSet}; +use chrono::{DateTime, NaiveDateTime, Utc}; +use polars::prelude::*; +use std::collections::HashSet; #[derive(Debug, Clone, PartialEq)] pub struct OhlcvRow { @@ -13,6 +14,18 @@ pub struct OhlcvRow { pub adj_close: f64, } +#[derive(Debug, Clone, PartialEq)] +pub struct OhlcvColumns { + pub timestamps_us: Vec, + pub symbols: Vec, + pub open: Vec, + pub high: Vec, + pub low: Vec, + pub close: Vec, + pub volume: Vec, + pub adj_close: Vec, +} + #[derive(Debug, Clone, PartialEq)] pub struct AlignedOhlcvRow { pub timestamp: NaiveDateTime, @@ -26,6 +39,19 @@ pub struct AlignedOhlcvRow { pub is_missing_bar: bool, } +#[derive(Debug, Clone, PartialEq)] +pub struct AlignedOhlcvColumns { + pub timestamps_us: Vec, + pub symbols: Vec, + pub open: Vec>, + pub high: Vec>, + pub low: Vec>, + pub close: Vec>, + pub volume: Vec>, + pub adj_close: Vec>, + pub is_missing_bar: Vec, +} + #[derive(Debug, Clone, PartialEq)] pub struct DataQualityReport { pub row_count: usize, @@ -37,140 +63,501 @@ pub struct DataQualityReport { pub rows_removed_by_deduplication: usize, } -fn sort_rows(rows: &mut [OhlcvRow]) { - rows.sort_by(|a, b| { - a.symbol - .cmp(&b.symbol) - .then_with(|| a.timestamp.cmp(&b.timestamp)) - }); +fn require_ohlcv_columns(df: &DataFrame) -> Result<(), String> { + for name in ["symbol", "ts_us", "open", "high", "low", "close", "volume", "adj_close"] { + df.column(name).map_err(|e| format!("missing required column '{name}': {e}"))?; + } + Ok(()) } -fn dedupe_rows(rows: &[OhlcvRow], keep_last: bool) -> (Vec, usize) { - if rows.is_empty() { - return (Vec::new(), 0); - } +fn sort_ohlcv_df(df: &DataFrame) -> Result { + df.sort( + ["symbol", "ts_us"], + SortMultipleOptions::new().with_order_descending_multi([false, false]), + ) + .map_err(|e| format!("polars sort failed: {e}")) +} - let mut deduped = Vec::new(); - let mut i = 0usize; - while i < rows.len() { - let mut j = i + 1; - while j < rows.len() - && rows[j].symbol == rows[i].symbol - && rows[j].timestamp == rows[i].timestamp - { - j += 1; - } - let chosen = if keep_last { &rows[j - 1] } else { &rows[i] }; - deduped.push(chosen.clone()); - i = j; - } - let removed = rows.len().saturating_sub(deduped.len()); - (deduped, removed) +fn micros_to_naive(ts_us: i64) -> Option { + DateTime::::from_timestamp_micros(ts_us).map(|dt| dt.naive_utc()) } -pub fn quality_report(rows: &[OhlcvRow], rows_removed_by_deduplication: usize) -> DataQualityReport { - let mut symbol_set = HashSet::new(); +fn quality_report_from_sorted_df( + sorted: &DataFrame, + rows_removed_by_deduplication: usize, +) -> Result { + require_ohlcv_columns(sorted)?; + + let symbols = sorted + .column("symbol") + .map_err(|e| format!("symbol column error: {e}"))? + .str() + .map_err(|e| format!("symbol dtype error: {e}"))?; + let ts = sorted + .column("ts_us") + .map_err(|e| format!("ts_us column error: {e}"))? + .i64() + .map_err(|e| format!("ts_us dtype error: {e}"))?; + + let mut symbol_set: HashSet<&str> = HashSet::new(); let mut duplicate_key_count = 0usize; - let mut key_counts: BTreeMap<(String, NaiveDateTime), usize> = BTreeMap::new(); + let mut gap_interval_count = 0usize; + let day_us = 24 * 3600 * 1_000_000i64; - for row in rows { - symbol_set.insert(row.symbol.clone()); - let key = (row.symbol.clone(), row.timestamp); - *key_counts.entry(key).or_insert(0usize) += 1; - } + let mut prev_symbol: Option<&str> = None; + let mut prev_ts: Option = None; - for count in key_counts.values() { - if *count > 1 { - duplicate_key_count += 1; - } - } + for i in 0..sorted.height() { + let s = symbols.get(i).ok_or_else(|| format!("null symbol at row {i}"))?; + let t = ts.get(i).ok_or_else(|| format!("null ts_us at row {i}"))?; + symbol_set.insert(s); - let mut gap_interval_count = 0usize; - let mut last_by_symbol: BTreeMap = BTreeMap::new(); - for row in rows { - if let Some(prev) = last_by_symbol.get(&row.symbol) { - if (row.timestamp - *prev).num_seconds() > 24 * 3600 { + if let (Some(ps), Some(pt)) = (prev_symbol, prev_ts) { + if ps == s && pt == t { + duplicate_key_count += 1; + } else if ps == s && t - pt > day_us { gap_interval_count += 1; } } - last_by_symbol.insert(row.symbol.clone(), row.timestamp); + + prev_symbol = Some(s); + prev_ts = Some(t); } - DataQualityReport { - row_count: rows.len(), + let ts_min = ts.min().and_then(micros_to_naive); + let ts_max = ts.max().and_then(micros_to_naive); + + Ok(DataQualityReport { + row_count: sorted.height(), symbol_count: symbol_set.len(), duplicate_key_count, gap_interval_count, - ts_min: rows.first().map(|r| r.timestamp), - ts_max: rows.last().map(|r| r.timestamp), + ts_min, + ts_max, rows_removed_by_deduplication, + }) +} + +pub fn quality_report_df( + df: &DataFrame, + rows_removed_by_deduplication: usize, +) -> Result { + require_ohlcv_columns(df)?; + if df.height() == 0 { + return Ok(DataQualityReport { + row_count: 0, + symbol_count: 0, + duplicate_key_count: 0, + gap_interval_count: 0, + ts_min: None, + ts_max: None, + rows_removed_by_deduplication, + }); } + let sorted = sort_ohlcv_df(df)?; + quality_report_from_sorted_df(&sorted, rows_removed_by_deduplication) } -pub fn clean_ohlcv_rows(rows: &[OhlcvRow], keep_last: bool) -> (Vec, DataQualityReport) { - let mut sorted = rows.to_vec(); - sort_rows(&mut sorted); - let (deduped, removed) = dedupe_rows(&sorted, keep_last); - let report = quality_report(&deduped, removed); - (deduped, report) +pub fn clean_ohlcv_df( + df: &DataFrame, + keep_last: bool, +) -> Result<(DataFrame, DataQualityReport), String> { + require_ohlcv_columns(df)?; + + if df.height() == 0 { + let empty = sort_ohlcv_df(df)?; + let report = DataQualityReport { + row_count: 0, + symbol_count: 0, + duplicate_key_count: 0, + gap_interval_count: 0, + ts_min: None, + ts_max: None, + rows_removed_by_deduplication: 0, + }; + return Ok((empty, report)); + } + + let sorted = sort_ohlcv_df(df)?; + let before = sorted.height(); + + let cleaned = sorted + .unique_stable( + Some(&["symbol".to_string(), "ts_us".to_string()]), + if keep_last { UniqueKeepStrategy::Last } else { UniqueKeepStrategy::First }, + None, + ) + .map_err(|e| format!("polars unique failed: {e}"))?; + + let removed = before.saturating_sub(cleaned.height()); + let mut report = quality_report_from_sorted_df(&cleaned, removed)?; + report.duplicate_key_count = 0; + + Ok((cleaned, report)) } -pub fn align_calendar_rows(rows: &[OhlcvRow], interval_seconds: i64) -> Result, String> { +pub fn align_calendar_df(df: &DataFrame, interval_seconds: i64) -> Result { if interval_seconds <= 0 { return Err("interval_seconds must be > 0".to_string()); } - let (clean, _) = clean_ohlcv_rows(rows, true); - if clean.is_empty() { - return Ok(Vec::new()); - } - let mut by_symbol: BTreeMap> = BTreeMap::new(); - for row in clean { - by_symbol.entry(row.symbol.clone()).or_default().push(row); + let (cleaned, _) = clean_ohlcv_df(df, true)?; + if cleaned.height() == 0 { + let mut out = cleaned.clone(); + out.with_column(Series::new("is_missing_bar".into(), Vec::::new())) + .map_err(|e| format!("failed to add is_missing_bar: {e}"))?; + return Ok(out); } - let mut out = Vec::new(); - for (symbol, rows_for_symbol) in by_symbol { - if rows_for_symbol.is_empty() { - continue; - } - let start = rows_for_symbol.first().expect("non-empty").timestamp; - let end = rows_for_symbol.last().expect("non-empty").timestamp; + let symbols = cleaned + .column("symbol") + .map_err(|e| format!("symbol column error: {e}"))? + .str() + .map_err(|e| format!("symbol dtype error: {e}"))?; + let ts = cleaned + .column("ts_us") + .map_err(|e| format!("ts_us column error: {e}"))? + .i64() + .map_err(|e| format!("ts_us dtype error: {e}"))?; + + let step_us = interval_seconds * 1_000_000; + + let mut cal_symbols: Vec = Vec::new(); + let mut cal_ts: Vec = Vec::new(); - let mut index: BTreeMap = BTreeMap::new(); - for row in rows_for_symbol { - index.insert(row.timestamp, row); + let mut i = 0usize; + while i < cleaned.height() { + let symbol = symbols.get(i).ok_or_else(|| format!("null symbol at row {i}"))?; + let start = ts.get(i).ok_or_else(|| format!("null ts_us at row {i}"))?; + + let mut j = i + 1; + while j < cleaned.height() && symbols.get(j) == Some(symbol) { + j += 1; } - let mut ts = start; - while ts <= end { - if let Some(row) = index.get(&ts) { - out.push(AlignedOhlcvRow { - timestamp: ts, - symbol: symbol.clone(), - open: Some(row.open), - high: Some(row.high), - low: Some(row.low), - close: Some(row.close), - volume: Some(row.volume), - adj_close: Some(row.adj_close), - is_missing_bar: false, - }); - } else { - out.push(AlignedOhlcvRow { - timestamp: ts, - symbol: symbol.clone(), - open: None, - high: None, - low: None, - close: None, - volume: None, - adj_close: None, - is_missing_bar: true, - }); - } - ts += chrono::Duration::seconds(interval_seconds); + let end = ts.get(j - 1).ok_or_else(|| format!("null ts_us at row {}", j - 1))?; + + let mut cur = start; + while cur <= end { + cal_symbols.push(symbol.to_string()); + cal_ts.push(cur); + cur += step_us; } + + i = j; } + + let calendar = df!("symbol" => cal_symbols, "ts_us" => cal_ts) + .map_err(|e| format!("calendar df build failed: {e}"))?; + + let mut out = calendar + .left_join(&cleaned, ["symbol", "ts_us"], ["symbol", "ts_us"]) + .map_err(|e| format!("calendar join failed: {e}"))?; + + let mut missing = out.column("open").map_err(to_string_err)?.is_null().into_series(); + missing.rename("is_missing_bar".into()); + out.with_column(missing).map_err(|e| format!("failed to add is_missing_bar: {e}"))?; + Ok(out) } + +fn validate_lengths(columns: &OhlcvColumns) -> Result<(), String> { + let n = columns.timestamps_us.len(); + let lengths = [ + columns.symbols.len(), + columns.open.len(), + columns.high.len(), + columns.low.len(), + columns.close.len(), + columns.volume.len(), + columns.adj_close.len(), + ]; + if lengths.iter().any(|&len| len != n) { + return Err(format!( + "ohlcv vector length mismatch: ts={n}, symbol={}, open={}, high={}, low={}, close={}, volume={}, adj_close={}", + columns.symbols.len(), + columns.open.len(), + columns.high.len(), + columns.low.len(), + columns.close.len(), + columns.volume.len(), + columns.adj_close.len() + )); + } + Ok(()) +} + +fn to_polars_df(columns: &OhlcvColumns) -> Result { + validate_lengths(columns)?; + df!( + "symbol" => columns.symbols.clone(), + "ts_us" => columns.timestamps_us.clone(), + "open" => columns.open.clone(), + "high" => columns.high.clone(), + "low" => columns.low.clone(), + "close" => columns.close.clone(), + "volume" => columns.volume.clone(), + "adj_close" => columns.adj_close.clone(), + ) + .map_err(|e| format!("polars df build failed: {e}")) +} + +fn df_to_ohlcv_columns(df: &DataFrame) -> Result { + let timestamps_us = df + .column("ts_us") + .map_err(|e| format!("missing ts_us: {e}"))? + .i64() + .map_err(|e| format!("ts_us type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let symbols = df + .column("symbol") + .map_err(|e| format!("missing symbol: {e}"))? + .str() + .map_err(|e| format!("symbol type error: {e}"))? + .into_no_null_iter() + .map(ToString::to_string) + .collect::>(); + let open = df + .column("open") + .map_err(|e| format!("missing open: {e}"))? + .f64() + .map_err(|e| format!("open type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let high = df + .column("high") + .map_err(|e| format!("missing high: {e}"))? + .f64() + .map_err(|e| format!("high type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let low = df + .column("low") + .map_err(|e| format!("missing low: {e}"))? + .f64() + .map_err(|e| format!("low type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let close = df + .column("close") + .map_err(|e| format!("missing close: {e}"))? + .f64() + .map_err(|e| format!("close type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let volume = df + .column("volume") + .map_err(|e| format!("missing volume: {e}"))? + .f64() + .map_err(|e| format!("volume type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let adj_close = df + .column("adj_close") + .map_err(|e| format!("missing adj_close: {e}"))? + .f64() + .map_err(|e| format!("adj_close type error: {e}"))? + .into_no_null_iter() + .collect::>(); + + Ok(OhlcvColumns { timestamps_us, symbols, open, high, low, close, volume, adj_close }) +} + +fn to_string_err(e: T) -> String { + e.to_string() +} + +pub fn quality_report_columns( + columns: &OhlcvColumns, + rows_removed_by_deduplication: usize, +) -> Result { + let df = to_polars_df(columns)?; + quality_report_df(&df, rows_removed_by_deduplication) +} + +pub fn clean_ohlcv_columns( + columns: &OhlcvColumns, + keep_last: bool, +) -> Result<(OhlcvColumns, DataQualityReport), String> { + let df = to_polars_df(columns)?; + let (clean_df, report) = clean_ohlcv_df(&df, keep_last)?; + let clean_cols = df_to_ohlcv_columns(&clean_df)?; + Ok((clean_cols, report)) +} + +pub fn align_calendar_columns( + columns: &OhlcvColumns, + interval_seconds: i64, +) -> Result { + let df = to_polars_df(columns)?; + let out = align_calendar_df(&df, interval_seconds)?; + + let timestamps_us = out + .column("ts_us") + .map_err(|e| format!("missing ts_us: {e}"))? + .i64() + .map_err(|e| format!("ts_us type error: {e}"))? + .into_no_null_iter() + .collect::>(); + let symbols = out + .column("symbol") + .map_err(|e| format!("missing symbol: {e}"))? + .str() + .map_err(|e| format!("symbol type error: {e}"))? + .into_no_null_iter() + .map(ToString::to_string) + .collect::>(); + + let open = out + .column("open") + .map_err(|e| format!("missing open: {e}"))? + .f64() + .map_err(|e| format!("open type error: {e}"))? + .into_iter() + .collect::>(); + let high = out + .column("high") + .map_err(|e| format!("missing high: {e}"))? + .f64() + .map_err(|e| format!("high type error: {e}"))? + .into_iter() + .collect::>(); + let low = out + .column("low") + .map_err(|e| format!("missing low: {e}"))? + .f64() + .map_err(|e| format!("low type error: {e}"))? + .into_iter() + .collect::>(); + let close = out + .column("close") + .map_err(|e| format!("missing close: {e}"))? + .f64() + .map_err(|e| format!("close type error: {e}"))? + .into_iter() + .collect::>(); + let volume = out + .column("volume") + .map_err(|e| format!("missing volume: {e}"))? + .f64() + .map_err(|e| format!("volume type error: {e}"))? + .into_iter() + .collect::>(); + let adj_close = out + .column("adj_close") + .map_err(|e| format!("missing adj_close: {e}"))? + .f64() + .map_err(|e| format!("adj_close type error: {e}"))? + .into_iter() + .collect::>(); + let is_missing_bar = out + .column("is_missing_bar") + .map_err(|e| format!("missing is_missing_bar: {e}"))? + .bool() + .map_err(|e| format!("is_missing_bar type error: {e}"))? + .into_no_null_iter() + .collect::>(); + + Ok(AlignedOhlcvColumns { + timestamps_us, + symbols, + open, + high, + low, + close, + volume, + adj_close, + is_missing_bar, + }) +} + +fn rows_to_columns(rows: &[OhlcvRow]) -> OhlcvColumns { + let mut out = OhlcvColumns { + timestamps_us: Vec::with_capacity(rows.len()), + symbols: Vec::with_capacity(rows.len()), + open: Vec::with_capacity(rows.len()), + high: Vec::with_capacity(rows.len()), + low: Vec::with_capacity(rows.len()), + close: Vec::with_capacity(rows.len()), + volume: Vec::with_capacity(rows.len()), + adj_close: Vec::with_capacity(rows.len()), + }; + for row in rows { + out.timestamps_us.push( + DateTime::::from_naive_utc_and_offset(row.timestamp, Utc).timestamp_micros(), + ); + out.symbols.push(row.symbol.clone()); + out.open.push(row.open); + out.high.push(row.high); + out.low.push(row.low); + out.close.push(row.close); + out.volume.push(row.volume); + out.adj_close.push(row.adj_close); + } + out +} + +fn columns_to_rows(columns: &OhlcvColumns) -> Vec { + let mut rows = Vec::with_capacity(columns.timestamps_us.len()); + for i in 0..columns.timestamps_us.len() { + let dt = DateTime::::from_timestamp_micros(columns.timestamps_us[i]) + .expect("valid datetime") + .naive_utc(); + rows.push(OhlcvRow { + timestamp: dt, + symbol: columns.symbols[i].clone(), + open: columns.open[i], + high: columns.high[i], + low: columns.low[i], + close: columns.close[i], + volume: columns.volume[i], + adj_close: columns.adj_close[i], + }); + } + rows +} + +fn aligned_columns_to_rows(columns: &AlignedOhlcvColumns) -> Vec { + let mut rows = Vec::with_capacity(columns.timestamps_us.len()); + for i in 0..columns.timestamps_us.len() { + let dt = DateTime::::from_timestamp_micros(columns.timestamps_us[i]) + .expect("valid datetime") + .naive_utc(); + rows.push(AlignedOhlcvRow { + timestamp: dt, + symbol: columns.symbols[i].clone(), + open: columns.open[i], + high: columns.high[i], + low: columns.low[i], + close: columns.close[i], + volume: columns.volume[i], + adj_close: columns.adj_close[i], + is_missing_bar: columns.is_missing_bar[i], + }); + } + rows +} + +pub fn clean_ohlcv_rows(rows: &[OhlcvRow], keep_last: bool) -> (Vec, DataQualityReport) { + let cols = rows_to_columns(rows); + let (clean_cols, report) = clean_ohlcv_columns(&cols, keep_last).expect("validated rows"); + (columns_to_rows(&clean_cols), report) +} + +pub fn quality_report( + rows: &[OhlcvRow], + rows_removed_by_deduplication: usize, +) -> DataQualityReport { + let cols = rows_to_columns(rows); + quality_report_columns(&cols, rows_removed_by_deduplication).expect("validated rows") +} + +pub fn align_calendar_rows( + rows: &[OhlcvRow], + interval_seconds: i64, +) -> Result, String> { + let cols = rows_to_columns(rows); + let aligned_cols = align_calendar_columns(&cols, interval_seconds)?; + Ok(aligned_columns_to_rows(&aligned_cols)) +} diff --git a/crates/pyopenquant/Cargo.toml b/crates/pyopenquant/Cargo.toml index 4582420..ef3cafb 100644 --- a/crates/pyopenquant/Cargo.toml +++ b/crates/pyopenquant/Cargo.toml @@ -13,7 +13,9 @@ crate-type = ["cdylib"] chrono = "0.4" nalgebra = "0.32" openquant = { path = "../openquant" } +polars = { version = "0.46", default-features = false } pyo3 = { version = "0.23", features = ["extension-module"] } +pyo3-polars = "0.20" [package.metadata.maturin] name = "openquant._core" diff --git a/crates/pyopenquant/src/lib.rs b/crates/pyopenquant/src/lib.rs index 4c68b8e..16cdefb 100644 --- a/crates/pyopenquant/src/lib.rs +++ b/crates/pyopenquant/src/lib.rs @@ -1,5 +1,7 @@ use nalgebra::DMatrix; -use openquant::data_processing::{align_calendar_rows, clean_ohlcv_rows, quality_report, OhlcvRow}; +use openquant::data_processing::{ + align_calendar_columns, clean_ohlcv_columns, quality_report_columns, OhlcvColumns, +}; use openquant::data_structures::{standard_bars, time_bars, StandardBarType, Trade}; use openquant::filters::Threshold; use openquant::pipeline::{ @@ -9,11 +11,13 @@ use openquant::portfolio_optimization::{ allocate_inverse_variance, allocate_max_sharpe, allocate_min_vol, }; use openquant::risk_metrics::RiskMetrics; +use polars::prelude::DataFrame; use pyo3::exceptions::PyRuntimeError; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::PyDict; use pyo3::types::PyModule; +use pyo3_polars::PyDataFrame; fn to_py_err(err: T) -> PyErr { PyValueError::new_err(format!("{err:?}")) @@ -89,7 +93,9 @@ fn build_trades( Ok(trades) } -fn bars_to_rows(bars: Vec) -> Vec<(String, String, f64, f64, f64, f64, f64, f64, usize)> { +fn bars_to_rows( + bars: Vec, +) -> Vec<(String, String, f64, f64, f64, f64, f64, f64, usize)> { bars.into_iter() .map(|b| { ( @@ -107,8 +113,8 @@ fn bars_to_rows(bars: Vec) -> Vec<(Stri .collect() } -fn build_ohlcv_rows( - timestamps: Vec, +fn build_ohlcv_columns( + timestamps_us: Vec, symbols: Vec, open: Vec, high: Vec, @@ -116,8 +122,8 @@ fn build_ohlcv_rows( close: Vec, volume: Vec, adj_close: Vec, -) -> PyResult> { - let n = timestamps.len(); +) -> PyResult { + let n = timestamps_us.len(); let lengths = [ symbols.len(), open.len(), @@ -139,20 +145,24 @@ fn build_ohlcv_rows( adj_close.len(), ))); } - let mut rows = Vec::with_capacity(n); - for i in 0..n { - rows.push(OhlcvRow { - timestamp: parse_one_naive_datetime(×tamps[i])?, - symbol: symbols[i].clone(), - open: open[i], - high: high[i], - low: low[i], - close: close[i], - volume: volume[i], - adj_close: adj_close[i], - }); - } - Ok(rows) + Ok(OhlcvColumns { timestamps_us, symbols, open, high, low, close, volume, adj_close }) +} + +fn report_to_pydict( + py: Python<'_>, + report: openquant::data_processing::DataQualityReport, +) -> PyResult { + let out_report = PyDict::new_bound(py); + out_report.set_item("row_count", report.row_count)?; + out_report.set_item("symbol_count", report.symbol_count)?; + out_report.set_item("duplicate_key_count", report.duplicate_key_count)?; + out_report.set_item("gap_interval_count", report.gap_interval_count)?; + out_report + .set_item("ts_min", report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()))?; + out_report + .set_item("ts_max", report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()))?; + out_report.set_item("rows_removed_by_deduplication", report.rows_removed_by_deduplication)?; + Ok(out_report.into_py(py)) } #[pyfunction(name = "calculate_value_at_risk")] @@ -322,7 +332,7 @@ fn bars_build_dollar_bars( #[pyfunction(name = "clean_ohlcv")] fn data_clean_ohlcv( py: Python<'_>, - timestamps: Vec, + timestamps_us: Vec, symbols: Vec, open: Vec, high: Vec, @@ -331,46 +341,48 @@ fn data_clean_ohlcv( volume: Vec, adj_close: Vec, dedupe_keep_last: bool, -) -> PyResult<(Vec<(String, String, f64, f64, f64, f64, f64, f64)>, PyObject)> { - let rows = build_ohlcv_rows(timestamps, symbols, open, high, low, close, volume, adj_close)?; - let (clean, report) = clean_ohlcv_rows(&rows, dedupe_keep_last); - let out_rows = clean - .into_iter() - .map(|r| { - ( - r.timestamp.format("%Y-%m-%d %H:%M:%S").to_string(), - r.symbol, - r.open, - r.high, - r.low, - r.close, - r.volume, - r.adj_close, - ) - }) - .collect::>(); +) -> PyResult<( + Vec, + Vec, + Vec, + Vec, + Vec, + Vec, + Vec, + Vec, + PyObject, +)> { + let cols = + build_ohlcv_columns(timestamps_us, symbols, open, high, low, close, volume, adj_close)?; + let (clean, report) = clean_ohlcv_columns(&cols, dedupe_keep_last).map_err(to_py_err)?; let out_report = PyDict::new_bound(py); out_report.set_item("row_count", report.row_count)?; out_report.set_item("symbol_count", report.symbol_count)?; out_report.set_item("duplicate_key_count", report.duplicate_key_count)?; out_report.set_item("gap_interval_count", report.gap_interval_count)?; - out_report.set_item( - "ts_min", - report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), - )?; - out_report.set_item( - "ts_max", - report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), - )?; + out_report + .set_item("ts_min", report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()))?; + out_report + .set_item("ts_max", report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()))?; out_report.set_item("rows_removed_by_deduplication", report.rows_removed_by_deduplication)?; - Ok((out_rows, out_report.into_py(py))) + Ok(( + clean.timestamps_us, + clean.symbols, + clean.open, + clean.high, + clean.low, + clean.close, + clean.volume, + clean.adj_close, + out_report.into_py(py), + )) } #[pyfunction(name = "quality_report")] fn data_quality_report( py: Python<'_>, - timestamps: Vec, + timestamps_us: Vec, symbols: Vec, open: Vec, high: Vec, @@ -379,29 +391,25 @@ fn data_quality_report( volume: Vec, adj_close: Vec, ) -> PyResult { - let mut rows = build_ohlcv_rows(timestamps, symbols, open, high, low, close, volume, adj_close)?; - rows.sort_by(|a, b| a.symbol.cmp(&b.symbol).then_with(|| a.timestamp.cmp(&b.timestamp))); - let report = quality_report(&rows, 0); + let cols = + build_ohlcv_columns(timestamps_us, symbols, open, high, low, close, volume, adj_close)?; + let report = quality_report_columns(&cols, 0).map_err(to_py_err)?; let out_report = PyDict::new_bound(py); out_report.set_item("row_count", report.row_count)?; out_report.set_item("symbol_count", report.symbol_count)?; out_report.set_item("duplicate_key_count", report.duplicate_key_count)?; out_report.set_item("gap_interval_count", report.gap_interval_count)?; - out_report.set_item( - "ts_min", - report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), - )?; - out_report.set_item( - "ts_max", - report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()), - )?; + out_report + .set_item("ts_min", report.ts_min.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()))?; + out_report + .set_item("ts_max", report.ts_max.map(|v| v.format("%Y-%m-%d %H:%M:%S").to_string()))?; out_report.set_item("rows_removed_by_deduplication", 0)?; Ok(out_report.into_py(py)) } #[pyfunction(name = "align_calendar")] fn data_align_calendar( - timestamps: Vec, + timestamps_us: Vec, symbols: Vec, open: Vec, high: Vec, @@ -410,25 +418,59 @@ fn data_align_calendar( volume: Vec, adj_close: Vec, interval_seconds: i64, -) -> PyResult, Option, Option, Option, Option, Option, bool)>> { - let rows = build_ohlcv_rows(timestamps, symbols, open, high, low, close, volume, adj_close)?; - let out = align_calendar_rows(&rows, interval_seconds).map_err(to_py_err)?; - Ok(out - .into_iter() - .map(|r| { - ( - r.timestamp.format("%Y-%m-%d %H:%M:%S").to_string(), - r.symbol, - r.open, - r.high, - r.low, - r.close, - r.volume, - r.adj_close, - r.is_missing_bar, - ) - }) - .collect()) +) -> PyResult<( + Vec, + Vec, + Vec>, + Vec>, + Vec>, + Vec>, + Vec>, + Vec>, + Vec, +)> { + let cols = + build_ohlcv_columns(timestamps_us, symbols, open, high, low, close, volume, adj_close)?; + let out = align_calendar_columns(&cols, interval_seconds).map_err(to_py_err)?; + Ok(( + out.timestamps_us, + out.symbols, + out.open, + out.high, + out.low, + out.close, + out.volume, + out.adj_close, + out.is_missing_bar, + )) +} + +#[pyfunction(name = "clean_ohlcv_df")] +fn data_clean_ohlcv_df( + py: Python<'_>, + pydf: PyDataFrame, + dedupe_keep_last: bool, +) -> PyResult<(PyDataFrame, PyObject)> { + let df: DataFrame = pydf.into(); + let (out_df, report) = + openquant::data_processing::clean_ohlcv_df(&df, dedupe_keep_last).map_err(to_py_err)?; + let out_report = report_to_pydict(py, report)?; + Ok((PyDataFrame(out_df), out_report)) +} + +#[pyfunction(name = "quality_report_df")] +fn data_quality_report_df(py: Python<'_>, pydf: PyDataFrame) -> PyResult { + let df: DataFrame = pydf.into(); + let report = openquant::data_processing::quality_report_df(&df, 0).map_err(to_py_err)?; + report_to_pydict(py, report) +} + +#[pyfunction(name = "align_calendar_df")] +fn data_align_calendar_df(pydf: PyDataFrame, interval_seconds: i64) -> PyResult { + let df: DataFrame = pydf.into(); + let out_df = + openquant::data_processing::align_calendar_df(&df, interval_seconds).map_err(to_py_err)?; + Ok(PyDataFrame(out_df)) } #[pyfunction(name = "get_signal")] @@ -614,6 +656,9 @@ fn _core(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { data.add_function(wrap_pyfunction!(data_clean_ohlcv, &data)?)?; data.add_function(wrap_pyfunction!(data_quality_report, &data)?)?; data.add_function(wrap_pyfunction!(data_align_calendar, &data)?)?; + data.add_function(wrap_pyfunction!(data_clean_ohlcv_df, &data)?)?; + data.add_function(wrap_pyfunction!(data_quality_report_df, &data)?)?; + data.add_function(wrap_pyfunction!(data_align_calendar_df, &data)?)?; m.add_submodule(&data)?; m.add("data", data)?; diff --git a/python/benchmarks/benchmark_data_processing.py b/python/benchmarks/benchmark_data_processing.py new file mode 100644 index 0000000..03c1d40 --- /dev/null +++ b/python/benchmarks/benchmark_data_processing.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import argparse +import json +import time + +import polars as pl + +import openquant + + +def make_dataset(rows_per_symbol: int, symbols: list[str]) -> pl.DataFrame: + frames: list[pl.DataFrame] = [] + for symbol in symbols: + frames.append( + pl.DataFrame( + { + "ts": pl.datetime_range( + start=pl.datetime(2020, 1, 1), + end=pl.datetime(2020, 1, 1) + pl.duration(minutes=rows_per_symbol - 1), + interval="1m", + eager=True, + ), + "symbol": [symbol] * rows_per_symbol, + "open": pl.arange(0, rows_per_symbol, eager=True).cast(pl.Float64) + 100.0, + "high": pl.arange(0, rows_per_symbol, eager=True).cast(pl.Float64) + 100.5, + "low": pl.arange(0, rows_per_symbol, eager=True).cast(pl.Float64) + 99.5, + "close": pl.arange(0, rows_per_symbol, eager=True).cast(pl.Float64) + 100.2, + "volume": pl.repeat(1000.0, rows_per_symbol, eager=True), + } + ) + ) + return pl.concat(frames, rechunk=True) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Benchmark openquant.data throughput.") + parser.add_argument("--rows-per-symbol", type=int, default=200_000) + parser.add_argument("--symbols", type=int, default=4) + args = parser.parse_args() + + symbol_names = [f"SYM{i}" for i in range(args.symbols)] + base = make_dataset(args.rows_per_symbol, symbol_names) + total_rows = base.height + + # Warm-up to stabilize lazy-plan compile and allocation effects. + _ = openquant.data.clean_ohlcv(base) + + t0 = time.perf_counter() + clean = openquant.data.clean_ohlcv(base) + t1 = time.perf_counter() + + t2 = time.perf_counter() + quality = openquant.data.data_quality_report(base) + t3 = time.perf_counter() + + t4 = time.perf_counter() + aligned = openquant.data.align_calendar(clean, interval="1m") + t5 = time.perf_counter() + + print( + json.dumps( + { + "rows": total_rows, + "clean_rows": clean.height, + "aligned_rows": aligned.height, + "clean_seconds": t1 - t0, + "clean_rows_per_sec": total_rows / max(t1 - t0, 1e-9), + "quality_seconds": t3 - t2, + "quality_rows_per_sec": total_rows / max(t3 - t2, 1e-9), + "align_seconds": t5 - t4, + "align_rows_per_sec": clean.height / max(t5 - t4, 1e-9), + "quality_report": { + "row_count": quality["row_count"], + "symbol_count": quality["symbol_count"], + "duplicate_key_count": quality["duplicate_key_count"], + "gap_interval_count": quality["gap_interval_count"], + }, + }, + indent=2, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/python/openquant/data.py b/python/openquant/data.py index 12c1cec..a11481f 100644 --- a/python/openquant/data.py +++ b/python/openquant/data.py @@ -5,8 +5,6 @@ import polars as pl -from . import _core - CANONICAL_OHLCV_COLUMNS = [ "ts", @@ -39,6 +37,17 @@ "adj close": "adj_close", } +_ZERO_NULL_COUNTS = { + "ts": 0, + "symbol": 0, + "open": 0, + "high": 0, + "low": 0, + "close": 0, + "volume": 0, + "adj_close": 0, +} + def _normalize_column_name(name: str) -> str: return name.strip().lower().replace("-", "_") @@ -67,8 +76,11 @@ def _validate_required_columns(df: pl.DataFrame) -> None: raise ValueError(f"missing required OHLCV columns: {', '.join(missing)}") -def _cast_and_order(df: pl.DataFrame) -> pl.DataFrame: - casted = df.with_columns( +def _prepare_ohlcv_lf(df: pl.DataFrame) -> pl.LazyFrame: + frame = _canonicalize_columns(df) + _validate_required_columns(frame) + + lf = frame.lazy().with_columns( pl.col("ts").cast(pl.Utf8).str.strptime(pl.Datetime, strict=False), pl.col("symbol").cast(pl.Utf8), pl.col("open").cast(pl.Float64), @@ -77,52 +89,69 @@ def _cast_and_order(df: pl.DataFrame) -> pl.DataFrame: pl.col("close").cast(pl.Float64), pl.col("volume").cast(pl.Float64), ) - if "adj_close" in casted.columns: - casted = casted.with_columns(pl.col("adj_close").cast(pl.Float64)) + if "adj_close" in frame.columns: + lf = lf.with_columns(pl.col("adj_close").cast(pl.Float64)) else: - casted = casted.with_columns(pl.col("close").alias("adj_close")) - return casted.select(CANONICAL_OHLCV_COLUMNS) + lf = lf.with_columns(pl.col("close").alias("adj_close")) + return lf.select(CANONICAL_OHLCV_COLUMNS).drop_nulls(CANONICAL_OHLCV_COLUMNS) -def _to_core_vectors(df: pl.DataFrame) -> tuple[list[str], list[str], list[float], list[float], list[float], list[float], list[float], list[float]]: +def _format_ts(v: Any) -> str | None: + if v is None: + return None + if hasattr(v, "strftime"): + return v.strftime("%Y-%m-%d %H:%M:%S") + return str(v) + + +def _gap_expr(symbol_expr: pl.Expr, ts_us_expr: pl.Expr, threshold_seconds: int = 24 * 3600) -> pl.Expr: + threshold_us = int(threshold_seconds) * 1_000_000 return ( - [str(x) for x in df["ts"].to_list()], - [str(x) for x in df["symbol"].to_list()], - [float(x) for x in df["open"].to_list()], - [float(x) for x in df["high"].to_list()], - [float(x) for x in df["low"].to_list()], - [float(x) for x in df["close"].to_list()], - [float(x) for x in df["volume"].to_list()], - [float(x) for x in df["adj_close"].to_list()], + (symbol_expr == symbol_expr.shift(1)) + & ((ts_us_expr - ts_us_expr.shift(1)) > threshold_us) ) -def _rows_to_frame(rows: list[tuple[str, str, float, float, float, float, float, float]]) -> pl.DataFrame: - if not rows: - return pl.DataFrame( - { - "ts": [], - "symbol": [], - "open": [], - "high": [], - "low": [], - "close": [], - "volume": [], - "adj_close": [], - } - ) - return pl.DataFrame( - { - "ts": [r[0] for r in rows], - "symbol": [r[1] for r in rows], - "open": [r[2] for r in rows], - "high": [r[3] for r in rows], - "low": [r[4] for r in rows], - "close": [r[5] for r in rows], - "volume": [r[6] for r in rows], - "adj_close": [r[7] for r in rows], +def _build_quality_report(sorted_df: pl.DataFrame, rows_removed_by_deduplication: int) -> dict[str, Any]: + if sorted_df.height == 0: + return { + "row_count": 0, + "symbol_count": 0, + "duplicate_key_count": 0, + "gap_interval_count": 0, + "ts_min": None, + "ts_max": None, + "rows_removed_by_deduplication": rows_removed_by_deduplication, + "null_counts": dict(_ZERO_NULL_COUNTS), } - ).with_columns(pl.col("ts").str.strptime(pl.Datetime, strict=False)) + + summary = ( + sorted_df.lazy() + .select( + pl.len().alias("row_count"), + pl.col("symbol").n_unique().alias("symbol_count"), + ( + ((pl.col("symbol") == pl.col("symbol").shift(1)) & (pl.col("ts_us") == pl.col("ts_us").shift(1))) + .cast(pl.UInt32) + .sum() + ).alias("duplicate_key_count"), + _gap_expr(pl.col("symbol"), pl.col("ts_us")).cast(pl.UInt32).sum().alias("gap_interval_count"), + pl.col("ts").min().alias("ts_min"), + pl.col("ts").max().alias("ts_max"), + ) + .collect() + .row(0, named=True) + ) + return { + "row_count": int(summary["row_count"]), + "symbol_count": int(summary["symbol_count"]), + "duplicate_key_count": int(summary["duplicate_key_count"]), + "gap_interval_count": int(summary["gap_interval_count"]), + "ts_min": _format_ts(summary["ts_min"]), + "ts_max": _format_ts(summary["ts_max"]), + "rows_removed_by_deduplication": rows_removed_by_deduplication, + "null_counts": dict(_ZERO_NULL_COUNTS), + } def _interval_to_seconds(interval: str) -> int: @@ -144,66 +173,52 @@ def clean_ohlcv( dedupe_keep: str = "last", return_report: bool = False, ) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]: - out = _canonicalize_columns(df) - _validate_required_columns(out) - out = _cast_and_order(out) - ts, symbol, open_, high, low, close, volume, adj_close = _to_core_vectors(out) - rows, report = _core.data.clean_ohlcv( - ts, - symbol, - open_, - high, - low, - close, - volume, - adj_close, - dedupe_keep == "last", + if dedupe_keep not in {"first", "last"}: + raise ValueError("dedupe_keep must be 'first' or 'last'") + + base_lf = _prepare_ohlcv_lf(df).with_columns(pl.col("ts").dt.timestamp(time_unit="us").alias("ts_us")) + sorted_lf = base_lf.sort(["symbol", "ts_us"]) + + duplicate_key_count = int( + sorted_lf + .select( + ( + ((pl.col("symbol") == pl.col("symbol").shift(1)) & (pl.col("ts_us") == pl.col("ts_us").shift(1))) + .cast(pl.UInt32) + .sum() + ).alias("duplicate_key_count") + ) + .collect() + .item(0, 0) + ) + + cleaned = ( + sorted_lf.unique( + subset=["symbol", "ts_us"], + keep=dedupe_keep, + maintain_order=True, + ) + .sort(["symbol", "ts"]) + .collect() ) - frame = _rows_to_frame(rows).sort(["symbol", "ts"]) - report = dict(report) - report["null_counts"] = { - "ts": 0, - "symbol": 0, - "open": 0, - "high": 0, - "low": 0, - "close": 0, - "volume": 0, - "adj_close": 0, - } - if return_report: - return frame, report - return frame + + frame = cleaned.select(CANONICAL_OHLCV_COLUMNS) + if not return_report: + return frame + + report = _build_quality_report(cleaned, rows_removed_by_deduplication=duplicate_key_count) + report["duplicate_key_count"] = 0 + return frame, report def data_quality_report(df: pl.DataFrame) -> dict[str, Any]: - out = _canonicalize_columns(df) - _validate_required_columns(out) - out = _cast_and_order(out).sort(["symbol", "ts"]) - ts, symbol, open_, high, low, close, volume, adj_close = _to_core_vectors(out) - report = dict( - _core.data.quality_report( - ts, - symbol, - open_, - high, - low, - close, - volume, - adj_close, - ) + sorted_df = ( + _prepare_ohlcv_lf(df) + .with_columns(pl.col("ts").dt.timestamp(time_unit="us").alias("ts_us")) + .sort(["symbol", "ts_us"]) + .collect() ) - report["null_counts"] = { - "ts": 0, - "symbol": 0, - "open": 0, - "high": 0, - "low": 0, - "close": 0, - "volume": 0, - "adj_close": 0, - } - return report + return _build_quality_report(sorted_df, rows_removed_by_deduplication=0) def load_ohlcv( @@ -236,45 +251,33 @@ def align_calendar( *, interval: str = "1d", ) -> pl.DataFrame: - clean = clean_ohlcv(df) - ts, symbol, open_, high, low, close, volume, adj_close = _to_core_vectors(clean) - rows = _core.data.align_calendar( - ts, - symbol, - open_, - high, - low, - close, - volume, - adj_close, - _interval_to_seconds(interval), + interval_seconds = _interval_to_seconds(interval) + if interval_seconds <= 0: + raise ValueError("interval_seconds must be > 0") + + clean = clean_ohlcv(df).lazy() + bounds = clean.group_by("symbol").agg( + pl.col("ts").min().alias("ts_min"), + pl.col("ts").max().alias("ts_max"), ) - if not rows: - return pl.DataFrame( - { - "ts": [], - "symbol": [], - "open": [], - "high": [], - "low": [], - "close": [], - "volume": [], - "adj_close": [], - "is_missing_bar": [], - } + calendar = ( + bounds.with_columns( + pl.datetime_ranges( + "ts_min", + "ts_max", + interval=f"{interval_seconds}s", + closed="both", + ).alias("ts") ) - return pl.DataFrame( - { - "ts": [r[0] for r in rows], - "symbol": [r[1] for r in rows], - "open": [r[2] for r in rows], - "high": [r[3] for r in rows], - "low": [r[4] for r in rows], - "close": [r[5] for r in rows], - "volume": [r[6] for r in rows], - "adj_close": [r[7] for r in rows], - "is_missing_bar": [r[8] for r in rows], - } - ).with_columns( - pl.col("ts").str.strptime(pl.Datetime, strict=False), - ).sort(["symbol", "ts"]) + .explode("ts") + .select(["symbol", "ts"]) + ) + + out = ( + calendar.join(clean, on=["symbol", "ts"], how="left") + .with_columns(pl.col("open").is_null().alias("is_missing_bar")) + .select(CANONICAL_OHLCV_COLUMNS + ["is_missing_bar"]) + .sort(["symbol", "ts"]) + .collect() + ) + return out diff --git a/vendor/pyo3-polars/.cargo-ok b/vendor/pyo3-polars/.cargo-ok new file mode 100644 index 0000000..5f8b795 --- /dev/null +++ b/vendor/pyo3-polars/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/vendor/pyo3-polars/.cargo_vcs_info.json b/vendor/pyo3-polars/.cargo_vcs_info.json new file mode 100644 index 0000000..d18bd7b --- /dev/null +++ b/vendor/pyo3-polars/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "9b69223c5fc51994e2802865c667cd62201b8397" + }, + "path_in_vcs": "pyo3-polars" +} \ No newline at end of file diff --git a/vendor/pyo3-polars/Cargo.lock b/vendor/pyo3-polars/Cargo.lock new file mode 100644 index 0000000..6ffe7e6 --- /dev/null +++ b/vendor/pyo3-polars/Cargo.lock @@ -0,0 +1,2449 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "argminmax" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52424b59d69d69d5056d508b260553afd91c57e21849579cd1f50ee8b8b88eaa" +dependencies = [ + "num-traits", +] + +[[package]] +name = "array-init-cursor" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atoi_simd" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4790f9e8961209112beb783d85449b508673cf4a6a419c8449b210743ac4dbe9" + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +dependencies = [ + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "bytemuck" +version = "1.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc8b54b395f2fcfbb3d90c47b01c7f444d94d05bdeb775811dec868ac3bbc26" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" +dependencies = [ + "serde", +] + +[[package]] +name = "castaway" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5" +dependencies = [ + "rustversion", +] + +[[package]] +name = "cc" +version = "1.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62ac837cdb5cb22e10a256099b4fc502b1dfe560cb282963a974d7abd80e476" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "serde", + "windows-targets", +] + +[[package]] +name = "chrono-tz" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" +dependencies = [ + "parse-zoneinfo", + "phf_codegen", +] + +[[package]] +name = "comfy-table" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +dependencies = [ + "crossterm", + "strum", + "strum_macros", + "unicode-width", +] + +[[package]] +name = "compact_str" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crossterm" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" +dependencies = [ + "bitflags", + "crossterm_winapi", + "libc", + "parking_lot", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "dyn-clone" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +dependencies = [ + "serde", +] + +[[package]] +name = "enum_dispatch" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "ethnum" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90ca2580b73ab6a1f724b76ca11ab632df820fd6040c336200d2c1df7b3c82c" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + +[[package]] +name = "flate2" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + +[[package]] +name = "foldhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "halfbrown" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8588661a8607108a5ca69cab034063441a0413a0b041c13618a7dd348021ef6f" +dependencies = [ + "hashbrown 0.14.5", + "serde", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", + "rayon", + "serde", +] + +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", + "rayon", + "serde", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core 0.52.0", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +dependencies = [ + "equivalent", + "hashbrown 0.14.5", + "serde", +] + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "iter-read" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4" + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.167" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lz4" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958b4caa893816eea05507c20cfe47574a43d9a697138a7872990bba8a0ece68" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +dependencies = [ + "hermit-abi", + "libc", + "wasi", + "windows-sys 0.52.0", +] + +[[package]] +name = "now" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89e9874397a1f0a52fc1f197a8effd9735223cb2390e9dcc83ac6cd02923d0" +dependencies = [ + "chrono", +] + +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.36.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" +dependencies = [ + "regex", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "planus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" +dependencies = [ + "array-init-cursor", +] + +[[package]] +name = "polars" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72571dde488ecccbe799798bf99ab7308ebdb7cf5d95bcc498dbd5a132f0da4d" +dependencies = [ + "getrandom", + "polars-arrow", + "polars-core", + "polars-error", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-parquet", + "polars-plan", + "polars-sql", + "polars-time", + "polars-utils", + "version_check", +] + +[[package]] +name = "polars-arrow" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6611c758d52e799761cc25900666b71552e6c929d88052811bc9daad4b3321a8" +dependencies = [ + "ahash", + "atoi_simd", + "bytemuck", + "chrono", + "chrono-tz", + "dyn-clone", + "either", + "ethnum", + "getrandom", + "hashbrown 0.15.0", + "itoa", + "lz4", + "num-traits", + "parking_lot", + "polars-arrow-format", + "polars-error", + "polars-schema", + "polars-utils", + "serde", + "simdutf8", + "streaming-iterator", + "strength_reduce", + "strum_macros", + "version_check", + "zstd", +] + +[[package]] +name = "polars-arrow-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b0ef2474af9396b19025b189d96e992311e6a47f90c53cd998b36c4c64b84c" +dependencies = [ + "planus", + "serde", +] + +[[package]] +name = "polars-compute" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "332f2547dbb27599a8ffe68e56159f5996ba03d1dad0382ccb62c109ceacdeb6" +dependencies = [ + "atoi_simd", + "bytemuck", + "chrono", + "either", + "fast-float2", + "itoa", + "num-traits", + "polars-arrow", + "polars-error", + "polars-utils", + "ryu", + "strength_reduce", + "version_check", +] + +[[package]] +name = "polars-core" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796d06eae7e6e74ed28ea54a8fccc584ebac84e6cf0e1e9ba41ffc807b169a01" +dependencies = [ + "ahash", + "bitflags", + "bytemuck", + "chrono", + "chrono-tz", + "comfy-table", + "either", + "hashbrown 0.14.5", + "hashbrown 0.15.0", + "indexmap", + "itoa", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-row", + "polars-schema", + "polars-utils", + "rand", + "rand_distr", + "rayon", + "regex", + "serde", + "serde_json", + "strum_macros", + "thiserror 2.0.5", + "version_check", + "xxhash-rust", +] + +[[package]] +name = "polars-error" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d6529cae0d1db5ed690e47de41fac9b35ae0c26d476830c2079f130887b847" +dependencies = [ + "polars-arrow-format", + "regex", + "simdutf8", + "thiserror 2.0.5", +] + +[[package]] +name = "polars-expr" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8e639991a8ad4fb12880ab44bcc3cf44a5703df003142334d9caf86d77d77e7" +dependencies = [ + "ahash", + "bitflags", + "hashbrown 0.15.0", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-time", + "polars-utils", + "rand", + "rayon", +] + +[[package]] +name = "polars-ffi" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a657a2cd278a9f9b40d5eedc5816d5fd0c65619ed2f53f0ff5ff4ef20916d3a8" +dependencies = [ + "polars-arrow", + "polars-core", +] + +[[package]] +name = "polars-io" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719a77e94480f6be090512da196e378cbcbeb3584c6fe1134c600aee906e38ab" +dependencies = [ + "ahash", + "async-trait", + "atoi_simd", + "bytes", + "chrono", + "fast-float2", + "futures", + "glob", + "hashbrown 0.15.0", + "home", + "itoa", + "memchr", + "memmap2", + "num-traits", + "once_cell", + "percent-encoding", + "polars-arrow", + "polars-core", + "polars-error", + "polars-json", + "polars-parquet", + "polars-schema", + "polars-time", + "polars-utils", + "pyo3", + "rayon", + "regex", + "ryu", + "serde", + "simdutf8", + "tokio", + "tokio-util", +] + +[[package]] +name = "polars-json" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e30603ca81e317b66b4caac683a8325a6a82ea0489685dc37e22ae03720def98" +dependencies = [ + "ahash", + "chrono", + "fallible-streaming-iterator", + "hashbrown 0.15.0", + "indexmap", + "itoa", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-utils", + "ryu", + "simd-json", + "streaming-iterator", +] + +[[package]] +name = "polars-lazy" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0a731a672dfc8ac38c1f73c9a4b2ae38d2fc8ac363bfb64c5f3a3e072ffc5ad" +dependencies = [ + "ahash", + "bitflags", + "chrono", + "memchr", + "once_cell", + "polars-arrow", + "polars-core", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-stream", + "polars-time", + "polars-utils", + "pyo3", + "rayon", + "version_check", +] + +[[package]] +name = "polars-mem-engine" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33442189bcbf2e2559aa7914db3835429030a13f4f18e43af5fba9d1b018cf12" +dependencies = [ + "memmap2", + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "pyo3", + "rayon", +] + +[[package]] +name = "polars-ops" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb83218b0c216104f0076cd1a005128be078f958125f3d59b094ee73d78c18e" +dependencies = [ + "ahash", + "argminmax", + "base64", + "bytemuck", + "chrono", + "chrono-tz", + "either", + "hashbrown 0.15.0", + "hex", + "indexmap", + "memchr", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-schema", + "polars-utils", + "rayon", + "regex", + "regex-syntax", + "serde", + "strum_macros", + "unicode-normalization", + "unicode-reverse", + "version_check", +] + +[[package]] +name = "polars-parquet" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c60ee85535590a38db6c703a21be4cb25342e40f573f070d1e16f9d84a53ac7" +dependencies = [ + "ahash", + "async-stream", + "base64", + "bytemuck", + "ethnum", + "futures", + "hashbrown 0.15.0", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-parquet-format", + "polars-utils", + "serde", + "simdutf8", + "streaming-decompression", +] + +[[package]] +name = "polars-parquet-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c025243dcfe8dbc57e94d9f82eb3bef10b565ab180d5b99bed87fd8aea319ce1" +dependencies = [ + "async-trait", + "futures", +] + +[[package]] +name = "polars-pipe" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d238fb76698f56e51ddfa89b135e4eda56a4767c6e8859eed0ab78386fcd52" +dependencies = [ + "crossbeam-channel", + "crossbeam-queue", + "enum_dispatch", + "hashbrown 0.15.0", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", + "rayon", + "uuid", + "version_check", +] + +[[package]] +name = "polars-plan" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f03533a93aa66127fcb909a87153a3c7cfee6f0ae59f497e73d7736208da54c" +dependencies = [ + "ahash", + "bitflags", + "bytemuck", + "bytes", + "chrono", + "chrono-tz", + "either", + "hashbrown 0.15.0", + "memmap2", + "num-traits", + "once_cell", + "percent-encoding", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", + "pyo3", + "rayon", + "recursive", + "regex", + "serde", + "strum_macros", + "version_check", +] + +[[package]] +name = "polars-row" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf47f7409f8e75328d7d034be390842924eb276716d0458607be0bddb8cc839" +dependencies = [ + "bitflags", + "bytemuck", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-utils", +] + +[[package]] +name = "polars-schema" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "416621ae82b84466cf4ff36838a9b0aeb4a67e76bd3065edc8c9cb7da19b1bc7" +dependencies = [ + "indexmap", + "polars-error", + "polars-utils", + "serde", + "version_check", +] + +[[package]] +name = "polars-sql" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaab553b90aa4d6743bb538978e1982368acb58a94408d7dd3299cad49c7083" +dependencies = [ + "hex", + "polars-core", + "polars-error", + "polars-lazy", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rand", + "regex", + "serde", + "sqlparser", +] + +[[package]] +name = "polars-stream" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498997b656c779610c1496b3d96a59fe569ef22a5b81ccfe5325cb3df8dff2fd" +dependencies = [ + "atomic-waker", + "crossbeam-deque", + "crossbeam-utils", + "futures", + "memmap2", + "parking_lot", + "pin-project-lite", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-parquet", + "polars-plan", + "polars-utils", + "rand", + "rayon", + "recursive", + "slotmap", + "tokio", + "version_check", +] + +[[package]] +name = "polars-time" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d192efbdab516d28b3fab1709a969e3385bd5cda050b7c9aa9e2502a01fda879" +dependencies = [ + "atoi_simd", + "bytemuck", + "chrono", + "chrono-tz", + "now", + "num-traits", + "once_cell", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-ops", + "polars-utils", + "rayon", + "regex", + "serde", + "strum_macros", +] + +[[package]] +name = "polars-utils" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f6c8166a4a7fbc15b87c81645ed9e1f0651ff2e8c96cafc40ac5bf43441a10" +dependencies = [ + "ahash", + "bincode", + "bytemuck", + "bytes", + "compact_str", + "flate2", + "hashbrown 0.15.0", + "indexmap", + "libc", + "memmap2", + "num-traits", + "once_cell", + "polars-error", + "pyo3", + "rand", + "raw-cpuid", + "rayon", + "serde", + "serde_json", + "stacker", + "sysinfo", + "version_check", +] + +[[package]] +name = "portable-atomic" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "psm" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa37f80ca58604976033fae9515a8a2989fc13797d953f7c04fb8fa36a11f205" +dependencies = [ + "cc", +] + +[[package]] +name = "pyo3" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "pyo3-polars" +version = "0.20.0" +dependencies = [ + "libc", + "once_cell", + "polars", + "polars-arrow", + "polars-core", + "polars-ffi", + "polars-lazy", + "polars-plan", + "polars-utils", + "pyo3", + "pyo3-polars-derive", + "serde", + "serde-pickle", + "thiserror 1.0.63", +] + +[[package]] +name = "pyo3-polars-derive" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a247f5b03316e317f42e9a3fec4ff5b26cfa2b05fc2d9e821b7a182c82ef08f" +dependencies = [ + "polars-arrow", + "polars-core", + "polars-ffi", + "polars-plan", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + +[[package]] +name = "raw-cpuid" +version = "11.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb9ee317cfe3fbd54b36a511efc1edd42e216903c9cd575e686dd68a2ba90d8d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "redox_syscall" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" +dependencies = [ + "bitflags", +] + +[[package]] +name = "ref-cast" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf0a6f84d5f1d581da8b41b47ec8600871962f2a528115b542b362d4b744931" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustversion" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.210" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde-pickle" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c762ad136a26407c6a80825813600ceeab5e613660d93d79a41f0ec877171e71" +dependencies = [ + "byteorder", + "iter-read", + "num-bigint", + "num-traits", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.210" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.128" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd-json" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1df0290e9bfe79ddd5ff8798ca887cd107b75353d2957efe9777296e17f26b5" +dependencies = [ + "ahash", + "getrandom", + "halfbrown", + "once_cell", + "ref-cast", + "serde", + "serde_json", + "simdutf8", + "value-trait", +] + +[[package]] +name = "simdutf8" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "slotmap" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbff4acf519f630b3a3ddcfaea6c06b42174d9a44bc70c620e9ed1649d58b82a" +dependencies = [ + "version_check", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "sqlparser" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +dependencies = [ + "log", +] + +[[package]] +name = "stacker" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "streaming-decompression" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" +dependencies = [ + "fallible-streaming-iterator", +] + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sysinfo" +version = "0.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "windows", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "thiserror" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +dependencies = [ + "thiserror-impl 1.0.63", +] + +[[package]] +name = "thiserror" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643caef17e3128658ff44d85923ef2d28af81bb71e0d67bbfe1d76f19a73e053" +dependencies = [ + "thiserror-impl 2.0.5", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "995d0bbc9995d1f19d28b7215a9352b0fc3cd3a2d2ec95c2cadc485cdedbcdde" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinyvec" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.41.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "pin-project-lite", + "socket2", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-util" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-reverse" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6f4888ebc23094adfb574fdca9fdc891826287a6397d2cd28802ffd6f20c76" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + +[[package]] +name = "unicode-width" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + +[[package]] +name = "uuid" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +dependencies = [ + "getrandom", +] + +[[package]] +name = "value-trait" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9170e001f458781e92711d2ad666110f153e4e50bfd5cbd02db6547625714187" +dependencies = [ + "float-cmp", + "halfbrown", + "itoa", + "ryu", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-targets", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "xxhash-rust" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/vendor/pyo3-polars/Cargo.toml b/vendor/pyo3-polars/Cargo.toml new file mode 100644 index 0000000..6834780 --- /dev/null +++ b/vendor/pyo3-polars/Cargo.toml @@ -0,0 +1,114 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +name = "pyo3-polars" +version = "0.20.0" +build = false +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "Expression plugins and PyO3 types for polars" +readme = "README.md" +license = "MIT" +repository = "https://github.com/pola-rs/pyo3-polars" + +[features] +derive = [ + "pyo3-polars-derive", + "polars-plan", + "polars-ffi", + "serde-pickle", + "serde", +] +dtype-array = ["polars/dtype-array"] +dtype-categorical = ["polars/dtype-categorical"] +dtype-decimal = ["polars/dtype-decimal"] +dtype-full = [ + "polars/dtype-full", + "dtype-decimal", + "dtype-array", + "dtype-struct", + "dtype-categorical", +] +dtype-struct = ["polars/dtype-struct"] +lazy = [ + "polars/serde-lazy", + "polars-plan", + "polars-lazy/serde", + "polars-utils", + "polars-lazy/python", +] +object = ["polars/object"] + +[lib] +name = "pyo3_polars" +path = "src/lib.rs" + +[dependencies.libc] +version = "0.2" + +[dependencies.once_cell] +version = "1" + +[dependencies.polars] +version = "0.46.0" +default-features = true + +[dependencies.polars-arrow] +version = "0.46.0" +default-features = false + +[dependencies.polars-core] +version = "0.46.0" +default-features = false + +[dependencies.polars-ffi] +version = "0.46.0" +optional = true +default-features = false + +[dependencies.polars-lazy] +version = "0.46.0" +optional = true +default-features = false + +[dependencies.polars-plan] +version = "0.46.0" +optional = true +default-features = false + +[dependencies.polars-utils] +version = "0.46.0" +features = ["serde"] +optional = true +default-features = false + +[dependencies.pyo3] +version = "0.23" + +[dependencies.pyo3-polars-derive] +version = "0.14.0" +optional = true + +[dependencies.serde] +version = "1" +optional = true + +[dependencies.serde-pickle] +version = "1" +optional = true + +[dependencies.thiserror] +version = "1" diff --git a/vendor/pyo3-polars/Cargo.toml.orig b/vendor/pyo3-polars/Cargo.toml.orig new file mode 100644 index 0000000..8c85f2a --- /dev/null +++ b/vendor/pyo3-polars/Cargo.toml.orig @@ -0,0 +1,37 @@ +[package] +name = "pyo3-polars" +version = "0.20.0" +edition = "2021" +license = "MIT" +readme = "../README.md" +repository = "https://github.com/pola-rs/pyo3-polars" +description = "Expression plugins and PyO3 types for polars" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2" # pyo3 depends on libc already, so this does not introduce an extra dependence. +once_cell = "1" +polars = { workspace = true, default-features = true} +polars-arrow = { workspace = true, default-features = false } +polars-core = { workspace = true, default-features = false } +polars-ffi = { workspace = true, optional = true } +polars-lazy = { workspace = true, optional = true } +polars-plan = { workspace = true, optional = true } +polars-utils = {workspace = true, features = ["serde"], optional = true } +pyo3 = "0.23" +pyo3-polars-derive = { version = "0.14.0", path = "../pyo3-polars-derive", optional = true } +serde = { version = "1", optional = true } +serde-pickle = { version = "1", optional = true } +thiserror = "1" + +[features] +# Polars python is needed because all variants need to be acttivated of the DSL. +lazy = ["polars/serde-lazy", "polars-plan", "polars-lazy/serde", "polars-utils", "polars-lazy/python"] +derive = ["pyo3-polars-derive", "polars-plan", "polars-ffi", "serde-pickle", "serde"] +dtype-full = ["polars/dtype-full", "dtype-decimal", "dtype-array", "dtype-struct", "dtype-categorical"] +object = ["polars/object"] +dtype-decimal = ["polars/dtype-decimal"] +dtype-struct = ["polars/dtype-struct"] +dtype-array = ["polars/dtype-array"] +dtype-categorical = ["polars/dtype-categorical"] diff --git a/vendor/pyo3-polars/README.md b/vendor/pyo3-polars/README.md new file mode 100644 index 0000000..dbcd728 --- /dev/null +++ b/vendor/pyo3-polars/README.md @@ -0,0 +1,148 @@ +## 1. Shared library plugins for Polars + + + + + +Documentation for this functionality may also be found in the [Polars User Guide](https://docs.pola.rs/user-guide/expressions/plugins/). +This is new functionality and should be preferred over `2.` as this +will circumvent the GIL and will be the way we want to support extending polars. + +Parallelism and optimizations are managed by the default polars runtime. That runtime will call into the plugin function. +The plugin functions are compiled separately. + +We can therefore keep polars more lean and maybe add support for a `polars-distance`, `polars-geo`, `polars-ml`, etc. +Those can then have specialized expressions and don't have to worry as much for code bloat as they can be optionally installed. + +The idea is that you define an expression in another Rust crate with a proc_macro `polars_expr`. + +The macro may have one of the following attributes: + +- `output_type` -> to define the output type of that expression +- `output_type_func` -> to define a function that computes the output type based on input types. +- `output_type_func_with_kwargs` -> to define a function that computes the output type based on input types and keyword args. + +Here is an example of a `String` conversion expression that converts any string to [pig latin](https://en.wikipedia.org/wiki/Pig_Latin): + +```rust +fn pig_latin_str(value: &str, capitalize: bool, output: &mut String) { + if let Some(first_char) = value.chars().next() { + if capitalize { + for c in value.chars().skip(1).map(|char| char.to_uppercase()) { + write!(output, "{c}").unwrap() + } + write!(output, "AY").unwrap() + } else { + let offset = first_char.len_utf8(); + write!(output, "{}{}ay", &value[offset..], first_char).unwrap() + } + } +} + +#[derive(Deserialize)] +struct PigLatinKwargs { + capitalize: bool, +} + +#[polars_expr(output_type=String)] +fn pig_latinnify(inputs: &[Series], kwargs: PigLatinKwargs) -> PolarsResult { + let ca = inputs[0].str()?; + let out: StringChunked = + ca.apply_into_string_amortized(|value, output| pig_latin_str(value, kwargs.capitalize, output)); + Ok(out.into_series()) +} +``` + +This can then be exposed on the Python side: + +```python +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl +from polars.plugins import register_plugin_function + +from expression_lib._utils import LIB + +if TYPE_CHECKING: + from expression_lib._typing import IntoExprColumn + + +def pig_latinnify(expr: IntoExprColumn, capitalize: bool = False) -> pl.Expr: + return register_plugin_function( + plugin_path=LIB, + args=[expr], + function_name="pig_latinnify", + is_elementwise=True, + kwargs={"capitalize": capitalize}, + ) +``` + +Compile/ship and then it is ready to use: + +```python +import polars as pl +from expression_lib import language + +df = pl.DataFrame({ + "names": ["Richard", "Alice", "Bob"], +}) + + +out = df.with_columns( + pig_latin = language.pig_latinnify("names") +) +``` + +Alternatively, you can [register a custom namespace](https://docs.pola.rs/py-polars/html/reference/api/polars.api.register_expr_namespace.html#polars.api.register_expr_namespace), which enables you to write: + +```python +out = df.with_columns( + pig_latin = pl.col("names").language.pig_latinnify() +) +``` + +See the full example in [example/derive_expression]: https://github.com/pola-rs/pyo3-polars/tree/main/example/derive_expression + +## 2. Pyo3 extensions for Polars + +See the `example` directory for a concrete example. Here we send a polars `DataFrame` to rust and then compute a +`jaccard similarity` in parallel using `rayon` and rust hash sets. + +## Run example + +`$ cd example && make install` +`$ venv/bin/python run.py` + +This will output: + +``` +shape: (2, 2) +┌───────────┬───────────────┐ +│ list_a ┆ list_b │ +│ --- ┆ --- │ +│ list[i64] ┆ list[i64] │ +╞═══════════╪═══════════════╡ +│ [1, 2, 3] ┆ [1, 2, ... 8] │ +│ [5, 5] ┆ [5, 1, 1] │ +└───────────┴───────────────┘ +shape: (2, 1) +┌─────────┐ +│ jaccard │ +│ --- │ +│ f64 │ +╞═════════╡ +│ 0.75 │ +│ 0.5 │ +└─────────┘ +``` + +## Compile for release + +`$ make install-release` + +# What to expect + +This crate offers a `PySeries` and a `PyDataFrame` which are simple wrapper around `Series` and `DataFrame`. The +advantage of these wrappers is that they can be converted to and from python as they implement `FromPyObject` and `IntoPy`. diff --git a/vendor/pyo3-polars/src/alloc.rs b/vendor/pyo3-polars/src/alloc.rs new file mode 100644 index 0000000..ed59c0f --- /dev/null +++ b/vendor/pyo3-polars/src/alloc.rs @@ -0,0 +1,123 @@ +use std::alloc::{GlobalAlloc, Layout, System}; +use std::ffi::c_char; + +use once_cell::race::OnceRef; +use pyo3::ffi::{PyCapsule_Import, Py_IsInitialized}; +use pyo3::Python; + +unsafe extern "C" fn fallback_alloc(size: usize, align: usize) -> *mut u8 { + System.alloc(Layout::from_size_align_unchecked(size, align)) +} + +unsafe extern "C" fn fallback_dealloc(ptr: *mut u8, size: usize, align: usize) { + System.dealloc(ptr, Layout::from_size_align_unchecked(size, align)) +} + +unsafe extern "C" fn fallback_alloc_zeroed(size: usize, align: usize) -> *mut u8 { + System.alloc_zeroed(Layout::from_size_align_unchecked(size, align)) +} + +unsafe extern "C" fn fallback_realloc( + ptr: *mut u8, + size: usize, + align: usize, + new_size: usize, +) -> *mut u8 { + System.realloc( + ptr, + Layout::from_size_align_unchecked(size, align), + new_size, + ) +} + +#[repr(C)] +struct AllocatorCapsule { + alloc: unsafe extern "C" fn(usize, usize) -> *mut u8, + dealloc: unsafe extern "C" fn(*mut u8, usize, usize), + alloc_zeroed: unsafe extern "C" fn(usize, usize) -> *mut u8, + realloc: unsafe extern "C" fn(*mut u8, usize, usize, usize) -> *mut u8, +} + +static FALLBACK_ALLOCATOR_CAPSULE: AllocatorCapsule = AllocatorCapsule { + alloc: fallback_alloc, + alloc_zeroed: fallback_alloc_zeroed, + dealloc: fallback_dealloc, + realloc: fallback_realloc, +}; + +static ALLOCATOR_CAPSULE_NAME: &[u8] = b"polars.polars._allocator\0"; + +/// A memory allocator that relays allocations to the allocator used by Polars. +/// +/// You can use it as the global memory allocator: +/// +/// ```rust +/// use pyo3_polars::PolarsAllocator; +/// +/// #[global_allocator] +/// static ALLOC: PolarsAllocator = PolarsAllocator::new(); +/// ``` +/// +/// If the allocator capsule (`polars.polars._allocator`) is not available, +/// this allocator fallbacks to [`std::alloc::System`]. +pub struct PolarsAllocator(OnceRef<'static, AllocatorCapsule>); + +impl PolarsAllocator { + fn get_allocator(&self) -> &'static AllocatorCapsule { + // Do not allocate in this function, + // otherwise it will cause infinite recursion. + self.0.get_or_init(|| { + let r = (unsafe { Py_IsInitialized() } != 0) + .then(|| { + Python::with_gil(|_| unsafe { + (PyCapsule_Import(ALLOCATOR_CAPSULE_NAME.as_ptr() as *const c_char, 0) + as *const AllocatorCapsule) + .as_ref() + }) + }) + .flatten(); + #[cfg(debug_assertions)] + if r.is_none() { + // Do not use eprintln; it may alloc. + let msg = b"failed to get allocator capsule\n"; + // Message length type is platform-dependent. + let msg_len = msg.len().try_into().unwrap(); + unsafe { libc::write(2, msg.as_ptr() as *const libc::c_void, msg_len) }; + } + r.unwrap_or(&FALLBACK_ALLOCATOR_CAPSULE) + }) + } + + /// Create a `PolarsAllocator`. + pub const fn new() -> Self { + PolarsAllocator(OnceRef::new()) + } +} + +impl Default for PolarsAllocator { + fn default() -> Self { + Self::new() + } +} + +unsafe impl GlobalAlloc for PolarsAllocator { + #[inline] + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + (self.get_allocator().alloc)(layout.size(), layout.align()) + } + + #[inline] + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + (self.get_allocator().dealloc)(ptr, layout.size(), layout.align()); + } + + #[inline] + unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 { + (self.get_allocator().alloc_zeroed)(layout.size(), layout.align()) + } + + #[inline] + unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 { + (self.get_allocator().realloc)(ptr, layout.size(), layout.align(), new_size) + } +} diff --git a/vendor/pyo3-polars/src/derive.rs b/vendor/pyo3-polars/src/derive.rs new file mode 100644 index 0000000..53fc440 --- /dev/null +++ b/vendor/pyo3-polars/src/derive.rs @@ -0,0 +1,68 @@ +use polars::prelude::PolarsError; +use polars_core::error::{to_compute_err, PolarsResult}; +pub use pyo3_polars_derive::polars_expr; +use serde::Deserialize; +use std::cell::RefCell; +use std::ffi::CString; +use std::sync::atomic::{AtomicBool, Ordering}; + +/// Gives the caller extra information on how to execute the expression. +pub use polars_ffi::version_0::CallerContext; + +/// A default opaque kwargs type. +pub type DefaultKwargs = serde_pickle::Value; + +thread_local! { + static LAST_ERROR: RefCell = RefCell::new(CString::default()); +} + +pub fn _parse_kwargs<'a, T>(kwargs: &'a [u8]) -> PolarsResult +where + T: Deserialize<'a>, +{ + serde_pickle::from_slice(kwargs, Default::default()).map_err(to_compute_err) +} + +pub fn _update_last_error(err: PolarsError) { + let msg = format!("{}", err); + let msg = CString::new(msg).unwrap(); + LAST_ERROR.with(|prev| *prev.borrow_mut() = msg) +} + +pub fn _set_panic() { + let msg = "PANIC"; + let msg = CString::new(msg).unwrap(); + LAST_ERROR.with(|prev| *prev.borrow_mut() = msg) +} + +#[no_mangle] +/// # Safety +/// FFI function, so unsafe +pub unsafe extern "C" fn _polars_plugin_get_last_error_message() -> *const std::os::raw::c_char { + LAST_ERROR.with(|prev| prev.borrow_mut().as_ptr()) +} + +static INIT: AtomicBool = AtomicBool::new(false); + +fn start_up_init() { + // Set a custom panic hook that only shows output if verbose. + std::panic::set_hook(Box::new(|info| { + let show_message = std::env::var("POLARS_VERBOSE").as_deref().unwrap_or("") == "1"; + if show_message { + eprintln!("{}", info) + } + })); +} + +#[no_mangle] +/// # Safety +/// FFI function, so unsafe +pub unsafe extern "C" fn _polars_plugin_get_version() -> u32 { + if !INIT.swap(true, Ordering::Relaxed) { + // Plugin version is is always called at least once. + start_up_init(); + } + let (major, minor) = polars_ffi::get_version(); + // Stack bits together + ((major as u32) << 16) + minor as u32 +} diff --git a/vendor/pyo3-polars/src/error.rs b/vendor/pyo3-polars/src/error.rs new file mode 100644 index 0000000..9a55bc6 --- /dev/null +++ b/vendor/pyo3-polars/src/error.rs @@ -0,0 +1,73 @@ +use std::fmt::{Debug, Formatter}; + +use polars::prelude::PolarsError; +use pyo3::create_exception; +use pyo3::exceptions::{PyException, PyIOError, PyIndexError, PyRuntimeError, PyValueError}; +use pyo3::prelude::*; +use thiserror::Error; + +#[derive(Error)] +pub enum PyPolarsErr { + #[error(transparent)] + Polars(#[from] PolarsError), + #[error("{0}")] + Other(String), +} + +impl std::convert::From for PyErr { + fn from(err: PyPolarsErr) -> PyErr { + fn convert(err: &PolarsError) -> PyErr { + match err { + PolarsError::ComputeError(err) => ComputeError::new_err(err.to_string()), + PolarsError::NoData(err) => NoDataError::new_err(err.to_string()), + PolarsError::ShapeMismatch(err) => ShapeError::new_err(err.to_string()), + PolarsError::SchemaMismatch(err) => SchemaError::new_err(err.to_string()), + PolarsError::IO { error, .. } => PyIOError::new_err(error.to_string()), + PolarsError::OutOfBounds(err) => PyIndexError::new_err(err.to_string()), + PolarsError::InvalidOperation(err) => PyValueError::new_err(err.to_string()), + PolarsError::Duplicate(err) => DuplicateError::new_err(err.to_string()), + PolarsError::ColumnNotFound(err) => ColumnNotFound::new_err(err.to_string()), + PolarsError::SchemaFieldNotFound(err) => { + SchemaFieldNotFound::new_err(err.to_string()) + } + PolarsError::StructFieldNotFound(err) => { + StructFieldNotFound::new_err(err.to_string()) + } + PolarsError::StringCacheMismatch(err) => { + StringCacheMismatchError::new_err(err.to_string()) + } + PolarsError::SQLInterface(err) => SQLInterface::new_err(err.to_string()), + PolarsError::SQLSyntax(err) => SQLSyntax::new_err(err.to_string()), + PolarsError::Context { error, .. } => convert(error), + } + } + + use PyPolarsErr::*; + match &err { + Polars(err) => convert(err), + _ => PyRuntimeError::new_err(format!("{:?}", &err)), + } + } +} + +impl Debug for PyPolarsErr { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + use PyPolarsErr::*; + match self { + Polars(err) => write!(f, "{:?}", err), + Other(err) => write!(f, "BindingsError: {:?}", err), + } + } +} + +create_exception!(exceptions, ColumnNotFound, PyException); +create_exception!(exceptions, SchemaFieldNotFound, PyException); +create_exception!(exceptions, StructFieldNotFound, PyException); +create_exception!(exceptions, ComputeError, PyException); +create_exception!(exceptions, NoDataError, PyException); +create_exception!(exceptions, ShapeError, PyException); +create_exception!(exceptions, SchemaError, PyException); +create_exception!(exceptions, DuplicateError, PyException); +create_exception!(exceptions, StringCacheMismatchError, PyException); +create_exception!(exceptions, SQLInterface, PyException); +create_exception!(exceptions, SQLSyntax, PyException); diff --git a/vendor/pyo3-polars/src/export.rs b/vendor/pyo3-polars/src/export.rs new file mode 100644 index 0000000..76551b3 --- /dev/null +++ b/vendor/pyo3-polars/src/export.rs @@ -0,0 +1,3 @@ +pub use polars_core; +pub use polars_ffi; +pub use polars_plan; diff --git a/vendor/pyo3-polars/src/ffi/mod.rs b/vendor/pyo3-polars/src/ffi/mod.rs new file mode 100644 index 0000000..df928c0 --- /dev/null +++ b/vendor/pyo3-polars/src/ffi/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod to_py; +pub(crate) mod to_rust; diff --git a/vendor/pyo3-polars/src/ffi/to_py.rs b/vendor/pyo3-polars/src/ffi/to_py.rs new file mode 100644 index 0000000..a1bb283 --- /dev/null +++ b/vendor/pyo3-polars/src/ffi/to_py.rs @@ -0,0 +1,26 @@ +use polars_arrow::ffi; + +use polars::prelude::{ArrayRef, ArrowField}; +use pyo3::ffi::Py_uintptr_t; +use pyo3::prelude::*; + +/// Arrow array to Python. +pub(crate) fn to_py_array<'py>( + array: ArrayRef, + pyarrow: Bound<'py, PyModule>, +) -> PyResult> { + let schema = Box::new(ffi::export_field_to_c(&ArrowField::new( + "".into(), + array.dtype().clone(), + true, + ))); + let array = Box::new(ffi::export_array_to_c(array)); + + let schema_ptr: *const ffi::ArrowSchema = &*schema; + let array_ptr: *const ffi::ArrowArray = &*array; + + pyarrow.getattr("Array")?.call_method1( + "_import_arrow_from_c", + (array_ptr as Py_uintptr_t, schema_ptr as Py_uintptr_t), + ) +} diff --git a/vendor/pyo3-polars/src/ffi/to_rust.rs b/vendor/pyo3-polars/src/ffi/to_rust.rs new file mode 100644 index 0000000..c7664c1 --- /dev/null +++ b/vendor/pyo3-polars/src/ffi/to_rust.rs @@ -0,0 +1,27 @@ +use crate::error::PyPolarsErr; +use polars::prelude::*; +use polars_arrow::ffi; +use pyo3::ffi::Py_uintptr_t; +use pyo3::prelude::*; + +pub fn array_to_rust(obj: &Bound) -> PyResult { + // prepare a pointer to receive the Array struct + let array = Box::new(ffi::ArrowArray::empty()); + let schema = Box::new(ffi::ArrowSchema::empty()); + + let array_ptr = &*array as *const ffi::ArrowArray; + let schema_ptr = &*schema as *const ffi::ArrowSchema; + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds + obj.call_method1( + "_export_to_c", + (array_ptr as Py_uintptr_t, schema_ptr as Py_uintptr_t), + )?; + + unsafe { + let field = ffi::import_field_from_c(schema.as_ref()).map_err(PyPolarsErr::from)?; + let array = ffi::import_array_from_c(*array, field.dtype).map_err(PyPolarsErr::from)?; + Ok(array) + } +} diff --git a/vendor/pyo3-polars/src/lib.rs b/vendor/pyo3-polars/src/lib.rs new file mode 100644 index 0000000..65437fa --- /dev/null +++ b/vendor/pyo3-polars/src/lib.rs @@ -0,0 +1,62 @@ +//! This crate offers a [`PySeries`] and a [`PyDataFrame`] which are simple wrapper around `Series` and `DataFrame`. The +//! advantage of these wrappers is that they can be converted to and from python as they implement `FromPyObject` and `IntoPy`. +//! +//! # Example +//! +//! From `src/lib.rs`. +//! ```rust +//! # use polars::prelude::*; +//! # use pyo3::prelude::*; +//! # use pyo3_polars::PyDataFrame; +//! +//! #[pyfunction] +//! fn my_cool_function(pydf: PyDataFrame) -> PyResult { +//! let df: DataFrame = pydf.into(); +//! let df = { +//! // some work on the dataframe here +//! todo!() +//! }; +//! +//! // wrap the dataframe and it will be automatically converted to a python polars dataframe +//! Ok(PyDataFrame(df)) +//! } +//! +//! /// A Python module implemented in Rust. +//! #[pymodule] +//! fn expression_lib(_py: Python, m: &Bound) -> PyResult<()> { +//! m.add_function(wrap_pyfunction!(my_cool_function, m)?)?; +//! Ok(()) +//! } +//! ``` +//! +//! Compile your crate with `maturin` and then import from python. +//! +//! From `my_python_file.py`. +//! ```python +//! from expression_lib import my_cool_function +//! +//! df = pl.DataFrame({ +//! "foo": [1, 2, None], +//! "bar": ["a", None, "c"], +//! }) +//! out_df = my_cool_function(df) +//! ``` +mod alloc; +#[cfg(feature = "derive")] +pub mod derive; +pub mod error; +#[cfg(feature = "derive")] +pub mod export; +mod ffi; +mod types; + +pub use crate::alloc::PolarsAllocator; +use once_cell::sync::Lazy; +use pyo3::prelude::*; +pub use types::*; + +pub(crate) static POLARS: Lazy> = + Lazy::new(|| Python::with_gil(|py| PyModule::import(py, "polars").unwrap().unbind())); + +pub(crate) static SERIES: Lazy> = + Lazy::new(|| Python::with_gil(|py| POLARS.getattr(py, "Series").unwrap())); diff --git a/vendor/pyo3-polars/src/types.rs b/vendor/pyo3-polars/src/types.rs new file mode 100644 index 0000000..483c45e --- /dev/null +++ b/vendor/pyo3-polars/src/types.rs @@ -0,0 +1,706 @@ +use std::convert::Infallible; + +use super::*; + +use crate::error::PyPolarsErr; +use crate::ffi::to_py::to_py_array; +use polars_arrow as arrow; +use polars_core::datatypes::{CompatLevel, DataType}; +use polars_core::prelude::*; +use polars_core::utils::materialize_dyn_int; +#[cfg(feature = "lazy")] +use polars_lazy::frame::LazyFrame; +#[cfg(feature = "lazy")] +use polars_plan::dsl::Expr; +#[cfg(feature = "lazy")] +use polars_plan::plans::DslPlan; +#[cfg(feature = "lazy")] +use polars_utils::pl_serialize; +use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::ffi::Py_uintptr_t; +use pyo3::intern; +use pyo3::prelude::*; +use pyo3::pybacked::PyBackedStr; +#[cfg(feature = "dtype-struct")] +use pyo3::types::PyList; +use pyo3::types::{PyDict, PyString}; + +#[cfg(feature = "dtype-categorical")] +pub(crate) fn get_series(obj: &Bound<'_, PyAny>) -> PyResult { + let s = obj.getattr(intern!(obj.py(), "_s"))?; + Ok(s.extract::()?.0) +} + +#[repr(transparent)] +#[derive(Debug, Clone)] +/// A wrapper around a [`Series`] that can be converted to and from python with `pyo3`. +pub struct PySeries(pub Series); + +#[repr(transparent)] +#[derive(Debug, Clone)] +/// A wrapper around a [`DataFrame`] that can be converted to and from python with `pyo3`. +pub struct PyDataFrame(pub DataFrame); + +#[cfg(feature = "lazy")] +#[repr(transparent)] +#[derive(Clone)] +/// A wrapper around a [`DataFrame`] that can be converted to and from python with `pyo3`. +/// # Warning +/// If the [`LazyFrame`] contains in memory data, +/// such as a [`DataFrame`] this will be serialized/deserialized. +/// +/// It is recommended to only have `LazyFrame`s that scan data +/// from disk +pub struct PyLazyFrame(pub LazyFrame); + +#[cfg(feature = "lazy")] +#[repr(transparent)] +#[derive(Clone)] +pub struct PyExpr(pub Expr); + +#[repr(transparent)] +#[derive(Clone)] +pub struct PySchema(pub SchemaRef); + +#[repr(transparent)] +#[derive(Clone)] +pub struct PyDataType(pub DataType); + +#[repr(transparent)] +#[derive(Clone, Copy)] +pub struct PyTimeUnit(TimeUnit); + +#[repr(transparent)] +#[derive(Clone)] +pub struct PyField(Field); + +impl<'py> FromPyObject<'py> for PyField { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + let py = ob.py(); + let name = ob + .getattr(intern!(py, "name"))? + .str()? + .extract::()?; + let dtype = ob.getattr(intern!(py, "dtype"))?.extract::()?; + let name: &str = name.as_ref(); + Ok(PyField(Field::new(name.into(), dtype.0))) + } +} + +impl<'py> FromPyObject<'py> for PyTimeUnit { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + let parsed = match &*ob.extract::()? { + "ns" => TimeUnit::Nanoseconds, + "us" => TimeUnit::Microseconds, + "ms" => TimeUnit::Milliseconds, + v => { + return Err(PyValueError::new_err(format!( + "`time_unit` must be one of {{'ns', 'us', 'ms'}}, got {v}", + ))) + } + }; + Ok(PyTimeUnit(parsed)) + } +} + +impl<'py> IntoPyObject<'py> for PyTimeUnit { + type Target = PyString; + type Output = Bound<'py, Self::Target>; + type Error = Infallible; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let time_unit = match self.0 { + TimeUnit::Nanoseconds => "ns", + TimeUnit::Microseconds => "us", + TimeUnit::Milliseconds => "ms", + }; + time_unit.into_pyobject(py) + } +} + +impl From for DataFrame { + fn from(value: PyDataFrame) -> Self { + value.0 + } +} + +impl From for Series { + fn from(value: PySeries) -> Self { + value.0 + } +} + +#[cfg(feature = "lazy")] +impl From for LazyFrame { + fn from(value: PyLazyFrame) -> Self { + value.0 + } +} + +impl From for SchemaRef { + fn from(value: PySchema) -> Self { + value.0 + } +} + +impl AsRef for PySeries { + fn as_ref(&self) -> &Series { + &self.0 + } +} + +impl AsRef for PyDataFrame { + fn as_ref(&self) -> &DataFrame { + &self.0 + } +} + +#[cfg(feature = "lazy")] +impl AsRef for PyLazyFrame { + fn as_ref(&self) -> &LazyFrame { + &self.0 + } +} + +impl AsRef for PySchema { + fn as_ref(&self) -> &Schema { + self.0.as_ref() + } +} + +impl<'a> FromPyObject<'a> for PySeries { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + let ob = ob.call_method0("rechunk")?; + + let name = ob.getattr("name")?; + let py_name = name.str()?; + let name = py_name.to_cow()?; + + // Newer Python polars versions reject integer compat_level values. + // Fall back to the default to_arrow behavior for broad compatibility. + let arr = ob.call_method0("to_arrow")?; + let arr = ffi::to_rust::array_to_rust(&arr)?; + let name = name.as_ref(); + Ok(PySeries( + Series::try_from((PlSmallStr::from(name), arr)).map_err(PyPolarsErr::from)?, + )) + } +} + +impl<'a> FromPyObject<'a> for PyDataFrame { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + let series = ob.call_method0("get_columns")?; + let n = ob.getattr("width")?.extract::()?; + let mut columns = Vec::with_capacity(n); + for pyseries in series.try_iter()? { + let pyseries = pyseries?; + let s = pyseries.extract::()?.0; + columns.push(s.into_column()); + } + unsafe { + Ok(PyDataFrame(DataFrame::new_no_checks_height_from_first( + columns, + ))) + } + } +} + +#[cfg(feature = "lazy")] +impl<'a> FromPyObject<'a> for PyLazyFrame { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + let s = ob.call_method0("__getstate__")?; + let b = s.extract::>()?; + let b = b.as_bytes(); + + let lp: DslPlan = pl_serialize::SerializeOptions::default() + .deserialize_from_reader(&*b) + .map_err( + |e| PyPolarsErr::Other( + format!("Error when deserializing LazyFrame. This may be due to mismatched polars versions. {}", e) + ) + )?; + + Ok(PyLazyFrame(LazyFrame::from(lp))) + } +} + +#[cfg(feature = "lazy")] +impl<'a> FromPyObject<'a> for PyExpr { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + let s = ob.call_method0("__getstate__")?.extract::>()?; + + let e: Expr = pl_serialize::SerializeOptions::default() + .deserialize_from_reader(&*s) + .map_err( + |e| PyPolarsErr::Other( + format!("Error when deserializing 'Expr'. This may be due to mismatched polars versions. {}", e) + ) + )?; + + Ok(PyExpr(e)) + } +} + +impl<'py> IntoPyObject<'py> for PySeries { + type Target = PyAny; + type Output = Bound<'py, Self::Target>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let polars = POLARS.bind(py); + let s = SERIES.bind(py); + match s + .getattr("_import_arrow_from_c") + .or_else(|_| s.getattr("_import_from_c")) + { + // Go via polars + Ok(import_arrow_from_c) => { + // Get supported compatibility level + let compat_level = CompatLevel::with_level( + s.getattr("_newest_compat_level") + .map_or(1, |newest_compat_level| { + newest_compat_level.call0().unwrap().extract().unwrap() + }), + ) + .unwrap_or(CompatLevel::newest()); + // Prepare pointers on the heap. + let mut chunk_ptrs = Vec::with_capacity(self.0.n_chunks()); + for i in 0..self.0.n_chunks() { + let array = self.0.to_arrow(i, compat_level); + let schema = Box::new(arrow::ffi::export_field_to_c(&ArrowField::new( + "".into(), + array.dtype().clone(), + true, + ))); + let array = Box::new(arrow::ffi::export_array_to_c(array.clone())); + + let schema_ptr: *const arrow::ffi::ArrowSchema = Box::leak(schema); + let array_ptr: *const arrow::ffi::ArrowArray = Box::leak(array); + + chunk_ptrs.push((schema_ptr as Py_uintptr_t, array_ptr as Py_uintptr_t)) + } + + // Somehow we need to clone the Vec, because pyo3 doesn't accept a slice here. + let pyseries = import_arrow_from_c + .call1((self.0.name().as_str(), chunk_ptrs.clone())) + .unwrap(); + // Deallocate boxes + for (schema_ptr, array_ptr) in chunk_ptrs { + let schema_ptr = schema_ptr as *mut arrow::ffi::ArrowSchema; + let array_ptr = array_ptr as *mut arrow::ffi::ArrowArray; + unsafe { + // We can drop both because the `schema` isn't read in an owned matter on the other side. + let _ = Box::from_raw(schema_ptr); + + // The array is `ptr::read_unaligned` so there are two owners. + // We drop the box, and forget the content so the other process is the owner. + let array = Box::from_raw(array_ptr); + // We must forget because the other process will call the release callback. + // Read *array as Box::into_inner + let array = *array; + std::mem::forget(array); + } + } + + Ok(pyseries) + } + // Go via pyarrow + Err(_) => { + let s = self.0.rechunk(); + let name = s.name().as_str(); + let arr = s.to_arrow(0, CompatLevel::oldest()); + let pyarrow = py.import("pyarrow").expect("pyarrow not installed"); + + let arg = to_py_array(arr, pyarrow).unwrap(); + let s = polars.call_method1("from_arrow", (arg,)).unwrap(); + let s = s.call_method1("rename", (name,)).unwrap(); + Ok(s) + } + } + } +} + +impl<'py> IntoPyObject<'py> for PyDataFrame { + type Target = PyAny; + type Output = Bound<'py, Self::Target>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let pyseries = self + .0 + .get_columns() + .iter() + .map(|s| PySeries(s.as_materialized_series().clone()).into_pyobject(py)) + .collect::>>()?; + + let polars = POLARS.bind(py); + polars.call_method1("DataFrame", (pyseries,)) + } +} + +#[cfg(feature = "lazy")] +impl<'py> IntoPyObject<'py> for PyLazyFrame { + type Target = PyAny; + type Output = Bound<'py, Self::Target>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + dbg!("into py"); + let polars = POLARS.bind(py); + let cls = polars.getattr("LazyFrame")?; + let instance = cls.call_method1(intern!(py, "__new__"), (&cls,)).unwrap(); + + let buf = pl_serialize::SerializeOptions::default() + .serialize_to_bytes(&self.0.logical_plan) + .unwrap(); + instance.call_method1("__setstate__", (&buf,))?; + Ok(instance) + } +} + +#[cfg(feature = "lazy")] +impl<'py> IntoPyObject<'py> for PyExpr { + type Target = PyAny; + type Output = Bound<'py, Self::Target>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let polars = POLARS.bind(py); + let cls = polars.getattr("Expr")?; + let instance = cls.call_method1(intern!(py, "__new__"), (&cls,))?; + + let buf = pl_serialize::SerializeOptions::default() + .serialize_to_bytes(&self.0) + .unwrap(); + + instance + .call_method1("__setstate__", (&buf,)) + .map_err(|err| { + let msg = format!("deserialization failed: {err}"); + PyValueError::new_err(msg) + }) + } +} + +#[cfg(feature = "dtype-categorical")] +pub(crate) fn to_series(py: Python, s: PySeries) -> PyObject { + let series = SERIES.bind(py); + let constructor = series + .getattr(intern!(series.py(), "_from_pyseries")) + .unwrap(); + constructor + .call1((s,)) + .unwrap() + .into_pyobject(py) + .unwrap() + .into() +} + +impl<'py> IntoPyObject<'py> for PyDataType { + type Target = PyAny; + type Output = Bound<'py, Self::Target>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let pl = POLARS.bind(py); + + match &self.0 { + DataType::Int8 => { + let class = pl.getattr(intern!(py, "Int8")).unwrap(); + class.call0() + } + DataType::Int16 => { + let class = pl.getattr(intern!(py, "Int16")).unwrap(); + class.call0() + } + DataType::Int32 => { + let class = pl.getattr(intern!(py, "Int32")).unwrap(); + class.call0() + } + DataType::Int64 => { + let class = pl.getattr(intern!(py, "Int64")).unwrap(); + class.call0() + } + DataType::UInt8 => { + let class = pl.getattr(intern!(py, "UInt8")).unwrap(); + class.call0() + } + DataType::UInt16 => { + let class = pl.getattr(intern!(py, "UInt16")).unwrap(); + class.call0() + } + DataType::UInt32 => { + let class = pl.getattr(intern!(py, "UInt32")).unwrap(); + class.call0() + } + DataType::UInt64 => { + let class = pl.getattr(intern!(py, "UInt64")).unwrap(); + class.call0() + } + DataType::Float32 => { + let class = pl.getattr(intern!(py, "Float32")).unwrap(); + class.call0() + } + DataType::Float64 | DataType::Unknown(UnknownKind::Float) => { + let class = pl.getattr(intern!(py, "Float64")).unwrap(); + class.call0() + } + #[cfg(feature = "dtype-decimal")] + DataType::Decimal(precision, scale) => { + let class = pl.getattr(intern!(py, "Decimal")).unwrap(); + let args = (*precision, *scale); + class.call1(args) + } + DataType::Boolean => { + let class = pl.getattr(intern!(py, "Boolean")).unwrap(); + class.call0() + } + DataType::String | DataType::Unknown(UnknownKind::Str) => { + let class = pl.getattr(intern!(py, "String")).unwrap(); + class.call0() + } + DataType::Binary => { + let class = pl.getattr(intern!(py, "Binary")).unwrap(); + class.call0() + } + #[cfg(feature = "dtype-array")] + DataType::Array(inner, size) => { + let class = pl.getattr(intern!(py, "Array")).unwrap(); + let inner = PyDataType(*inner.clone()).into_pyobject(py)?; + let args = (inner, *size); + class.call1(args) + } + DataType::List(inner) => { + let class = pl.getattr(intern!(py, "List")).unwrap(); + let inner = PyDataType(*inner.clone()).into_pyobject(py)?; + class.call1((inner,)) + } + DataType::Date => { + let class = pl.getattr(intern!(py, "Date")).unwrap(); + class.call0() + } + DataType::Datetime(tu, tz) => { + let datetime_class = pl.getattr(intern!(py, "Datetime")).unwrap(); + datetime_class.call1((tu.to_ascii(), tz.as_ref().map(|s| s.as_str()))) + } + DataType::Duration(tu) => { + let duration_class = pl.getattr(intern!(py, "Duration")).unwrap(); + duration_class.call1((tu.to_ascii(),)) + } + #[cfg(feature = "object")] + DataType::Object(_, _) => { + let class = pl.getattr(intern!(py, "Object")).unwrap(); + class.call0() + } + #[cfg(feature = "dtype-categorical")] + DataType::Categorical(_, ordering) => { + let class = pl.getattr(intern!(py, "Categorical")).unwrap(); + let ordering = match ordering { + CategoricalOrdering::Physical => "physical", + CategoricalOrdering::Lexical => "lexical", + }; + class.call1((ordering,)) + } + #[cfg(feature = "dtype-categorical")] + DataType::Enum(rev_map, _) => { + // we should always have an initialized rev_map coming from rust + let categories = rev_map.as_ref().unwrap().get_categories(); + let class = pl.getattr(intern!(py, "Enum")).unwrap(); + let s = Series::from_arrow("category".into(), categories.clone().boxed()).unwrap(); + let series = to_series(py, PySeries(s)); + return class.call1((series,)); + } + DataType::Time => pl.getattr(intern!(py, "Time")), + #[cfg(feature = "dtype-struct")] + DataType::Struct(fields) => { + let field_class = pl.getattr(intern!(py, "Field")).unwrap(); + let iter = fields + .iter() + .map(|fld| { + let name = fld.name().as_str(); + let dtype = PyDataType(fld.dtype().clone()).into_pyobject(py)?; + field_class.call1((name, dtype)) + }) + .collect::>>()?; + let fields = PyList::new(py, iter)?; + let struct_class = pl.getattr(intern!(py, "Struct")).unwrap(); + struct_class.call1((fields,)) + } + DataType::Null => { + let class = pl.getattr(intern!(py, "Null")).unwrap(); + class.call0() + } + DataType::Unknown(UnknownKind::Int(v)) => { + PyDataType(materialize_dyn_int(*v).dtype()).into_pyobject(py) + } + DataType::Unknown(_) => { + let class = pl.getattr(intern!(py, "Unknown")).unwrap(); + class.call0() + } + DataType::BinaryOffset => { + panic!("this type isn't exposed to python") + } + #[allow(unreachable_patterns)] + _ => panic!("activate dtype"), + } + } +} + +impl<'py> IntoPyObject<'py> for PySchema { + type Target = PyDict; + type Output = Bound<'py, Self::Target>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let dict = PyDict::new(py); + for (k, v) in self.0.iter() { + dict.set_item(k.as_str(), PyDataType(v.clone()).into_pyobject(py)?)?; + } + Ok(dict) + } +} + +impl<'py> FromPyObject<'py> for PyDataType { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + let py = ob.py(); + let type_name = ob.get_type().qualname()?.to_string(); + + let dtype = match type_name.as_ref() { + "DataTypeClass" => { + // just the class, not an object + let name = ob + .getattr(intern!(py, "__name__"))? + .str()? + .extract::()?; + match &*name { + "Int8" => DataType::Int8, + "Int16" => DataType::Int16, + "Int32" => DataType::Int32, + "Int64" => DataType::Int64, + "UInt8" => DataType::UInt8, + "UInt16" => DataType::UInt16, + "UInt32" => DataType::UInt32, + "UInt64" => DataType::UInt64, + "Float32" => DataType::Float32, + "Float64" => DataType::Float64, + "Boolean" => DataType::Boolean, + "String" => DataType::String, + "Binary" => DataType::Binary, + #[cfg(feature = "dtype-categorical")] + "Categorical" => DataType::Categorical(None, Default::default()), + #[cfg(feature = "dtype-categorical")] + "Enum" => DataType::Enum(None, Default::default()), + "Date" => DataType::Date, + "Time" => DataType::Time, + "Datetime" => DataType::Datetime(TimeUnit::Microseconds, None), + "Duration" => DataType::Duration(TimeUnit::Microseconds), + #[cfg(feature = "dtype-decimal")] + "Decimal" => DataType::Decimal(None, None), // "none" scale => "infer" + "List" => DataType::List(Box::new(DataType::Null)), + #[cfg(feature = "dtype-array")] + "Array" => DataType::Array(Box::new(DataType::Null), 0), + #[cfg(feature = "dtype-struct")] + "Struct" => DataType::Struct(vec![]), + "Null" => DataType::Null, + #[cfg(feature = "object")] + "Object" => todo!(), + "Unknown" => DataType::Unknown(Default::default()), + dt => { + return Err(PyTypeError::new_err(format!( + "'{dt}' is not a Polars data type, or the plugin isn't compiled with the right features", + ))) + }, + } + }, + "Int8" => DataType::Int8, + "Int16" => DataType::Int16, + "Int32" => DataType::Int32, + "Int64" => DataType::Int64, + "UInt8" => DataType::UInt8, + "UInt16" => DataType::UInt16, + "UInt32" => DataType::UInt32, + "UInt64" => DataType::UInt64, + "Float32" => DataType::Float32, + "Float64" => DataType::Float64, + "Boolean" => DataType::Boolean, + "String" => DataType::String, + "Binary" => DataType::Binary, + #[cfg(feature = "dtype-categorical")] + "Categorical" => { + let ordering = ob.getattr(intern!(py, "ordering")).unwrap(); + let ordering = ordering.extract::()?; + let ordering = match ordering.as_bytes() { + b"physical" => CategoricalOrdering::Physical, + b"lexical" => CategoricalOrdering::Lexical, + ordering => { + let ordering = std::str::from_utf8(ordering).unwrap(); + return Err(PyValueError::new_err(format!("invalid ordering argument: {ordering}"))) + } + }; + + DataType::Categorical(None, ordering) + }, + #[cfg(feature = "dtype-categorical")] + "Enum" => { + let categories = ob.getattr(intern!(py, "categories")).unwrap(); + let s = get_series(&categories.as_borrowed())?; + let ca = s.str().map_err(PyPolarsErr::from)?; + let categories = ca.downcast_iter().next().unwrap().clone(); + DataType::Enum(Some(Arc::new(RevMapping::build_local(categories))), Default::default()) + }, + "Date" => DataType::Date, + "Time" => DataType::Time, + "Datetime" => { + let time_unit = ob.getattr(intern!(py, "time_unit")).unwrap(); + let time_unit = time_unit.extract::()?.0; + let time_zone = ob.getattr(intern!(py, "time_zone")).unwrap(); + let time_zone: Option = time_zone.extract()?; + DataType::Datetime(time_unit, time_zone.map(PlSmallStr::from)) + }, + "Duration" => { + let time_unit = ob.getattr(intern!(py, "time_unit")).unwrap(); + let time_unit = time_unit.extract::()?.0; + DataType::Duration(time_unit) + }, + #[cfg(feature = "dtype-decimal")] + "Decimal" => { + let precision = ob.getattr(intern!(py, "precision"))?.extract()?; + let scale = ob.getattr(intern!(py, "scale"))?.extract()?; + DataType::Decimal(precision, Some(scale)) + }, + "List" => { + let inner = ob.getattr(intern!(py, "inner")).unwrap(); + let inner = inner.extract::()?; + DataType::List(Box::new(inner.0)) + }, + #[cfg(feature = "dtype-array")] + "Array" => { + let inner = ob.getattr(intern!(py, "inner")).unwrap(); + let size = ob.getattr(intern!(py, "size")).unwrap(); + let inner = inner.extract::()?; + let size = size.extract::()?; + DataType::Array(Box::new(inner.0), size) + }, + #[cfg(feature = "dtype-struct")] + "Struct" => { + let fields = ob.getattr(intern!(py, "fields"))?; + let fields = fields + .extract::>()? + .into_iter() + .map(|f| f.0) + .collect::>(); + DataType::Struct(fields) + }, + "Null" => DataType::Null, + #[cfg(feature = "object")] + "Object" => panic!("object not supported"), + "Unknown" => DataType::Unknown(Default::default()), + dt => { + return Err(PyTypeError::new_err(format!( + "'{dt}' is not a Polars data type, or the plugin isn't compiled with the right features", + ))) + }, + }; + Ok(PyDataType(dtype)) + } +}