diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8bb8be9..ba277fa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,8 +1,8 @@ # InQL CI — Incan library package # -# The compiler is built from the Incan repository on each run so this package tracks the default branch of the toolchain -# (no pre-published `incan` binary required). -# FIXME: once incan has the ability, switch to a published `incan` binary. +# Uses the reusable Incan composite action from the incan repository. +# This eliminates copy-paste drift and provides a supported integration path. +# The composite action caches built binaries for faster subsequent runs. name: CI @@ -21,9 +21,6 @@ env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 INCAN_NO_BANNER: 1 - # Pin the compiler checkout; bump when you intentionally require a newer Incan. - INCAN_REPO: dannys-code-corner/incan - INCAN_REF: main jobs: inql: @@ -38,26 +35,12 @@ jobs: - name: Check out InQL uses: actions/checkout@v4 - - name: Check out Incan (toolchain) - uses: actions/checkout@v4 - with: - repository: ${{ env.INCAN_REPO }} - ref: ${{ env.INCAN_REF }} - path: incan - - - uses: dtolnay/rust-toolchain@stable - - - name: Cache Incan build - uses: Swatinem/rust-cache@v2 + - name: Install Incan (cached) + uses: dannys-code-corner/incan/.github/actions/install-incan@main with: - workspaces: incan - - - name: Build Incan - working-directory: incan - run: cargo build --release - - - name: Add Incan to PATH - run: echo "${{ github.workspace }}/incan/target/release" >> "$GITHUB_PATH" + incan-ref: main + incan-repo: dannys-code-corner/incan + runner-os: ${{ matrix.os }} - name: Show toolchain run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68fdf56..e291ebe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ Thank you for your interest in InQL — the typed relational layer for [Incan][i ## Getting started 1. **Install a matching Incan toolchain** - Build or install `incan` so it is on your `PATH` (start from the [Incan repository][incan-repo] and its contributor docs). + Build or install `incan` so it is on your `PATH` (start from the [Incan repository][incan-repo] and its contributor docs). The CI uses a **reusable composite action** that caches built binaries for faster subsequent runs. 2. **Clone this repository** @@ -110,8 +110,8 @@ Open an issue on this repository for InQL-specific design or package questions; -[incan-repo]: https://github.com/dannys-code-corner/incan-programming-language -[incan-contributing]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/CONTRIBUTING.md +[incan-repo]: https://github.com/dannys-code-corner/incan +[incan-contributing]: https://github.com/dannys-code-corner/incan/blob/main/CONTRIBUTING.md [readme]: README.md [agents]: AGENTS.md [architecture]: docs/architecture.md @@ -122,5 +122,5 @@ Open an issue on this repository for InQL-specific design or package questions; [issue-auto-label]: .github/workflows/issue_auto_label.yml [pr-template]: .github/pull_request_template.md [issue-templates]: .github/ISSUE_TEMPLATE/ -[incan-docsite-loop]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/workspaces/docs-site/docs/contributing/tutorials/book/08_docsite_contributor_loop.md -[incan-agents-docs-workflow]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/AGENTS.md#docs-site-workflow-mkdocs-material +[incan-docsite-loop]: https://github.com/dannys-code-corner/incan/blob/main/workspaces/docs-site/docs/contributing/tutorials/book/08_docsite_contributor_loop.md +[incan-agents-docs-workflow]: https://github.com/dannys-code-corner/incan/blob/main/AGENTS.md#docs-site-workflow-mkdocs-material diff --git a/README.md b/README.md index 8d43ce9..5cef0c1 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Normative proposals live under **[docs/rfcs/](docs/rfcs/README.md)**. InQL’s R - `src/lib.incn` — public exports - `src/` — library modules - `tests/` — tests -- `.github/workflows/` — CI (builds Incan, then `make ci`) +- `.github/workflows/` — CI (uses reusable Incan composite action for caching) Build and test from this repo root (with `incan` on your `PATH`): diff --git a/docs/README.md b/docs/README.md index 8c0280b..5f2c421 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,7 +4,6 @@ Design records for the InQL project live in this directory. - **RFCs (design proposals):** [docs/rfcs/](rfcs/README.md) — see the index for current numbered RFCs - **How to write an RFC:** [contributing/writing_rfcs.md](contributing/writing_rfcs.md) -- **Research (non-normative):** [__research__](../__research__/README.md) When a standalone docs site (e.g. **MkDocs Material**) is added, use `docs/` as the content root. **Conventions** (Divio-style structure, prose without hard wrapping, `mkdocs build --strict`, snippets) align with the Incan project: see [Incan docs-site contributor loop][incan-docsite-loop] and [Incan AGENTS — Docs-site workflow][incan-agents-docs-workflow]. @@ -12,5 +11,5 @@ When a standalone docs site (e.g. **MkDocs Material**) is added, use `docs/` as -[incan-docsite-loop]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/workspaces/docs-site/docs/contributing/tutorials/book/08_docsite_contributor_loop.md -[incan-agents-docs-workflow]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/AGENTS.md#docs-site-workflow-mkdocs-material +[incan-docsite-loop]: https://github.com/dannys-code-corner/incan/blob/main/workspaces/docs-site/docs/contributing/tutorials/book/08_docsite_contributor_loop.md +[incan-agents-docs-workflow]: https://github.com/dannys-code-corner/incan/blob/main/AGENTS.md#docs-site-workflow-mkdocs-material diff --git a/docs/architecture.md b/docs/architecture.md index 3f0d00b..a78c7a1 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,112 +1,220 @@ # InQL architecture -This document describes how the **InQL** project is structured and how it relates to the **Incan** compiler. It is modeled on Incan’s architecture overview but scoped to this repository and the relational **data logic** layer (not orchestration or engine-specific runtime in the authoring model). +This document describes the architectural model of **InQL**. It is scoped to the InQL repository and its relationship to the Incan compiler, not to product orchestration or engine-specific operational concerns. ## What InQL is InQL is two things that evolve together: -1. **A specification** — Normative design under [docs/rfcs/][inql-rfcs]: language surface and naming, dataset types (including bounded vs unbounded carriers), portable logical plans (Substrait), `query { }` authoring, execution context (session and I/O), and (later) optional pipe-forward. **InQL v0.1** is scoped through execution context; pipe-forward is specified for alignment but not part of that release. -2. **An Incan library package** — `.incn` sources built with `incan build --lib`, published as a dependency for Incan programs. +1. **A specification** — Normative design under [docs/rfcs/][inql-rfcs]: naming and core semantics, dataset carriers, Substrait emission, query authoring, the execution boundary, and the internal planning substrate. +2. **An Incan library package** — `.incn` modules built with `incan build --lib`, consumed by Incan programs as a typed relational package. -The **compiler** that parses, typechecks, and lowers InQL syntax into plans or Rust lives in the [Incan repository][incan-repo]. This repo holds the **author-facing package** and the **normative design docs** that implementation work should follow. +The Incan compiler remains responsible for parsing, typechecking, lowering, and Rust/code generation. The InQL repo holds the author-facing package and the RFCs that define what that package is supposed to mean. -## High-level placement +## Architectural model - +InQL is organized around three layers: -```text -┌─────────────────────────────────────────────────────────────────────────────┐ -│ InQL repo (this project) │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ docs/rfcs/ ──► Normative design (numbered proposals + index) │ -│ src/*.incn ──► Library surface (exports, helpers, version) │ -│ tests/ ──► Package tests │ -└─────────────────────────────────────────────────────────────────────────────┘ - │ - │ informs & consumed by - ▼ -┌─────────────────────────────────────────────────────────────────────────────┐ -│ Incan compiler (separate repo) │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ Lexer / Parser / AST ──► Typechecker ──► Lowering ──► Rust / plans │ -│ (relational syntax, models, and `DataSet` types are checked here) │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` +- **Prism internally** — the immutable planning and optimization engine +- **Substrait at the boundary** — the normative emitted logical interchange contract +- **Session for execution** — the execution/binding layer that consumes plans but does not define them + +That gives each major concept one job: + +- **Prism** thinks about the plan +- **Substrait** communicates the plan +- **Session** executes the plan -## Intended language pipeline (relational surface) +This separation is important because it keeps internal planning concerns, portable interchange semantics, and runtime execution concerns from collapsing into one another. -The compiler implements this story over time. Conceptually: +## Conceptual pipeline + +InQL is intended to follow this shape: ```text -Incan models (row types) +Incan models / model-derived schema │ ▼ - DataSet[T] carriers ◄── programmatic API (bounded vs unbounded traits) - │ - ├──► query { } blocks - │ │ - │ └──► Substrait-shaped logical plan - │ │ - │ └──► Session: bind, execute, read/write + DataSet[T] carriers │ - └──► optional pipe-forward (later release; same naming core as blocks/chains) + ├──► method chains + ├──► query { } blocks + └──► future pipe-forward / other authoring surfaces + │ + ▼ + Prism logical planning substrate + │ + ├──► authored plan state + ├──► lineage-preserving optimization + └──► optimized logical view + │ + ▼ + Substrait Plan / Rel emission + │ + ▼ + Session / backend execution ``` -Across blocks, method chains, and (when present) pipe-forward, authors share one notion of **naming** and **query schema evolution**—including forms like `.column`, `alias.column`, bare names in the query schema, and outer bindings—plus clear **layer boundaries** between data logic and execution. Precise rules live in the specification documents under [docs/rfcs/][inql-rfcs]. +The core rule is: + +- authoring surfaces build or manipulate Prism-managed logical work +- Prism prepares that work for boundary emission +- RFC 002 owns the Substrait contract +- RFC 004 owns execution and binding + +## Layer responsibilities + +### Carriers + +The author-facing carrier family is rooted in `DataSet[T]` and currently includes `LazyFrame[T]`, `DataFrame[T]`, and `DataStream[T]`. + +Carriers are expected to be: + +- typed by model-derived schema information +- immutable from the author’s point of view +- cheap to branch +- execution-neutral on their own + +They should be understood as **experiences over shared Prism-managed planning state**, not as independent semantic systems. + +### Prism + +Per [RFC 007][rfc-007], Prism is InQL’s internal logical planning and optimization engine. + +Prism is responsible for: + +- persistent logical plan storage +- cheap branching through structural sharing +- lineage preservation +- logical rewrites and optimization before boundary emission or execution + +Prism is **not** the normative interchange format and **not** the execution engine. + +### Substrait + +Per [RFC 002][rfc-002], Apache Substrait is the normative logical interchange boundary for InQL. + +That means: + +- portable relational work must be expressible as Substrait `Plan` / `Rel` +- logical reads remain logical at the boundary +- extension and gap handling are documented at the Substrait boundary +- internal planning freedom is allowed, but emitted plans must follow RFC 002 + +Today, the package’s RFC 002-facing code lives primarily in: + +- [plan.incn](../src/substrait/plan.incn) +- [conformance.incn](../src/substrait/conformance.incn) +- [schema.incn](../src/substrait/schema.incn) + +### Session + +Per [RFC 004][rfc-004], `Session` / `SessionContext` own binding and execution. + +Session is responsible for: + +- resolving logical reads to physical resources +- applying backend-specific execution behavior +- collecting or materializing results +- writing to sinks where appropriate + +Session is intentionally outside RFC 002’s normative emitted contract. It consumes plans; it does not define plan semantics. + +## Current implementation + +The repository currently includes: + +- author-facing carrier types exist in [mod.incn](../src/dataset/mod.incn) +- canonical relational operator helpers exist in [ops.incn](../src/dataset/ops.incn) +- RFC 002 emits **real proto-backed Substrait plans** +- conformance scenarios are represented as typed package code in [conformance.incn](../src/substrait/conformance.incn) +- Prism is specified as the internal planning substrate, while parts of its full implementation remain ahead of the current package code + +This means the package has a concrete Substrait boundary and conformance layer, while some internal planning mechanics remain transitional. ## Repository layout -| Path | Role | -| ---------------------- | --------------------------------------------------- | -| `incan.toml` | Package metadata (`name`, `version`) | -| `src/lib.incn` | Public module: re-exports and package docs | -| `src/*.incn` | Library implementation modules | -| `tests/` | `incan test` targets | -| `docs/rfcs/` | Specification index and individual proposals | -| `docs/architecture.md` | This document: repo placement vs the Incan compiler | -| `docs/README.md` | Pointer into documentation | +| Path | Role | +| --------------------------------- | ------------------------------------------------- | +| `incan.toml` | Package metadata and Rust dependency declarations | +| `src/lib.incn` | Public package exports | +| `src/dataset/mod.incn` | Carrier types and trait surface | +| `src/dataset/ops.incn` | Canonical relational operator helpers | +| `src/substrait/plan.incn` | RFC 002 proto-backed Substrait emission helpers | +| `src/substrait/conformance.incn` | Typed conformance corpus and validation helpers | +| `src/substrait/schema.incn` | Model/schema to Substrait type bridging | +| `tests/` | Package tests run through `incan test` | +| `docs/rfcs/` | Normative RFC series | +| `docs/architecture.md` | This overview | -Normative behavior is defined in the **RFC series**, not only in code. If code and a spec disagree, treat it as a bug unless the document is explicitly superseded. +Normative behavior lives in the RFC series first. If code and RFCs disagree, treat that as a bug or transition state to resolve explicitly. -## Build and test (this package) +## Repository vs compiler -From the repo root, with `incan` on your `PATH`: +The InQL repository and the Incan compiler have different responsibilities. ```text -make ci - │ - └──► incan fmt --check (package dirs) → incan build --lib → incan test tests +┌─────────────────────────────────────────────────────────────────────────────┐ +│ InQL repo │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ RFCs, package modules, tests, architecture, conformance corpus │ +│ Defines the relational package surface and its normative contracts │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ implemented through + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Incan compiler │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ Parsing, typechecking, lowering, Rust emission, LSP, test runner, builds │ +│ Makes InQL package code executable and eventually supports new surfaces │ +└─────────────────────────────────────────────────────────────────────────────┘ ``` -Equivalent raw commands: +That distinction matters because some InQL architecture is specified before the compiler fully supports every intended implementation path. Prism is a good example: the planning boundary is specified even where current compiler and tooling constraints still force temporary implementation compromises. + +## Build and test + +From the repo root, with `incan` on `PATH`: ```text incan build --lib - │ - └──► Incan frontend (parse, check, …) + backend emit a Rust crate for the library - (same staged pipeline as application builds; see [Incan architecture docs][incan-architecture]) - -incan test tests - │ - └──► Discover and run tests under `tests/` only (not the whole repo; CI may have `./incan/` checked out) +incan test ``` -**GitHub Actions** does not assume a preinstalled `incan` binary: the workflow checks out the [Incan compiler repository][incan-repo], runs `cargo build --release`, adds `target/release` to `PATH`, then runs `make ci` in this tree. +In practice: -For **stage-by-stage** debugging (`--parse`, `--check`, `--emit-rust`, etc.), use the Incan CLI options documented in the Incan project. +- `incan build --lib` parses, typechecks, lowers, and emits a Rust crate for the InQL library +- `incan test` discovers and runs tests under `tests/` -## Where to read more +CI builds `incan` first, then runs the InQL package checks against that compiler. + +## Reading order + +If you want the clearest architecture story, read in this order: -| Topic | Location | -| --------------------------------- | --------------------------------------------------------------------- | -| Full compiler module map, IR, LSP | [Incan compiler architecture][incan-architecture] (in the Incan repo) | -| InQL specification | [docs/rfcs/][inql-rfcs] | -| Contributing | [CONTRIBUTING.md][inql-contributing] | +1. [RFC 001][rfc-001] — carrier semantics +2. [RFC 002][rfc-002] — Substrait boundary +3. [RFC 004][rfc-004] — execution boundary +4. [RFC 007][rfc-007] — Prism internal planning substrate + +That sequence mirrors the intended separation between authoring surface, interchange, execution, and internal planning. + +## Where to read more - +| Topic | Location | +| --------------------------- | --------------------------------------------- | +| InQL RFC index | [docs/rfcs/README.md][inql-rfcs] | +| Prism planning engine | [RFC 007][rfc-007] | +| Substrait integration | [RFC 002][rfc-002] | +| Execution context | [RFC 004][rfc-004] | +| Incan compiler architecture | [Incan architecture docs][incan-architecture] | +| Contributing | [CONTRIBUTING.md][inql-contributing] | -[incan-repo]: https://github.com/dannys-code-corner/incan-programming-language -[incan-architecture]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/workspaces/docs-site/docs/contributing/explanation/architecture.md +[incan-architecture]: https://github.com/dannys-code-corner/incan/blob/main/workspaces/docs-site/docs/contributing/explanation/architecture.md [inql-rfcs]: rfcs/README.md -[inql-contributing]: ../CONTRIBUTING.md \ No newline at end of file +[inql-contributing]: ../CONTRIBUTING.md +[rfc-001]: rfcs/001_inql_dataset.md +[rfc-002]: rfcs/002_apache_substrait_integration.md +[rfc-004]: rfcs/004_inql_execution_context.md +[rfc-007]: rfcs/007_prism_planning_engine.md diff --git a/docs/contributing/writing_rfcs.md b/docs/contributing/writing_rfcs.md index bcd96f1..7db4c3e 100644 --- a/docs/contributing/writing_rfcs.md +++ b/docs/contributing/writing_rfcs.md @@ -2,7 +2,7 @@ This guide is for contributors writing an RFC (design record) in the **InQL** repository. -RFC means “Request for Comments”: a normative design document under [`docs/rfcs/`](../rfcs/README.md), numbered separately from [Incan language RFCs](https://github.com/dannys-code-corner/incan-programming-language/tree/main/workspaces/docs-site/docs/RFCs). +RFC means “Request for Comments”: a normative design document under [`docs/rfcs/`](../rfcs/README.md), numbered separately from [Incan language RFCs](https://github.com/dannys-code-corner/incan/tree/main/workspaces/docs-site/docs/RFCs). !!! warning "Before you start" @@ -74,7 +74,7 @@ When superseding or rejecting, update the status line (for example `Superseded b - Write **reference-level** sections so an implementer could build to them. - Call out **non-goals** explicitly. - If the design is too large, split into a sequence of smaller RFCs with clear **Related** links. -- **Normative rules** must live in the RFC text (or this repo’s public docs), not in `__research__/` or private trees. +- **Normative rules** must live in the RFC text (or this repo’s public docs), not in private notes or internal-only trees. ## Compiler and tooling work @@ -87,5 +87,5 @@ Narrative docs and RFCs in this repo should stay compatible with how the Incan p - [Incan docs-site contributor loop][incan-docsite-loop] — Divio quadrants, snippets, PR checklist for the Incan docs site. - [Incan AGENTS.md — Docs-site workflow][incan-agents-docs-workflow] — prose without hard wrapping, `mkdocs build --strict`, and related expectations. -[incan-docsite-loop]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/workspaces/docs-site/docs/contributing/tutorials/book/08_docsite_contributor_loop.md -[incan-agents-docs-workflow]: https://github.com/dannys-code-corner/incan-programming-language/blob/main/AGENTS.md#docs-site-workflow-mkdocs-material +[incan-docsite-loop]: https://github.com/dannys-code-corner/incan/blob/main/workspaces/docs-site/docs/contributing/tutorials/book/08_docsite_contributor_loop.md +[incan-agents-docs-workflow]: https://github.com/dannys-code-corner/incan/blob/main/AGENTS.md#docs-site-workflow-mkdocs-material diff --git a/docs/language/reference/substrait/conformance.md b/docs/language/reference/substrait/conformance.md new file mode 100644 index 0000000..35adfec --- /dev/null +++ b/docs/language/reference/substrait/conformance.md @@ -0,0 +1,78 @@ +# Substrait conformance corpus (Reference) + +This page documents where InQL's Substrait conformance scenarios live and how they are represented. The normative Substrait contract still lives in [InQL RFC 002][rfc-002], with operator-level mappings in the [Substrait operator catalog][ref-operator-catalog]. + +The corpus is the machine-readable validation layer for the RFC 002 v1 implementation profile ("v1 implementation profile (InQL code path)"). + +## Source of truth + +The canonical conformance corpus is implemented in InQL package code: + +- `src/substrait/conformance.incn` + +The corpus uses typed models/enums (`SubstraitConformanceScenario`, `ConformanceStatus`, `ConformanceRel`, and related enums) for machine-readable contracts, and uses module/API docstrings for the human-readable contract. + +Canonical operation semantics flow through `src/dataset/ops.incn`, while proto-backed Substrait emission and plan inspection live in `src/substrait/plan.incn`. + +For the current package-level RFC 002 profile, conformance checks are intentionally split between: + +- real boundary facts that the package can prove now (relation kind, read kind, join variant, set operation, reference ordinal, extension URI presence) +- richer planning semantics that remain deferred to future `query {}` lowering and Prism work + +## Representation contract + +Each scenario is selected by `CoreScenarioKey` and materialized via `core_scenario(key) -> SubstraitConformanceScenario`. + +- Machine-readable fields include strongly typed enums for status/profile/relation/portability fields. +- Tag and reference collections are modeled as list-backed newtypes (`ConformanceCapabilityTags` and `ConformanceReferences`) rather than pipe-delimited strings. +- Human-readable content remains in docs plus descriptive scenario text fields (`intent`, `required_rel_shape`, and `expected_constraints`). + +## Scenario ID convention + +`scenario_id` values must be stable and use this convention: + +```text +inql.substrait... +``` + +The numeric suffix is immutable after publication. If requirements change incompatibly, add a new scenario ID instead of mutating semantics under an existing ID. + +## Current core coverage + +Core scenarios currently implemented in `src/substrait/conformance.incn`: + +| Scenario ID | Selector | Primary core `Rel` coverage | +| ------------------------------------------------------ | ---------------------------------------------------------- | --------------------------- | +| `inql.substrait.core.read_named_table.001` | `core_scenario(CoreScenarioKey.ReadNamedTable)` | `ReadRel` (`NamedTable`) | +| `inql.substrait.core.read_local_files.001` | `core_scenario(CoreScenarioKey.ReadLocalFiles)` | `ReadRel` (`LocalFiles`) | +| `inql.substrait.core.read_virtual_table.001` | `core_scenario(CoreScenarioKey.ReadVirtualTable)` | `ReadRel` (`VirtualTable`) | +| `inql.substrait.core.filter_rows.001` | `core_scenario(CoreScenarioKey.FilterRows)` | `FilterRel` | +| `inql.substrait.core.project_computed_columns.001` | `core_scenario(CoreScenarioKey.ProjectComputedColumns)` | `ProjectRel` | +| `inql.substrait.core.join_rel_variants.001` | `core_scenario(CoreScenarioKey.JoinRelVariants)` | `JoinRel` | +| `inql.substrait.core.cross_rel_cartesian.001` | `core_scenario(CoreScenarioKey.CrossRelCartesian)` | `CrossRel` | +| `inql.substrait.core.aggregate_grouping_sets.001` | `core_scenario(CoreScenarioKey.AggregateGroupingSets)` | `AggregateRel` | +| `inql.substrait.core.sort_rel_ordering.001` | `core_scenario(CoreScenarioKey.SortRelOrdering)` | `SortRel` | +| `inql.substrait.core.fetch_rel_limit_offset.001` | `core_scenario(CoreScenarioKey.FetchRelLimitOffset)` | `FetchRel` | +| `inql.substrait.core.set_rel_operations.001` | `core_scenario(CoreScenarioKey.SetRelOperations)` | `SetRel` | +| `inql.substrait.core.reference_rel_shared_subplan.001` | `core_scenario(CoreScenarioKey.ReferenceRelSharedSubplan)` | `ReferenceRel` | + +## Taxonomy values + +The same taxonomy remains in force for scenario declarations: + +- `status`: `ConformanceStatus.Core`, `ConformanceStatus.Extension`, `ConformanceStatus.Gap`, `ConformanceStatus.OptionalMutation` +- `profile_tags`: `ConformanceProfileTag.ReadQueryCore`, `ConformanceProfileTag.OptionalMutation`, `ConformanceProfileTag.GapPolicy`, `ConformanceProfileTag.ReadBindingBoundary` +- `portability`: `ConformancePortability.Portable`, `ConformancePortability.ConsumerConditional`, `ConformancePortability.NonPortable` + +## Tooling expectation + +Downstream tooling should consume scenario functions and model fields from `src/substrait/conformance.incn` as the machine contract, rather than JSON sidecar files. + +Conformance validation for the v1 profile is expected to run against canonical operation functions in `src/dataset/ops.incn`, emitted proto-backed plans from `src/substrait/plan.incn`, and typed model/schema helpers where needed. + +The current `ProjectRel` and `AggregateRel` scenarios are boundary-shape scaffolds, not proof that full computed-column, window, grouping-set, or distinct semantics are already implemented in package code. + + + +[rfc-002]: ../../rfcs/002_apache_substrait_integration.md +[ref-operator-catalog]: ./operator_catalog.md diff --git a/docs/language/reference/substrait/operator_catalog.md b/docs/language/reference/substrait/operator_catalog.md new file mode 100644 index 0000000..662615d --- /dev/null +++ b/docs/language/reference/substrait/operator_catalog.md @@ -0,0 +1,112 @@ +# Substrait operator catalog (Reference) + +This page is the **operational mapping reference** for InQL's Apache Substrait integration. The normative contract — including the Logical `Rel` alphabet, pinning policy, read-root boundary, and extension URI rules — lives in [InQL RFC 002][rfc-002]. This page provides the full capability → `Rel` catalog, profile tags, gap encoding requirements, and optional mutation profile detail that are too long-lived and versioned to remain inside the RFC text itself. + +## Profile tags + +Each entry in the catalog carries one of the following profile tags: + +| Tag | Meaning | +| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **core** | Maps to a standard logical `Rel` in the pinned Substrait revision; portable across conforming consumers without additional extension registration | +| **extension** | Requires a registered extension URI; portability depends on consumer support for that URI | +| **gap** | No stable logical `Rel` exists in current core Substrait; **must** use a documented non-core encoding (see [Gap profiles](#gap-profiles)); ad hoc or undocumented encodings are non-conforming | +| **optional-mutation** | Part of the optional mutation profile; not required for read/query analytical core; may be omitted by distributions that target read-only analytical use | + +The same status taxonomy is used in the [Substrait conformance corpus][ref-conformance-corpus]. Scenario contracts in that corpus are represented as typed InQL models with stable scenario IDs so CI and downstream implementations can consume a stable machine contract. + +## Read/query analytical core profile + +The following table maps InQL plan capabilities to Substrait logical relations and expression patterns for the read/query analytical core — the minimum required for InQL v0.1. + +| InQL capability (conceptual) | Substrait | Profile | +| ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------- | +| Logical table / registered name | `ReadRel` + `NamedTable` | core | +| File or object scan as plan input | `ReadRel` + `LocalFiles` (format options in the pinned spec) | core | +| Literal or embedded rows | `ReadRel` + `VirtualTable` | core | +| Predicate pushdown into scan | `ReadRel` filter fields and/or separate `FilterRel` — producer policy; **must** be documented per implementation | core | +| Row filter | `FilterRel` | core | +| Add or replace computed columns | `ProjectRel`; current package code provides a boundary scaffold, while richer computed/window semantics remain deferred | core | +| Inner join | `JoinRel` (inner variant) | core | +| Left join | `JoinRel` (left outer variant) | core | +| Semi, anti, single, mark join variants | `JoinRel` (respective variant; optional `post_join_filter`) | core | +| Cross join | `CrossRel` | core | +| Group by / aggregates | `AggregateRel`; current package code provides a boundary scaffold, while richer grouping/measure semantics remain deferred | core | +| Rollup / cube / grouping sets | `AggregateRel` with multiple groupings | core | +| Distinct rows | `AggregateRel` with grouping keys and no measures | core | +| Window / analytic functions | `ProjectRel` with window expressions | core | +| Sort | `SortRel` | core | +| Limit / offset | `FetchRel` | core | +| Union, intersect, except | `SetRel` with the appropriate set operation enum | core | +| Reuse of an identical subplan | `Plan` + `ReferenceRel` | core | +| Unnest / explode | Extension rel or documented expansion — **must** be pinned per implementation (see [Unnest / explode](#unnest--explode)) | gap | +| Pivot / unpivot | `ExtensionSingleRel` or documented rewrite to join + aggregate + project (see [Pivot / unpivot](#pivot--unpivot)) | gap | +| Asof / interval join | Gap or non-equi join expression only where consumer contract explicitly allows (see [Asof / interval joins](#asof--interval-joins)) | gap | +| Streaming time semantics (watermarks, session windows, state) | Outside core Substrait unless via named extensions or a separate execution IR (see [Streaming time semantics](#streaming-time-semantics)) | gap | + +## Optional mutation profile + +The following capabilities are part of the optional mutation profile. They are **not** required for InQL read/query analytical core (v0.1). An implementation that exposes any mutation-profile capability **must** document which relations are supported for its target backend and what portability guarantees (if any) apply. + +| InQL capability | Substrait | Profile | +| --------------------------------------------- | ----------- | ----------------- | +| Write to a table / CTAS | `WriteRel` | optional-mutation | +| Table update without a full child `Rel` input | `UpdateRel` | optional-mutation | +| DDL (create, drop, alter) | `DdlRel` | optional-mutation | + +Absence of these in a given distribution does not make InQL incomplete for read-only analytical use. + +## Extension escape hatches + +When no standard logical `Rel` covers a required operation and no gap encoding policy applies, implementations **may** use the following extension escape hatches. Any use **must** be declared in the public operator catalog for the toolchain version and assigned a stable, registered extension URI. + +| Extension `Rel` | When appropriate | +| -------------------- | ------------------------------------------------------------- | +| `ExtensionLeafRel` | Source/scan with no applicable standard `ReadRel` variant | +| `ExtensionSingleRel` | Single-input transformation with no applicable standard `Rel` | +| `ExtensionMultiRel` | Multiple-input relation with no applicable standard `Rel` | + +Using an extension escape hatch without a registered URI is non-conforming. + +## Gap profiles + +### Unnest / explode + +Core Substrait does not define a portable unnest or explode `Rel` at the logical level. Until a stable logical `Rel` for unnest is adopted in the pinned Substrait revision and recognized by InQL: + +- `EXPLODE`-style behavior **must** lower through a registered extension relation (`ExtensionSingleRel` or `ExtensionLeafRel`) with a declared extension URI in the toolchain's public catalog. +- Alternatively, a documented rewrite (for example, expanding a virtual table) **may** be used if the encoding is unambiguously specified in the public operator catalog for the toolchain version. +- Implementations **must not** present ad hoc or undocumented unnest encodings as portable core behavior. + +Current package-level RFC 002 boundary registration: + +- `https://inql.io/extensions/v0.1/unnest.yaml#explode` + +### Pivot / unpivot + +No core Substrait `Rel` covers pivot/unpivot directly: + +- The canonical encoding is a documented rewrite to `JoinRel` + `AggregateRel` + `ProjectRel`. +- Alternatively, `ExtensionSingleRel` **may** be used with a registered URI. +- The chosen encoding **must** be documented in the public operator catalog for the toolchain version. + +### Asof / interval joins + +Asof and interval joins fall outside `JoinRel`'s standard join type enum: + +- They **may** be expressed as a `JoinRel` with a complex non-equi join expression only where the consumer's documentation explicitly states it handles such expressions correctly. +- If expressed as an extension, a registered URI is required. +- The expected consumer behavior **must** be stated in the operator catalog entry for the toolchain version. + +### Streaming time semantics + +Watermarks, session windows, and stateful streaming operations are outside the scope of core Substrait's logical plan language: + +- They **must** be expressed either through registered named extensions with explicitly documented semantics, or through a separate execution-level IR that is not the normative Substrait interchange. +- Any use of streaming extensions in the normative Substrait output **must** be documented as non-portable to non-streaming consumers. +- The operator catalog entry **must** state which consumers are known to handle the extension correctly. + + + +[rfc-002]: ../../rfcs/002_apache_substrait_integration.md +[ref-conformance-corpus]: ./conformance.md diff --git a/docs/language/reference/substrait/read_root_binding_contract.md b/docs/language/reference/substrait/read_root_binding_contract.md new file mode 100644 index 0000000..a47bef0 --- /dev/null +++ b/docs/language/reference/substrait/read_root_binding_contract.md @@ -0,0 +1,74 @@ +# Substrait read-root and binding contract (Reference) + +This page is the **operational reference** for InQL's normative boundary between logical read roots in Substrait plans and execution-context binding. The normative rule — that logical reads carry names and virtual values rather than secrets, and that the execution context resolves them — lives in [InQL RFC 002][rfc-002]. This page expands on the `ReadRel` variant requirements, what a read must and must not contain, the execution context's obligations, and the adapter layer boundary. + +## Normative boundary + +InQL relational plans **must** express all new data entering the plan as logical reads. A logical read carries a **logical identity** — a name, a virtual row set, or an opaque extension type — without normative dependence on: + +- Secret material: credentials, tokens, API keys, or passwords. +- Host-specific connection strings, DSNs, or URIs that encode execution-context policy. +- Engine-specific physical scan parameters that would need to change when the plan is executed on a different conforming consumer. + +The execution context **must** resolve logical reads to physical resources through its adapter and execution layer. That resolution **must not** redefine the relational semantics of the plan. The plan's meaning — which rows, which columns, which schema — is fixed at authoring time by the logical read; the execution context only supplies the physical source. + +## `ReadRel` variant reference + +| Variant | Substrait field | Typical InQL use | Portability | +| --- | --- | --- | --- | +| `NamedTable` | `named_table` (list of name parts) | Registered logical table name; resolved by session registry | Portable across conforming consumers that have registered the same logical name | +| `LocalFiles` | `local_files` (file list + format options) | Parquet, CSV, Arrow IPC scan from a URI | Portable if consumers can resolve the URI; URI format is not standardized by Substrait | +| `VirtualTable` | `virtual_table` (inline rows) | Literal or embedded row data; `session.from_values(...)` | Fully portable; rows are embedded directly in the plan | +| Extension leaf | `ExtensionLeafRel` | Custom source type with no applicable standard `ReadRel` variant | Extension-URI-dependent; portability requires consumers to support the registered URI | + +### What a `ReadRel` must carry + +- The **logical identity** of the source: name parts for named tables, format type and URI pattern for file scans, or inline data for virtual tables. +- The **base schema** (a Substrait `NamedStruct`): field names and types must match the Incan `model` `T` that parameterizes the resulting `DataSet[T]`. +- Any **filter or column hints** permitted by the pinned Substrait spec (for example, scan-level filter fields on `ReadRel`); these are optimization hints, not semantic changes, and consumers that do not support them must still produce semantically correct results. + +### What a `ReadRel` must not carry (in the normative interchange) + +- Raw connection strings, credentials, DSN passwords, or any secret material. +- Engine-specific scan parameters that would fail or behave differently on a different conforming consumer unless declared as a registered extension. +- Schema definitions that contradict or override the Incan `model` type checked at compile time. + +## Execution context responsibilities + +The execution context ([InQL RFC 004][rfc-004] `Session`) **must**: + +1. Maintain a **table registry** that maps logical names to physical data source definitions (connection parameters, catalog references, or file paths). +2. **Resolve** `ReadRel` logical names through this registry at execution time — not at plan authoring time — so the serialized plan remains independent of execution-context state. +3. **Supply credentials and connection details** from its own configuration layer, never by reading them from the serialized Substrait plan. +4. Apply any **governance policy** (access control, row filtering, schema masking) that is sensitive enough to keep out of the portable plan, by injecting it at resolution time. +5. **Validate** that the resolved physical source schema is compatible with the `NamedStruct` declared in the `ReadRel` before execution begins. + +## Adapter boundary + +Product SDKs and higher operational layers **may** provide convenience read APIs (for example, `session.read_parquet(uri)` wrapping a `ReadRel` + `LocalFiles`). These convenience surfaces: + +- **Must** produce a `ReadRel` in the normative Substrait interchange with the logical identity encoded appropriately for the variant. +- **Must not** embed execution-context state — resolved credentials, session tokens, resolved endpoint URLs — in the `ReadRel` payload of the normative plan. +- **May** pass execution-context configuration through separate, non-normative channels (for example, `AdvancedExtension` hints, out-of-band session configuration) when needed for optimization, provided the plan remains semantically valid without them. + +Adapter-specific "open connection" or "bind source" APIs **should not** be specified as core InQL. They are thin wrappers at most, with the binding contract owned by the execution context per InQL RFC 004. + +## Interaction with InQL RFC 001 types + +The following table summarizes how each `Session` read method maps to a `ReadRel` variant and the resulting InQL carrier type. + +| `Session` method | Returns | `ReadRel` variant | +| --- | --- | --- | +| `session.table(name)` | `LazyFrame[T]` | `NamedTable` | +| `session.read_parquet(uri)` | `LazyFrame[T]` | `LocalFiles` (Parquet format) | +| `session.read_csv(uri)` | `LazyFrame[T]` | `LocalFiles` (delimited text format) | +| `session.read_arrow(uri)` | `LazyFrame[T]` | `LocalFiles` (Arrow IPC format) | +| `session.from_values(rows)` | `LazyFrame[T]` | `VirtualTable` | + +In all cases the `LazyFrame[T]` holds a deferred plan — no data is fetched until `session.collect(...)` is called. The `ReadRel` in the deferred plan carries only the logical identity; resolution to a physical source happens at execution time per the execution context obligations described above. + + + +[rfc-002]: ../../rfcs/002_apache_substrait_integration.md +[rfc-001]: ../../rfcs/001_inql_dataset.md +[rfc-004]: ../../../rfcs/004_inql_execution_context.md diff --git a/docs/language/reference/substrait/revision_and_extension_policy.md b/docs/language/reference/substrait/revision_and_extension_policy.md new file mode 100644 index 0000000..f7ccf7d --- /dev/null +++ b/docs/language/reference/substrait/revision_and_extension_policy.md @@ -0,0 +1,101 @@ +# Substrait revision and extension policy (Reference) + +This page is the **operational policy reference** for InQL's Substrait revision pinning and extension function management. The normative rules — that pinning is required and that functions outside the core bundle must use registered extension URIs — live in [InQL RFC 002][rfc-002]. This page provides the operational detail: what must be declared in a release, how extension URIs are registered, what constitutes a breaking vs additive change, and the checklist contributors follow when bumping the pinned revision. + +## Revision pinning + +### Requirements + +Each conforming InQL toolchain release **must** declare: + +- The exact Substrait **revision** it targets (commit hash or tagged release, depending on the Substrait project's versioning model at time of the InQL release). +- Any **bundled extension function sets** (YAML or equivalent) shipped alongside the toolchain. Each set must identify its URI prefix, the Substrait revision it was authored against, and the set of functions it covers. + +This information **must** appear in: + +1. The toolchain's **public release artifacts** — for example, the compiler binary's `--version` output or an accompanying manifest file (`substrait-pin.json` or equivalent). +2. The toolchain's **compiler documentation** — the release notes for that version and, where applicable, the published [operator catalog][ref-operator-catalog] for that toolchain version. + +### Compatibility matrix + +| Change type | Required action | +| ------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| Patch-level Substrait revision bump (clarifications or fixes only; no schema changes) | Document in release notes; no catalog or RFC amendment required | +| Minor Substrait revision bump (additive schema changes) | Document in release notes; update operator catalog if new `Rel` nodes are adopted | +| Breaking Substrait schema change | Major toolchain version bump; RFC 002 amendment required; migration guide required | +| New extension URI registered | Document in operator catalog for the toolchain version; no RFC amendment required unless the new URI changes normative lowering behavior | +| Extension URI retired or renamed | Document in release notes with a deprecation warning; implementations must emit a deprecation diagnostic for plans referencing the retired URI | + +### Pin bump: release-note checklist + +When bumping the pinned Substrait revision, the release notes entry **must** include: + +- [ ] Previous revision reference (commit hash or tag). +- [ ] New revision reference (commit hash or tag). +- [ ] Summary of schema changes that affect emitted plans. +- [ ] Any capability reclassifications — for example, a gap capability promoted to core, an extension reclassified, or (for regressions) a core capability demoted. +- [ ] Required consumer update guidance if consumer APIs or protobuf schemas changed in a consumer-breaking way. + +## Extension URI registration + +### Policy + +Functions not in the pinned core Substrait bundle **must** use extension URIs that are: + +1. **Registered** in the toolchain's public catalog before the function is exposed in a stable release. Pre-release builds may use provisional URIs clearly labeled as unstable. +2. **Stable across patch releases**: once a URI is published in a stable release, it **must not** change meaning or be silently dropped without a deprecation cycle. +3. **Documented**: the operator catalog entry for the URI **must** specify the function name, argument types, return type, and semantic contract (including any edge-case behavior that differs from SQL or Substrait conventions). + +### URI structure + +InQL toolchain extension URIs **should** follow the pattern: + +```text +https://inql.io/extensions//.yaml +``` + +Where: + +- `` is the toolchain version that introduced the extension (e.g. `v0.1`). +- `` groups related functions (e.g. `aggregate`, `string`, `temporal`, `unnest`). + +The exact URI scheme is part of the toolchain release process and **must** be documented alongside the release. The current InQL package code uses the `inql.io` base for registered extension examples and treats pre-1.0 entries as provisional until the wider release process is finalized. + +### `AdvancedExtension` fields + +`AdvancedExtension` fields **may** carry optimization hints or metadata alongside a plan: + +- Normative plan semantics **must** be expressible without relying on `AdvancedExtension` fields. A plan that requires `AdvancedExtension` to execute correctly is non-conforming. +- Consumers **must** be able to execute the plan (possibly without the optimization benefit) in the absence of `AdvancedExtension` support. +- If an `AdvancedExtension` field changes a plan's observable output (rather than its performance), it is no longer a hint — it must be modeled as a registered extension. + +## Compatibility policy + +### Additive changes (default) + +Mapping catalog additions — new InQL capabilities mapped to Substrait, new extension URIs registered, new optional capabilities documented — are **additive changes**. They: + +- Do not require an RFC amendment. +- **Must** appear in release notes with the capability name and profile tag. +- **Must not** change the semantics of any existing mapping. + +### Breaking changes + +A change is **breaking** when it: + +- Removes or renames an extension URI that was published in a stable release. +- Changes the emitted `Rel` shape for an existing capability in a way that alters consumer behavior or plan serialization. +- Reclassifies a capability from core to gap or non-portable (a regression). + +Breaking emitter changes **must**: + +1. Ship with release notes that explicitly identify the breaking change and the affected capability. +2. Include an RFC 002 amendment documenting the updated mapping when the change is user-visible (for example, plans that compiled or executed before no longer do so correctly). +3. Where possible, include a migration guide or automated migration tool. + +Reclassification from gap → core or extension → core is **not** breaking (it broadens portability). It **must** still be documented in release notes and the operator catalog. + + + +[rfc-002]: ../../rfcs/002_apache_substrait_integration.md +[ref-operator-catalog]: ./operator_catalog.md diff --git a/docs/release_notes/v0_1.md b/docs/release_notes/v0_1.md index a6dcb90..397d966 100644 --- a/docs/release_notes/v0_1.md +++ b/docs/release_notes/v0_1.md @@ -1,6 +1,6 @@ # InQL v0.1 release notes -**Status:** Unreleased. InQL v0.1 is the milestone where RFCs **000–004** are implemented and the library is usable end-to-end (language surface, dataset carriers, Substrait-shaped plans, `query {}`, session read/transform/write). See the [RFC index](../rfcs/README.md) for normative design. +**Status:** Unreleased. InQL v0.1 is the milestone where the RFC **000–004** series becomes meaningfully usable end-to-end. The current package already exposes dataset carriers and a real Substrait boundary, while some planned pieces — especially `query {}` lowering and fuller execution semantics — still remain ahead of the current code. See the [RFC index](../rfcs/README.md) for normative design. ## Features and enhancements @@ -9,14 +9,14 @@ Entries will be filled in as work lands (link RFCs and PRs when applicable). - **Language:** Foundational InQL syntax and semantics (naming, query schema, layer boundaries). - **Carriers:** `DataSet[T]` hierarchy including bounded vs unbounded traits and concrete frame/stream types. - **Plans:** Apache Substrait as the logical interchange contract. -- **Authoring:** `query {}` blocks typed and lowered per the above. +- **Authoring:** method-chain lowering into a real Substrait boundary today, with `query {}` work still ahead. - **Execution:** Session-oriented read, execute, and write (reference backend per RFC 004). Pipe-forward (`|>`) is specified in RFC 005 but **out of scope** for v0.1. ## Bugfixes -- TBD. +- **Substrait boundary:** RFC 002 now exposes explicit join/set operation helpers, preserves `ReferenceRel` ordinals, and uses a registered `inql.io` extension URI for the current EXPLODE gap encoding. ## Documentation diff --git a/docs/rfcs/002_apache_substrait_integration.md b/docs/rfcs/002_apache_substrait_integration.md index aefa8b4..e10e422 100644 --- a/docs/rfcs/002_apache_substrait_integration.md +++ b/docs/rfcs/002_apache_substrait_integration.md @@ -1,6 +1,6 @@ # InQL RFC 002: Apache Substrait integration -- **Status:** Planned +- **Status:** In Progress - **Created:** 2026-03-23 - **Author(s):** Danny Meijer - **Related:** @@ -20,7 +20,7 @@ This RFC defines **Apache Substrait** as the **normative logical interchange** f 1. A **checked** InQL relational tree **must** be expressible as a Substrait **`Plan`** whose executable root is a **`Rel`** tree, optionally a **DAG** via **`ReferenceRel`** when subplans are shared. 2. **Logical reads** are **`ReadRel`** (or extension leaf relations) carrying **names, virtual rows, or extension payloads** instead of host-specific connection strings or secrets in the normative interchange. 3. **Scalar and aggregate** computation uses Substrait **expressions** and **aggregate functions**; functions outside the pinned core set **must** use **registered extension URIs** documented with the compiler. -4. **North-star operator catalog**: InQL capabilities map to logical `Rel` kinds as tabulated below; **MVP** subsets are implementation choices but **must not** contradict this RFC for operators they expose. +4. **North-star operator catalog**: InQL capabilities map to logical `Rel` kinds as specified in the [Substrait operator catalog reference][ref-operator-catalog]; **MVP** subsets are implementation choices but **must not** contradict this RFC for operators they expose. ## Motivation @@ -41,6 +41,34 @@ Without a dedicated specification, Substrait lowering risks drifting between fro - Physical Substrait relations as a normative InQL output — consumers **may** use them; InQL **may** emit them when documented as a non-portable or target-specific mode. - ANSI SQL completeness — mapping is capability-based, not a SQL compliance checklist. +## v1 implementation profile (InQL code path) + +The v1 implementation profile for this RFC is explicitly scoped to InQL package code (`.incn`) and is the contract for current delivery tracking. + +- Core read/query `Rel` coverage is implemented through a thin proto-backed Substrait boundary in InQL package code. +- Optional mutation relations remain modeled but are not required to be executable in v1. +- Gap and extension semantics are represented as typed contracts in package code and conformance scenarios, rather than ad hoc string payloads. +- Richer planning semantics remain outside this profile when they logically belong to future `query {}` lowering or Prism. + +This profile is reflected by: + +- `src/substrait/schema.incn` +- `src/substrait/plan.incn` +- `src/substrait/conformance.incn` +- `docs/language/reference/substrait/conformance.md` + +### Current implementation status + +| Area | Current status | +| --- | --- | +| Read roots, filter, cross, sort, fetch | Implemented at the proto-backed Substrait boundary | +| Join and set operation selection | Implemented at the boundary through explicit package-level enums/helpers | +| Reference rel | Implemented at the boundary for ordinal preservation only | +| Project and aggregate | Present as boundary-shape scaffolds; richer expression/grouping semantics remain deferred | +| Extension URI policy and explode gap encoding | Implemented through a registered package-level URI and documented gap policy | +| `query {}` lowering parity | Deferred to RFC 003 and Prism-backed lowering | +| Optional mutation profile | Deferred; not required for the v0.1 read/query analytical core | + ## Guide-level explanation Authors build `DataSet[T]` values (InQL RFC 001) using `query {}` or relational method chains. After typechecking, the relational work becomes a **Substrait plan**: mostly `FilterRel`, `ProjectRel`, `JoinRel`, `AggregateRel`, and so on, rooted in a `ReadRel` when new data enters the plan. @@ -55,6 +83,10 @@ When a plan says "read this named relation" or "read this logical asset id," the - Lowering semantics **must** be identical whether the surface is `query {}`, trait methods, or desugared pipe-forward, for the same checked tree. - Implementations **may** additionally lower to InQL RFC 001 operations for execution; if both paths exist, they **must** match the Substrait semantics for those operators. +For the full capability → `Rel` mapping, profile classifications, and gap encoding requirements, see the [Substrait operator catalog reference][ref-operator-catalog]. + +Conformance scenarios **should** use stable scenario IDs and typed InQL model contracts as defined by the [Substrait conformance corpus reference][ref-conformance-corpus], so implementation and CI reporting can track portability status consistently across toolchains. + ### Logical `Rel` alphabet The following are the primary logical relations InQL targets. Exact protobuf message paths follow the pinned Substrait version selected by the toolchain for a given release. @@ -76,30 +108,11 @@ The following are the primary logical relations InQL targets. Exact protobuf mes | `DdlRel` | DDL (optional profile) | | `ExtensionSingleRel` / `ExtensionMultiRel` / `ExtensionLeafRel` | Extension escape hatches | -### North-star catalog: InQL capabilities → Substrait +### North-star catalog -| InQL capability (conceptual) | Substrait | -| --------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | -| Logical table / registered name | `ReadRel` + named table definition | -| File or object scan as plan input | `ReadRel` + local files (and format options in the pinned spec) | -| Literal or embedded rows | `ReadRel` + virtual table | -| Predicate pushdown into scan | `ReadRel` filter fields and/or separate `FilterRel` — producer policy, **must** be documented | -| Row filter | `FilterRel` | -| Add or replace computed columns | `ProjectRel`; drop / reorder via `RelCommon` emit (preferred) or equivalent | -| Joins | `JoinRel` | -| Cross join | `CrossRel` | -| Group by / aggregates | `AggregateRel` | -| Rollup / cube / grouping sets | `AggregateRel` with multiple groupings | -| Distinct rows | `AggregateRel` with grouping keys and no measures | -| Window / analytic functions | `ProjectRel` with window expressions | -| Sort | `SortRel` | -| Limit / offset | `FetchRel` | -| Union / intersect / except | `SetRel` with the appropriate set operation enum | -| Reuse of an identical subplan | `Plan` + `ReferenceRel` | -| Unnest / explode | **Gap**: extension rel, or documented expansion — **must** be pinned per implementation | -| Pivot / unpivot | **Gap**: `ExtensionSingleRel` or documented rewrite to join + aggregate + project | -| Asof / interval join | **Gap** or non-equi join expression only where consumer contract explicitly allows | -| Streaming time semantics (watermarks, session windows, state) | **Outside core Substrait** unless expressed via named extensions or a separate execution IR | +InQL defines a north-star operator catalog that maps every InQL plan capability to a Substrait `Rel` or expression pattern, and classifies each capability as one of: **core** (maps to a standard logical `Rel` in the pinned revision), **extension** (requires a registered extension URI), **gap** (no stable logical `Rel` in core Substrait; encoding must be documented), or **optional-mutation** (not required for read/query analytical core). + +The full, versioned catalog — including all profile tags, gap encoding requirements, and mutation-profile operators — lives in the [Substrait operator catalog reference][ref-operator-catalog]. Conforming implementations **must** follow the mappings listed there for any capability they claim portable. ### Read roots vs binding @@ -107,15 +120,32 @@ The following are the primary logical relations InQL targets. Exact protobuf mes - The execution context **must** resolve logical reads to physical resources through its adapter and execution layer; that layer **must not** redefine relational semantics of the plan. - Product SDKs **may** present a unified import surface; adapter-specific "open connection" APIs **should not** be specified as core InQL — they remain thin wrappers at most. +For the full `ReadRel` variant reference, the detailed execution context obligations, and the adapter boundary contract, see the [read-root and binding contract reference][ref-read-root]. + ### Extensions and function URIs - Functions not in the pinned core Substrait bundle **must** use extension URIs registered in the compiler's public catalog for that toolchain version. - `AdvancedExtension` fields **may** carry hints; normative semantics **must** be expressible without relying on hints. -### Optional mutation profile +For revision pin requirements, URI registration policy, bundle naming, compatibility conventions, and the release-note checklist for pin bumps, see the [revision and extension policy reference][ref-revision-policy]. + +### Optional mutation profile progress - InQL **may** expose `WriteRel`, `DdlRel`, or `UpdateRel` for warehouse-style mutation. Absence of these in a given distribution **does not** make InQL incomplete for read-only analytical use. +The optional mutation profile operators, per-operator portability notes, and support expectations are listed in the [Substrait operator catalog reference][ref-operator-catalog]. + +### Reference documents + +The following reference documents expand on the operational detail that is too long-lived and versioned to remain inside this RFC text: + +| Document | What it covers | +| --- | --- | +| [Substrait operator catalog][ref-operator-catalog] | Full InQL capability → `Rel` mapping; profile tags; gap encoding rules; mutation profile operators | +| [Substrait revision and extension policy][ref-revision-policy] | Revision pin requirements; extension URI registration policy; compatibility conventions; release-note checklist | +| [Substrait read-root and binding contract][ref-read-root] | `ReadRel` variant reference; execution context obligations; adapter boundary | +| [Substrait conformance corpus][ref-conformance-corpus] | Canonical corpus structure, scenario metadata schema, profile taxonomy, and stable scenario ID conventions | + ## Design details ### Interaction with Incan @@ -140,7 +170,7 @@ The following are the primary logical relations InQL targets. Exact protobuf mes ## Implementation architecture -Non-normative: toolchains **should** maintain golden Substrait plans or equivalent fixture tests for representative `query {}` and API-lowered trees, and **should** document tested consumers without implying exclusive support. +Non-normative: toolchains **should** maintain golden Substrait plans or equivalent fixture tests for representative API-lowered trees, and later add `query {}` fixtures once that surface lowers through the same boundary. ## Layers affected @@ -148,9 +178,94 @@ Non-normative: toolchains **should** maintain golden Substrait plans or equivale - **Conformance / testing** artifacts for serialized plans. - **Published operator catalog** and release notes for Substrait pin bumps. +## Implementation Plan + +### Phase 1: Spec and operator catalog + +- Lock down the versioned Substrait revision pinning policy in compiler documentation and release artifacts. +- Publish the normative operator catalog mapping InQL capabilities to Substrait `Rel` kinds, including gap annotations for unnest, pivot, and streaming semantics. +- Document extension URI registration conventions in the public toolchain catalog. + +### Phase 2: IR lowering — core boundary + +- Lower current package-authored `DataSet[T]` method-chain relational trees to Substrait `Plan` / `Rel` nodes covering: `ReadRel`, `FilterRel`, `ProjectRel`, `JoinRel`, `CrossRel`, `AggregateRel`, `SortRel`, `FetchRel`, `SetRel`, and `ReferenceRel`. +- Keep the current package layer thin: relation-shape ownership stays here, while richer planning semantics remain candidates for Prism. +- Align field references and types with `model`-backed schemas (RFC 001) where the current package code materially owns that boundary. + +### Phase 3: Extensions and read binding + +- Implement extension URI registration for functions outside the pinned core Substrait bundle. +- Implement logical `ReadRel` emission for named tables, virtual rows, and extension-defined sources without normative secret material in the plan text. +- Implement documented extension encoding for unnest / explode (gap handling). + +### Phase 4: Optional mutation profile + +- Implement `WriteRel`, `DdlRel`, and `UpdateRel` emission under the optional mutation profile. + +### Phase 5: Conformance and testing + +- Add golden Substrait plan fixtures for representative API-lowered trees. +- Add `query {}` fixtures later when that surface lowers through the same boundary. +- Verify fixture round-trips when Substrait revision is bumped. +- Update docs-site pages and operator catalog for public release. + +## Progress Checklist + +### Spec / design + +- [ ] Substrait revision pinning policy documented in release artifacts and compiler docs. +- [ ] Normative operator catalog published (including gap annotations). +- [ ] Extension URI registration conventions documented. + +### IR / lowering — core relations + +- [ ] `ReadRel` emission: named table, virtual rows, extension sources. +- [ ] `FilterRel` emission. +- [ ] `ProjectRel` boundary scaffold emission. +- [ ] `JoinRel` emission (semi, anti, single, mark variants). +- [ ] `CrossRel` emission. +- [ ] `AggregateRel` boundary scaffold emission. +- [ ] `SortRel` emission. +- [ ] `FetchRel` emission (limit / offset). +- [ ] `SetRel` emission (union / intersect / except). +- [ ] `ReferenceRel` ordinal emission at the Substrait boundary. +- [ ] Lowering is identical across `query {}`, method chains, and desugared pipe-forward for the same checked tree once RFC 003 and Prism-backed lowering land. +- [ ] Field references align with RFC 001 `model`-backed schemas and `NamedStruct` indices. + +### Extensions and read binding + +- [ ] Extension URI registration for non-core functions wired in toolchain catalog. +- [ ] Logical `ReadRel` carries no normative secret material (binding left to execution context). +- [ ] Documented extension encoding for unnest / explode gap. + +### Optional mutation profile + +- [ ] `WriteRel` emission (optional profile). +- [ ] `DdlRel` emission (optional profile). +- [ ] `UpdateRel` emission (optional profile). + +### Tests + +- [ ] Golden Substrait plan fixtures for representative API-lowered (`DataSet[T]`) trees. +- [ ] Golden Substrait plan fixtures for representative `query {}` trees once that surface lowers through the same boundary. +- [ ] Fixture round-trip tests on Substrait revision bump. +- [ ] Tests confirm no secret material in emitted `ReadRel` plans. + +### Docs + +- [ ] Operator catalog page updated in docs-site. +- [ ] Release notes entry added. + ## Design Decisions - **Substrait revision pinning:** this RFC defines the pinning policy, not one timeless revision number. Each conforming InQL toolchain release **must** publish the exact Substrait revision it targets and any bundled extension sets in public release artifacts and compiler documentation. - **Canonical unnest / explode encoding:** until core Substrait standardizes a portable unnest relation that InQL adopts, `EXPLODE`-style behavior **must** lower through a documented extension relation or another documented non-core encoding listed in the toolchain's public operator catalog. Implementations **must not** present ad hoc or undocumented encodings as portable core behavior. - **Mutation relations:** `WriteRel`, `DdlRel`, and `UpdateRel` remain an optional mutation profile. They are not part of the minimum read/query analytical core required for InQL v0.1, and implementations **may** expose them only when the execution context and backend support them. - **Correlated subqueries:** InQL v0.1 does not standardize a single correlated-subquery desugaring because correlated subquery surface syntax is not part of the minimum relational grammar. If a future RFC adds correlated subqueries, that RFC **must** define the lowering contract explicitly rather than relying on implicit emitter policy. + + + +[ref-operator-catalog]: ../language/reference/substrait/operator_catalog.md +[ref-revision-policy]: ../language/reference/substrait/revision_and_extension_policy.md +[ref-read-root]: ../language/reference/substrait/read_root_binding_contract.md +[ref-conformance-corpus]: ../language/reference/substrait/conformance.md diff --git a/docs/rfcs/006_unnest_core_substrait.md b/docs/rfcs/006_unnest_core_substrait.md new file mode 100644 index 0000000..b4177a0 --- /dev/null +++ b/docs/rfcs/006_unnest_core_substrait.md @@ -0,0 +1,102 @@ +# InQL RFC 006: Promote unnest/explode to core Substrait lowering + +- **Status:** Blocked +- **Created:** 2026-03-27 +- **Author(s):** Danny Meijer +- **Related:** + - InQL RFC 002 (Apache Substrait — normative gap classification for unnest; **prerequisite**) + - InQL RFC 003 (`query {}` — `EXPLODE` clause; no surface change required) + - InQL RFC 001 (dataset types — `explode` method on `DataSet[T]`; no surface change required) +- **Issue:** [InQL #14](https://github.com/dannys-code-corner/InQL/issues/14) +- **RFC PR:** - +- **Written against:** Incan v0.2 +- **Shipped in:** - + +> **Blocked** on upstream Apache Substrait standardizing a portable logical unnest/explode `Rel` in a revision InQL can pin to. See [Substrait operator catalog — Gap profiles: Unnest / explode][ref-operator-catalog]. + +## Summary + +InQL RFC 002 classifies `EXPLODE`/unnest as a **gap** capability: no stable logical `Rel` exists in core Substrait, so implementations must lower through a registered extension relation with a declared URI. This RFC records the intent to promote that capability from `gap` to `core` — updating the operator catalog, retiring the extension encoding requirement, and updating Incan compiler lowering — once upstream Substrait ships a portable unnest `Rel` that InQL can adopt. + +## Motivation + +The current extension encoding for unnest adds extension URI maintenance burden to every conforming InQL toolchain release and limits plan portability to consumers that happen to support the same registered extension. The gap classification is not a permanent design choice; it reflects a gap in Substrait at the time of RFC 002. Once upstream closes that gap with a stable logical `Rel`, there is no reason for InQL to keep the extension path as the normative encoding. Reclassifying promptly gives authors core-portable `EXPLODE` semantics without requiring consumers to register or recognize InQL-specific URIs. + +## Goals + +- Reclassify the unnest/explode capability from `gap` to `core` in the [Substrait operator catalog reference][ref-operator-catalog]. +- Retire the extension encoding requirement for unnest, with appropriate release-note entries per the [revision and extension policy reference][ref-revision-policy]. +- Update Incan compiler lowering to emit the core `Rel` instead of the extension relation. + +## Non-Goals + +- Changing the InQL surface syntax for unnest — `EXPLODE` in `query {}` (InQL RFC 003) and `explode` on `DataSet[T]` (InQL RFC 001) remain unchanged. +- Defining the semantics of the new core `Rel` — that is an upstream Substrait concern; InQL aligns to whatever the pinned revision specifies. +- Keeping the extension encoding as an alternate path — once the core `Rel` is adopted, the extension path is retired. + +## Guide-level explanation + +From an author's perspective, nothing changes. `EXPLODE` in `query {}` and `.explode()` on a `DataSet[T]` work exactly as before. The only observable difference is in the serialized Substrait plan: before promotion, the emitted plan contains an `ExtensionSingleRel` or `ExtensionLeafRel` with an InQL-registered URI; after promotion, it contains the standard logical unnest `Rel` from the pinned Substrait revision. Consumers that previously required the InQL extension URI to execute unnest plans no longer do. + +## Reference-level explanation + +### Operator catalog update + +When the unnest capability is promoted, `docs/language/reference/substrait/operator_catalog.md` **must** be updated: + +- The entry for unnest/explode in the read/query analytical core table **must** change from `gap` to `core`. +- The Substrait column **must** reference the standard logical `Rel` name from the pinned revision. +- The unnest/explode section under Gap profiles **must** be removed or replaced with a note that the capability was promoted (pointing to the release notes for the relevant toolchain version). + +### Extension encoding retirement + +Per the [revision and extension policy reference][ref-revision-policy]: + +- The extension URI registered for unnest/explode **must** be deprecated in the toolchain release that adopts the core `Rel`, with a deprecation diagnostic for plans referencing the old URI. +- The release notes entry **must** include the previous extension URI, the new core `Rel` name, and guidance for consumers to update plan consumption. + +### Lowering update (Incan compiler) + +- Incan compiler lowering for `EXPLODE` and `.explode()` **must** emit the standard logical unnest `Rel` instead of the extension relation. +- The emitted plan **must** not include the deprecated extension URI for unnest after the toolchain version that adopts the core `Rel`. + +### Plan compatibility + +Serialized Substrait plans containing the extension encoding for unnest will need to be re-emitted after a toolchain upgrade. This is a plan-level breaking change and **must** be documented in release notes per the revision and extension policy. + +## Design details + +### Interaction with other InQL surfaces + +No surface changes. The `EXPLODE` clause in `query {}` and the `explode` method on `DataSet[T]` retain their existing semantics; only the Substrait emission changes. + +### Compatibility / migration + +- Breaking for serialized plans: existing plans with the extension encoding for unnest must be re-emitted. No author source code changes are required. +- Non-breaking for InQL source: `EXPLODE` and `.explode()` continue to compile and type-check identically. + +## Alternatives considered + +- **Keep the extension encoding permanently** — rejected; once a portable `Rel` exists, maintaining a proprietary extension degrades portability and adds unnecessary URI maintenance with no benefit. +- **Support both encodings simultaneously** — rejected; two emitted encodings for the same operation create consumer ambiguity. The extension path should be cleanly retired in the release that adopts the core `Rel`. + +## Drawbacks + +- Bumping the Substrait pin for this change is a plan-level breaking change, requiring coordinated consumer updates if any downstream tooling relies on the extension encoding. +- Timing depends entirely on upstream Substrait; InQL cannot control when a portable unnest `Rel` ships. + +## Layers affected + +- **InQL specification** — operator catalog reference updated; revision and extension policy retirement entry required. +- **Incan compiler** — lowering for `EXPLODE` / `explode` updated to emit the core `Rel` (work in the Incan repository). +- **Documentation** — release notes entry; operator catalog update. + +## Unresolved questions + +- Which exact Substrait revision introduces the portable unnest `Rel`? (Blocked on upstream; track `substrait-io/substrait`.) +- Are there semantic edge cases between the InQL extension encoding and the upstream core `Rel` that require a compatibility shim or a lowering-time rewrite? + + + +[ref-operator-catalog]: ../language/reference/substrait/operator_catalog.md +[ref-revision-policy]: ../language/reference/substrait/revision_and_extension_policy.md diff --git a/docs/rfcs/007_prism_planning_engine.md b/docs/rfcs/007_prism_planning_engine.md new file mode 100644 index 0000000..3e5973b --- /dev/null +++ b/docs/rfcs/007_prism_planning_engine.md @@ -0,0 +1,214 @@ +# InQL RFC 007: Prism logical planning and optimization engine + +- **Status:** Draft +- **Created:** 2026-04-02 +- **Author(s):** Danny Meijer +- **Related:** + - InQL RFC 001 (dataset types and carriers — Prism-backed carriers must remain consistent with `DataSet[T]` semantics) + - InQL RFC 002 (Apache Substrait integration — Substrait remains the normative emitted contract at the boundary) + - InQL RFC 003 (`query {}` — lowers through Prism-managed logical work before Substrait emission) + - InQL RFC 004 (execution context — session executes Prism-backed plans but does not define Prism) + - InQL RFC 005 (optional pipe-forward — must stay Prism-consistent with equivalent surfaces) +- **Issue:** — +- **RFC PR:** — +- **Written against:** Incan v0.2 +- **Shipped in:** — + +## Summary + +This RFC defines **Prism** as InQL's immutable internal logical planning and optimization engine. Prism owns persistent plan storage, cheap branching through structural sharing, lineage-preserving rewrites, and logical optimization prior to Substrait emission or session execution. Prism is an **internal planning substrate**, not the normative interchange contract: **Apache Substrait** remains the boundary format per InQL RFC 002. `LazyFrame`, `DataFrame`, and `DataStream` are carrier experiences over Prism-managed plan state; `Session` and `SessionContext` bind and execute those plans per InQL RFC 004. + +## Motivation + +InQL already has a strong external story around typed carriers, Substrait emission, and the execution boundary, but it lacks a dedicated specification for the internal planning layer that sits between authored logic and emitted plans. Without that layer being named and scoped, plan construction, optimization, lineage, interactive behavior, and future explain/debug tooling risk becoming an accidental mix of implementation details spread across InQL RFC 001, InQL RFC 002, and InQL RFC 004. + +Prism gives that layer a home. It lets InQL say clearly that: + +- authored transformations build immutable logical plans +- carriers stay cheap by sharing planning state instead of cloning whole plans +- optimization is a first-class responsibility, not an incidental backend side effect +- lineage must survive rewrites so optimized plans remain explainable + +This matters for more than simple query lowering. Complex multi-hop pipelines, future interactive environments, and prospective reuse of the planning substrate beyond InQL all benefit from a stable definition of what the internal plan engine is allowed and required to do. + +## Goals + +- Define **Prism** as the immutable logical planning engine for InQL. +- Specify Prism's core responsibilities: persistent plan storage, logical optimization, lineage preservation, and preparation for Substrait emission. +- Clarify the relationship between Prism and InQL carriers (`LazyFrame`, `DataFrame`, `DataStream`, `DataSet`). +- Clarify the relationship between Prism and sibling boundaries: Substrait at interchange boundaries and `Session` / `SessionContext` at execution boundaries. +- Require that Prism-backed plan construction remain cheap through structural sharing rather than deep-cloning carrier state. +- Define the conceptual distinction between authored plan state and optimized plan state without over-constraining the final implementation. + +## Non-Goals + +- Replacing Apache Substrait as InQL's normative emitted logical contract — that remains InQL RFC 002. +- Defining physical execution behavior, backend binding, or secret management — that remains outside Prism and is scoped by InQL RFC 004 and surrounding operational layers. +- Defining new author-facing query syntax — Prism is an internal planning engine, not a new language surface. +- Forcing one exact in-memory data structure implementation for authored and optimized plan state. +- Promising Prism as a general-purpose platform beyond InQL today. This RFC scopes Prism normatively to InQL, while requiring a clean enough module boundary that future extraction remains possible. + +## Guide-level explanation + +From an author's point of view, Prism is not something they use directly. Authors work with InQL carriers such as `LazyFrame[T]`, `DataFrame[T]`, and (later) `DataStream[T]`. Those carriers build or operate over logical work that Prism stores and optimizes internally. + +```incan +orders = session.table("orders") +cutoff = ... # some appropriate value + +high_value = orders.filter(.amount > 1000) +recent = orders.filter(.created_at >= cutoff) + +summary = high_value.join(recent, on=.order_id) +``` + +The important user-visible behavior is: + +- each transformation returns a new carrier +- earlier carriers still exist unchanged +- branching from a shared base plan is cheap +- execution still belongs to the session boundary + +Prism is the reason this can work efficiently. It stores the shared logical planning state, allows both `high_value` and `recent` to branch from the same base plan, and may optimize the resulting logical graph before the plan is emitted to Substrait or executed by a session. + +Prism should be thought of as the internal engine that **thinks** about the plan. Substrait is how the plan is **communicated** at the boundary. Session is how the plan is **executed**. + +## Reference-level explanation + +### Prism role + +Prism is the internal logical planning and optimization substrate for InQL. + +Prism **must**: + +- store logical relational author intent in persistent plan state +- support cheap plan branching through structural sharing +- preserve lineage across plan construction and optimization +- provide an optimized logical view for lowering and execution + +Prism **must not**: + +- become the normative interchange format +- require destructive mutation of prior authored history +- own physical execution or backend-specific binding + +### Relationship to carriers + +`LazyFrame[T]`, `DataFrame[T]`, and `DataStream[T]` **may** present different user-facing execution behavior, but they **should** be able to share Prism-managed planning state. + +Carrier operations that extend logical work **must** produce new logical tips rather than mutating prior history. Implementations **should** make returned carriers cheap immutable handles over shared Prism-managed state. + +### Relationship to Substrait + +Prism is internal (for now). Apache Substrait remains the normative boundary contract. + +The relationship is: + +- Prism = internal logical planning, lineage, and optimization +- Substrait = emitted logical interchange contract + +An implementation **may** use Prism-native node kinds or overlays internally, but emitted plans that claim conformance **must** still follow InQL RFC 002. + +### Relationship to session execution + +Prism does not execute plans. `Session` / `SessionContext` own execution. + +Execution-oriented flows **must** treat Prism as an input to lowering and execution, not as the executor itself. Session-backed operations may request optimized views from Prism before emission or execution, but the existence of Prism **must not** collapse the execution boundary defined in InQL RFC 004. + +### Authored state vs optimized state + +Prism **should** conceptually distinguish between: + +- **authored plan state**: persistent construction history closest to user intent +- **optimized plan state**: semantically equivalent rewritten state used for lowering or execution +- **lineage metadata**: mappings from optimized state back to authored history + +This distinction is normative at the conceptual level, but implementations retain freedom in how they realize it. A single persistent graph with overlays, separate graphs with references, or another equivalent structure are all acceptable if the invariants below hold. + +### Required invariants + +The following invariants **must** hold: + +1. Adding a new carrier transformation never mutates prior authored history. +2. Any optimized representation remains semantically equivalent to the authored representation. +3. Schema facts remain derivable and trustworthy across rewrites. +4. Branching from a common carrier remains cheap enough to be a normal authoring pattern. +5. Optimization may change plan shape, but it must not destroy lineage traceability. + +### Optimization responsibilities + +Optimization is a core Prism responsibility, not merely a downstream backend concern. + +Prism **may** perform: + +- projection pruning +- predicate pushdown +- redundant-node elimination +- normalization of equivalent logical shapes +- shared subplan detection and sharing +- other semantically valid rewrites consistent with schema and lineage invariants + +More advanced rewrites such as join reordering or sink-aware splitting **may** be added later. + +Implementations **may** apply some rewrites incrementally during plan construction and defer others until lowering or explicit analysis, provided authored history remains intact. + +## Design details + +### Syntax + +This RFC introduces no new author-facing syntax. + +### Semantics + +Prism is the internal engine that owns logical planning and optimization for InQL carriers. + +At minimum, a Prism-backed carrier should be representable as: + +- a reference to Prism-managed persistent plan state +- a current logical tip +- schema facts associated with that tip + +The exact representation is intentionally not fixed by this RFC, but the semantics of immutability, structural sharing, and lineage preservation are. + +### Interaction with other InQL surfaces + +- **`DataSet[T]` APIs:** method-chain surfaces defined by InQL RFC 001 **must** build or manipulate Prism-backed logical state without violating carrier immutability. +- **`query {}`:** checked query blocks defined by InQL RFC 003 **should** lower into Prism-managed logical work before final Substrait emission. +- **Pipe-forward (`|>`):** if supported per InQL RFC 005, desugared pipe-forward **must** remain Prism-consistent with the equivalent method-chain or query-block form. +- **Incan `model` types:** Prism optimization legality **must** remain consistent with model-derived schema semantics and must not fall back to runtime-authored schema truth. +- **Substrait / execution:** Prism prepares plans for InQL RFC 002 emission and InQL RFC 004 execution, but it does not replace either sibling boundary. + +### Compatibility / migration + +This RFC is additive and architectural. It clarifies and stabilizes internal InQL planning semantics; it does not by itself introduce a source-level breaking change for authors or a serialized-plan breaking change for Substrait consumers. + +It may, however, motivate refactoring of implementation architecture so that planning, optimization, and emission concerns are separated more clearly than they were before this RFC existed. + +## Alternatives considered + +- **Keep Prism as a research note only** — rejected for now; the planning and optimization substrate is foundational enough that leaving it undocumented as an implementation note would keep key architectural boundaries implicit. +- **Fold Prism fully into InQL RFC 002** — rejected; Substrait emission and internal planning are related but distinct concerns. Keeping them in one RFC makes the internal engine look like a boundary-format detail. +- **Define Prism as a cross-cutting platform beyond InQL immediately** — rejected for now; Prism may eventually be reused elsewhere, but this RFC keeps the normative scope concrete by defining Prism first as an InQL component with a clean standalone module boundary. + +## Drawbacks + +- Adds another foundational RFC to the series, which increases up-front design surface before implementation. +- Introduces a conceptual split between authored and optimized plan state that implementations must model carefully. +- Risks over-specifying internal architecture if future Incan constraints make some Prism design choices awkward. + +## Layers affected + +- **InQL specification** — sibling RFCs that reference logical planning, carrier behavior, Substrait lowering, or session execution **should** remain consistent with Prism as the internal planning substrate. +- **InQL library package** — public carriers and internal planning modules **should** preserve immutable carrier semantics over shared Prism-managed state. +- **Incan compiler** — if InQL surfaces lower through compiler-managed intermediate representations, those integrations **should** respect Prism's lineage and optimization invariants. +- **Execution / interchange** — Session-backed lowering and execution flows **must** treat Prism as internal preparation and Substrait as the boundary contract. +- **Documentation** — RFC indexes, architecture notes, and implementation planning notes **should** distinguish Prism from Substrait and from session execution. + +## Unresolved questions + +- Should Prism maintain one persistent graph with optimized overlays, or separate authored and optimized graphs with explicit references? +- Which optimization passes are part of the Prism north star immediately, and which should be deferred until after the first implementation? +- What is the most useful lineage metadata shape for explain/debug tooling without making normal plan construction expensive? +- Are there Incan language or tooling limitations around model-derived schema facts that Prism depends on and that may require an upstream Incan RFC? + + diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index cffa79b..b968f0a 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -8,14 +8,16 @@ InQL uses its **own** RFC series (starting at 000), independent of the [Incan la | -------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | | [000][rfc-000] | Planned | Language specification — core model, naming, schema shapes, layer boundaries | | | [001][rfc-001] | Implemented | Dataset types and carriers (`DataSet[T]`, `BoundedDataSet[T]`, `UnboundedDataSet[T]`) — library package is **contract-complete** (types, `Self` method surface, `functions` imports); execution in RFC 004 | | -| [002][rfc-002] | Planned | Apache Substrait — `Rel`-level contract, mapping catalog, binding boundaries | | +| [002][rfc-002] | In Progress | Apache Substrait — `Rel`-level contract, mapping catalog, binding boundaries | | | [003][rfc-003] | Planned | `query {}` blocks — grammar, typing, Substrait lowering | | | [004][rfc-004] | Planned | Execution context — session, DataFusion, read/transform/write | | -| [005][rfc-005] | Blocked | Pipe-forward relational syntax (`\ | >`) — optional surface | | +| [005][rfc-005] | Blocked | Pipe-forward relational syntax (`|>`) — optional surface | | +| [006][rfc-006] | Blocked | Promote unnest/explode to core Substrait lowering — blocked on upstream Substrait standardization | | +| [007][rfc-007] | Draft | Prism logical planning and optimization engine | | -**Order:** [RFC 000][rfc-000] is the foundational language specification. [RFC 001][rfc-001] defines the dataset type hierarchy. [RFC 002][rfc-002] defines the Substrait interchange contract. [RFC 003][rfc-003] defines the `query {}` surface that lowers to Substrait per RFC 002 over carriers from RFC 001. [RFC 004][rfc-004] completes the end-to-end story: session, read, execute, write. [RFC 005][rfc-005] specifies optional pipe-forward syntax outside the RFC 000–004 milestone and currently blocked on Incan RFC 040. +**Order:** [RFC 000][rfc-000] is the foundational language specification. [RFC 001][rfc-001] defines the dataset type hierarchy. [RFC 002][rfc-002] defines the Substrait interchange contract. [RFC 003][rfc-003] defines the `query {}` surface that lowers to Substrait per RFC 002 over carriers from RFC 001. [RFC 004][rfc-004] completes the end-to-end story: session, read, execute, write. [RFC 005][rfc-005] specifies optional pipe-forward syntax outside the RFC 000–004 milestone and currently blocked on Incan RFC 040. [RFC 006][rfc-006] tracks promotion of unnest/explode from gap to core Substrait lowering, blocked on upstream Substrait standardization. [RFC 007][rfc-007] defines Prism as InQL's internal logical planning and optimization engine, sitting beneath carriers and upstream of Substrait emission and session execution. **v0.1 scope:** RFCs 000–004. When all five are resolved (Draft → Planned → Implemented), InQL v0.1 is complete: authors can read data, write typed queries, lower to Substrait, execute through DataFusion, and write results. @@ -31,4 +33,6 @@ New RFCs should follow [TEMPLATE.md] (aligned with Incan’s RFC structure, adap [rfc-003]: 003_inql_query_blocks.md [rfc-004]: 004_inql_execution_context.md [rfc-005]: 005_inql_pipe_forward.md +[rfc-006]: 006_unnest_core_substrait.md +[rfc-007]: 007_prism_planning_engine.md [incan-rfcs]: https://github.com/dannys-code-corner/incan/tree/main/workspaces/docs-site/docs/RFCs diff --git a/examples/hello.incn b/examples/hello.incn new file mode 100644 index 0000000..d3ca911 --- /dev/null +++ b/examples/hello.incn @@ -0,0 +1,4 @@ +@main +fn main() { + print("Hello, World!") +} diff --git a/incan.lock b/incan.lock index f501b25..4c3bfdd 100644 --- a/incan.lock +++ b/incan.lock @@ -4,8 +4,8 @@ [incan] format = 1 incan-version = "0.2.0-dev.5" -generated = "2026-03-27T13:23:49.961814Z" -deps-fingerprint = "sha256:17f122844d2fa1c9756f9a1976d222f15255557e74d975b8d8ff46536ea82b87" +generated = "2026-03-30T11:39:12.858306Z" +deps-fingerprint = "sha256:87bf2b26e77e2fb75979ce6cbe1128c88b63c617f3b1653eef19a2a3bbdfb4c6" cargo-features = [] cargo-no-default-features = false cargo-all-features = false @@ -16,6 +16,167 @@ lock = """ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "cc" +version = "1.2.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "incan_core" version = "0.2.0-dev.5" @@ -37,12 +198,105 @@ dependencies = [ "incan_derive", ] +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + [[package]] name = "inql" version = "0.2.0-dev.5" dependencies = [ "incan_derive", "incan_stdlib", + "prost", + "prost-types", + "substrait", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", ] [[package]] @@ -54,6 +308,64 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +dependencies = [ + "heck", + "itertools", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "regex", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost", +] + +[[package]] +name = "protobuf-src" +version = "2.1.1+27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6217c3504da19b85a3a4b2e9a5183d635822d83507ba0986624b5c05b83bfc40" +dependencies = [ + "cmake", +] + [[package]] name = "quote" version = "1.0.45" @@ -63,6 +375,223 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "regress" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" +dependencies = [ + "hashbrown 0.16.1", + "memchr", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schemars" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +dependencies = [ + "serde", + "serde_core", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_tokenstream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69" +dependencies = [ + "proc-macro2", + "quote", + "serde", + "syn", +] + +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "substrait" +version = "0.63.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e620ff4d5c02fd6f7752931aa74b16a26af66a63022cc1ad412c77edbe0bab47" +dependencies = [ + "heck", + "indexmap", + "prettyplease", + "prost", + "prost-build", + "prost-types", + "protobuf-src", + "regress", + "schemars", + "semver", + "serde", + "serde_json", + "serde_yaml", + "syn", + "typify", + "walkdir", +] + [[package]] name = "syn" version = "2.0.117" @@ -74,9 +603,281 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "typify" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5bcc6f62eb1fa8aa4098f39b29f93dcb914e17158b76c50360911257aa629" +dependencies = [ + "typify-impl", + "typify-macro", +] + +[[package]] +name = "typify-impl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" +dependencies = [ + "heck", + "log", + "proc-macro2", + "quote", + "regress", + "schemars", + "semver", + "serde", + "serde_json", + "syn", + "thiserror", + "unicode-ident", +] + +[[package]] +name = "typify-macro" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "911c32f3c8514b048c1b228361bebb5e6d73aeec01696e8cc0e82e2ffef8ab7a" +dependencies = [ + "proc-macro2", + "quote", + "schemars", + "semver", + "serde", + "serde_json", + "serde_tokenstream", + "syn", + "typify-impl", +] + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" """ diff --git a/incan.toml b/incan.toml index d171a11..40635e2 100644 --- a/incan.toml +++ b/incan.toml @@ -1,3 +1,10 @@ [project] name = "inql" version = "0.1.0" + +[rust-dependencies] +# Protobuf serialization and deserialization are provided via prost in the proto module. +prost = "0.14" +prost-types = "0.14" +# Build needs `protoc` on PATH (`brew install protobuf`), or `features = ["protoc"]` + CMake. +substrait = { version = "0.63", features = ["protoc"] } diff --git a/src/dataset.incn b/src/dataset.incn deleted file mode 100644 index 4e8310b..0000000 --- a/src/dataset.incn +++ /dev/null @@ -1,116 +0,0 @@ -""" -Dataset carriers for InQL (RFC 001). - -This module defines the *author-facing* type hierarchy used to carry schema-parameterized tabular data through -relational pipelines: - -```text -DataSet[T] - ├─ BoundedDataSet[T] (finite / batch) - │ ├─ DataFrame[T] (eager / materialized) - │ └─ LazyFrame[T] (deferred / planned) - └─ UnboundedDataSet[T] (streaming / unbounded) - └─ DataStream[T] (streaming) -``` - -The implementation here is intentionally backend-neutral: these APIs are the public surface. - -## Type hierarchy - -The dataset type hierarchy is rooted in the `DataSet[T]` trait, split into `BoundedDataSet[T]` (finite extent) and -`UnboundedDataSet[T]` (streaming/unbounded), with three concrete types: `DataFrame[T]` (materialized/eager), -`LazyFrame[T]` (deferred plan), and `DataStream[T]` (streaming). - -## Operation API - -The following methods are defined on `DataSet[T]`. - -- `filter(self, predicate: bool) -> Self` -- `join(self, other: Self, on: bool) -> Self` -- `select(self) -> Self` -- `group_by(self) -> Self` -- `agg(self) -> Self` -- `order_by(self) -> Self` -- `limit(self, n: int) -> Self` -- `explode(self) -> Self` - -## Documentation - -- **Explanation**: [docs/language/explanation/dataset_types.md](../docs/language/explanation/dataset_types.md) -- **Reference**: [docs/language/reference/dataset_types.md](../docs/language/reference/dataset_types.md) -- **Examples**: [examples/](../examples/) - -## Example patterns - -```incan -from pub::inql import LazyFrame, DataFrame, DataStream -from models import Order, Event - -# Filter and chain -orders.filter(.amount > 100).filter(.status == "active") - -# Join -orders.join(customers, .order.customer_id == .customer.id) - -# Group and aggregate -orders.group_by(.customer_id).agg(total(.amount)) - -# Order and limit -orders.order_by(.amount.desc()).limit(10) - -# Bounded vs unbounded signatures -def batch_only(data: BoundedDataSet[Order]) -> None: ... -def streaming_only(data: UnboundedDataSet[Event]) -> None: ... -def generic_any(data: DataSet[T]) -> None: ... -``` -""" - -from std.testing import fail_t as NotImplementedError - -# ---- DataSet trait ---- -pub trait DataSet[T]: - def filter(self, predicate: bool) -> Self: - return NotImplementedError("InQL DataSet.filter is not implemented yet") - - def join(self, other: Self, on: bool) -> Self: - return NotImplementedError("InQL DataSet.join is not implemented yet") - - def select(self) -> Self: - return NotImplementedError("InQL DataSet.select is not implemented yet") - - def group_by(self) -> Self: - return NotImplementedError("InQL DataSet.group_by is not implemented yet") - - def agg(self) -> Self: - return NotImplementedError("InQL DataSet.agg is not implemented yet") - - def order_by(self) -> Self: - return NotImplementedError("InQL DataSet.order_by is not implemented yet") - - def limit(self, n: int) -> Self: - return NotImplementedError("InQL DataSet.limit is not implemented yet") - - def explode(self) -> Self: - return NotImplementedError("InQL DataSet.explode is not implemented yet") - - -# ---- BoundedDataSet trait and concrete types ---- -pub trait BoundedDataSet[T] with DataSet[T]: - pass - - -pub class DataFrame[T] with BoundedDataSet: - pub _row_schema_marker: T - - -pub class LazyFrame[T] with BoundedDataSet: - pub _row_schema_marker: T - - -# ---- UnboundedDataSet trait and concrete types ---- -pub trait UnboundedDataSet[T] with DataSet[T]: - pass - - -pub class DataStream[T] with UnboundedDataSet: - pub _row_schema_marker: T diff --git a/src/dataset/mod.incn b/src/dataset/mod.incn new file mode 100644 index 0000000..8cf84c8 --- /dev/null +++ b/src/dataset/mod.incn @@ -0,0 +1,199 @@ +""" +Dataset carriers for InQL (RFC 001). + +This module defines the *author-facing* type hierarchy used to carry schema-parameterized tabular data through +relational pipelines: + +```text +DataSet[T] + ├─ BoundedDataSet[T] (finite / batch) + │ ├─ DataFrame[T] (eager / materialized) + │ └─ LazyFrame[T] (deferred / planned) + └─ UnboundedDataSet[T] (streaming / unbounded) + └─ DataStream[T] (streaming) +``` + +The implementation here is intentionally backend-neutral: these APIs are the public surface. + +## Type hierarchy + +The dataset type hierarchy is rooted in the `DataSet[T]` trait, split into `BoundedDataSet[T]` (finite extent) and +`UnboundedDataSet[T]` (streaming/unbounded), with three concrete types: `DataFrame[T]` (materialized/eager), +`LazyFrame[T]` (deferred plan), and `DataStream[T]` (streaming). + +## Operation API + +The following methods are defined on `DataSet[T]`. + +- `filter(self, predicate: bool) -> Self` +- `join(self, other: Self, on: bool) -> Self` +- `select(self) -> Self` +- `group_by(self) -> Self` +- `agg(self) -> Self` +- `order_by(self) -> Self` +- `limit(self, n: int) -> Self` +- `explode(self) -> Self` + +## Documentation + +- **Explanation**: [docs/language/explanation/dataset_types.md](../docs/language/explanation/dataset_types.md) +- **Reference**: [docs/language/reference/dataset_types.md](../docs/language/reference/dataset_types.md) +- **Examples**: [examples/](../examples/) + +## Example patterns + +```incan +from pub::inql import LazyFrame, DataFrame, DataStream +from models import Order, Event + +# Filter and chain +orders.filter(.amount > 100).filter(.status == "active") + +# Join +orders.join(customers, .order.customer_id == .customer.id) + +# Group and aggregate +orders.group_by(.customer_id).agg(total(.amount)) + +# Order and limit +orders.order_by(.amount.desc()).limit(10) + +# Bounded vs unbounded signatures +def batch_only(data: BoundedDataSet[Order]) -> None: ... +def streaming_only(data: UnboundedDataSet[Event]) -> None: ... +def generic_any(data: DataSet[T]) -> None: ... +``` +""" + +from rust::substrait::proto import Plan, Rel +from substrait.plan import plan_from_root_relation +from dataset.ops import filter_ds, join_ds, select_ds, group_by_ds, agg_ds, order_by_ds, limit_ds, explode_ds + +# ---- DataSet trait ---- +pub trait DataSet[T with Clone]: + def to_substrait_plan(self) -> Plan: ... + + def filter(self, predicate: bool) -> Self: ... + + def join(self, other: Self, on: bool) -> Self: ... + + def select(self) -> Self: ... + + def group_by(self) -> Self: ... + + def agg(self) -> Self: ... + + def order_by(self) -> Self: ... + + def limit(self, n: int) -> Self: ... + + def explode(self) -> Self: ... + + +# ---- BoundedDataSet trait and concrete types ---- +pub trait BoundedDataSet[T with Clone] with DataSet[T]: + pass + + +pub class DataFrame[T with Clone] with BoundedDataSet: + pub _row_schema_marker: T + pub _substrait_rel: Rel + + def to_substrait_plan(self) -> Plan: + # TODO(#229): switch back to an empty root-name list once Incan preserves element type context for `[]`. + return plan_from_root_relation(self._substrait_rel, [str("id")]) + + def filter(self, predicate: bool) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=filter_ds(self._substrait_rel, predicate)) + + def join(self, other: Self, on: bool) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=join_ds(self._substrait_rel, other._substrait_rel, on)) + + def select(self) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=select_ds(self._substrait_rel)) + + def group_by(self) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=group_by_ds(self._substrait_rel)) + + def agg(self) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=agg_ds(self._substrait_rel)) + + def order_by(self) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=order_by_ds(self._substrait_rel)) + + def limit(self, n: int) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=limit_ds(self._substrait_rel, n)) + + def explode(self) -> Self: + return DataFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=explode_ds(self._substrait_rel)) + + +pub class LazyFrame[T with Clone] with BoundedDataSet: + pub _row_schema_marker: T + pub _substrait_rel: Rel + + def to_substrait_plan(self) -> Plan: + # TODO(#229): switch back to an empty root-name list once Incan preserves element type context for `[]`. + return plan_from_root_relation(self._substrait_rel, [str("id")]) + + def filter(self, predicate: bool) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=filter_ds(self._substrait_rel, predicate)) + + def join(self, other: Self, on: bool) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=join_ds(self._substrait_rel, other._substrait_rel, on)) + + def select(self) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=select_ds(self._substrait_rel)) + + def group_by(self) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=group_by_ds(self._substrait_rel)) + + def agg(self) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=agg_ds(self._substrait_rel)) + + def order_by(self) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=order_by_ds(self._substrait_rel)) + + def limit(self, n: int) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=limit_ds(self._substrait_rel, n)) + + def explode(self) -> Self: + return LazyFrame(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=explode_ds(self._substrait_rel)) + + +# ---- UnboundedDataSet trait and concrete types ---- +pub trait UnboundedDataSet[T with Clone] with DataSet[T]: + pass + + +pub class DataStream[T with Clone] with UnboundedDataSet: + pub _row_schema_marker: T + pub _substrait_rel: Rel + + def to_substrait_plan(self) -> Plan: + # TODO(#229): switch back to an empty root-name list once Incan preserves element type context for `[]`. + return plan_from_root_relation(self._substrait_rel, [str("id")]) + + def filter(self, predicate: bool) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=filter_ds(self._substrait_rel, predicate)) + + def join(self, other: Self, on: bool) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=join_ds(self._substrait_rel, other._substrait_rel, on)) + + def select(self) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=select_ds(self._substrait_rel)) + + def group_by(self) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=group_by_ds(self._substrait_rel)) + + def agg(self) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=agg_ds(self._substrait_rel)) + + def order_by(self) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=order_by_ds(self._substrait_rel)) + + def limit(self, n: int) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=limit_ds(self._substrait_rel, n)) + + def explode(self) -> Self: + return DataStream(_row_schema_marker=self._row_schema_marker.clone(), _substrait_rel=explode_ds(self._substrait_rel)) diff --git a/src/dataset/ops.incn b/src/dataset/ops.incn new file mode 100644 index 0000000..0f5f700 --- /dev/null +++ b/src/dataset/ops.incn @@ -0,0 +1,35 @@ +"""Canonical DataSet relation operation functions for RFC 002.""" + +from rust::substrait::proto import Rel +from substrait.plan import aggregate_rel, explode_extension_uri, extension_single_rel, fetch_rel, filter_rel, join_rel, project_rel, sort_rel + +pub def filter_ds(rel: Rel, predicate: bool) -> Rel: + return filter_rel(rel, predicate) + + +pub def join_ds(left_rel: Rel, right_rel: Rel, on: bool) -> Rel: + return join_rel(left_rel, right_rel, on) + + +pub def select_ds(rel: Rel) -> Rel: + return project_rel(rel) + + +pub def group_by_ds(rel: Rel) -> Rel: + return aggregate_rel(rel, false) + + +pub def agg_ds(rel: Rel) -> Rel: + return aggregate_rel(rel, true) + + +pub def order_by_ds(rel: Rel) -> Rel: + return sort_rel(rel) + + +pub def limit_ds(rel: Rel, n: int) -> Rel: + return fetch_rel(rel, 0, n) + + +pub def explode_ds(rel: Rel) -> Rel: + return extension_single_rel(rel, explode_extension_uri()) diff --git a/src/lib.incn b/src/lib.incn index 32d5f53..6e2b56b 100644 --- a/src/lib.incn +++ b/src/lib.incn @@ -6,5 +6,69 @@ Consumers depend on this package via `[dependencies]` and import with `from pub: """ pub from dataset import BoundedDataSet, DataFrame, DataSet, DataStream, LazyFrame, UnboundedDataSet +pub from dataset.ops import agg_ds, explode_ds, filter_ds, group_by_ds, join_ds, limit_ds, order_by_ds, select_ds pub from functions import count_rows, total pub from metadata import inql_version +pub from substrait.schema import ( + DemoCustomer, + RowColumnSpec, + RowShapeSpec, + SubstraitPrimitiveKind, + demo_customer_row_shape, + row_shape_to_named_struct, + row_shape_field_names, + row_shape_to_substrait_struct_type, + substrait_row_type_encoded_len, +) +pub from substrait.plan import ( + SubstraitJoinKind, + SubstraitSetOperation, + aggregate_rel, + cross_rel, + empty_plan, + explode_extension_uri, + extension_single_rel, + fetch_rel, + filter_rel, + join_rel, + join_rel_of_kind, + plan_contains_relation_kind, + plan_encoded_len, + plan_has_extension_urn, + plan_from_local_files, + plan_from_named_table, + plan_from_root_relation, + plan_from_virtual_table, + project_rel, + read_kind_name, + read_local_files_rel, + read_named_table_rel, + read_virtual_table_rel, + reference_subtree_ordinal, + relation_kind_name, + rel_contains_kind, + reference_rel, + root_rel, + registered_substrait_extension_uris, + set_rel, + set_operation_name, + set_rel_of_kind, + sort_rel, + substrait_producer_name, + substrait_release_tag, +) +pub from substrait.conformance import ( + ConformanceCapabilityTags, + ConformancePortability, + ConformanceProfileTag, + ConformanceRel, + ConformanceReferences, + ConformanceStatus, + CoreScenarioKey, + SubstraitConformanceScenario, + core_scenario_emission_matches, + core_scenario_plan, + core_scenario, + core_scenarios, + scenario_matches_root_shape, +) diff --git a/src/substrait/conformance.incn b/src/substrait/conformance.incn new file mode 100644 index 0000000..d06503f --- /dev/null +++ b/src/substrait/conformance.incn @@ -0,0 +1,276 @@ +"""Substrait conformance corpus represented as typed InQL models over real proto emission.""" + +from rust::substrait::proto import Plan +from substrait.plan import ( + SubstraitJoinKind, + SubstraitSetOperation, + aggregate_rel, + cross_rel, + fetch_rel, + filter_rel, + join_rel_of_kind, + plan_contains_relation_kind, + plan_encoded_len, + plan_from_local_files, + plan_from_named_table, + plan_from_root_relation, + plan_from_virtual_table, + project_rel, + read_kind_name, + read_named_table_rel, + reference_subtree_ordinal, + relation_kind_name, + reference_rel, + root_rel, + set_operation_name, + set_rel_of_kind, + sort_rel, +) + +def _clone_conformance_tag[T](value: T) -> T: + return value.clone() + + +pub enum ConformanceStatus: + Core + Extension + Gap + OptionalMutation + + +pub enum ConformanceProfileTag: + ReadQueryCore + OptionalMutation + GapPolicy + ReadBindingBoundary + + +@derive(Clone) +pub enum ConformanceRel: + Read + Filter + Project + Join + Cross + Aggregate + Sort + Fetch + Set + Reference + Write + Update + Ddl + ExtensionLeaf + ExtensionSingle + ExtensionMulti + + +pub enum ConformancePortability: + Portable + ConsumerConditional + NonPortable + + +pub enum CoreScenarioKey: + ReadNamedTable + ReadLocalFiles + ReadVirtualTable + FilterRows + ProjectComputedColumns + JoinRelVariants + CrossRelCartesian + AggregateGroupingSets + SortRelOrdering + FetchRelLimitOffset + SetRelOperations + ReferenceRelSharedSubplan + + +@derive(Clone) +pub type ConformanceCapabilityTags = newtype list[str] + + +@derive(Clone) +pub type ConformanceReferences = newtype list[str] + + +def _capability_tags(tags: list[str]) -> ConformanceCapabilityTags: + return ConformanceCapabilityTags(tags) + + +def _references(paths: list[str]) -> ConformanceReferences: + return ConformanceReferences(paths) + + +def _refs_read_root_binding() -> ConformanceReferences: + return _references([str("docs/rfcs/002_apache_substrait_integration.md"), str("docs/language/reference/substrait/operator_catalog.md"), str("docs/language/reference/substrait/read_root_binding_contract.md")]) + + +def _refs_query_surface() -> ConformanceReferences: + return _references([str("docs/rfcs/002_apache_substrait_integration.md"), str("docs/rfcs/003_inql_query_blocks.md"), str("docs/language/reference/substrait/operator_catalog.md")]) + + +def _refs_core_operator() -> ConformanceReferences: + return _references([str("docs/rfcs/002_apache_substrait_integration.md"), str("docs/language/reference/substrait/operator_catalog.md")]) + + +pub model SubstraitConformanceScenario: + pub scenario_id: str + pub title: str + pub status: ConformanceStatus + pub profile_tags: list[ConformanceProfileTag] + pub capability_tags: ConformanceCapabilityTags + pub required_rels: list[ConformanceRel] + pub portability: ConformancePortability + pub intent: str + pub required_rel_shape: str + pub expected_constraints: str + pub references: ConformanceReferences + + +def _core_scenario(scenario_id: str, title: str, capability_tags: ConformanceCapabilityTags, required_rels: list[ConformanceRel], intent: str, required_rel_shape: str, expected_constraints: str, references: ConformanceReferences) -> SubstraitConformanceScenario: + return SubstraitConformanceScenario(scenario_id=scenario_id, title=title, status=ConformanceStatus.Core, profile_tags=[ConformanceProfileTag.ReadQueryCore], capability_tags=capability_tags, required_rels=required_rels, portability=ConformancePortability.Portable, intent=intent, required_rel_shape=required_rel_shape, expected_constraints=expected_constraints, references=references) + + +pub def core_scenario(key: CoreScenarioKey) -> SubstraitConformanceScenario: + """Lookup one core conformance scenario by enum key.""" + match key: + CoreScenarioKey.ReadNamedTable => + return _core_scenario(scenario_id=str("inql.substrait.core.read_named_table.001"), title=str("ReadRel named table without secret material"), capability_tags=_capability_tags([str("read"), str("named-table")]), required_rels=[ConformanceRel.Read], intent=str("Logical read from a registered table name without embedding execution secrets."), required_rel_shape=str("ReadRel(NamedTable) with model-compatible schema."), expected_constraints=str("No secrets or resolved execution-bound endpoint material in normative plan payload."), references=_refs_read_root_binding()) + CoreScenarioKey.ReadLocalFiles => + return _core_scenario(scenario_id=str("inql.substrait.core.read_local_files.001"), title=str("ReadRel local files with portable format fields"), capability_tags=_capability_tags([str("read"), str("local-files")]), required_rels=[ConformanceRel.Read], intent=str("Logical scan of file-backed sources with pinned-revision format descriptors."), required_rel_shape=str("ReadRel(LocalFiles) with supported file format fields."), expected_constraints=str("No credential or session-state material in normative plan payload."), references=_refs_read_root_binding()) + CoreScenarioKey.ReadVirtualTable => + return _core_scenario(scenario_id=str("inql.substrait.core.read_virtual_table.001"), title=str("ReadRel virtual table with schema-aligned rows"), capability_tags=_capability_tags([str("read"), str("virtual-table"), str("literal-rows")]), required_rels=[ConformanceRel.Read], intent=str("Inline row materialization through VirtualTable payload."), required_rel_shape=str("ReadRel(VirtualTable) with schema-consistent embedded rows."), expected_constraints=str("No external binding required to interpret inline rows."), references=_refs_read_root_binding()) + CoreScenarioKey.FilterRows => + return _core_scenario(scenario_id=str("inql.substrait.core.filter_rows.001"), title=str("FilterRel boolean row predicate"), capability_tags=_capability_tags([str("filter"), str("predicate")]), required_rels=[ConformanceRel.Filter], intent=str("Boolean predicate filtering over relation input."), required_rel_shape=str("FilterRel(predicate) over one child relation."), expected_constraints=str("Boundary coverage is relation-shape only; richer query-surface parity is deferred."), references=_refs_core_operator()) + CoreScenarioKey.ProjectComputedColumns => + return _core_scenario(scenario_id=str("inql.substrait.core.project_computed_columns.001"), title=str("ProjectRel boundary scaffold"), capability_tags=_capability_tags([str("project"), str("shape-scaffold")]), required_rels=[ConformanceRel.Project], intent=str("Boundary-level ProjectRel envelope for package-authored plans."), required_rel_shape=str("ProjectRel(expressions) over one child relation."), expected_constraints=str("Current package code verifies shape only; computed-column and window semantics are deferred."), references=_refs_core_operator()) + CoreScenarioKey.JoinRelVariants => + return _core_scenario(scenario_id=str("inql.substrait.core.join_rel_variants.001"), title=str("JoinRel variant emission boundary"), capability_tags=_capability_tags([str("join"), str("inner"), str("left"), str("semi"), str("anti"), str("single"), str("mark")]), required_rels=[ConformanceRel.Join], intent=str("Explicit JoinRel variant emission over two child inputs."), required_rel_shape=str("JoinRel(join_type, expression) with optional post_join_filter when needed."), expected_constraints=str("Variant selection is owned by the boundary helper; broader planning semantics are deferred."), references=_refs_core_operator()) + CoreScenarioKey.CrossRelCartesian => + return _core_scenario(scenario_id=str("inql.substrait.core.cross_rel_cartesian.001"), title=str("CrossRel cartesian product semantics"), capability_tags=_capability_tags([str("cross"), str("cartesian-product")]), required_rels=[ConformanceRel.Cross], intent=str("Cartesian multiplication of rows across two inputs."), required_rel_shape=str("CrossRel(left, right) without predicate fields."), expected_constraints=str("Row cardinality follows cartesian product semantics."), references=_refs_core_operator()) + CoreScenarioKey.AggregateGroupingSets => + return _core_scenario(scenario_id=str("inql.substrait.core.aggregate_grouping_sets.001"), title=str("AggregateRel boundary scaffold"), capability_tags=_capability_tags([str("aggregate"), str("shape-scaffold")]), required_rels=[ConformanceRel.Aggregate], intent=str("Boundary-level AggregateRel envelope for package-authored plans."), required_rel_shape=str("AggregateRel(groupings, measures) over child relation."), expected_constraints=str("Current package code verifies scaffold shape only; grouping-set and distinct semantics are deferred."), references=_refs_core_operator()) + CoreScenarioKey.SortRelOrdering => + return _core_scenario(scenario_id=str("inql.substrait.core.sort_rel_ordering.001"), title=str("SortRel deterministic ordering contract"), capability_tags=_capability_tags([str("sort"), str("order-by"), str("collation")]), required_rels=[ConformanceRel.Sort], intent=str("Deterministic collation semantics for ordered result sets."), required_rel_shape=str("SortRel(collation_fields) over child relation."), expected_constraints=str("Ordering behavior follows declared collation and null-order settings."), references=_refs_core_operator()) + CoreScenarioKey.FetchRelLimitOffset => + return _core_scenario(scenario_id=str("inql.substrait.core.fetch_rel_limit_offset.001"), title=str("FetchRel offset and count windowing"), capability_tags=_capability_tags([str("fetch"), str("limit"), str("offset")]), required_rels=[ConformanceRel.Fetch], intent=str("Top-N and offset-based windowing over ordered or unordered relation output."), required_rel_shape=str("FetchRel(offset,count) over child relation."), expected_constraints=str("Fetch changes row window only; it does not alter row values or schema."), references=_refs_core_operator()) + CoreScenarioKey.SetRelOperations => + return _core_scenario(scenario_id=str("inql.substrait.core.set_rel_operations.001"), title=str("SetRel operation emission boundary"), capability_tags=_capability_tags([str("set"), str("union"), str("intersect"), str("except")]), required_rels=[ConformanceRel.Set], intent=str("Explicit SetRel operation emission for schema-compatible inputs."), required_rel_shape=str("SetRel(operation, inputs) with compatible schemas."), expected_constraints=str("Operation selection is owned by the boundary helper; richer planning semantics are deferred."), references=_refs_core_operator()) + CoreScenarioKey.ReferenceRelSharedSubplan => + return _core_scenario(scenario_id=str("inql.substrait.core.reference_rel_shared_subplan.001"), title=str("ReferenceRel ordinal emission boundary"), capability_tags=_capability_tags([str("reference"), str("shared-subplan"), str("plan-dag")]), required_rels=[ConformanceRel.Reference], intent=str("Boundary-level ReferenceRel ordinal emission."), required_rel_shape=str("ReferenceRel(subtree_ordinal) inside one plan envelope."), expected_constraints=str("Current package code verifies ordinal preservation only; shared-subplan planning is deferred."), references=_refs_core_operator()) + + +pub def core_scenarios() -> list[SubstraitConformanceScenario]: + """Return all currently defined read/query core scenarios.""" + return [core_scenario(CoreScenarioKey.ReadNamedTable), core_scenario(CoreScenarioKey.ReadLocalFiles), core_scenario(CoreScenarioKey.ReadVirtualTable), core_scenario(CoreScenarioKey.FilterRows), core_scenario(CoreScenarioKey.ProjectComputedColumns), core_scenario(CoreScenarioKey.JoinRelVariants), core_scenario(CoreScenarioKey.CrossRelCartesian), core_scenario(CoreScenarioKey.AggregateGroupingSets), core_scenario(CoreScenarioKey.SortRelOrdering), core_scenario(CoreScenarioKey.FetchRelLimitOffset), core_scenario(CoreScenarioKey.SetRelOperations), core_scenario(CoreScenarioKey.ReferenceRelSharedSubplan)] + + +pub def core_scenario_plan(key: CoreScenarioKey) -> Plan: + """Build a deterministic proto-backed plan for a core scenario key.""" + # TODO(#229): most of these root-name lists can become `[]` again once empty list literals lower with type context. + match key: + CoreScenarioKey.ReadNamedTable => return plan_from_named_table(str("orders")) + CoreScenarioKey.ReadLocalFiles => return plan_from_local_files(str("file:///tmp/orders.parquet")) + CoreScenarioKey.ReadVirtualTable => return plan_from_virtual_table(str("inline_orders")) + CoreScenarioKey.FilterRows => return plan_from_root_relation(filter_rel(read_named_table_rel(str("orders")), true), [str("id")]) + CoreScenarioKey.ProjectComputedColumns => return plan_from_root_relation(project_rel(read_named_table_rel(str("orders"))), [str("id")]) + CoreScenarioKey.JoinRelVariants => return plan_from_root_relation(join_rel_of_kind(read_named_table_rel(str("orders")), read_named_table_rel(str("customers")), true, SubstraitJoinKind.Left), [str("id")]) + CoreScenarioKey.CrossRelCartesian => return plan_from_root_relation(cross_rel(read_named_table_rel(str("left_source")), read_named_table_rel(str("right_source"))), [str("id")]) + CoreScenarioKey.AggregateGroupingSets => return plan_from_root_relation(aggregate_rel(read_named_table_rel(str("orders")), true), [str("id")]) + CoreScenarioKey.SortRelOrdering => return plan_from_root_relation(sort_rel(read_named_table_rel(str("orders"))), [str("id")]) + CoreScenarioKey.FetchRelLimitOffset => return plan_from_root_relation(fetch_rel(read_named_table_rel(str("orders")), 10, 25), [str("id")]) + CoreScenarioKey.SetRelOperations => return plan_from_root_relation(set_rel_of_kind(read_named_table_rel(str("orders_current")), read_named_table_rel(str("orders_archive")), SubstraitSetOperation.UnionDistinct), [str("id")]) + CoreScenarioKey.ReferenceRelSharedSubplan => return plan_from_root_relation(reference_rel(7), [str("id")]) + + +def _kind_to_conformance_rel(kind: str) -> ConformanceRel: + if kind == str("ReadRel"): + return ConformanceRel.Read + elif kind == str("FilterRel"): + return ConformanceRel.Filter + elif kind == str("ProjectRel"): + return ConformanceRel.Project + elif kind == str("JoinRel"): + return ConformanceRel.Join + elif kind == str("CrossRel"): + return ConformanceRel.Cross + elif kind == str("AggregateRel"): + return ConformanceRel.Aggregate + elif kind == str("SortRel"): + return ConformanceRel.Sort + elif kind == str("FetchRel"): + return ConformanceRel.Fetch + elif kind == str("SetRel"): + return ConformanceRel.Set + elif kind == str("ReferenceRel"): + return ConformanceRel.Reference + elif kind == str("WriteRel"): + return ConformanceRel.Write + elif kind == str("UpdateRel"): + return ConformanceRel.Update + elif kind == str("DdlRel"): + return ConformanceRel.Ddl + elif kind == str("ExtensionSingleRel"): + return ConformanceRel.ExtensionSingle + elif kind == str("ExtensionMultiRel"): + return ConformanceRel.ExtensionMulti + return ConformanceRel.ExtensionLeaf + + +pub def scenario_matches_root_shape(scenario: SubstraitConformanceScenario, plan: Plan) -> bool: + """Check whether the emitted plan satisfies the scenario root contract.""" + if plan_encoded_len(plan) == 0: + return false + root = root_rel(plan) + for expected in scenario.required_rels: + if expected == ConformanceRel.Read: + if str("named-table") in scenario.capability_tags.0: + return relation_kind_name(root) == str("ReadRel") and read_kind_name(root) == str("NamedTable") + elif str("local-files") in scenario.capability_tags.0: + return relation_kind_name(root) == str("ReadRel") and read_kind_name(root) == str("LocalFiles") + elif str("virtual-table") in scenario.capability_tags.0: + return relation_kind_name(root) == str("ReadRel") and read_kind_name(root) == str("VirtualTable") + return relation_kind_name(root) == str("ReadRel") + elif relation_kind_name(root) != relation_kind_name_from_conformance(expected): + return false + return true + return false + + +pub def core_scenario_emission_matches(key: CoreScenarioKey) -> bool: + """Validate one core scenario contract against the proto-backed emission output.""" + plan = core_scenario_plan(key) + scenario = core_scenario(key) + if scenario_matches_root_shape(scenario, plan) == false: + return false + for expected in scenario.required_rels: + if plan_contains_relation_kind(plan, relation_kind_name_from_conformance(expected)) == false: + return false + root = root_rel(plan) + match key: + CoreScenarioKey.SetRelOperations => + if set_operation_name(root) != str("UnionDistinct"): + return false + CoreScenarioKey.ReferenceRelSharedSubplan => + if reference_subtree_ordinal(root) != 7: + return false + _ => pass + return plan_encoded_len(plan) > 0 + + +def relation_kind_name_from_conformance(rel: ConformanceRel) -> str: + match rel: + ConformanceRel.Read => return str("ReadRel") + ConformanceRel.Filter => return str("FilterRel") + ConformanceRel.Project => return str("ProjectRel") + ConformanceRel.Join => return str("JoinRel") + ConformanceRel.Cross => return str("CrossRel") + ConformanceRel.Aggregate => return str("AggregateRel") + ConformanceRel.Sort => return str("SortRel") + ConformanceRel.Fetch => return str("FetchRel") + ConformanceRel.Set => return str("SetRel") + ConformanceRel.Reference => return str("ReferenceRel") + ConformanceRel.ExtensionSingle => return str("ExtensionSingleRel") + ConformanceRel.ExtensionLeaf => return str("ExtensionLeafRel") + _ => return str("UnknownRel") diff --git a/src/substrait/mod.incn b/src/substrait/mod.incn new file mode 100644 index 0000000..2fe8ea0 --- /dev/null +++ b/src/substrait/mod.incn @@ -0,0 +1,10 @@ +""" +This module contains the Substrait implementation for InQL. +It uses the Substrait crate to be able to parse and serialize Substrait plans. + + https://docs.rs/substrait/latest/substrait/ + +Protobuf serialization and deserialization are provided via prost in the proto module. +""" +from rust::prost import Message +from rust::substrait::proto import Plan diff --git a/src/substrait/plan.incn b/src/substrait/plan.incn new file mode 100644 index 0000000..bc2b7c6 --- /dev/null +++ b/src/substrait/plan.incn @@ -0,0 +1,499 @@ +""" +Proto-backed Substrait plan construction for InQL RFC 002. + +This module is the canonical emission boundary for Substrait plans. It uses the real `substrait::proto` types rather +than a hand-rolled parallel plan model. + +TODO: (rfc-002) revisit this module's public surface once the proto-backed API stabilizes. + A thin wrapper class or classmethod-based entrypoint may read more cleanly than the current free-function-heavy + shape, as long as it remains a wrapper over real proto plans rather than a parallel semantic model. +""" + +from rust::prost import Message +from rust::prost_types import Any +from rust::std::boxed import Box +from rust::std::primitive import i32 as RustI32 +from rust::substrait::proto import AggregateRel, CrossRel, Expression, ExtensionSingleRel, FetchRel, FilterRel, JoinRel, NamedStruct, Plan, PlanRel, ProjectRel, ReadRel, ReferenceRel, Rel, RelCommon, RelRoot, SetRel, SortField, SortRel, Version +from rust::substrait::proto::aggregate_rel import Grouping, Measure +from rust::substrait::proto::expression import Literal, RexType +from rust::substrait::proto::expression::literal import LiteralType +from rust::substrait::proto::expression::nested import Struct as NestedStruct +from rust::substrait::proto::extensions import SimpleExtensionUrn +from rust::substrait::proto::fetch_rel import CountMode, OffsetMode +from rust::substrait::proto::join_rel import JoinType +from rust::substrait::proto::plan_rel import RelType as PlanRelType +from rust::substrait::proto::read_rel import LocalFiles +from rust::substrait::proto::read_rel import NamedTable as ReadNamedTable +from rust::substrait::proto::read_rel import ReadType, VirtualTable +from rust::substrait::proto::read_rel::local_files import FileOrFiles +from rust::substrait::proto::read_rel::local_files::file_or_files import FileFormat +from rust::substrait::proto::read_rel::local_files::file_or_files import ParquetReadOptions +from rust::substrait::proto::read_rel::local_files::file_or_files import PathType +from rust::substrait::proto::rel import RelType +from rust::substrait::proto::rel_common import Direct, EmitKind +from rust::substrait::proto::set_rel import SetOp +from rust::substrait::proto::sort_field import SortDirection, SortKind + +from substrait.schema import ( + RowColumnSpec, + RowShapeSpec, + SubstraitPrimitiveKind, + row_shape_to_named_struct, +) + +@derive(Clone) +pub type RelationKindName = newtype str + + +# Public join variants exposed by the thin RFC 002 Substrait boundary. +pub enum SubstraitJoinKind: + Inner + Left + Right + Outer + Semi + Anti + Single + Mark + + +# Public set-operation variants exposed by the thin RFC 002 Substrait boundary. +pub enum SubstraitSetOperation: + MinusPrimary + IntersectPrimary + IntersectMultiset + MinusMultiset + UnionDistinct + UnionAll + +# --- Internal helpers ------------------------------------------------------------------------------------------------ +# These keep the public RFC 002 surface compact by centralizing common proto defaults, relation wrapping, and temporary +# compatibility shims. + +def _logical_shape(name: str) -> RowShapeSpec: + return RowShapeSpec(model_name=name, columns=[RowColumnSpec(name=str("id"), kind=SubstraitPrimitiveKind.I64, nullable=false)]) + + +def _default_named_struct(name: str) -> NamedStruct: + return row_shape_to_named_struct(_logical_shape(name)) + + +def _default_release_tag() -> str: + return str("v0.63.0") + + +def _default_producer() -> str: + return str("inql-rfc002") + + +def _default_version() -> Version: + return Version(major_number=0, minor_number=63, patch_number=0, git_hash=str(""), producer=_default_producer()) + + +def _explode_extension_uri_value() -> str: + return str("https://inql.io/extensions/v0.1/unnest.yaml#explode") + + +def _default_extension_uris() -> list[str]: + return [_explode_extension_uri_value()] + + +def _direct_common() -> RelCommon: + return RelCommon(hint=None, advanced_extension=None, emit_kind=Some(EmitKind.Direct(Direct()))) + + +def _bool_expr(value: bool) -> Expression: + return Expression(rex_type=Some(RexType.Literal(Literal(literal_type=Some(LiteralType.Boolean(value)), nullable=false, type_variation_reference=0)))) + + +def _i64_expr(value: int) -> Expression: + return Expression(rex_type=Some(RexType.Literal(Literal(literal_type=Some(LiteralType.I64(value)), nullable=false, type_variation_reference=0)))) + + +def _string_expr(value: str) -> Expression: + return Expression(rex_type=Some(RexType.Literal(Literal(literal_type=Some(LiteralType.String(value.to_string())), nullable=true, type_variation_reference=0)))) + + +def _rel_read(read: ReadRel) -> Rel: + return Rel(rel_type=Some(RelType.Read(Box.new(read)))) + + +def _rel_filter(filter: FilterRel) -> Rel: + return Rel(rel_type=Some(RelType.Filter(Box.new(filter)))) + + +def _rel_project(project: ProjectRel) -> Rel: + return Rel(rel_type=Some(RelType.Project(Box.new(project)))) + + +def _rel_join(join: JoinRel) -> Rel: + return Rel(rel_type=Some(RelType.Join(Box.new(join)))) + + +def _rel_cross(cross: CrossRel) -> Rel: + return Rel(rel_type=Some(RelType.Cross(Box.new(cross)))) + + +def _rel_aggregate(aggregate: AggregateRel) -> Rel: + return Rel(rel_type=Some(RelType.Aggregate(Box.new(aggregate)))) + + +def _rel_sort(sort: SortRel) -> Rel: + return Rel(rel_type=Some(RelType.Sort(Box.new(sort)))) + + +def _rel_fetch(fetch: FetchRel) -> Rel: + return Rel(rel_type=Some(RelType.Fetch(Box.new(fetch)))) + + +def _rel_set(set_rel: SetRel) -> Rel: + return Rel(rel_type=Some(RelType.Set(set_rel))) + + +def _rel_reference(reference: ReferenceRel) -> Rel: + return Rel(rel_type=Some(RelType.Reference(reference))) + + +def _rel_extension_single(input: Rel, extension_uri: str) -> Rel: + detail = Any(type_url=extension_uri, value=[]) + rel = ExtensionSingleRel(common=Some(_direct_common()), input=Some(Box.new(input)), detail=Some(detail)) + return Rel(rel_type=Some(RelType.ExtensionSingle(Box.new(rel)))) + + +def _set_operation_from_legacy_name(operation: str) -> SubstraitSetOperation: + if operation == str("Union"): + return SubstraitSetOperation.UnionDistinct + elif operation == str("UnionAll"): + return SubstraitSetOperation.UnionAll + elif operation == str("Intersect"): + return SubstraitSetOperation.IntersectPrimary + return SubstraitSetOperation.MinusPrimary + + +def _join_type_from_kind(kind: SubstraitJoinKind) -> RustI32: + match kind: + SubstraitJoinKind.Inner => return JoinType.Inner.into() + SubstraitJoinKind.Left => return JoinType.Left.into() + SubstraitJoinKind.Right => return JoinType.Right.into() + SubstraitJoinKind.Outer => return JoinType.Outer.into() + SubstraitJoinKind.Semi => return JoinType.LeftSemi.into() + SubstraitJoinKind.Anti => return JoinType.LeftAnti.into() + SubstraitJoinKind.Single => return JoinType.LeftSingle.into() + SubstraitJoinKind.Mark => return JoinType.LeftMark.into() + + +def _set_op_from_kind(operation: SubstraitSetOperation) -> RustI32: + match operation: + SubstraitSetOperation.MinusPrimary => return SetOp.MinusPrimary.into() + SubstraitSetOperation.IntersectPrimary => return SetOp.IntersectionPrimary.into() + SubstraitSetOperation.IntersectMultiset => return SetOp.IntersectionMultiset.into() + SubstraitSetOperation.MinusMultiset => return SetOp.MinusMultiset.into() + SubstraitSetOperation.UnionDistinct => return SetOp.UnionDistinct.into() + SubstraitSetOperation.UnionAll => return SetOp.UnionAll.into() + + +def _plan_root(input: Rel, names: list[str]) -> PlanRel: + return PlanRel(rel_type=Some(PlanRelType.Root(RelRoot(input=Some(input), names=names)))) + + +def _plan_rel(input: Rel) -> PlanRel: + return PlanRel(rel_type=Some(PlanRelType.Rel(input))) + + +pub def empty_plan() -> Plan: + """Return an empty proto-backed Substrait plan with the RFC 002 default version.""" + return Plan(version=Some(_default_version()), extension_urns=[], extensions=[], relations=[], advanced_extensions=None, expected_type_urls=[], parameter_bindings=[], type_aliases=[]) + + +pub def substrait_release_tag() -> str: + """Return the current RFC 002 Substrait release tag exported by the package boundary.""" + return _default_release_tag() + + +pub def substrait_producer_name() -> str: + """Return the current RFC 002 producer label exported by the package boundary.""" + return _default_producer() + + +pub def registered_substrait_extension_uris() -> list[str]: + """Return the registered extension URIs used by current package-level Substrait lowering.""" + return _default_extension_uris() + + +pub def explode_extension_uri() -> str: + """Return the registered extension URI used for EXPLODE-style gap encoding.""" + return _explode_extension_uri_value() + + +pub def plan_from_root_relation(input: Rel, names: list[str]) -> Plan: + """Wrap one root relation in a complete Substrait `Plan` envelope.""" + return Plan(version=Some(_default_version()), extension_urns=_extension_urns_for_rel(input), extensions=[], relations=[_plan_root(input, names)], advanced_extensions=None, expected_type_urls=[], parameter_bindings=[], type_aliases=[]) + + +pub def read_named_table_rel(table_name: str) -> Rel: + """Construct a logical `ReadRel(NamedTable)` root for a registered table name.""" + read = ReadRel(common=Some(_direct_common()), base_schema=Some(_default_named_struct(table_name)), filter=None, best_effort_filter=None, projection=None, advanced_extension=None, read_type=Some(ReadType.NamedTable(ReadNamedTable(names=[table_name], advanced_extension=None)))) + return _rel_read(read) + + +pub def read_local_files_rel(uri: str) -> Rel: + """Construct a logical `ReadRel(LocalFiles)` for a file-backed scan.""" + file_item = FileOrFiles(partition_index=0, start=0, length=0, path_type=Some(PathType.UriFile(uri.to_string())), file_format=Some(FileFormat.Parquet(ParquetReadOptions()))) + read = ReadRel(common=Some(_direct_common()), base_schema=Some(_default_named_struct(str("LocalFiles"))), filter=None, best_effort_filter=None, projection=None, advanced_extension=None, read_type=Some(ReadType.LocalFiles(LocalFiles(items=[file_item], advanced_extension=None)))) + return _rel_read(read) + + +pub def read_virtual_table_rel(table_name: str) -> Rel: + """Construct a logical `ReadRel(VirtualTable)` with one placeholder inline row.""" + row = NestedStruct(fields=[_string_expr(table_name)]) + read = ReadRel(common=Some(_direct_common()), base_schema=Some(_default_named_struct(table_name)), filter=None, best_effort_filter=None, projection=None, advanced_extension=None, read_type=Some(ReadType.VirtualTable(VirtualTable(values=[], expressions=[row])))) + return _rel_read(read) + + +pub def plan_from_named_table(table_name: str) -> Plan: + """Build a minimal plan rooted at a named-table logical read.""" + return plan_from_root_relation(read_named_table_rel(table_name), [str("id")]) + + +pub def plan_from_local_files(uri: str) -> Plan: + """Build a minimal plan rooted at a local-files logical read.""" + return plan_from_root_relation(read_local_files_rel(uri), [str("id")]) + + +pub def plan_from_virtual_table(table_name: str) -> Plan: + """Build a minimal plan rooted at a virtual-table logical read.""" + return plan_from_root_relation(read_virtual_table_rel(table_name), [str("id")]) + + +pub def filter_rel(input: Rel, predicate: bool) -> Rel: + """Wrap a child relation in `FilterRel` with a placeholder boolean predicate.""" + return _rel_filter(FilterRel(common=Some(_direct_common()), input=Some(Box.new(input)), condition=Some(Box.new(_bool_expr(predicate))), advanced_extension=None)) + + +pub def project_rel(input: Rel) -> Rel: + """Wrap a child relation in a shape-only `ProjectRel` scaffold with one placeholder expression.""" + return _rel_project(ProjectRel(common=Some(_direct_common()), input=Some(Box.new(input)), expressions=[_string_expr(str("__project__"))], advanced_extension=None)) + + +pub def join_rel(left: Rel, right: Rel, on_predicate: bool) -> Rel: + """Wrap two child relations in an inner `JoinRel`.""" + return join_rel_of_kind(left, right, on_predicate, SubstraitJoinKind.Inner) + + +pub def join_rel_of_kind(left: Rel, right: Rel, on_predicate: bool, kind: SubstraitJoinKind) -> Rel: + """Wrap two child relations in `JoinRel` using one explicit Substrait join variant.""" + return _rel_join(JoinRel(common=Some(_direct_common()), left=Some(Box.new(left)), right=Some(Box.new(right)), expression=Some(Box.new(_bool_expr(on_predicate))), post_join_filter=None, type=_join_type_from_kind(kind), advanced_extension=None)) + + +pub def cross_rel(left: Rel, right: Rel) -> Rel: + """Wrap two child relations in `CrossRel`.""" + return _rel_cross(CrossRel(common=Some(_direct_common()), left=Some(Box.new(left)), right=Some(Box.new(right)), advanced_extension=None)) + + +pub def aggregate_rel(input: Rel, include_measure: bool) -> Rel: + """Wrap a child relation in a shape-only `AggregateRel` scaffold using placeholder grouping metadata.""" + mut groupings: list[Grouping] = [Grouping(grouping_expressions=[], expression_references=[0])] + mut grouping_exprs: list[Expression] = [_string_expr(str("__group_key__"))] + mut measures: list[Measure] = [] + if include_measure: + measures = [] + return _rel_aggregate(AggregateRel(common=Some(_direct_common()), input=Some(Box.new(input)), groupings=groupings, measures=measures, grouping_expressions=grouping_exprs, advanced_extension=None)) + + +pub def sort_rel(input: Rel) -> Rel: + """Wrap a child relation in `SortRel` with one placeholder ascending sort field.""" + sort_field = SortField(expr=Some(_string_expr(str("__sort_key__"))), sort_kind=Some(SortKind.Direction(SortDirection.AscNullsFirst.into()))) + return _rel_sort(SortRel(common=Some(_direct_common()), input=Some(Box.new(input)), sorts=[sort_field], advanced_extension=None)) + + +pub def fetch_rel(input: Rel, offset: int, count: int) -> Rel: + """Wrap a child relation in `FetchRel` for offset/count windowing.""" + return _rel_fetch(FetchRel(common=Some(_direct_common()), input=Some(Box.new(input)), advanced_extension=None, offset_mode=Some(OffsetMode.Offset(offset)), count_mode=Some(CountMode.Count(count)))) + + +pub def set_rel(left: Rel, right: Rel, operation: str) -> Rel: + """Compatibility wrapper for `SetRel` using the historic string-based operation name.""" + return set_rel_of_kind(left, right, _set_operation_from_legacy_name(operation)) + + +pub def set_rel_of_kind(left: Rel, right: Rel, operation: SubstraitSetOperation) -> Rel: + """Wrap two child relations in `SetRel` using one explicit Substrait set-operation enum.""" + return _rel_set(SetRel(common=Some(_direct_common()), inputs=[left, right], op=_set_op_from_kind(operation), advanced_extension=None)) + + +pub def reference_rel(subtree_ordinal: RustI32) -> Rel: + """Construct a `ReferenceRel` using the provided subtree ordinal.""" + return _rel_reference(ReferenceRel(subtree_ordinal=subtree_ordinal)) + + +pub def extension_single_rel(input: Rel, extension_uri: str) -> Rel: + """Wrap a child relation in `ExtensionSingleRel` with the provided extension URI.""" + return _rel_extension_single(input, extension_uri) + + +pub def root_rel(plan: Plan) -> Rel: + """Return the logical root relation from a plan.""" + if len(plan.relations) == 0: + return read_named_table_rel(str("__malformed_plan__")) + relation = plan.relations[0] + match relation.rel_type: + Some(PlanRelType.Root(root)) => + match root.input: + Some(rel) => return rel + None => return read_named_table_rel(str("__malformed_plan__")) + Some(PlanRelType.Rel(rel)) => return rel + _ => return read_named_table_rel(str("__malformed_plan__")) + + +pub def relation_kind_name(rel: Rel) -> str: + """Return the public Substrait relation-kind label for one `Rel` node.""" + match rel.rel_type: + Some(RelType.Read(_)) => return str("ReadRel") + Some(RelType.Filter(_)) => return str("FilterRel") + Some(RelType.Project(_)) => return str("ProjectRel") + Some(RelType.Join(_)) => return str("JoinRel") + Some(RelType.Cross(_)) => return str("CrossRel") + Some(RelType.Aggregate(_)) => return str("AggregateRel") + Some(RelType.Sort(_)) => return str("SortRel") + Some(RelType.Fetch(_)) => return str("FetchRel") + Some(RelType.Set(_)) => return str("SetRel") + Some(RelType.Reference(_)) => return str("ReferenceRel") + Some(RelType.ExtensionSingle(_)) => return str("ExtensionSingleRel") + Some(RelType.ExtensionLeaf(_)) => return str("ExtensionLeafRel") + Some(RelType.ExtensionMulti(_)) => return str("ExtensionMultiRel") + _ => return str("UnknownRel") + + +pub def read_kind_name(rel: Rel) -> str: + """Return the read-root kind label for one relation.""" + match rel.rel_type: + Some(RelType.Read(read)) => + match read.read_type: + Some(ReadType.NamedTable(_)) => return str("NamedTable") + Some(ReadType.LocalFiles(_)) => return str("LocalFiles") + Some(ReadType.VirtualTable(_)) => return str("VirtualTable") + _ => return str("ReadRel") + _ => return str("Unknown") + + +pub def set_operation_name(rel: Rel) -> str: + """Return the public Substrait set-operation label for one `SetRel` node.""" + match rel.rel_type: + Some(RelType.Set(set_rel)) => + if set_rel.op == _set_op_from_kind(SubstraitSetOperation.MinusPrimary): + return str("MinusPrimary") + elif set_rel.op == _set_op_from_kind(SubstraitSetOperation.IntersectPrimary): + return str("IntersectPrimary") + elif set_rel.op == _set_op_from_kind(SubstraitSetOperation.IntersectMultiset): + return str("IntersectMultiset") + elif set_rel.op == _set_op_from_kind(SubstraitSetOperation.MinusMultiset): + return str("MinusMultiset") + elif set_rel.op == _set_op_from_kind(SubstraitSetOperation.UnionDistinct): + return str("UnionDistinct") + elif set_rel.op == _set_op_from_kind(SubstraitSetOperation.UnionAll): + return str("UnionAll") + return str("UnknownSetOp") + _ => return str("NotSet") + + +pub def reference_subtree_ordinal(rel: Rel) -> RustI32: + """Return the subtree ordinal for one `ReferenceRel`, or `-1` when the input is not a reference.""" + match rel.rel_type: + Some(RelType.Reference(reference)) => return reference.subtree_ordinal + _ => return -1 + + +pub def plan_encoded_len(plan: Plan) -> int: + """Return the encoded protobuf byte length of a plan.""" + return len(plan.encode_to_vec()) + + +def _relation_children(rel: Rel) -> list[Rel]: + match rel.rel_type: + Some(RelType.Filter(filter)) => + match filter.input: + Some(child) => return [child.as_ref().clone()] + None => return [] + Some(RelType.Project(project)) => + match project.input: + Some(child) => return [child.as_ref().clone()] + None => return [] + Some(RelType.Join(join)) => + mut children: list[Rel] = [] + match join.left: + Some(left) => children.append(left.as_ref().clone()) + None => pass + match join.right: + Some(right) => children.append(right.as_ref().clone()) + None => pass + return children + Some(RelType.Cross(cross)) => + mut children: list[Rel] = [] + match cross.left: + Some(left) => children.append(left.as_ref().clone()) + None => pass + match cross.right: + Some(right) => children.append(right.as_ref().clone()) + None => pass + return children + Some(RelType.Aggregate(aggregate)) => + match aggregate.input: + Some(child) => return [child.as_ref().clone()] + None => return [] + Some(RelType.Sort(sort)) => + match sort.input: + Some(child) => return [child.as_ref().clone()] + None => return [] + Some(RelType.Fetch(fetch)) => + match fetch.input: + Some(child) => return [child.as_ref().clone()] + None => return [] + Some(RelType.Set(set_rel)) => return set_rel.inputs + Some(RelType.ExtensionSingle(extension)) => + match extension.input: + Some(child) => return [child.as_ref().clone()] + None => return [] + _ => return [] + + +pub def plan_contains_relation_kind(plan: Plan, expected_kind: str) -> bool: + """Return whether the plan contains a relation with the requested public kind label.""" + return rel_contains_kind(root_rel(plan), expected_kind) + + +pub def rel_contains_kind(rel: Rel, expected_kind: str) -> bool: + """Return whether one relation subtree contains the requested public kind label.""" + if relation_kind_name(rel) == expected_kind: + return true + for child in _relation_children(rel): + if rel_contains_kind(child, expected_kind): + return true + return false + + +def _extension_urns_for_rel(rel: Rel) -> list[SimpleExtensionUrn]: + return _collect_extension_urns(rel) + + +def _collect_extension_urns(rel: Rel) -> list[SimpleExtensionUrn]: + mut urns: list[SimpleExtensionUrn] = [] + match rel.rel_type.clone(): + Some(RelType.ExtensionSingle(extension)) => + match extension.detail: + Some(detail) => + if detail.type_url != str(""): + urns.append(SimpleExtensionUrn(extension_urn_anchor=1, urn=detail.type_url)) + None => pass + _ => pass + for child in _relation_children(rel): + for urn in _collect_extension_urns(child): + urns.append(SimpleExtensionUrn(extension_urn_anchor=1, urn=urn.urn)) + return urns + + +pub def plan_has_extension_urn(plan: Plan, extension_uri: str) -> bool: + """Return whether one emitted plan already carries the requested extension URI.""" + for urn in plan.extension_urns: + if urn.urn == extension_uri: + return true + return false diff --git a/src/substrait/schema.incn b/src/substrait/schema.incn new file mode 100644 index 0000000..5f30fa8 --- /dev/null +++ b/src/substrait/schema.incn @@ -0,0 +1,88 @@ +""" +Describe an Incan row shape and lower it to Substrait schema structures. + +The demo model `DemoCustomer` is hand-authored; `demo_customer_row_shape()` lists columns in the same declaration order. +A future compiler pass would derive `RowShapeSpec` from `model` AST. + +`NamedStruct` (names + struct) is the full read-root shape; this module emits the inner column +types and exposes names separately until Incan can set prost's `r#struct` field ergonomically. + +See RFC 002: field order must match `NamedStruct` / field-reference indices. +""" + +from rust::prost import Message +from rust::substrait::proto import NamedStruct, Type +from rust::substrait::proto::type import I64, Kind, Nullability, String as SubstraitString, Struct + +@derive(Clone) +pub model DemoCustomer: + pub id: int + pub name: str + + +@derive(Clone) +pub enum SubstraitPrimitiveKind: + I64 + String + + +@derive(Clone) +pub model RowColumnSpec: + pub name: str + pub kind: SubstraitPrimitiveKind + pub nullable: bool + + +@derive(Clone) +pub model RowShapeSpec: + pub model_name: str + pub columns: list[RowColumnSpec] + + +pub def demo_customer_row_shape() -> RowShapeSpec: + """Return the row shape for `DemoCustomer` (manual until compiler-derived).""" + return RowShapeSpec(model_name=str("DemoCustomer"), columns=[RowColumnSpec(name=str("id"), kind=SubstraitPrimitiveKind.I64, nullable=false), RowColumnSpec(name=str("name"), kind=SubstraitPrimitiveKind.String, nullable=true)]) + + +pub def row_shape_field_names(shape: RowShapeSpec) -> list[str]: + """Field names in index order (flat row; no nested structs in this prototype).""" + return [col.name for col in shape.columns] + + +def _type_from_primitive(kind: SubstraitPrimitiveKind, nullable: bool) -> Type: + mut n = Nullability.Required + if nullable: + n = Nullability.Nullable + # `incan fmt` currently rewrites `match`/`case` into invalid Rust-style patterns; use branches. + if kind == SubstraitPrimitiveKind.I64: + return Type(kind=Some(Kind.I64(I64(type_variation_reference=0, nullability=n.into())))) + return Type(kind=Some(Kind.String(SubstraitString(type_variation_reference=0, nullability=n.into())))) + + +pub def row_shape_to_substrait_struct_type(shape: RowShapeSpec) -> Type: + """ + Build a Substrait row type as `Type.kind = Struct` (column types in `RowShapeSpec` order). + + Field **names** are not embedded here; pair with `row_shape_field_names(shape)` at the + `NamedStruct` boundary when wiring read roots (prost field `struct` is a Rust keyword and + is awkward to populate from Incan literals today). + """ + mut types_collector: list[Type] = [] + for col in shape.columns: + types_collector.append(_type_from_primitive(col.kind, col.nullable).clone()) + struct_inner = Struct(types=types_collector, type_variation_reference=0, nullability=Nullability.Required.into()) + return Type(kind=Some(Kind.Struct(struct_inner))) + + +pub def row_shape_to_named_struct(shape: RowShapeSpec) -> NamedStruct: + """Build a NamedStruct from a row shape using DFS field order.""" + mut types_collector: list[Type] = [] + for col in shape.columns: + types_collector.append(_type_from_primitive(col.kind, col.nullable).clone()) + struct_inner = Struct(types=types_collector, type_variation_reference=0, nullability=Nullability.Required.into()) + return NamedStruct(names=row_shape_field_names(shape), struct=Some(struct_inner)) + + +pub def substrait_row_type_encoded_len(row_ty: Type) -> int: + """Protobuf-encoded size of the row `Type` (for tests).""" + return len(row_ty.encode_to_vec()) diff --git a/tests/test_dataset.incn b/tests/test_dataset.incn index 1b697ff..06f71fe 100644 --- a/tests/test_dataset.incn +++ b/tests/test_dataset.incn @@ -3,58 +3,62 @@ from std.testing import assert_eq from metadata import inql_version from dataset import DataSet, BoundedDataSet, UnboundedDataSet, DataFrame, LazyFrame, DataStream +from dataset.ops import filter_ds, join_ds from functions import count_rows, total +from substrait.plan import explode_extension_uri, plan_encoded_len, plan_from_named_table, plan_from_root_relation, plan_has_extension_urn, read_named_table_rel, relation_kind_name, root_rel # ---- Helper functions and tooling ---- +@derive(Clone) model Order: id: int +@derive(Clone) model Event: id: int -def _accept_data_set_generic[T](data: DataSet[T]) -> DataSet[T]: +def _accept_data_set_generic[T with Clone](data: DataSet[T]) -> DataSet[T]: return data -def _accept_bounded_generic[T](data: BoundedDataSet[T]) -> BoundedDataSet[T]: +def _accept_bounded_generic[T with Clone](data: BoundedDataSet[T]) -> BoundedDataSet[T]: return data -def _accept_unbounded_generic[T](data: UnboundedDataSet[T]) -> UnboundedDataSet[T]: +def _accept_unbounded_generic[T with Clone](data: UnboundedDataSet[T]) -> UnboundedDataSet[T]: return data -def _accept_data_frame_concrete[T](data: DataFrame[T]) -> DataFrame[T]: +def _accept_data_frame_concrete[T with Clone](data: DataFrame[T]) -> DataFrame[T]: return data -def _accept_lazy_frame_concrete[T](data: LazyFrame[T]) -> LazyFrame[T]: +def _accept_lazy_frame_concrete[T with Clone](data: LazyFrame[T]) -> LazyFrame[T]: return data -def _accept_data_stream_concrete[T](data: DataStream[T]) -> DataStream[T]: +def _accept_data_stream_concrete[T with Clone](data: DataStream[T]) -> DataStream[T]: return data -def _upcast_data_frame_to_bounded[T](data: DataFrame[T]) -> BoundedDataSet[T]: +def _upcast_data_frame_to_bounded[T with Clone](data: DataFrame[T]) -> BoundedDataSet[T]: return data -def _upcast_lazy_frame_to_bounded[T](data: LazyFrame[T]) -> BoundedDataSet[T]: +def _upcast_lazy_frame_to_bounded[T with Clone](data: LazyFrame[T]) -> BoundedDataSet[T]: return data -def _upcast_data_stream_to_unbounded[T](data: DataStream[T]) -> UnboundedDataSet[T]: +def _upcast_data_stream_to_unbounded[T with Clone](data: DataStream[T]) -> UnboundedDataSet[T]: return data -def _upcast_bounded_to_data_set[T](data: BoundedDataSet[T]) -> DataSet[T]: +def _upcast_bounded_to_data_set[T with Clone](data: BoundedDataSet[T]) -> DataSet[T]: return data -def _upcast_unbounded_to_data_set[T](data: UnboundedDataSet[T]) -> DataSet[T]: +def _upcast_unbounded_to_data_set[T with Clone](data: UnboundedDataSet[T]) -> DataSet[T]: return data @@ -62,7 +66,6 @@ def _touch[T](x: T) -> None: """Consume a value so assignment chains are not unused-variable warnings.""" pass - def _compile_hierarchy_assignability(order_frame: DataFrame[Order], order_lazy: LazyFrame[Order], event_stream: DataStream[Event]) -> None: """Compile-time shape checks for concrete -> trait -> supertrait assignments.""" sink0 = _upcast_bounded_to_data_set(_upcast_data_frame_to_bounded(order_frame)) @@ -83,9 +86,9 @@ def test_smoke__dataset_types_are_published() -> None: def test_type_contracts__signature_tiers_compile() -> None: """RFC 001: DataSet / BoundedDataSet / UnboundedDataSet accept concrete carriers.""" row = Order(id=1) - df: DataFrame[Order] = DataFrame(_row_schema_marker=row) + df: DataFrame[Order] = DataFrame(_row_schema_marker=row, _substrait_rel=read_named_table_rel(str("orders"))) ev = Event(id=2) - st: DataStream[Event] = DataStream(_row_schema_marker=ev) + st: DataStream[Event] = DataStream(_row_schema_marker=ev, _substrait_rel=read_named_table_rel(str("events"))) _touch(_accept_data_set_generic(df)) _touch(_accept_bounded_generic(df)) _touch(_accept_data_set_generic(st)) @@ -94,10 +97,10 @@ def test_type_contracts__signature_tiers_compile() -> None: def test_type_contracts__concrete_carriers_compile() -> None: """RFC 001: concrete carriers flow through generic helper signatures.""" - df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=3)) - lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=3003)) + df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=3), _substrait_rel=read_named_table_rel(str("orders"))) + lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=3003), _substrait_rel=read_named_table_rel(str("orders"))) ev = Event(id=4) - st: DataStream[Event] = DataStream(_row_schema_marker=ev) + st: DataStream[Event] = DataStream(_row_schema_marker=ev, _substrait_rel=read_named_table_rel(str("events"))) _touch(_accept_data_frame_concrete(df)) _touch(_accept_lazy_frame_concrete(lf)) _touch(_accept_data_stream_concrete(st)) @@ -105,23 +108,23 @@ def test_type_contracts__concrete_carriers_compile() -> None: def test_hierarchy__concrete_and_supertrait_assignability() -> None: """RFC 001: concrete -> bounded/unbounded trait -> DataSet chains compile.""" - df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=5)) - lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=5005)) + df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=5), _substrait_rel=read_named_table_rel(str("orders"))) + lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=5005), _substrait_rel=read_named_table_rel(str("orders"))) ev = Event(id=6) - st: DataStream[Event] = DataStream(_row_schema_marker=ev) + st: DataStream[Event] = DataStream(_row_schema_marker=ev, _substrait_rel=read_named_table_rel(str("events"))) _compile_hierarchy_assignability(df, lf, st) def test_type_contracts__concrete_and_trait_types_match_generic_arguments() -> None: """Generic parameter T matches for DataFrame/LazyFrame/DataStream carriers.""" - df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=7)) - lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=7007)) + df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=7), _substrait_rel=read_named_table_rel(str("orders"))) + lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=7007), _substrait_rel=read_named_table_rel(str("orders"))) bounded_df: BoundedDataSet[Order] = df bounded_lf: BoundedDataSet[Order] = lf _touch(bounded_df) _touch(bounded_lf) ev = Event(id=8) - st: DataStream[Event] = DataStream(_row_schema_marker=ev) + st: DataStream[Event] = DataStream(_row_schema_marker=ev, _substrait_rel=read_named_table_rel(str("events"))) ub: UnboundedDataSet[Event] = st _touch(ub) @@ -130,3 +133,30 @@ def test_version__inql_version_is_published() -> None: """InQL version should be available from metadata.""" version = inql_version() assert_eq(version, "0.1.0", "InQL version should be 0.1.0") + + +def test_dataset_ops__method_wrapper_matches_canonical_function() -> None: + """Method wrappers should preserve canonical dataset_ops semantics.""" + base_row = Order(id=9) + via_function = filter_ds(read_named_table_rel(str("orders")), true) + via_method = DataFrame(_row_schema_marker=base_row, _substrait_rel=read_named_table_rel(str("orders"))).filter(true) + assert_eq(relation_kind_name(via_function), str("FilterRel"), "canonical function should produce FilterRel root") + assert_eq(plan_encoded_len(via_method.to_substrait_plan()) > 0, true, "method wrapper should still emit a real proto plan") + assert_eq(relation_kind_name(root_rel(via_method.to_substrait_plan())), str("FilterRel"), "method wrapper plan should preserve filter root") + + +def test_dataset_ops__all_carriers_emit_real_plans() -> None: + df = DataFrame(_row_schema_marker=Order(id=10), _substrait_rel=read_named_table_rel(str("orders"))) + lf = LazyFrame(_row_schema_marker=Order(id=11), _substrait_rel=read_named_table_rel(str("orders"))) + st = DataStream(_row_schema_marker=Event(id=12), _substrait_rel=read_named_table_rel(str("events"))) + assert_eq(plan_encoded_len(df.to_substrait_plan()) > 0, true, "DataFrame should emit a real plan") + assert_eq(plan_encoded_len(lf.to_substrait_plan()) > 0, true, "LazyFrame should emit a real plan") + assert_eq(plan_encoded_len(st.to_substrait_plan()) > 0, true, "DataStream should emit a real plan") + + +def test_dataset_ops__api_lowered_boundary_facts_stay_stable() -> None: + left = DataFrame(_row_schema_marker=Order(id=13), _substrait_rel=read_named_table_rel(str("orders"))) + joined_plan = plan_from_root_relation(join_ds(read_named_table_rel(str("orders")), read_named_table_rel(str("orders_archive")), true), [str("id")]) + exploded_plan = left.explode().to_substrait_plan() + assert_eq(relation_kind_name(root_rel(joined_plan)), str("JoinRel"), "canonical join function should still lower to a JoinRel root") + assert_eq(plan_has_extension_urn(exploded_plan, explode_extension_uri()), true, "explode method should emit the registered extension URI") diff --git a/tests/test_model_substrait_schema.incn b/tests/test_model_substrait_schema.incn new file mode 100644 index 0000000..00668a1 --- /dev/null +++ b/tests/test_model_substrait_schema.incn @@ -0,0 +1,23 @@ +"""Tests for model → Substrait NamedStruct prototype.""" + +from std.testing import assert_eq +from substrait.schema import ( + demo_customer_row_shape, + row_shape_field_names, + row_shape_to_substrait_struct_type, + substrait_row_type_encoded_len, +) + +def test_demo_customer_row_shape__field_order() -> None: + shape = demo_customer_row_shape() + assert_eq(shape.model_name, str("DemoCustomer"), "model name") + names = row_shape_field_names(shape) + assert_eq(len(names), 2, "column count") + assert_eq(names[0], str("id"), "first field is declaration order") + assert_eq(names[1], str("name"), "second field") + + +def test_row_shape_to_substrait_struct_type__encodes() -> None: + row_ty = row_shape_to_substrait_struct_type(demo_customer_row_shape()) + encoded = substrait_row_type_encoded_len(row_ty) + assert_eq(encoded > 0, true, "protobuf encoding should be non-empty") diff --git a/tests/test_substrait_plan.incn b/tests/test_substrait_plan.incn new file mode 100644 index 0000000..3375e8c --- /dev/null +++ b/tests/test_substrait_plan.incn @@ -0,0 +1,116 @@ +"""Tests for RFC 002 proto-backed Substrait emission and conformance alignment.""" + +from std.testing import assert_eq +from substrait.plan import ( + SubstraitJoinKind, + SubstraitSetOperation, + explode_extension_uri, + extension_single_rel, + fetch_rel, + filter_rel, + join_rel_of_kind, + plan_has_extension_urn, + plan_contains_relation_kind, + plan_encoded_len, + plan_from_local_files, + plan_from_named_table, + plan_from_root_relation, + plan_from_virtual_table, + read_kind_name, + read_local_files_rel, + read_named_table_rel, + read_virtual_table_rel, + reference_rel, + reference_subtree_ordinal, + relation_kind_name, + registered_substrait_extension_uris, + root_rel, + set_rel, + set_operation_name, + set_rel_of_kind, + sort_rel, + substrait_producer_name, + substrait_release_tag, +) +from substrait.conformance import CoreScenarioKey, core_scenario_emission_matches, core_scenarios + +def _core_keys() -> list[CoreScenarioKey]: + return [CoreScenarioKey.ReadNamedTable, CoreScenarioKey.ReadLocalFiles, CoreScenarioKey.ReadVirtualTable, CoreScenarioKey.FilterRows, CoreScenarioKey.ProjectComputedColumns, CoreScenarioKey.JoinRelVariants, CoreScenarioKey.CrossRelCartesian, CoreScenarioKey.AggregateGroupingSets, CoreScenarioKey.SortRelOrdering, CoreScenarioKey.FetchRelLimitOffset, CoreScenarioKey.SetRelOperations, CoreScenarioKey.ReferenceRelSharedSubplan] + + +def test_plan__named_table_root_kind() -> None: + plan = plan_from_named_table(str("orders")) + assert_eq(plan_encoded_len(plan) > 0, true, "proto-backed plan should encode") + assert_eq(relation_kind_name(root_rel(plan)), str("ReadRel"), "named table plan should expose ReadRel root") + assert_eq(read_kind_name(root_rel(plan)), str("NamedTable"), "named table plan should expose NamedTable root kind") + + +def test_plan__read_root_kinds_are_distinguished() -> None: + named = plan_from_named_table(str("orders")) + local = plan_from_local_files(str("file:///tmp/orders.parquet")) + virtual = plan_from_virtual_table(str("inline_orders")) + assert_eq(read_kind_name(root_rel(named)), str("NamedTable"), "named table root should remain inspectable") + assert_eq(read_kind_name(root_rel(local)), str("LocalFiles"), "local files root should remain inspectable") + assert_eq(read_kind_name(root_rel(virtual)), str("VirtualTable"), "virtual table root should remain inspectable") + + +def test_plan__combinators_compose_deterministically() -> None: + base = read_named_table_rel(str("orders")) + with_filter = filter_rel(base, true) + with_sort = sort_rel(with_filter) + with_fetch = fetch_rel(with_sort, 0, 10) + plan = plan_from_root_relation(with_fetch, [str("id")]) + assert_eq(relation_kind_name(with_fetch), str("FetchRel"), "last combinator should become the root rel") + assert_eq(plan_encoded_len(plan) > 0, true, "composed plan should still encode") + assert_eq(plan_contains_relation_kind(plan, str("FilterRel")), true, "recursive traversal should find nested filter") + assert_eq(plan_contains_relation_kind(plan, str("SortRel")), true, "recursive traversal should find nested sort") + assert_eq(plan_contains_relation_kind(plan, str("FetchRel")), true, "recursive traversal should find fetch root") + + +def test_plan__set_rel_uses_operation_enum() -> None: + left = read_named_table_rel(str("orders_current")) + right = read_named_table_rel(str("orders_archive")) + union_rel = set_rel(left, right, str("Union")) + union_plan = plan_from_root_relation(union_rel, [str("id")]) + assert_eq(relation_kind_name(union_rel), str("SetRel"), "set combinator must produce SetRel root") + assert_eq(plan_encoded_len(union_plan) > 0, true, "set plan should still encode") + assert_eq(plan_contains_relation_kind(union_plan, str("ReadRel")), true, "set traversal should still reach child reads") + + +def test_plan__enum_backed_join_and_set_builders_expose_boundary_facts() -> None: + left_join_rel = join_rel_of_kind(read_named_table_rel(str("orders")), read_named_table_rel(str("customers")), true, SubstraitJoinKind.Left) + union_distinct_rel = set_rel_of_kind(read_named_table_rel(str("orders_current")), read_named_table_rel(str("orders_archive")), SubstraitSetOperation.UnionDistinct) + assert_eq(relation_kind_name(left_join_rel), str("JoinRel"), "enum-backed join builder should still emit a JoinRel root") + assert_eq(set_operation_name(union_distinct_rel), str("UnionDistinct"), "enum-backed set builder should preserve the selected set operation") + + +def test_plan__reference_rel_preserves_subtree_ordinal() -> None: + rel = reference_rel(7) + assert_eq(relation_kind_name(rel), str("ReferenceRel"), "reference builder should still expose ReferenceRel root kind") + assert_eq(reference_subtree_ordinal(rel), 7, "reference builder should preserve the requested subtree ordinal") + + +def test_plan__extension_urns_are_surfaced() -> None: + extension_uri = explode_extension_uri() + rel = extension_single_rel(read_named_table_rel(str("orders")), extension_uri) + plan = plan_from_root_relation(rel, [str("id")]) + assert_eq(plan_has_extension_urn(plan, extension_uri), true, "extension relation should populate extension URNs") + assert_eq(plan_contains_relation_kind(plan, str("ExtensionSingleRel")), true, "extension root should remain inspectable") + + +def test_plan__revision_pin_and_extension_registry_are_exported() -> None: + registered = registered_substrait_extension_uris() + assert_eq(substrait_release_tag(), str("v0.63.0"), "revision helpers should expose the currently targeted Substrait release tag") + assert_eq(substrait_producer_name(), str("inql-rfc002"), "revision helpers should expose the package producer label") + assert_eq(len(registered), 1, "current package boundary should register one extension URI") + assert_eq(registered[0], explode_extension_uri(), "registry should include the emitted explode extension URI") + + +def test_conformance__core_scenarios_validate_emission_output() -> None: + scenarios = core_scenarios() + assert_eq(len(scenarios), 12, "core scenario count should stay stable") + for scenario in scenarios: + assert_eq(len(scenario.capability_tags.0) > 0, true, "capability tags must remain non-empty") + assert_eq(len(scenario.references.0) > 0, true, "references must remain non-empty") + for key in _core_keys(): + assert_eq(core_scenario_emission_matches(key), true, "scenario should validate the emitted plan shape")