From 91c4a0532e09c138b60be8e16aed97d23b136365 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 27 Mar 2026 17:02:07 +0100 Subject: [PATCH 1/4] feat: DataSet trait hierarchy, examples, and RFC 001 spec alignment --- docs/language/explanation/dataset_types.md | 122 +++++++++++++++++++ docs/language/reference/dataset_types.md | 94 +++++++++++++++ docs/rfcs/001_inql_dataset.md | 70 +++++++---- docs/rfcs/README.md | 18 +-- examples/README.md | 50 ++++++++ examples/bounded_vs_unbounded.incn | 81 +++++++++++++ examples/dataset_api.incn | 76 ++++++++++++ examples/models.incn | 31 +++++ examples/trait_hierarchy.incn | 90 ++++++++++++++ incan.lock | 12 +- src/dataset.incn | 125 +++++++++++++++++++ src/functions.incn | 18 +++ src/lib.incn | 4 +- tests/test_dataset.incn | 132 +++++++++++++++++++++ tests/test_inql.incn | 7 -- 15 files changed, 882 insertions(+), 48 deletions(-) create mode 100644 docs/language/explanation/dataset_types.md create mode 100644 docs/language/reference/dataset_types.md create mode 100644 examples/README.md create mode 100644 examples/bounded_vs_unbounded.incn create mode 100644 examples/dataset_api.incn create mode 100644 examples/models.incn create mode 100644 examples/trait_hierarchy.incn create mode 100644 src/dataset.incn create mode 100644 src/functions.incn create mode 100644 tests/test_dataset.incn delete mode 100644 tests/test_inql.incn diff --git a/docs/language/explanation/dataset_types.md b/docs/language/explanation/dataset_types.md new file mode 100644 index 0000000..5391209 --- /dev/null +++ b/docs/language/explanation/dataset_types.md @@ -0,0 +1,122 @@ +# Dataset types (Explanation) + +This page explains how to think about and use InQL's dataset types. + +## Why dataset types? + +Typed pipelines need a first-class carrier for columnar data indexed by `T`. Without `DataSet[T]`, relational authoring surfaces would lack a stable primary relation and schema flow for `FROM`-style entry points. + +The **bounded/unbounded** distinction — inspired by Spark Structured Streaming's principle that a stream is an unbounded table — must be expressed at the **type level** so the compiler can enforce streaming constraints statically rather than at runtime. + +## The core idea + +A `DataSet[T]` is a **schema-parameterized tabular carrier**: + +- `T` is an Incan `model` — the row schema +- The carrier holds tabular data with that schema +- Operations like `filter`, `join`, `select` transform the carrier + +## Bounded vs unbounded + +The key insight is that **a stream is an unbounded table**. Rather than defining separate operation APIs for batch and streaming, `DataSet[T]` provides one relational operation surface. The bounded/unbounded property is expressed through the type system: + +- **`BoundedDataSet[T]`** — finite extent, all operations allowed +- **`UnboundedDataSet[T]`** — streaming/unbounded, unbounded-state operations rejected at compile time + +This enables **static capability gating**: operations that require unbounded state are rejected at compile time when the target is unbounded, without requiring a separate streaming API. + +## When to use which type + +### `DataFrame[T]` — materialized/eager + +Use `DataFrame[T]` when you have data in hand and want to inspect or manipulate it directly: + +```incan +from pub::inql import DataFrame +from models import Order + +def inspect_orders(orders: DataFrame[Order]) -> None: + # Work with materialized data + pass +``` + +`DataFrame[T]` is always bounded — it's the product of collecting or executing a `LazyFrame`. + +### `LazyFrame[T]` — deferred plan + +Use `LazyFrame[T]` when you want to compose operations before execution: + +```incan +from pub::inql import LazyFrame +from models import Order + +def high_value_orders(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended when query syntax is available: return orders.filter(.amount > 100) + return orders +``` + +### `DataStream[T]` — streaming + +Use `DataStream[T]` for streaming/unbounded data: + +```incan +from pub::inql import DataStream +from models import Event + +def important_events(events: DataStream[Event]) -> DataStream[Event]: + # Intended when query syntax is available: return events.filter(.severity == "critical") + return events +``` + +`DataStream[T]` shares the same operation API as batch carriers, but signals that its source is unbounded. Static streaming constraints are specified in RFC 001 and enforced as the compiler gains analysis for `UnboundedDataSet[T]`. + +## Type signatures + +The trait hierarchy gives you three levels of specificity: + +```incan +from pub::inql import DataSet, BoundedDataSet, UnboundedDataSet +from models import Order, Event + +# Accepts any carrier — generic utilities +def row_count[T](data: DataSet[T]) -> int: + ... + +# Batch only — Parquet writers, batch sinks +def write_parquet(data: BoundedDataSet[Order]) -> None: + ... + +# Streaming only — Kafka sinks, event processors +def write_to_kafka(events: UnboundedDataSet[Event]) -> None: + ... +``` + +And two levels of concrete-type specificity: + +```incan +from pub::inql import DataFrame, LazyFrame, DataStream +from models import Order, Event + +# Materialized data in hand +def inspect(data: DataFrame[Order]) -> None: + ... + +def build_pipeline(orders: LazyFrame[Order]) -> LazyFrame[Order]: + ... + +def process_stream(events: DataStream[Event]) -> DataStream[Event]: + ... +``` + +## Aggregate helpers + +`.agg(...)` uses **imported** symbols from `pub::inql.functions` (for example `total`, `count_rows`). + + +> Note: ambient `sum` / `count` builtins will be possible once Incan's RFC 045 is implemented since those names clash with Incan/stdlib in ordinary expression positions. + +## What's next? + +- **Execution context**: How `DataSet` operations actually run (RFC 004) +- **Query DSL**: `query {}` blocks that produce plans (RFC 003) +- **Substrait**: Portable logical plans (RFC 002) diff --git a/docs/language/reference/dataset_types.md b/docs/language/reference/dataset_types.md new file mode 100644 index 0000000..679531c --- /dev/null +++ b/docs/language/reference/dataset_types.md @@ -0,0 +1,94 @@ +# Dataset types (Reference) + +This page documents the InQL dataset type hierarchy: the traits and concrete types that carry schema-parameterized tabular data through relational pipelines. + +## Type hierarchy + +```text +DataSet[T] (root trait — any tabular data) +├── BoundedDataSet[T] (trait — finite extent) +│ ├── DataFrame[T] (concrete — materialized/eager) +│ └── LazyFrame[T] (concrete — deferred plan, bounded source) +└── UnboundedDataSet[T] (trait — streaming/unbounded) + └── DataStream[T] (concrete — streaming) +``` + +### `DataSet[T]` + +Root trait for any schema-parameterized tabular data whose row shape is an Incan `model` `T`. + +All relational operations are defined on `DataSet[T]`. The compiler applies the **most restrictive** constraint set when the concrete kind is unknown at a call site (because the argument might be unbounded). + +### `BoundedDataSet[T]` + +Extends `DataSet[T]` — data with a finite, known extent. All relational operations are allowed. + +### `UnboundedDataSet[T]` + +Extends `DataSet[T]` — data from a streaming or unbounded source. Operations requiring unbounded state **must** be rejected at compile time. + +### `DataFrame[T]` + +Implements `BoundedDataSet[T]`. Materialized/eager result; always bounded. Conceptually the product of collecting or executing a `LazyFrame`. + +### `LazyFrame[T]` + +Implements `BoundedDataSet[T]`. Holds a logical plan (or equivalent) until an explicit execute, collect, or write boundary. Always bounded. + +### `DataStream[T]` + +Implements `UnboundedDataSet[T]`. Shares the `DataSet[T]` operation API but signals that its source is unbounded. The compiler applies static streaming constraints. + +## Operation API + +The following instance methods are defined on `DataSet[T]`: + +| Method | Signature | Description | +| ---------- | ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | +| `filter` | `def filter(self, predicate: bool) -> Self` | Restrict rows by a boolean relational expression | +| `join` | `def join(self, other: Self, on: bool) -> Self` | Combine with another relation on a join condition (`other: Self` at the trait level; see RFC 001 **Shipped trait signatures (`Self`)**) | +| `select` | `def select(self) -> Self` | Project columns and expressions; logical output schema is tracked when lowering/typing (RFC 003) | +| `group_by` | `def group_by(self) -> Self` | Define grouping keys for aggregation | +| `agg` | `def agg(self) -> Self` | Apply aggregate functions over groups; use imported helpers from `pub::inql.functions` (e.g. `total`, `count_rows`) | +| `order_by` | `def order_by(self) -> Self` | Define sort keys and directions | +| `limit` | `def limit(self, n: int) -> Self` | Cap the number of rows (after sort when both apply) | +| `explode` | `def explode(self) -> Self` | Expand a nested list column into rows | + +## Static capability gating + +| Trait bound in signature | Allowed operations | Constraint level | +| ------------------------ | ------------------------------------------------------ | ---------------------------------------- | +| `DataSet[T]` | Intersection of bounded + unbounded capabilities | Most restrictive (concrete kind unknown) | +| `BoundedDataSet[T]` | All relational operations | Unrestricted | +| `UnboundedDataSet[T]` | Relational operations minus unbounded-state operations | Streaming constraints enforced | + +## Usage + +```incan +from pub::inql import LazyFrame, DataFrame, DataStream +from models import Order, Event + +# Accept any carrier — generic utilities +def row_count[T](data: DataSet[T]) -> int: + ... + +# Batch only — Parquet writers, batch sinks +def write_parquet(data: BoundedDataSet[Order]) -> None: + ... + +# Streaming only — Kafka sinks, event processors +def write_to_kafka(events: UnboundedDataSet[Event]) -> None: + ... + +# Materialized data in hand +def inspect(data: DataFrame[Order]) -> None: + ... + +# Deferred plan — compose before execution +def build_pipeline(orders: LazyFrame[Order]) -> LazyFrame[Order]: + ... + +# Streaming specifically +def process_stream(events: DataStream[Event]) -> DataStream[Event]: + ... +``` diff --git a/docs/rfcs/001_inql_dataset.md b/docs/rfcs/001_inql_dataset.md index a478379..042c3c1 100644 --- a/docs/rfcs/001_inql_dataset.md +++ b/docs/rfcs/001_inql_dataset.md @@ -1,14 +1,16 @@ # InQL RFC 001: Dataset types and carriers (`DataSet[T]`) -- **Status:** Planned +- **Status:** Implemented - **Created:** 2026-03-22 - **Author(s):** Danny Meijer - **Related:** - InQL RFC 000 (language specification — naming, schema shapes, layer boundaries) + - Incan compiler — static capability gating enforcement: [incan#187](https://github.com/dannys-code-corner/incan/issues/187) + - InQL follow-up when enforcement lands: [InQL #10](https://github.com/dannys-code-corner/InQL/issues/10) - **Issue:** [InQL #2](https://github.com/dannys-code-corner/InQL/issues/2) - **RFC PR:** - - **Written against:** Incan v0.2 -- **Shipped in:** - +- **Shipped in:** 0.1.0 ## Summary @@ -44,7 +46,7 @@ Typed pipelines need a first-class carrier for columnar data indexed by `T`. Wit - Apache Substrait `Rel`-level mapping and extension policy — InQL RFC 002. - Clause-based relational grammar, aggregate rules, Substrait lowering from that surface — InQL RFC 003. - Execution context, session, DataFusion — InQL RFC 004. -- Pipe-forward (`|>`) grammar — InQL RFC 005 (not in v0.1 scope). +- Pipe-forward (`|>`) grammar — InQL RFC 005 (deferred; outside the RFC 000–004 milestone). - Cluster-scale scheduling, shuffle, distributed fault tolerance — orchestration layer. - Drop-in API compatibility with Apache Beam, Flink, or Spark SDKs. @@ -111,12 +113,12 @@ from models import Order, Summary, Event, Alert def inspect(data: DataFrame[Order]) -> None: ... -# Deferred plan — compose before execution -def build_pipeline(orders: LazyFrame[Order]) -> LazyFrame[Summary]: +# Deferred plan — compose before execution (signatures use Self; logical Summary row shape via RFC 003) +def build_pipeline(orders: LazyFrame[Order]) -> LazyFrame[Order]: ... -# Streaming specifically -def process_stream(events: DataStream[Event]) -> DataStream[Alert]: +# Streaming specifically (signatures use Self; logical Alert row shape via RFC 003) +def process_stream(events: DataStream[Event]) -> DataStream[Event]: ... ``` @@ -157,28 +159,34 @@ The three concrete types **must not** imply three unrelated relational languages When a function accepts `DataSet[T]` (the root trait), the compiler **must** enforce streaming constraints because the input **might** be unbounded. Authors who want the full operation set **must** accept `BoundedDataSet[T]` or a concrete bounded type. -For `UnboundedDataSet[T]`, the governing rule is semantic rather than ad hoc: operations that require end-of-input semantics or unbounded retained state are not valid unless a later RFC gives them bounded-state semantics. In v0.1, the obvious disallowed examples include global `order_by`, global `limit`, unwindowed `group_by` / `agg`, eager materialization to a finite `DataFrame[T]`, and finite file writes. +For `UnboundedDataSet[T]`, the governing rule is semantic rather than ad hoc: operations that require end-of-input semantics or unbounded retained state are not valid unless a later RFC gives them bounded-state semantics. Typical disallowed examples include global `order_by`, global `limit`, unwindowed `group_by` / `agg`, eager materialization to a finite `DataFrame[T]`, and finite file writes. ### Operation API (for lowering and direct use) -The InQL library **must** expose the following instance methods on `DataSet[T]` (exact signatures may live in companion library docs; semantics **must** match this table and stay consistent with any normative lowering rules for the same logical operators elsewhere in InQL). Method names are illustrative; implementations **may** use equivalent spellings if the compiler maps them consistently. +The InQL library **must** expose the following instance methods on `DataSet[T]`. Method names are illustrative; implementations **may** use equivalent spellings if the compiler maps them consistently. Semantics **must** match this table and stay consistent with any normative lowering rules for the same logical operators elsewhere in InQL. -| Method | Role | -| -------------- | ----------------------------------------------------------------------------------------------------------- | -| **`filter`** | Restrict rows by a boolean relational expression (relational argument positions per InQL RFC 000). | -| **`join`** | Combine with another `DataSet[U]` on a join condition; named relations for `relation.column` | -| **`select`** | Project columns and expressions; output row type becomes a new schema `U` the typechecker can track. | -| **`group_by`** | Define grouping keys for aggregation; keys are relational expressions. | -| **`agg`** | Apply aggregate functions over groups (often chained after `group_by`); produces grouped/aggregated schema. | -| **`order_by`** | Define sort keys and directions. | -| **`limit`** | Cap the number of rows (after sort when both apply). | -| **`explode`** | Expand a nested list column into rows (or equivalent). | +#### Shipped trait signatures (`Self`) + +Earlier drafts of this RFC described some methods with a second type parameter `U` (for example `join` with `other: DataSet[U]`, or `select` / `agg` returning `DataSet[U]`). The **InQL library package** (see **Shipped in** in the header) instead declares these methods using Incan’s **`Self`** on the `DataSet[T]` trait: the peer carrier in `join` is `other: Self`, and `select` / `agg` return `Self`. That is the **normative contract** for the library package until a follow-up specifies richer generic method typing in Incan. + +**Semantic intent is unchanged:** `join` still combines two relations (with `relation.column` naming per InQL RFC 000); `select` and `agg` still denote projection and aggregation that may change the logical row shape. Tracking output schema `U` at the **typechecker** level for those operations is expected to align with InQL RFC 003 (clause / method-chain lowering), not with extra type parameters on this **`Self`**-based trait surface. Authors should treat the table’s “Role” column as the logical behavior; the “Declared signature” column as what the Incan sources declare today. + +| Method | Declared signature (InQL library) | Role | +| -------------- | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | +| **`filter`** | `def filter(self, predicate: bool) -> Self` | Restrict rows by a boolean relational expression (relational argument positions per InQL RFC 000). | +| **`join`** | `def join(self, other: Self, on: bool) -> Self` | Combine with another relation on a join condition; named relations for `relation.column`. | +| **`select`** | `def select(self) -> Self` | Project columns and expressions; logical output row type may differ and is tracked when lowering/typing (RFC 003). | +| **`group_by`** | `def group_by(self) -> Self` | Define grouping keys for aggregation; keys are relational expressions. | +| **`agg`** | `def agg(self) -> Self` | Apply aggregate functions over groups (often chained after `group_by`); logical schema per lowering (RFC 003). | +| **`order_by`** | `def order_by(self) -> Self` | Define sort keys and directions. | +| **`limit`** | `def limit(self, n: int) -> Self` | Cap the number of rows (after sort when both apply). | +| **`explode`** | `def explode(self) -> Self` | Expand a nested list column into rows (or equivalent). | Additional requirements: -- Operations **must** preserve or update `T` (or output model `U`) in a way the typechecker can verify. -- Operations that are statically invalid on `UnboundedDataSet[T]` (e.g. unbounded-state operations) **must** produce compile-time errors, not runtime failures. -- Aggregate helpers used with `.agg(...)` are imported library symbols (for example from `pub::inql.functions`), not ambient builtins. +- Over time, operations **must** preserve or refine row schema information in a way the typechecker and lowering can verify; the shipped **`Self`**-based signatures intentionally do not encode every schema transition on the trait surface yet. +- Operations that are statically invalid on `UnboundedDataSet[T]` (e.g. unbounded-state operations) **must** produce compile-time errors in the **Incan** typechecker once that enforcement exists (see **Static capability gating**). The InQL library package does not implement that analysis; scheduling and tracking belong on the **Incan** compiler side, not in the **Contract-complete checklist** below. +- Aggregate helpers used with `.agg(...)` are imported library symbols from `pub::inql.functions` (for example `total` for summation; names avoid clashing with Incan/stdlib `sum` / `count`), not ambient builtins. - This RFC defines the minimum required aggregate-function import model for `.agg(...)`; it is not an exhaustive catalog of all present or future InQL functions. Additional functions **may** be added later through additive library evolution or follow-up RFCs, provided they do not change the semantics of the required set defined by the InQL RFC suite. ### Execution backend boundary @@ -209,7 +217,7 @@ The design draws on Spark Structured Streaming's core insight: a stream is an un `UnboundedDataSet[T]` currently has one concrete implementor (`DataStream[T]`). The intermediate trait is justified by: clean symmetry with `BoundedDataSet[T]` in type signatures, and future extensibility (e.g. a `ChangeStream[T]` for CDC, a `WindowedStream[T]`, or other streaming specializations). -Future RFCs **may** add methods on `BoundedDataSet[T]` or `UnboundedDataSet[T]`, but only where the semantics are inherently boundedness-specific and remain backend-neutral. v0.1 does not require any additional core relational methods beyond the shared `DataSet[T]` surface. +Future RFCs **may** add methods on `BoundedDataSet[T]` or `UnboundedDataSet[T]`, but only where the semantics are inherently boundedness-specific and remain backend-neutral. This RFC does not require any additional core relational methods on those intermediate traits beyond the shared `DataSet[T]` surface. ### Compatibility @@ -237,8 +245,20 @@ Future RFCs **may** add methods on `BoundedDataSet[T]` or `UnboundedDataSet[T]`, ### Resolved -- **`UnboundedDataSet[T]` restrictions:** Operations requiring end-of-input semantics or unbounded retained state are not valid unless a later RFC gives them bounded-state semantics. In v0.1, disallowed examples include global `order_by`, global `limit`, unwindowed `group_by` / `agg`, eager materialization to a finite `DataFrame[T]`, and finite file writes. +- **`UnboundedDataSet[T]` restrictions:** Operations requiring end-of-input semantics or unbounded retained state are not valid unless a later RFC gives them bounded-state semantics. Typical disallowed examples include global `order_by`, global `limit`, unwindowed `group_by` / `agg`, eager materialization to a finite `DataFrame[T]`, and finite file writes. - **`collect` / `display`:** Not part of the `DataSet[T]` trait surface. Helpers such as `collect(data)` or `display(data)` belong to the execution context and concrete implementation model defined in InQL RFC 004, not in this RFC. -- **Intermediate traits:** `BoundedDataSet[T]` and `UnboundedDataSet[T]` do not add required core relational methods in v0.1. Future RFCs may add additional methods only where the semantics are inherently boundedness-specific and remain backend-neutral. +- **Intermediate traits:** `BoundedDataSet[T]` and `UnboundedDataSet[T]` do not add required core relational methods beyond what this RFC specifies for the shared `DataSet[T]` surface. Future RFCs may add additional methods only where the semantics are inherently boundedness-specific and remain backend-neutral. + +- **Static capability gating — compiler enforcement:** The **must** in **Additional requirements** (compile-time errors for invalid `UnboundedDataSet[T]` uses) is normative language for the language; the **InQL** package issue in the header tracks the **library** contract. Typechecker implementation is tracked in **[incan#187](https://github.com/dannys-code-corner/incan/issues/187)**; **[InQL #10](https://github.com/dannys-code-corner/InQL/issues/10)** is a chore to revisit this package and docs when that work lands. + +## Contract-complete checklist (library package) + +RFC 001 is **contract-complete** for the InQL package when all of the following hold (**execution** / materialization: InQL RFC 004; **static streaming enforcement** in the typechecker: Incan compiler work—separate from this checklist): + +- **Hierarchy:** `DataSet`, `BoundedDataSet`, `UnboundedDataSet`, `DataFrame`, `LazyFrame`, `DataStream` are public exports and match the type tree in this RFC. +- **Operation names:** The eight methods in **Operation API** exist on `DataSet[T]` with the **`Self`-based signatures** documented above and in companion library docs. +- **Aggregates:** `pub::inql.functions` exports at least the minimum aggregate helpers required for the import model (`total`, `count_rows` in the shipped package); bodies may be stubs until RFC 004. +- **Tests:** Package tests verify exports, trait assignability, and aggregate symbol importability without requiring runtime relational execution. +- **Docs:** This RFC, `docs/language/reference/dataset_types.md`, `docs/language/explanation/dataset_types.md`, and examples do not contradict the shipped `Self` signatures or stub status. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index 63a9643..cffa79b 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -4,18 +4,18 @@ InQL uses its **own** RFC series (starting at 000), independent of the [Incan la **New RFC:** copy [TEMPLATE.md], name the file `NNN_short_slug.md`, pick the next number from the table (or from open issues), and open a PR. Section order and header fields follow that template. For workflow and conventions, see [Writing InQL RFCs]. -| RFC | Status | Title | | -| -------------- | ------- | ------------------------------------------------------------------------------------- | ---------------------- | -| [000][rfc-000] | Planned | Language specification — core model, naming, schema shapes, layer boundaries | | -| [001][rfc-001] | Planned | Dataset types and carriers (`DataSet[T]`, `BoundedDataSet[T]`, `UnboundedDataSet[T]`) | | -| [002][rfc-002] | Planned | Apache Substrait — `Rel`-level contract, mapping catalog, binding boundaries | | -| [003][rfc-003] | Planned | `query {}` blocks — grammar, typing, Substrait lowering | | -| [004][rfc-004] | Planned | Execution context — session, DataFusion, read/transform/write | | -| [005][rfc-005] | Blocked | Pipe-forward relational syntax (`\ | >`) — optional surface | +| RFC | Status | Title | | +| -------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | +| [000][rfc-000] | Planned | Language specification — core model, naming, schema shapes, layer boundaries | | +| [001][rfc-001] | Implemented | Dataset types and carriers (`DataSet[T]`, `BoundedDataSet[T]`, `UnboundedDataSet[T]`) — library package is **contract-complete** (types, `Self` method surface, `functions` imports); execution in RFC 004 | | +| [002][rfc-002] | Planned | Apache Substrait — `Rel`-level contract, mapping catalog, binding boundaries | | +| [003][rfc-003] | Planned | `query {}` blocks — grammar, typing, Substrait lowering | | +| [004][rfc-004] | Planned | Execution context — session, DataFusion, read/transform/write | | +| [005][rfc-005] | Blocked | Pipe-forward relational syntax (`\ | >`) — optional surface | | -**Order:** [RFC 000][rfc-000] is the foundational language specification. [RFC 001][rfc-001] defines the dataset type hierarchy. [RFC 002][rfc-002] defines the Substrait interchange contract. [RFC 003][rfc-003] defines the `query {}` surface that lowers to Substrait per RFC 002 over carriers from RFC 001. [RFC 004][rfc-004] completes the end-to-end story: session, read, execute, write. [RFC 005][rfc-005] specifies optional pipe-forward syntax deferred from v0.1 and currently blocked on Incan RFC 040. +**Order:** [RFC 000][rfc-000] is the foundational language specification. [RFC 001][rfc-001] defines the dataset type hierarchy. [RFC 002][rfc-002] defines the Substrait interchange contract. [RFC 003][rfc-003] defines the `query {}` surface that lowers to Substrait per RFC 002 over carriers from RFC 001. [RFC 004][rfc-004] completes the end-to-end story: session, read, execute, write. [RFC 005][rfc-005] specifies optional pipe-forward syntax outside the RFC 000–004 milestone and currently blocked on Incan RFC 040. **v0.1 scope:** RFCs 000–004. When all five are resolved (Draft → Planned → Implemented), InQL v0.1 is complete: authors can read data, write typed queries, lower to Substrait, execute through DataFusion, and write results. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..2835b88 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,50 @@ +# InQL examples + +Examples demonstrating InQL dataset types and patterns. + +## Current status + +These examples are **compile-safe signatures** that preserve RFC 001 type contracts. + +> Note: Relational expression syntax and executable backends are still evolving in sibling RFCs, so method-chain bodies are documented as comments where needed. + +## Example structure + +- `dataset_api.incn` — Demonstrates the DataSet[T] operation API +- `trait_hierarchy.incn` — Demonstrates trait hierarchy usage +- `bounded_vs_unbounded.incn` — Demonstrates bounded vs unbounded type signatures +- `models.incn` — Placeholder models for examples + +## Running examples + +```bash +incan run examples/dataset_api.incn +``` + +> Note: These examples primarily demonstrate type-shape contracts today. Execution semantics are defined by RFC 004. + +## What these examples show + +These examples document the **desired API patterns** for the initial InQL dataset surface: + +1. **RFC 001** contracts are represented as compile-safe signatures and trait assignments +2. Method-chain bodies show intended relational patterns in comments +3. **RFC 004** will provide execution behavior (DataFusion integration) + +Once those are in place, these examples will serve as: + +- **Regression tests** — verifying the patterns still work +- **Documentation** — showing users how to use the API +- **Examples** — providing starting points for real code + +## Incan status + +- **RFC 041** (First-Class Rust Interop Authoring): Implemented in Incan v0.2 +- **RFC 042** (Traits Are Always Abstract): Implemented in Incan v0.2 + +These RFCs provide the trait and interop foundation InQL builds on. + +What's still needed: + +- **Execution backend** — actual implementation of the operations (RFC 004) +- **Method-chain execution semantics** — examples still keep relational bodies as comments until runtime behavior lands diff --git a/examples/bounded_vs_unbounded.incn b/examples/bounded_vs_unbounded.incn new file mode 100644 index 0000000..923b11e --- /dev/null +++ b/examples/bounded_vs_unbounded.incn @@ -0,0 +1,81 @@ +""" +Example: Bounded vs unbounded type signatures (RFC 001). + +This example focuses on compile-time type signatures and keeps runtime bodies as placeholders. + +## Patterns shown + +- BoundedDataSet[T]: All operations allowed +- UnboundedDataSet[T]: Unbounded-state operations rejected at compile time +- Static capability gating based on trait bounds +""" + +from dataset import DataSet, BoundedDataSet, UnboundedDataSet +from models import Order, Event + +# ---- Bounded-only functions ---- +def write_to_parquet(data: BoundedDataSet[Order]) -> None: + """Write to Parquet requires bounded data (finite extent).""" + pass + + +def materialize_to_dataframe(_data: BoundedDataSet[Order]) -> None: + """Materialize to DataFrame requires bounded data.""" + pass + + +def batch_aggregate(_data: BoundedDataSet[Order]) -> None: + """Batch aggregation requires bounded data.""" + pass + + +# ---- Streaming-only functions ---- +def write_to_kafka(events: UnboundedDataSet[Event]) -> None: + """Write to Kafka requires streaming data.""" + pass + + +def process_stream(events: UnboundedDataSet[Event]) -> UnboundedDataSet[Event]: + """Process streaming data.""" + return events + + +# ---- Generic functions with boundedness constraints ---- +def process_any[T](data: DataSet[T]) -> int: + """Generic function that works with any carrier.""" + return 0 + + +def process_batch[T](data: BoundedDataSet[T]) -> BoundedDataSet[T]: + """Function that works only with bounded data.""" + return data + + +# ---- Static capability gating examples ---- +def safe_filter[T](data: DataSet[T]) -> DataSet[T]: + """Filter is available on all carriers.""" + # Intended shape: data.filter(...) + return data + + +def safe_join[T](data: DataSet[T], other: DataSet[T]) -> DataSet[T]: + """Join uses `other: Self` (same carrier type at the trait level; logical row typing per RFC 003).""" + # Intended shape when stubs are executable: data.join(other, ...) + _ = other + return data + + +def bounded_only_operation(data: BoundedDataSet[Order]) -> None: + """Function that requires bounded data.""" + pass + + +def streaming_safe_operation(data: UnboundedDataSet[Event]) -> None: + """Function that works with streaming data.""" + pass + + +# ---- Main ---- +def main() -> None: + """Example main function.""" + pass diff --git a/examples/dataset_api.incn b/examples/dataset_api.incn new file mode 100644 index 0000000..657ae30 --- /dev/null +++ b/examples/dataset_api.incn @@ -0,0 +1,76 @@ +""" +Example: DataSet[T] operation API patterns (RFC 001). + +This file is intentionally compile-safe today: it preserves the type signatures and documents the intended method-chain +shapes without requiring query-expression parsing. +""" + +from dataset import DataSet, BoundedDataSet, DataFrame, LazyFrame +from models import Order, Customer + +def filter_high_value_orders(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.filter(.amount > 100) + return orders + + +def filter_active_orders(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.filter(.status == "active") + return orders + + +def filter_combined(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.filter(...).filter(...) + return orders + + +def join_orders_with_customers(orders: LazyFrame[Order], customers: LazyFrame[Customer]) -> LazyFrame[Order]: + # Trait join uses `other: Self`; heterogeneous Order/Customer join is a logical/RFC 003 concern. + # Same-schema illustration: orders.join(orders, True) when both are LazyFrame[Order]. + _ = customers + return orders + + +def select_basic_fields(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.select(.id, .amount, .status) + return orders + + +def select_with_expression(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.select(.id, .amount * 1.1 as total_with_tax) + return orders + + +def group_by_customer_sum_amount(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.group_by(.customer_id).agg(total(.amount)) + return orders + + +def group_by_multiple_keys(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.group_by(.customer_id, .status).agg(total(.amount)) + return orders + + +def order_by_amount_desc(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.order_by(.amount.desc()) + return orders + + +def order_by_limit_top_10(orders: LazyFrame[Order]) -> LazyFrame[Order]: + # Intended shape: orders.order_by(.amount.desc()).limit(10) + return orders + + +def process_orders_batch(orders: DataFrame[Order]) -> DataFrame[Order]: + return orders + + +def batch_only_operation(_data: BoundedDataSet[Order]) -> None: + pass + + +def generic_any_carrier[T](_data: DataSet[T]) -> None: + pass + + +def main() -> None: + pass diff --git a/examples/models.incn b/examples/models.incn new file mode 100644 index 0000000..94a882a --- /dev/null +++ b/examples/models.incn @@ -0,0 +1,31 @@ +""" +Placeholder models for InQL examples. + +These are simplified models for demonstration purposes. In real code, these would +be defined in a separate module with proper field definitions. + +## Models + +- `Order`: Represents an order with id, customer_id, amount, status +- `Customer`: Represents a customer with id, name, email +- `Event`: Represents an event with id, type, severity +""" + +# In real code, these would have proper field definitions: +pub model Order: + pub id: int + pub customer_id: int + pub amount: float + pub status: str + + +pub model Customer: + pub id: int + pub name: str + pub email: str + + +pub model Event: + pub id: int + pub event_type: str + pub severity: str diff --git a/examples/trait_hierarchy.incn b/examples/trait_hierarchy.incn new file mode 100644 index 0000000..cd33970 --- /dev/null +++ b/examples/trait_hierarchy.incn @@ -0,0 +1,90 @@ +""" +Example: Trait hierarchy patterns (RFC 001). + +This example demonstrates compile-time trait hierarchy usage patterns. + +## Patterns shown + +- Trait inheritance: BoundedDataSet[T] extends DataSet[T] +- Trait inheritance: UnboundedDataSet[T] extends DataSet[T] +- Concrete type implementation: DataFrame[T] implements BoundedDataSet[T] +- Concrete type implementation: LazyFrame[T] implements BoundedDataSet[T] +- Concrete type implementation: DataStream[T] implements UnboundedDataSet[T] +""" + +from dataset import DataSet, BoundedDataSet, UnboundedDataSet, DataFrame, LazyFrame, DataStream +from models import Order, Event + +# ---- Trait hierarchy assignment patterns ---- +def assign_data_frame_to_bounded(data: DataFrame[Order]) -> BoundedDataSet[Order]: + """Concrete bounded carrier upcasts to its bounded trait.""" + return data + + +def assign_lazy_frame_to_bounded(data: LazyFrame[Order]) -> BoundedDataSet[Order]: + """Deferred bounded carrier upcasts to its bounded trait.""" + return data + + +def assign_data_stream_to_unbounded(data: DataStream[Event]) -> UnboundedDataSet[Event]: + """Streaming carrier upcasts to its unbounded trait.""" + return data + + +def assign_bounded_to_data_set(data: BoundedDataSet[Order]) -> DataSet[Order]: + """Bounded trait upcasts to the root DataSet trait.""" + return data + + +def assign_unbounded_to_data_set(data: UnboundedDataSet[Event]) -> DataSet[Event]: + """Unbounded trait upcasts to the root DataSet trait.""" + return data + + +def chain_data_frame_to_root(data: DataFrame[Order]) -> DataSet[Order]: + """Concrete-to-supertrait chain: DataFrame -> BoundedDataSet -> DataSet.""" + return assign_bounded_to_data_set(assign_data_frame_to_bounded(data)) + + +def chain_data_stream_to_root(data: DataStream[Event]) -> DataSet[Event]: + """Concrete-to-supertrait chain: DataStream -> UnboundedDataSet -> DataSet.""" + return assign_unbounded_to_data_set(assign_data_stream_to_unbounded(data)) + + +# ---- Function signature patterns ---- +def accept_any_data_set[T](data: DataSet[T]) -> None: + """Function accepts any dataset carrier (most restrictive).""" + # Only intersection of bounded + unbounded operations available + pass + + +def accept_bounded_only(data: BoundedDataSet[Order]) -> None: + """Function accepts only bounded datasets.""" + # All operations available + pass + + +def accept_unbounded_only(data: UnboundedDataSet[Event]) -> None: + """Function accepts only unbounded datasets.""" + # All operations except unbounded-state operations available + pass + + +# ---- Generic function patterns ---- +def process_any[T](data: DataSet[T]) -> int: + """Generic function that works with any dataset carrier.""" + # Can only use operations available on DataSet[T] + return 0 + + +def process_batch_only(data: BoundedDataSet[Order]) -> int: + """Function that works only with bounded datasets.""" + # Can use all operations + return 0 + + +# ---- Main ---- +def main() -> None: + """Example main function.""" + pass + diff --git a/incan.lock b/incan.lock index e23fd9b..f501b25 100644 --- a/incan.lock +++ b/incan.lock @@ -3,8 +3,8 @@ [incan] format = 1 -incan-version = "0.2.0-dev.3" -generated = "2026-03-20T10:34:57.236108Z" +incan-version = "0.2.0-dev.5" +generated = "2026-03-27T13:23:49.961814Z" deps-fingerprint = "sha256:17f122844d2fa1c9756f9a1976d222f15255557e74d975b8d8ff46536ea82b87" cargo-features = [] cargo-no-default-features = false @@ -18,11 +18,11 @@ version = 4 [[package]] name = "incan_core" -version = "0.2.0-dev.3" +version = "0.2.0-dev.5" [[package]] name = "incan_derive" -version = "0.2.0-dev.3" +version = "0.2.0-dev.5" dependencies = [ "proc-macro2", "quote", @@ -31,7 +31,7 @@ dependencies = [ [[package]] name = "incan_stdlib" -version = "0.2.0-dev.3" +version = "0.2.0-dev.5" dependencies = [ "incan_core", "incan_derive", @@ -39,7 +39,7 @@ dependencies = [ [[package]] name = "inql" -version = "0.2.0-dev.3" +version = "0.2.0-dev.5" dependencies = [ "incan_derive", "incan_stdlib", diff --git a/src/dataset.incn b/src/dataset.incn new file mode 100644 index 0000000..ac100a0 --- /dev/null +++ b/src/dataset.incn @@ -0,0 +1,125 @@ +""" +Dataset carriers for InQL (RFC 001). + +This module defines the *author-facing* type hierarchy used to carry schema-parameterized tabular data through +relational pipelines: + +```text +DataSet[T] + ├─ BoundedDataSet[T] (finite / batch) + │ ├─ DataFrame[T] (eager / materialized) + │ └─ LazyFrame[T] (deferred / planned) + └─ UnboundedDataSet[T] (streaming / unbounded) + └─ DataStream[T] (streaming) +``` + +The implementation here is intentionally backend-neutral: these APIs are the public surface. + +## Type hierarchy + +The dataset type hierarchy is rooted in the `DataSet[T]` trait, split into `BoundedDataSet[T]` (finite extent) and +`UnboundedDataSet[T]` (streaming/unbounded), with three concrete types: `DataFrame[T]` (materialized/eager), +`LazyFrame[T]` (deferred plan), and `DataStream[T]` (streaming). + +## Operation API + +The following methods are defined on `DataSet[T]`. + +- `filter(self, predicate: bool) -> Self` +- `join(self, other: Self, on: bool) -> Self` +- `select(self) -> Self` +- `group_by(self) -> Self` +- `agg(self) -> Self` +- `order_by(self) -> Self` +- `limit(self, n: int) -> Self` +- `explode(self) -> Self` + +## Documentation + +- **Explanation**: [docs/language/explanation/dataset_types.md](../docs/language/explanation/dataset_types.md) +- **Reference**: [docs/language/reference/dataset_types.md](../docs/language/reference/dataset_types.md) +- **Examples**: [examples/](../examples/) + +## Example patterns + +```incan +from pub::inql import LazyFrame, DataFrame, DataStream +from models import Order, Event + +# Filter and chain +orders.filter(.amount > 100).filter(.status == "active") + +# Join +orders.join(customers, .order.customer_id == .customer.id) + +# Group and aggregate +orders.group_by(.customer_id).agg(total(.amount)) + +# Order and limit +orders.order_by(.amount.desc()).limit(10) + +# Bounded vs unbounded signatures +def batch_only(data: BoundedDataSet[Order]) -> None: ... +def streaming_only(data: UnboundedDataSet[Event]) -> None: ... +def generic_any(data: DataSet[T]) -> None: ... +``` +""" + +from std.testing import fail_t as NotImplementedError + +# ---- DataSet trait ---- + +pub trait DataSet[T]: + """ + Root dataset trait (any tabular carrier with row schema `T`). + """ + + def filter(self, predicate: bool) -> Self: + return NotImplementedError("InQL DataSet.filter is not implemented yet") + + def join(self, other: Self, on: bool) -> Self: + return NotImplementedError("InQL DataSet.join is not implemented yet") + + def select(self) -> Self: + return NotImplementedError("InQL DataSet.select is not implemented yet") + + def group_by(self) -> Self: + return NotImplementedError("InQL DataSet.group_by is not implemented yet") + + def agg(self) -> Self: + return NotImplementedError("InQL DataSet.agg is not implemented yet") + + def order_by(self) -> Self: + return NotImplementedError("InQL DataSet.order_by is not implemented yet") + + def limit(self, n: int) -> Self: + return NotImplementedError("InQL DataSet.limit is not implemented yet") + + def explode(self) -> Self: + return NotImplementedError("InQL DataSet.explode is not implemented yet") + + +# ---- BoundedDataSet trait and concrete types ---- + +pub trait BoundedDataSet[T] with DataSet[T]: + """Finite/batch dataset carrier (all relational ops are allowed).""" + + +pub class DataFrame[T] with BoundedDataSet: + """Materialized/eager dataset (always bounded).""" + pub _row_schema_marker: T + + +pub class LazyFrame[T] with BoundedDataSet: + """Deferred plan over a bounded source (always bounded).""" + pub _row_schema_marker: T + + +# ---- UnboundedDataSet trait and concrete types ---- + +pub trait UnboundedDataSet[T] with DataSet[T]: + """Streaming/unbounded dataset carrier (compile-time constraints enforced by compiler).""" + +pub class DataStream[T] with UnboundedDataSet: + """Streaming dataset (unbounded).""" + pub _row_schema_marker: T diff --git a/src/functions.incn b/src/functions.incn new file mode 100644 index 0000000..f87fe32 --- /dev/null +++ b/src/functions.incn @@ -0,0 +1,18 @@ +""" +Aggregate helpers for `.agg(...)` (RFC 001). + +These symbols **must** be imported by authors (for example `from pub::inql.functions import total`); they are not ambient +builtins. Names avoid `sum` / `count` clashes with Incan/stdlib builtins. Bodies are neutral stubs until InQL RFC 004 +defines execution; lowering and relational typing for aggregates live in companion RFCs. + +FIXME: review this behavior once Incan's RFC 045 is implemented; we may want to migrate these to builtins preferably. +""" + +pub def total(_x: int) -> int: + """Sum aggregate (stub; engine / lowering supplies semantics).""" + return 0 + + +pub def count_rows(_x: int) -> int: + """Count aggregate placeholder (stub).""" + return 0 diff --git a/src/lib.incn b/src/lib.incn index 0676d99..32d5f53 100644 --- a/src/lib.incn +++ b/src/lib.incn @@ -1,8 +1,10 @@ """ InQL — typed query surface (Incan library). -Add modules (e.g. `dataframe`, `query`) and re-export them here with `pub from ... import ...`. +Add modules (e.g. `dataset`, `functions`, `query`) and re-export them here with `pub from ... import ...`. Consumers depend on this package via `[dependencies]` and import with `from pub::inql import ...`. """ +pub from dataset import BoundedDataSet, DataFrame, DataSet, DataStream, LazyFrame, UnboundedDataSet +pub from functions import count_rows, total pub from metadata import inql_version diff --git a/tests/test_dataset.incn b/tests/test_dataset.incn new file mode 100644 index 0000000..1b697ff --- /dev/null +++ b/tests/test_dataset.incn @@ -0,0 +1,132 @@ +"""Test: dataset types and RFC 001 contract (types, hierarchy, functions import).""" + +from std.testing import assert_eq +from metadata import inql_version +from dataset import DataSet, BoundedDataSet, UnboundedDataSet, DataFrame, LazyFrame, DataStream +from functions import count_rows, total + +# ---- Helper functions and tooling ---- +model Order: + id: int + + +model Event: + id: int + + +def _accept_data_set_generic[T](data: DataSet[T]) -> DataSet[T]: + return data + + +def _accept_bounded_generic[T](data: BoundedDataSet[T]) -> BoundedDataSet[T]: + return data + + +def _accept_unbounded_generic[T](data: UnboundedDataSet[T]) -> UnboundedDataSet[T]: + return data + + +def _accept_data_frame_concrete[T](data: DataFrame[T]) -> DataFrame[T]: + return data + + +def _accept_lazy_frame_concrete[T](data: LazyFrame[T]) -> LazyFrame[T]: + return data + + +def _accept_data_stream_concrete[T](data: DataStream[T]) -> DataStream[T]: + return data + + +def _upcast_data_frame_to_bounded[T](data: DataFrame[T]) -> BoundedDataSet[T]: + return data + + +def _upcast_lazy_frame_to_bounded[T](data: LazyFrame[T]) -> BoundedDataSet[T]: + return data + + +def _upcast_data_stream_to_unbounded[T](data: DataStream[T]) -> UnboundedDataSet[T]: + return data + + +def _upcast_bounded_to_data_set[T](data: BoundedDataSet[T]) -> DataSet[T]: + return data + + +def _upcast_unbounded_to_data_set[T](data: UnboundedDataSet[T]) -> DataSet[T]: + return data + + +def _touch[T](x: T) -> None: + """Consume a value so assignment chains are not unused-variable warnings.""" + pass + + +def _compile_hierarchy_assignability(order_frame: DataFrame[Order], order_lazy: LazyFrame[Order], event_stream: DataStream[Event]) -> None: + """Compile-time shape checks for concrete -> trait -> supertrait assignments.""" + sink0 = _upcast_bounded_to_data_set(_upcast_data_frame_to_bounded(order_frame)) + sink1 = _upcast_bounded_to_data_set(_upcast_lazy_frame_to_bounded(order_lazy)) + sink2 = _upcast_unbounded_to_data_set(_upcast_data_stream_to_unbounded(event_stream)) + _touch(sink0) + _touch(sink1) + _touch(sink2) + + +# ---- Test cases ---- +def test_smoke__dataset_types_are_published() -> None: + """RFC 001: carrier types and aggregate helpers are importable.""" + assert_eq(total(42), 0, "total is a stub returning 0 until RFC 004") + assert_eq(count_rows(7), 0, "count_rows is a stub returning 0 until RFC 004") + + +def test_type_contracts__signature_tiers_compile() -> None: + """RFC 001: DataSet / BoundedDataSet / UnboundedDataSet accept concrete carriers.""" + row = Order(id=1) + df: DataFrame[Order] = DataFrame(_row_schema_marker=row) + ev = Event(id=2) + st: DataStream[Event] = DataStream(_row_schema_marker=ev) + _touch(_accept_data_set_generic(df)) + _touch(_accept_bounded_generic(df)) + _touch(_accept_data_set_generic(st)) + _touch(_accept_unbounded_generic(st)) + + +def test_type_contracts__concrete_carriers_compile() -> None: + """RFC 001: concrete carriers flow through generic helper signatures.""" + df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=3)) + lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=3003)) + ev = Event(id=4) + st: DataStream[Event] = DataStream(_row_schema_marker=ev) + _touch(_accept_data_frame_concrete(df)) + _touch(_accept_lazy_frame_concrete(lf)) + _touch(_accept_data_stream_concrete(st)) + + +def test_hierarchy__concrete_and_supertrait_assignability() -> None: + """RFC 001: concrete -> bounded/unbounded trait -> DataSet chains compile.""" + df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=5)) + lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=5005)) + ev = Event(id=6) + st: DataStream[Event] = DataStream(_row_schema_marker=ev) + _compile_hierarchy_assignability(df, lf, st) + + +def test_type_contracts__concrete_and_trait_types_match_generic_arguments() -> None: + """Generic parameter T matches for DataFrame/LazyFrame/DataStream carriers.""" + df: DataFrame[Order] = DataFrame(_row_schema_marker=Order(id=7)) + lf: LazyFrame[Order] = LazyFrame(_row_schema_marker=Order(id=7007)) + bounded_df: BoundedDataSet[Order] = df + bounded_lf: BoundedDataSet[Order] = lf + _touch(bounded_df) + _touch(bounded_lf) + ev = Event(id=8) + st: DataStream[Event] = DataStream(_row_schema_marker=ev) + ub: UnboundedDataSet[Event] = st + _touch(ub) + + +def test_version__inql_version_is_published() -> None: + """InQL version should be available from metadata.""" + version = inql_version() + assert_eq(version, "0.1.0", "InQL version should be 0.1.0") diff --git a/tests/test_inql.incn b/tests/test_inql.incn deleted file mode 100644 index c56e554..0000000 --- a/tests/test_inql.incn +++ /dev/null @@ -1,7 +0,0 @@ -from std.testing import assert_eq -from metadata import inql_version - - -# TODO: this acts as a placeholder - remove this once we get more implementation service -def test_inql_version_is_published() -> None: - assert_eq(inql_version(), "0.1.0") From 913967d80910ccb640e64287ac132d5f922ba23c Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 27 Mar 2026 17:21:29 +0100 Subject: [PATCH 2/4] ci fix --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 277824e..d06c30f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ env: RUST_BACKTRACE: 1 INCAN_NO_BANNER: 1 # Pin the compiler checkout; bump when you intentionally require a newer Incan. - INCAN_REPO: dannys-code-corner/incan-programming-language + INCAN_REPO: dannys-code-corner/incan INCAN_REF: main jobs: From 93a52e5d8d49ab3937d520d6af042f0bba86dd00 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 27 Mar 2026 18:14:21 +0100 Subject: [PATCH 3/4] update for CI --- .github/workflows/ci.yml | 2 +- AGENTS.md | 4 ++-- Makefile | 27 ++++++++++++++++++++------- docs/architecture.md | 2 +- examples/trait_hierarchy.incn | 1 - src/dataset.incn | 15 +++------------ 6 files changed, 27 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d06c30f..8bb8be9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: incan --version rustc --version - - name: Format check (Incan sources) + - name: Format check (package .incn) run: make fmt-check - name: Build library diff --git a/AGENTS.md b/AGENTS.md index 8037ae9..47bd4fa 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -70,8 +70,8 @@ Normative behavior is defined in **`docs/rfcs/`**. If package code and an RFC di | `make help` | List targets | | `make ci` | Same as CI: `fmt-check`, `build`, `test` | | `make check` / `make pre-commit` | Alias-style gate: format check + build + test | -| `make fmt` | Format `.incn` sources (`incan fmt .`) | -| `make fmt-check` | Check formatting without writing | +| `make fmt` | Format package `.incn` sources (`src/`, `tests/`, `examples/` only) | +| `make fmt-check` | Check formatting without writing (same scope) | | `make build` | `incan build --lib` | | `make test` | `incan test` | | `make build-locked` / `make test-locked` | Stricter lockfile mode | diff --git a/Makefile b/Makefile index d223612..990750f 100644 --- a/Makefile +++ b/Makefile @@ -43,18 +43,31 @@ test-locked: ## Run tests with `--locked` @$(INCAN) test --locked # ============================================================================= -# Formatting (Incan source) +# Formatting (Incan source — package only) # ============================================================================= +# +# Scope to `src/`, `tests/`, and `examples/` only. CI checks out the Incan +# compiler under `./incan/`; formatting `.` would walk that tree and fail on +# stdlib snapshots and test fixtures that are not meant for `incan fmt`. + +INQL_FMT_DIRS := src tests examples .PHONY: fmt -fmt: ## Format `.incn` sources (`incan fmt`) - @echo "\033[1mFormatting Incan sources...\033[0m" - @$(INCAN) fmt . +fmt: ## Format package `.incn` sources (`incan fmt` per directory) + @echo "\033[1mFormatting Incan sources (package dirs)...\033[0m" + @for d in $(INQL_FMT_DIRS); do \ + if [ -d "$$d" ]; then $(INCAN) fmt "$$d"; fi; \ + done .PHONY: fmt-check -fmt-check: ## Check formatting without writing (`incan fmt --check`) - @echo "\033[1mChecking Incan source formatting...\033[0m" - @$(INCAN) fmt --check . +fmt-check: ## Check formatting without writing (`incan fmt --check` per directory) + @echo "\033[1mChecking Incan source formatting (package dirs)...\033[0m" + @for d in $(INQL_FMT_DIRS); do \ + if [ -d "$$d" ]; then \ + echo "\033[1m -> $$d/\033[0m"; \ + $(INCAN) fmt --check "$$d" || exit $$?; \ + fi; \ + done # ============================================================================= # Aggregates (local gates) diff --git a/docs/architecture.md b/docs/architecture.md index 91e923e..0b41142 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -76,7 +76,7 @@ From the repo root, with `incan` on your `PATH`: ```text make ci │ - └──► incan fmt --check . → incan build --lib → incan test + └──► incan fmt --check (package dirs) → incan build --lib → incan test ``` Equivalent raw commands: diff --git a/examples/trait_hierarchy.incn b/examples/trait_hierarchy.incn index cd33970..2f909f6 100644 --- a/examples/trait_hierarchy.incn +++ b/examples/trait_hierarchy.incn @@ -87,4 +87,3 @@ def process_batch_only(data: BoundedDataSet[Order]) -> int: def main() -> None: """Example main function.""" pass - diff --git a/src/dataset.incn b/src/dataset.incn index ac100a0..4e8310b 100644 --- a/src/dataset.incn +++ b/src/dataset.incn @@ -68,12 +68,7 @@ def generic_any(data: DataSet[T]) -> None: ... from std.testing import fail_t as NotImplementedError # ---- DataSet trait ---- - pub trait DataSet[T]: - """ - Root dataset trait (any tabular carrier with row schema `T`). - """ - def filter(self, predicate: bool) -> Self: return NotImplementedError("InQL DataSet.filter is not implemented yet") @@ -100,26 +95,22 @@ pub trait DataSet[T]: # ---- BoundedDataSet trait and concrete types ---- - pub trait BoundedDataSet[T] with DataSet[T]: - """Finite/batch dataset carrier (all relational ops are allowed).""" + pass pub class DataFrame[T] with BoundedDataSet: - """Materialized/eager dataset (always bounded).""" pub _row_schema_marker: T pub class LazyFrame[T] with BoundedDataSet: - """Deferred plan over a bounded source (always bounded).""" pub _row_schema_marker: T # ---- UnboundedDataSet trait and concrete types ---- - pub trait UnboundedDataSet[T] with DataSet[T]: - """Streaming/unbounded dataset carrier (compile-time constraints enforced by compiler).""" + pass + pub class DataStream[T] with UnboundedDataSet: - """Streaming dataset (unbounded).""" pub _row_schema_marker: T From e8bfe1dd046ed65b9851912732646ed075768fe1 Mon Sep 17 00:00:00 2001 From: Danny Meijer Date: Fri, 27 Mar 2026 18:27:54 +0100 Subject: [PATCH 4/4] another ci fix --- AGENTS.md | 2 +- CONTRIBUTING.md | 2 +- Makefile | 13 ++++++++++--- README.md | 2 +- docs/architecture.md | 6 +++--- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 47bd4fa..d269845 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -73,7 +73,7 @@ Normative behavior is defined in **`docs/rfcs/`**. If package code and an RFC di | `make fmt` | Format package `.incn` sources (`src/`, `tests/`, `examples/` only) | | `make fmt-check` | Check formatting without writing (same scope) | | `make build` | `incan build --lib` | -| `make test` | `incan test` | +| `make test` | `incan test tests` (package `tests/` only; avoids picking up a sibling `./incan/` checkout) | | `make build-locked` / `make test-locked` | Stricter lockfile mode | Requires `incan` on `PATH`, or `make build INCAN=/path/to/incan`. CI builds Incan from source then runs `make ci` (see [.github/workflows/ci.yml][ci-workflow]). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5e486e6..68fdf56 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -45,7 +45,7 @@ Thank you for your interest in InQL — the typed relational layer for [Incan][i make test ``` - With `incan` on your `PATH` you can call `incan build --lib` and `incan test` directly. Override the binary with `make build INCAN=/path/to/incan` if needed. + With `incan` on your `PATH` you can call `incan build --lib` and `incan test tests` directly (use the `tests/` path so a sibling Incan checkout under `./incan/` is not collected). Override the binary with `make build INCAN=/path/to/incan` if needed. ## Project structure diff --git a/Makefile b/Makefile index 990750f..62156e7 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,13 @@ help: ## Show Make targets # ============================================================================= # Build & test (primary — Incan-first) # ============================================================================= +# +# Test discovery defaults to `.` and walks the whole tree. CI checks out the +# Incan compiler under `./incan/`; running `incan test` without a path would +# pick up compiler integration tests (e.g. `incan/tests/test_*.incn`), which +# are not InQL package tests. Scope to `tests/` only (see INQL_FMT_DIRS). + +INQL_TEST_DIR := tests .PHONY: build build: ## Build the library (`incan build --lib`) @@ -28,9 +35,9 @@ build: ## Build the library (`incan build --lib`) @$(INCAN) build --lib .PHONY: test -test: ## Run package tests (`incan test`) +test: ## Run package tests (`incan test tests`) @echo "\033[1mRunning InQL tests...\033[0m" - @$(INCAN) test + @$(INCAN) test $(INQL_TEST_DIR) .PHONY: build-locked build-locked: ## Build with `--locked` (stricter; requires current `incan.lock`) @@ -40,7 +47,7 @@ build-locked: ## Build with `--locked` (stricter; requires current `incan.lock`) .PHONY: test-locked test-locked: ## Run tests with `--locked` @echo "\033[1mRunning InQL tests (locked)...\033[0m" - @$(INCAN) test --locked + @$(INCAN) test $(INQL_TEST_DIR) --locked # ============================================================================= # Formatting (Incan source — package only) diff --git a/README.md b/README.md index c42ca54..8d43ce9 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Or invoke the toolchain directly: ```bash incan build --lib -incan test +incan test tests ``` See `make help` for other targets (`fmt`, `fmt-check`, `build-locked`, …). Continuous integration builds **Incan from source** on each run, then runs the same `fmt-check`, `build`, and `test` steps (see [.github/workflows/ci.yml](.github/workflows/ci.yml)). diff --git a/docs/architecture.md b/docs/architecture.md index 0b41142..3f0d00b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -76,7 +76,7 @@ From the repo root, with `incan` on your `PATH`: ```text make ci │ - └──► incan fmt --check (package dirs) → incan build --lib → incan test + └──► incan fmt --check (package dirs) → incan build --lib → incan test tests ``` Equivalent raw commands: @@ -87,9 +87,9 @@ incan build --lib └──► Incan frontend (parse, check, …) + backend emit a Rust crate for the library (same staged pipeline as application builds; see [Incan architecture docs][incan-architecture]) -incan test +incan test tests │ - └──► Discover and run tests under tests/ + └──► Discover and run tests under `tests/` only (not the whole repo; CI may have `./incan/` checked out) ``` **GitHub Actions** does not assume a preinstalled `incan` binary: the workflow checks out the [Incan compiler repository][incan-repo], runs `cargo build --release`, adds `target/release` to `PATH`, then runs `make ci` in this tree.