dannys-code-corner · dannymeijer · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ env:
   RUST_BACKTRACE: 1
   INCAN_NO_BANNER: 1
   # Pin the compiler checkout; bump when you intentionally require a newer Incan.
-  INCAN_REPO: dannys-code-corner/incan-programming-language
+  INCAN_REPO: dannys-code-corner/incan
   INCAN_REF: main
 
 jobs:
@@ -64,7 +64,7 @@ jobs:
           incan --version
           rustc --version
 
-      - name: Format check (Incan sources)
+      - name: Format check (package .incn)
         run: make fmt-check
 
       - name: Build library

diff --git a/AGENTS.md b/AGENTS.md
@@ -70,10 +70,10 @@ Normative behavior is defined in **`docs/rfcs/`**. If package code and an RFC di
 | `make help` | List targets |
 | `make ci` | Same as CI: `fmt-check`, `build`, `test` |
 | `make check` / `make pre-commit` | Alias-style gate: format check + build + test |
-| `make fmt` | Format `.incn` sources (`incan fmt .`) |
-| `make fmt-check` | Check formatting without writing |
+| `make fmt` | Format package `.incn` sources (`src/`, `tests/`, `examples/` only) |
+| `make fmt-check` | Check formatting without writing (same scope) |
 | `make build` | `incan build --lib` |
-| `make test` | `incan test` |
+| `make test` | `incan test tests` (package `tests/` only; avoids picking up a sibling `./incan/` checkout) |
 | `make build-locked` / `make test-locked` | Stricter lockfile mode |
 
 Requires `incan` on `PATH`, or `make build INCAN=/path/to/incan`. CI builds Incan from source then runs `make ci` (see [.github/workflows/ci.yml][ci-workflow]).

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -45,7 +45,7 @@ Thank you for your interest in InQL — the typed relational layer for [Incan][i
    make test
    ```
 
-   With `incan` on your `PATH` you can call `incan build --lib` and `incan test` directly. Override the binary with `make build INCAN=/path/to/incan` if needed.
+   With `incan` on your `PATH` you can call `incan build --lib` and `incan test tests` directly (use the `tests/` path so a sibling Incan checkout under `./incan/` is not collected). Override the binary with `make build INCAN=/path/to/incan` if needed.
 
 ## Project structure
 

diff --git a/Makefile b/Makefile
@@ -21,16 +21,23 @@ help: ## Show Make targets
 # =============================================================================
 # Build & test (primary — Incan-first)
 # =============================================================================
+#
+# Test discovery defaults to `.` and walks the whole tree. CI checks out the
+# Incan compiler under `./incan/`; running `incan test` without a path would
+# pick up compiler integration tests (e.g. `incan/tests/test_*.incn`), which
+# are not InQL package tests. Scope to `tests/` only (see INQL_FMT_DIRS).
+
+INQL_TEST_DIR := tests
 
 .PHONY: build
 build: ## Build the library (`incan build --lib`)
 	@echo "\033[1mBuilding InQL library...\033[0m"
 	@$(INCAN) build --lib
 
 .PHONY: test
-test: ## Run package tests (`incan test`)
+test: ## Run package tests (`incan test tests`)
 	@echo "\033[1mRunning InQL tests...\033[0m"
-	@$(INCAN) test
+	@$(INCAN) test $(INQL_TEST_DIR)
 
 .PHONY: build-locked
 build-locked: ## Build with `--locked` (stricter; requires current `incan.lock`)
@@ -40,21 +47,34 @@ build-locked: ## Build with `--locked` (stricter; requires current `incan.lock`)
 .PHONY: test-locked
 test-locked: ## Run tests with `--locked`
 	@echo "\033[1mRunning InQL tests (locked)...\033[0m"
-	@$(INCAN) test --locked
+	@$(INCAN) test $(INQL_TEST_DIR) --locked
 
 # =============================================================================
-# Formatting (Incan source)
+# Formatting (Incan source — package only)
 # =============================================================================
+#
+# Scope to `src/`, `tests/`, and `examples/` only. CI checks out the Incan
+# compiler under `./incan/`; formatting `.` would walk that tree and fail on
+# stdlib snapshots and test fixtures that are not meant for `incan fmt`.
+
+INQL_FMT_DIRS := src tests examples
 
 .PHONY: fmt
-fmt: ## Format `.incn` sources (`incan fmt`)
-	@echo "\033[1mFormatting Incan sources...\033[0m"
-	@$(INCAN) fmt .
+fmt: ## Format package `.incn` sources (`incan fmt` per directory)
+	@echo "\033[1mFormatting Incan sources (package dirs)...\033[0m"
+	@for d in $(INQL_FMT_DIRS); do \
+	  if [ -d "$$d" ]; then $(INCAN) fmt "$$d"; fi; \
+	done
 
 .PHONY: fmt-check
-fmt-check: ## Check formatting without writing (`incan fmt --check`)
-	@echo "\033[1mChecking Incan source formatting...\033[0m"
-	@$(INCAN) fmt --check .
+fmt-check: ## Check formatting without writing (`incan fmt --check` per directory)
+	@echo "\033[1mChecking Incan source formatting (package dirs)...\033[0m"
+	@for d in $(INQL_FMT_DIRS); do \
+	  if [ -d "$$d" ]; then \
+	    echo "\033[1m  -> $$d/\033[0m"; \
+	    $(INCAN) fmt --check "$$d" || exit $$?; \
+	  fi; \
+	done
 
 # =============================================================================
 # Aggregates (local gates)

diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ Or invoke the toolchain directly:
 
 ```bash
 incan build --lib
-incan test
+incan test tests
 ```
 
 See `make help` for other targets (`fmt`, `fmt-check`, `build-locked`, …). Continuous integration builds **Incan from source** on each run, then runs the same `fmt-check`, `build`, and `test` steps (see [.github/workflows/ci.yml](.github/workflows/ci.yml)).
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -76,7 +76,7 @@ From the repo root, with `incan` on your `PATH`:
 ```text
 make ci
   │
-  └──►  incan fmt --check .  →  incan build --lib  →  incan test
+  └──►  incan fmt --check (package dirs)  →  incan build --lib  →  incan test tests
 ```
 
 Equivalent raw commands:
@@ -87,9 +87,9 @@ incan build --lib
   └──►  Incan frontend (parse, check, …) + backend emit a Rust crate for the library
         (same staged pipeline as application builds; see [Incan architecture docs][incan-architecture])
 
-incan test
+incan test tests
   │
-  └──►  Discover and run tests under tests/
+  └──►  Discover and run tests under `tests/` only (not the whole repo; CI may have `./incan/` checked out)
 ```
 
 **GitHub Actions** does not assume a preinstalled `incan` binary: the workflow checks out the [Incan compiler repository][incan-repo], runs `cargo build --release`, adds `target/release` to `PATH`, then runs `make ci` in this tree.

diff --git a/docs/language/explanation/dataset_types.md b/docs/language/explanation/dataset_types.md
@@ -0,0 +1,122 @@
+# Dataset types (Explanation)
+
+This page explains how to think about and use InQL's dataset types.
+
+## Why dataset types?
+
+Typed pipelines need a first-class carrier for columnar data indexed by `T`. Without `DataSet[T]`, relational authoring surfaces would lack a stable primary relation and schema flow for `FROM`-style entry points.
+
+The **bounded/unbounded** distinction — inspired by Spark Structured Streaming's principle that a stream is an unbounded table — must be expressed at the **type level** so the compiler can enforce streaming constraints statically rather than at runtime.
+
+## The core idea
+
+A `DataSet[T]` is a **schema-parameterized tabular carrier**:
+
+- `T` is an Incan `model` — the row schema
+- The carrier holds tabular data with that schema
+- Operations like `filter`, `join`, `select` transform the carrier
+
+## Bounded vs unbounded
+
+The key insight is that **a stream is an unbounded table**. Rather than defining separate operation APIs for batch and streaming, `DataSet[T]` provides one relational operation surface. The bounded/unbounded property is expressed through the type system:
+
+- **`BoundedDataSet[T]`** — finite extent, all operations allowed
+- **`UnboundedDataSet[T]`** — streaming/unbounded, unbounded-state operations rejected at compile time
+
+This enables **static capability gating**: operations that require unbounded state are rejected at compile time when the target is unbounded, without requiring a separate streaming API.
+
+## When to use which type
+
+### `DataFrame[T]` — materialized/eager
+
+Use `DataFrame[T]` when you have data in hand and want to inspect or manipulate it directly:
+
+```incan
+from pub::inql import DataFrame
+from models import Order
+
+def inspect_orders(orders: DataFrame[Order]) -> None:
+    # Work with materialized data
+    pass
+```
+
+`DataFrame[T]` is always bounded — it's the product of collecting or executing a `LazyFrame`.
+
+### `LazyFrame[T]` — deferred plan
+
+Use `LazyFrame[T]` when you want to compose operations before execution:
+
+```incan
+from pub::inql import LazyFrame
+from models import Order
+
+def high_value_orders(orders: LazyFrame[Order]) -> LazyFrame[Order]:
+    # Intended when query syntax is available: return orders.filter(.amount > 100)
+    return orders
+```
+
+### `DataStream[T]` — streaming
+
+Use `DataStream[T]` for streaming/unbounded data:
+
+```incan
+from pub::inql import DataStream
+from models import Event
+
+def important_events(events: DataStream[Event]) -> DataStream[Event]:
+    # Intended when query syntax is available: return events.filter(.severity == "critical")
+    return events
+```
+
+`DataStream[T]` shares the same operation API as batch carriers, but signals that its source is unbounded. Static streaming constraints are specified in RFC 001 and enforced as the compiler gains analysis for `UnboundedDataSet[T]`.
+
+## Type signatures
+
+The trait hierarchy gives you three levels of specificity:
+
+```incan
+from pub::inql import DataSet, BoundedDataSet, UnboundedDataSet
+from models import Order, Event
+
+# Accepts any carrier — generic utilities
+def row_count[T](data: DataSet[T]) -> int:
+    ...
+
+# Batch only — Parquet writers, batch sinks
+def write_parquet(data: BoundedDataSet[Order]) -> None:
+    ...
+
+# Streaming only — Kafka sinks, event processors
+def write_to_kafka(events: UnboundedDataSet[Event]) -> None:
+    ...
+```
+
+And two levels of concrete-type specificity:
+
+```incan
+from pub::inql import DataFrame, LazyFrame, DataStream
+from models import Order, Event
+
+# Materialized data in hand
+def inspect(data: DataFrame[Order]) -> None:
+    ...
+
+def build_pipeline(orders: LazyFrame[Order]) -> LazyFrame[Order]:
+    ...
+
+def process_stream(events: DataStream[Event]) -> DataStream[Event]:
+    ...
+```
+
+## Aggregate helpers
+
+`.agg(...)` uses **imported** symbols from `pub::inql.functions` (for example `total`, `count_rows`).
+
+<!-- FIXME: remove this note once Incan RFC 045 is implemented -->
+> Note: ambient `sum` / `count` builtins will be possible once Incan's RFC 045 is implemented since those names clash with Incan/stdlib in ordinary expression positions.
+
+## What's next?
+
+- **Execution context**: How `DataSet` operations actually run (RFC 004)
+- **Query DSL**: `query {}` blocks that produce plans (RFC 003)
+- **Substrait**: Portable logical plans (RFC 002)
diff --git a/docs/language/reference/dataset_types.md b/docs/language/reference/dataset_types.md
@@ -0,0 +1,94 @@
+# Dataset types (Reference)
+
+This page documents the InQL dataset type hierarchy: the traits and concrete types that carry schema-parameterized tabular data through relational pipelines.
+
+## Type hierarchy
+
+```text
+DataSet[T]                       (root trait — any tabular data)
+├── BoundedDataSet[T]            (trait — finite extent)
+│   ├── DataFrame[T]             (concrete — materialized/eager)
+│   └── LazyFrame[T]             (concrete — deferred plan, bounded source)
+└── UnboundedDataSet[T]          (trait — streaming/unbounded)
+    └── DataStream[T]            (concrete — streaming)
+```
+
+### `DataSet[T]`
+
+Root trait for any schema-parameterized tabular data whose row shape is an Incan `model` `T`.
+
+All relational operations are defined on `DataSet[T]`. The compiler applies the **most restrictive** constraint set when the concrete kind is unknown at a call site (because the argument might be unbounded).
+
+### `BoundedDataSet[T]`
+
+Extends `DataSet[T]` — data with a finite, known extent. All relational operations are allowed.
+
+### `UnboundedDataSet[T]`
+
+Extends `DataSet[T]` — data from a streaming or unbounded source. Operations requiring unbounded state **must** be rejected at compile time.
+
+### `DataFrame[T]`
+
+Implements `BoundedDataSet[T]`. Materialized/eager result; always bounded. Conceptually the product of collecting or executing a `LazyFrame`.
+
+### `LazyFrame[T]`
+
+Implements `BoundedDataSet[T]`. Holds a logical plan (or equivalent) until an explicit execute, collect, or write boundary. Always bounded.
+
+### `DataStream[T]`
+
+Implements `UnboundedDataSet[T]`. Shares the `DataSet[T]` operation API but signals that its source is unbounded. The compiler applies static streaming constraints.
+
+## Operation API
+
+The following instance methods are defined on `DataSet[T]`:
+
+| Method     | Signature                                       | Description                                                                                                                             |
+| ---------- | ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
+| `filter`   | `def filter(self, predicate: bool) -> Self`     | Restrict rows by a boolean relational expression                                                                                        |
+| `join`     | `def join(self, other: Self, on: bool) -> Self` | Combine with another relation on a join condition (`other: Self` at the trait level; see RFC 001 **Shipped trait signatures (`Self`)**) |
+| `select`   | `def select(self) -> Self`                      | Project columns and expressions; logical output schema is tracked when lowering/typing (RFC 003)                                        |
+| `group_by` | `def group_by(self) -> Self`                    | Define grouping keys for aggregation                                                                                                    |
+| `agg`      | `def agg(self) -> Self`                         | Apply aggregate functions over groups; use imported helpers from `pub::inql.functions` (e.g. `total`, `count_rows`)                     |
+| `order_by` | `def order_by(self) -> Self`                    | Define sort keys and directions                                                                                                         |
+| `limit`    | `def limit(self, n: int) -> Self`               | Cap the number of rows (after sort when both apply)                                                                                     |
+| `explode`  | `def explode(self) -> Self`                     | Expand a nested list column into rows                                                                                                   |
+
+## Static capability gating
+
+| Trait bound in signature | Allowed operations                                     | Constraint level                         |
+| ------------------------ | ------------------------------------------------------ | ---------------------------------------- |
+| `DataSet[T]`             | Intersection of bounded + unbounded capabilities       | Most restrictive (concrete kind unknown) |
+| `BoundedDataSet[T]`      | All relational operations                              | Unrestricted                             |
+| `UnboundedDataSet[T]`    | Relational operations minus unbounded-state operations | Streaming constraints enforced           |
+
+## Usage
+
+```incan
+from pub::inql import LazyFrame, DataFrame, DataStream
+from models import Order, Event
+
+# Accept any carrier — generic utilities
+def row_count[T](data: DataSet[T]) -> int:
+    ...
+
+# Batch only — Parquet writers, batch sinks
+def write_parquet(data: BoundedDataSet[Order]) -> None:
+    ...
+
+# Streaming only — Kafka sinks, event processors
+def write_to_kafka(events: UnboundedDataSet[Event]) -> None:
+    ...
+
+# Materialized data in hand
+def inspect(data: DataFrame[Order]) -> None:
+    ...
+
+# Deferred plan — compose before execution
+def build_pipeline(orders: LazyFrame[Order]) -> LazyFrame[Order]:
+    ...
+
+# Streaming specifically
+def process_stream(events: DataStream[Event]) -> DataStream[Event]:
+    ...
+```