From d8ec06d8e81c818748e760b27835ed8355a22687 Mon Sep 17 00:00:00 2001 From: Taylor Date: Fri, 19 Jun 2026 11:30:26 -0400 Subject: [PATCH 1/3] feat: deliver Phase 1 IR core compiler artifacts Add the ferro-schema-ir crate, deterministic Python->SchemaIR compilation, and model-set fingerprint/snapshot gates so Phase 1 exit criteria are enforceable in CI while preserving current user-visible behavior. Co-authored-by: Cursor --- Cargo.toml | 4 + crates/ferro-schema-ir/Cargo.toml | 8 + crates/ferro-schema-ir/src/lib.rs | 186 +++++++++++ docs/plans/2026-06-19-001-ir-first-roadmap.md | 23 +- docs/plans/ir-first-migration-guide.md | 8 +- src/ferro/ir/__init__.py | 11 + src/ferro/ir/compiler.py | 309 ++++++++++++++++++ src/ferro/metaclass.py | 3 + src/ferro/relations/__init__.py | 2 + src/ferro/state.py | 9 + .../schema_phase1_fixture_models_v1.json | 215 ++++++++++++ tests/test_ir_vectors_contract.py | 59 ++++ 12 files changed, 830 insertions(+), 7 deletions(-) create mode 100644 crates/ferro-schema-ir/Cargo.toml create mode 100644 crates/ferro-schema-ir/src/lib.rs create mode 100644 src/ferro/ir/__init__.py create mode 100644 src/ferro/ir/compiler.py create mode 100644 tests/fixtures/ir_vectors/schema_phase1_fixture_models_v1.json diff --git a/Cargo.toml b/Cargo.toml index 1769640..fb6af0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,6 @@ +[workspace] +members = [".", "crates/ferro-schema-ir"] + [package] name = "ferro" version = "0.11.0" @@ -31,3 +34,4 @@ sea-query = { version = "0.32", features = ["with-uuid"] } tokio = { version = "1.49", features = ["full"] } pyo3-async-runtimes = { version = "0.27", features = ["tokio-runtime"] } uuid = { version = "1.11", features = ["v4"] } +ferro-schema-ir = { path = "crates/ferro-schema-ir" } diff --git a/crates/ferro-schema-ir/Cargo.toml b/crates/ferro-schema-ir/Cargo.toml new file mode 100644 index 0000000..1432833 --- /dev/null +++ b/crates/ferro-schema-ir/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "ferro-schema-ir" +version = "0.1.0" +edition = "2024" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/crates/ferro-schema-ir/src/lib.rs b/crates/ferro-schema-ir/src/lib.rs new file mode 100644 index 0000000..b0c13a6 --- /dev/null +++ b/crates/ferro-schema-ir/src/lib.rs @@ -0,0 +1,186 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct IrEnvelope { + pub ir_kind: String, + pub ir_version: u32, + pub payload: T, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaIrPayload { + pub dialect_agnostic: bool, + pub models: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaModel { + pub model_name: String, + pub table_name: String, + pub columns: Vec, + pub foreign_keys: Vec, + pub indexes: Vec, + pub uniques: Vec, + pub checks: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaColumn { + pub name: String, + pub logical_type: String, + pub db_type: String, + pub nullable: bool, + pub primary_key: bool, + pub autoincrement: bool, + pub unique: bool, + pub index: bool, + pub default: Option, + pub format: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaForeignKey { + pub column: String, + pub to_table: String, + pub to_column: String, + pub on_delete: Option, + pub name: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaIndex { + pub name: String, + pub columns: Vec, + pub unique: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaUnique { + pub name: String, + pub columns: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SchemaCheck { + pub name: String, + pub expression: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct QueryIrPayload { + pub model_name: String, + #[serde(rename = "where")] + pub where_clause: Vec, + pub order_by: Vec, + pub limit: Option, + pub offset: Option, + pub m2m: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct QueryOrderBy { + pub column: String, + pub direction: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "node_kind")] +pub enum QueryNode { + #[serde(rename = "leaf")] + Leaf { + operator: String, + column: String, + value: QueryValue, + }, + #[serde(rename = "compound")] + Compound { + operator: String, + left: Box, + right: Box, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct QueryValue { + pub kind: String, + pub value: Value, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CodecIrPayload { + pub bind_rules: Vec, + pub fetch_rules: Vec, + pub hydration_abi: HydrationAbi, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CodecBindRule { + pub logical_type: String, + pub db_type: String, + pub non_null_wire_kind: String, + pub null_wire_kind: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CodecFetchRule { + pub db_type: String, + pub wire_kind: String, + pub python_kind: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct HydrationAbi { + pub constructor_mode: String, + pub required_slots: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn schema_fixture_roundtrip() { + let fixture = include_str!("../../../tests/fixtures/ir_vectors/schema_invoice_baseline_v1.json"); + let parsed: serde_json::Value = + serde_json::from_str(fixture).expect("schema fixture must parse"); + let ir = parsed + .get("ir") + .cloned() + .expect("fixture must contain ir envelope"); + let envelope: IrEnvelope = + serde_json::from_value(ir.clone()).expect("schema IR must deserialize"); + let encoded = serde_json::to_value(&envelope).expect("schema IR must serialize"); + assert_eq!(encoded, ir, "schema round-trip must not drift"); + } + + #[test] + fn query_fixture_roundtrip() { + let fixture = include_str!("../../../tests/fixtures/ir_vectors/query_user_compound_v1.json"); + let parsed: serde_json::Value = + serde_json::from_str(fixture).expect("query fixture must parse"); + let ir = parsed + .get("ir") + .cloned() + .expect("fixture must contain ir envelope"); + let envelope: IrEnvelope = + serde_json::from_value(ir.clone()).expect("query IR must deserialize"); + let encoded = serde_json::to_value(&envelope).expect("query IR must serialize"); + assert_eq!(encoded, ir, "query round-trip must not drift"); + } + + #[test] + fn codec_fixture_roundtrip() { + let fixture = include_str!("../../../tests/fixtures/ir_vectors/codec_registry_core_v1.json"); + let parsed: serde_json::Value = + serde_json::from_str(fixture).expect("codec fixture must parse"); + let ir = parsed + .get("ir") + .cloned() + .expect("fixture must contain ir envelope"); + let envelope: IrEnvelope = + serde_json::from_value(ir.clone()).expect("codec IR must deserialize"); + let encoded = serde_json::to_value(&envelope).expect("codec IR must serialize"); + assert_eq!(encoded, ir, "codec round-trip must not drift"); + } +} diff --git a/docs/plans/2026-06-19-001-ir-first-roadmap.md b/docs/plans/2026-06-19-001-ir-first-roadmap.md index 0d2db3a..3b31496 100644 --- a/docs/plans/2026-06-19-001-ir-first-roadmap.md +++ b/docs/plans/2026-06-19-001-ir-first-roadmap.md @@ -186,7 +186,7 @@ Issue references: ### Phase 1 - Build IR core and compiler -Status: `Not started` +Status: `In progress` Issue references: @@ -197,13 +197,23 @@ Issue references: - Introduce a Rust-owned IR crate and compile Python model metadata into deterministic IR artifacts. **Deliverables** -- [ ] `ferro-schema-ir` crate added with versioned serde types. -- [ ] Python -> SchemaIR compiler path added. -- [ ] IR hashing/fingerprinting persisted for model sets. +- [x] `ferro-schema-ir` crate added with versioned serde types. +- [x] Python -> SchemaIR compiler path added. +- [x] IR hashing/fingerprinting persisted for model sets. **Exit gate** -- [ ] Existing representative models compile to stable IR snapshots in CI. -- [ ] No user-visible behavior changes yet. +- [x] Existing representative models compile to stable IR snapshots in CI. +- [x] No user-visible behavior changes yet. + +**Evidence (working branch; pending merge to `feat/ir-first`)** +- IR crate: `crates/ferro-schema-ir/` (versioned serde types + RFC vector round-trip tests) +- Compiler + persistence: `src/ferro/ir/compiler.py`, `src/ferro/ir/__init__.py`, `src/ferro/metaclass.py`, `src/ferro/relations/__init__.py`, `src/ferro/state.py` +- Stable representative snapshot fixture: `tests/fixtures/ir_vectors/schema_phase1_fixture_models_v1.json` +- CI gate extension: `tests/test_ir_vectors_contract.py` (snapshot-compare + determinism tests) +- Verification commands: + - `cargo test -p ferro-schema-ir` + - `uv run pytest tests/test_ir_vectors_contract.py -q` + - `uv run pytest tests/test_cross_emitter_parity.py -q` --- @@ -473,6 +483,7 @@ Append updates as concise entries. - `2026-06-19` - Roadmap initialized. - `2026-06-19` - Branching policy set: phase work branches from `feat/ir-first` and merges back into `feat/ir-first` until final promotion to `main`. - `2026-06-19` - Phase 0 completed and merged via [#75](https://github.com/syn54x/ferro-orm/pull/75). +- `2026-06-19` - Phase 1 implementation landed on working branch: added `ferro-schema-ir`, Python->SchemaIR compiler, model-set fingerprinting, and stable representative snapshot checks. ## Immediate next actions diff --git a/docs/plans/ir-first-migration-guide.md b/docs/plans/ir-first-migration-guide.md index 0c07e49..17da8c3 100644 --- a/docs/plans/ir-first-migration-guide.md +++ b/docs/plans/ir-first-migration-guide.md @@ -33,7 +33,13 @@ No user-facing runtime behavior changes expected. ### Phase 1 -_TBD_ +No user-facing runtime behavior changes expected. + +| Issue | Change | Impact | User action | Notes | +| --- | --- | --- | --- | --- | +| [#77](https://github.com/syn54x/ferro-orm/issues/77) | Add `ferro-schema-ir` crate with versioned serde IR contracts | none | none | Internal contract crate only; artifacts: `crates/ferro-schema-ir/`, RFC vector round-trip tests | +| [#78](https://github.com/syn54x/ferro-orm/issues/78) | Add deterministic Python -> SchemaIR compiler path | none | none | Internal compiler path only; artifacts: `src/ferro/ir/compiler.py`, model registration + relationship-resolution hooks | +| [#79](https://github.com/syn54x/ferro-orm/issues/79) | Persist model-set fingerprints and stable representative snapshots | none | none | Infra/test only; artifacts: `tests/fixtures/ir_vectors/schema_phase1_fixture_models_v1.json`, `tests/test_ir_vectors_contract.py` | ### Phase 2 diff --git a/src/ferro/ir/__init__.py b/src/ferro/ir/__init__.py new file mode 100644 index 0000000..6df04be --- /dev/null +++ b/src/ferro/ir/__init__.py @@ -0,0 +1,11 @@ +from .compiler import ( + compile_model_schema_ir, + compile_registry_schema_ir, + schema_ir_fingerprint, +) + +__all__ = [ + "compile_model_schema_ir", + "compile_registry_schema_ir", + "schema_ir_fingerprint", +] diff --git a/src/ferro/ir/compiler.py b/src/ferro/ir/compiler.py new file mode 100644 index 0000000..5c57563 --- /dev/null +++ b/src/ferro/ir/compiler.py @@ -0,0 +1,309 @@ +from __future__ import annotations + +import hashlib +import json +from typing import Any + +from ..schema_metadata import build_model_schema +from ..state import ( + _MODEL_REGISTRY_PY, + _SCHEMA_IR_BY_MODEL, + _SCHEMA_IR_FINGERPRINT_BY_MODEL, + _SCHEMA_IR_MODELSET, + _SCHEMA_IR_MODELSET_FINGERPRINT, +) + +_IR_VERSION = 1 + + +def _canonical_json(value: dict[str, Any]) -> str: + return json.dumps( + value, + sort_keys=True, + separators=(",", ":"), + ensure_ascii=True, + ) + + +def _fingerprint(value: dict[str, Any]) -> str: + return hashlib.sha256(_canonical_json(value).encode("utf-8")).hexdigest() + + +def _resolve_ref(schema: dict[str, Any], col_info: dict[str, Any]) -> dict[str, Any]: + ref_path = col_info.get("$ref") + if not isinstance(ref_path, str): + return col_info + if not ref_path.startswith("#/$defs/"): + return col_info + def_name = ref_path.split("/")[-1] + resolved = schema.get("$defs", {}).get(def_name) + if not isinstance(resolved, dict): + return col_info + return { + **resolved, + **{k: v for k, v in col_info.items() if k != "$ref"}, + } + + +def _logical_type(col_info: dict[str, Any]) -> str: + field_type, field_format = _effective_type_and_format(col_info) + if field_type == "integer": + return "integer" + if field_type == "number": + return "decimal" if field_format == "decimal" else "number" + if field_type == "boolean": + return "boolean" + if field_type == "string": + if field_format == "date-time": + return "datetime" + if field_format == "date": + return "date" + if field_format == "time": + return "time" + if field_format == "uuid": + return "uuid" + return "string" + return "unknown" + + +def _default_db_type(col_info: dict[str, Any]) -> str: + field_type, field_format = _effective_type_and_format(col_info) + if field_type == "integer": + return "bigint" + if field_type == "number": + return "text" + if field_type == "string": + if field_format == "date-time": + return "timestamptz" + if field_format == "date": + return "date" + if field_format == "time": + return "time" + if field_format == "uuid": + return "uuid" + return "text" + return "text" + + +def _effective_type_and_format(col_info: dict[str, Any]) -> tuple[Any, Any]: + field_type = col_info.get("type") + field_format = col_info.get("format") + if field_type is not None: + return field_type, field_format + any_of = col_info.get("anyOf") + if isinstance(any_of, list): + for candidate in any_of: + if not isinstance(candidate, dict): + continue + candidate_type = candidate.get("type") + if candidate_type is None or candidate_type == "null": + continue + return candidate_type, candidate.get("format") + return field_type, field_format + + +def _is_nullable(col_name: str, col_info: dict[str, Any], required_fields: set[str]) -> bool: + nullable_hint = col_info.get("ferro_nullable") + if isinstance(nullable_hint, bool): + return nullable_hint + return col_name not in required_fields + + +def _column_ir( + col_name: str, col_info: dict[str, Any], required_fields: set[str] +) -> dict[str, Any]: + return { + "name": col_name, + "logical_type": _logical_type(col_info), + "db_type": col_info.get("db_type") or _default_db_type(col_info), + "nullable": _is_nullable(col_name, col_info, required_fields), + "primary_key": bool(col_info.get("primary_key", False)), + "autoincrement": bool(col_info.get("autoincrement", False)), + "unique": bool(col_info.get("unique", False)), + "index": bool(col_info.get("index", False)), + "default": col_info.get("default"), + "format": col_info.get("format"), + } + + +def _fk_name(table_name: str, col_name: str, to_table: str) -> str: + return f"fk_{table_name}_{col_name}_{to_table}" + + +def _single_index_name(table_name: str, col_name: str) -> str: + return f"idx_{table_name}_{col_name}" + + +def _single_unique_name(table_name: str, col_name: str) -> str: + return f"uq_{table_name}_{col_name}" + + +def _composite_index_name(table_name: str, columns: list[str]) -> str: + return f"idx_{table_name}_{'_'.join(columns)}" + + +def _composite_unique_name(table_name: str, columns: list[str]) -> str: + return f"uq_{table_name}_{'_'.join(columns)}" + + +def _checks_from_columns(table_name: str, columns: list[dict[str, Any]]) -> list[dict[str, Any]]: + checks: list[dict[str, Any]] = [] + for col in columns: + if col.get("db_check") is not True: + continue + col_name = col.get("name") + if not isinstance(col_name, str) or not col_name: + continue + checks.append( + { + "name": f"ck_{table_name}_{col_name}", + "expression": f"{col_name} IS NOT NULL", + } + ) + return checks + + +def compile_schema_ir_payload(model_name: str, schema: dict[str, Any]) -> dict[str, Any]: + table_name = model_name.lower() + properties = schema.get("properties", {}) + if not isinstance(properties, dict): + properties = {} + required_fields = schema.get("required", []) + required = set(required_fields) if isinstance(required_fields, list) else set() + + ordered_props = sorted(properties.items(), key=lambda item: item[0]) + resolved_columns: list[dict[str, Any]] = [] + for col_name, col_info in ordered_props: + if not isinstance(col_info, dict): + continue + resolved = _resolve_ref(schema, col_info) + resolved_with_name = {"name": col_name, **resolved} + resolved_columns.append(resolved_with_name) + + columns = [ + _column_ir(col["name"], col, required) + for col in resolved_columns + if isinstance(col.get("name"), str) + ] + + foreign_keys: list[dict[str, Any]] = [] + indexes: list[dict[str, Any]] = [] + uniques: list[dict[str, Any]] = [] + + for col in resolved_columns: + col_name = col.get("name") + if not isinstance(col_name, str) or not col_name: + continue + fk = col.get("foreign_key") + if isinstance(fk, dict): + to_table = fk.get("to_table") + if isinstance(to_table, str) and to_table: + foreign_keys.append( + { + "column": col_name, + "to_table": to_table, + "to_column": "id", + "on_delete": fk.get("on_delete"), + "name": _fk_name(table_name, col_name, to_table), + } + ) + if bool(col.get("index", False)): + indexes.append( + { + "name": _single_index_name(table_name, col_name), + "columns": [col_name], + "unique": False, + } + ) + if bool(col.get("unique", False)): + uniques.append( + { + "name": _single_unique_name(table_name, col_name), + "columns": [col_name], + } + ) + + for composite in schema.get("ferro_composite_indexes") or []: + if not isinstance(composite, list) or not composite: + continue + cols = [c for c in composite if isinstance(c, str) and c] + if len(cols) != len(composite): + continue + indexes.append( + { + "name": _composite_index_name(table_name, cols), + "columns": cols, + "unique": False, + } + ) + + for composite in schema.get("ferro_composite_uniques") or []: + if not isinstance(composite, list) or not composite: + continue + cols = [c for c in composite if isinstance(c, str) and c] + if len(cols) != len(composite): + continue + uniques.append({"name": _composite_unique_name(table_name, cols), "columns": cols}) + + model_payload = { + "model_name": model_name, + "table_name": table_name, + "columns": columns, + "foreign_keys": sorted( + foreign_keys, + key=lambda item: (item["column"], item["to_table"], item["to_column"]), + ), + "indexes": sorted(indexes, key=lambda item: item["name"]), + "uniques": sorted(uniques, key=lambda item: item["name"]), + "checks": sorted( + _checks_from_columns(table_name, resolved_columns), + key=lambda item: item["name"], + ), + } + return {"dialect_agnostic": True, "models": [model_payload]} + + +def wrap_schema_ir(payload: dict[str, Any]) -> dict[str, Any]: + return { + "ir_kind": "schema", + "ir_version": _IR_VERSION, + "payload": payload, + } + + +def compile_model_schema_ir(model_name: str, model_cls: type[Any]) -> dict[str, Any]: + schema = build_model_schema(model_cls) + payload = compile_schema_ir_payload(model_name, schema) + envelope = wrap_schema_ir(payload) + _SCHEMA_IR_BY_MODEL[model_name] = envelope + _SCHEMA_IR_FINGERPRINT_BY_MODEL[model_name] = _fingerprint(envelope) + return envelope + + +def compile_registry_schema_ir() -> dict[str, Any]: + models: list[dict[str, Any]] = [] + for model_name, model_cls in sorted(_MODEL_REGISTRY_PY.items(), key=lambda item: item[0]): + if model_name == "Model": + continue + model_envelope = compile_model_schema_ir(model_name, model_cls) + model_payload = model_envelope["payload"]["models"][0] + models.append(model_payload) + + envelope = { + "ir_kind": "schema", + "ir_version": _IR_VERSION, + "payload": { + "dialect_agnostic": True, + "models": models, + }, + } + + global _SCHEMA_IR_MODELSET, _SCHEMA_IR_MODELSET_FINGERPRINT + _SCHEMA_IR_MODELSET = envelope + _SCHEMA_IR_MODELSET_FINGERPRINT = _fingerprint(envelope) + return envelope + + +def schema_ir_fingerprint(ir_envelope: dict[str, Any]) -> str: + return _fingerprint(ir_envelope) + diff --git a/src/ferro/metaclass.py b/src/ferro/metaclass.py index 16c8ca6..0dfd0ec 100644 --- a/src/ferro/metaclass.py +++ b/src/ferro/metaclass.py @@ -25,6 +25,7 @@ from ._shadow_fk_types import shadow_annotation_for_foreign_key from .base import FerroField, ForeignKey, ManyToManyRelation from .fields import FERRO_FIELD_EXTRA_KEY +from .ir import compile_model_schema_ir, compile_registry_schema_ir from .query import FieldProxy, Relation from .relations.descriptors import ForwardDescriptor from .schema_metadata import _enum_subclass_from_annotation, build_model_schema @@ -535,5 +536,7 @@ def _generate_and_register_schema( if schema: setattr(cls, "__ferro_schema__", schema) register_model_schema(name, json.dumps(schema)) + compile_model_schema_ir(name, cls) + compile_registry_schema_ir() except Exception as e: raise RuntimeError(f"Ferro failed to register model '{name}': {e}") diff --git a/src/ferro/relations/__init__.py b/src/ferro/relations/__init__.py index 670daec..ce6f8f3 100644 --- a/src/ferro/relations/__init__.py +++ b/src/ferro/relations/__init__.py @@ -8,6 +8,7 @@ schema_fragment_for_pk, ) from ..base import ForeignKey, ManyToManyRelation +from ..ir import compile_registry_schema_ir from ..schema_metadata import build_model_schema from ..state import ( # noqa: F401 _JOIN_TABLE_REGISTRY, @@ -148,4 +149,5 @@ def resolve_relationships(): except Exception: pass + compile_registry_schema_ir() _PENDING_RELATIONS.clear() diff --git a/src/ferro/state.py b/src/ferro/state.py index 850e3a2..d124777 100644 --- a/src/ferro/state.py +++ b/src/ferro/state.py @@ -1,4 +1,5 @@ from contextvars import ContextVar +from typing import Any # Context variable to store the active transaction ID for the current task _CURRENT_TRANSACTION: ContextVar[str | None] = ContextVar( @@ -17,3 +18,11 @@ # Global registry for automatically generated join tables _JOIN_TABLE_REGISTRY = {} + +# Latest compiled SchemaIR model-set artifact and fingerprint. +_SCHEMA_IR_MODELSET: dict[str, Any] | None = None +_SCHEMA_IR_MODELSET_FINGERPRINT: str | None = None + +# Per-model compiled SchemaIR artifacts and fingerprints. +_SCHEMA_IR_BY_MODEL: dict[str, dict[str, Any]] = {} +_SCHEMA_IR_FINGERPRINT_BY_MODEL: dict[str, str] = {} diff --git a/tests/fixtures/ir_vectors/schema_phase1_fixture_models_v1.json b/tests/fixtures/ir_vectors/schema_phase1_fixture_models_v1.json new file mode 100644 index 0000000..5cc9ce6 --- /dev/null +++ b/tests/fixtures/ir_vectors/schema_phase1_fixture_models_v1.json @@ -0,0 +1,215 @@ +{ + "vector_name": "schema_phase1_fixture_models_v1", + "domain": "schema", + "expect_valid": true, + "ir": { + "ir_kind": "schema", + "ir_version": 1, + "payload": { + "dialect_agnostic": true, + "models": [ + { + "model_name": "Member", + "table_name": "member", + "columns": [ + { + "name": "email", + "logical_type": "string", + "db_type": "text", + "nullable": false, + "primary_key": false, + "autoincrement": false, + "unique": true, + "index": false, + "default": null, + "format": null + }, + { + "name": "id", + "logical_type": "integer", + "db_type": "bigint", + "nullable": true, + "primary_key": true, + "autoincrement": true, + "unique": false, + "index": false, + "default": null, + "format": null + }, + { + "name": "org_id", + "logical_type": "integer", + "db_type": "bigint", + "nullable": false, + "primary_key": false, + "autoincrement": false, + "unique": false, + "index": true, + "default": null, + "format": null + } + ], + "foreign_keys": [ + { + "column": "org_id", + "to_table": "org", + "to_column": "id", + "on_delete": "CASCADE", + "name": "fk_member_org_id_org" + } + ], + "indexes": [ + { + "name": "idx_member_org_id", + "columns": [ + "org_id" + ], + "unique": false + } + ], + "uniques": [ + { + "name": "uq_member_email", + "columns": [ + "email" + ] + } + ], + "checks": [] + }, + { + "model_name": "Org", + "table_name": "org", + "columns": [ + { + "name": "id", + "logical_type": "integer", + "db_type": "bigint", + "nullable": true, + "primary_key": true, + "autoincrement": true, + "unique": false, + "index": false, + "default": null, + "format": null + }, + { + "name": "name", + "logical_type": "string", + "db_type": "text", + "nullable": false, + "primary_key": false, + "autoincrement": false, + "unique": false, + "index": true, + "default": null, + "format": null + }, + { + "name": "slug", + "logical_type": "string", + "db_type": "text", + "nullable": false, + "primary_key": false, + "autoincrement": false, + "unique": true, + "index": false, + "default": null, + "format": null + } + ], + "foreign_keys": [], + "indexes": [ + { + "name": "idx_org_name", + "columns": [ + "name" + ], + "unique": false + } + ], + "uniques": [ + { + "name": "uq_org_slug", + "columns": [ + "slug" + ] + } + ], + "checks": [] + }, + { + "model_name": "Project", + "table_name": "project", + "columns": [ + { + "name": "id", + "logical_type": "integer", + "db_type": "bigint", + "nullable": true, + "primary_key": true, + "autoincrement": true, + "unique": false, + "index": false, + "default": null, + "format": null + }, + { + "name": "name", + "logical_type": "string", + "db_type": "text", + "nullable": false, + "primary_key": false, + "autoincrement": false, + "unique": false, + "index": false, + "default": null, + "format": null + }, + { + "name": "org_id", + "logical_type": "integer", + "db_type": "bigint", + "nullable": false, + "primary_key": false, + "autoincrement": false, + "unique": false, + "index": true, + "default": null, + "format": null + } + ], + "foreign_keys": [ + { + "column": "org_id", + "to_table": "org", + "to_column": "id", + "on_delete": "CASCADE", + "name": "fk_project_org_id_org" + } + ], + "indexes": [ + { + "name": "idx_project_org_id", + "columns": [ + "org_id" + ], + "unique": false + }, + { + "name": "idx_project_org_id_name", + "columns": [ + "org_id", + "name" + ], + "unique": false + } + ], + "uniques": [], + "checks": [] + } + ] + } + }, + "fingerprint": "e32e8b0dbb67269dc456904d344dca7cecd506aabe1837a544fc9fba5efc3d3d" +} diff --git a/tests/test_ir_vectors_contract.py b/tests/test_ir_vectors_contract.py index f406495..7444146 100644 --- a/tests/test_ir_vectors_contract.py +++ b/tests/test_ir_vectors_contract.py @@ -4,6 +4,10 @@ from pathlib import Path from typing import Any +import pytest + +from ferro import clear_registry, reset_engine + VECTORS_DIR = Path(__file__).parent / "fixtures" / "ir_vectors" SUPPORTED_DOMAINS = {"schema", "query", "codec"} @@ -158,3 +162,58 @@ def test_ir_vectors_match_phase0_contract_envelope() -> None: ) assert isinstance(ir["payload"], dict), f"{label}.ir.payload must be object" _validate_domain_payload(vector["domain"], ir["payload"], f"{label}.ir.payload") + + +@pytest.fixture() +def clean_model_registry() -> None: + from ferro.state import _JOIN_TABLE_REGISTRY, _MODEL_REGISTRY_PY, _PENDING_RELATIONS + + reset_engine() + clear_registry() + _MODEL_REGISTRY_PY.clear() + _PENDING_RELATIONS.clear() + _JOIN_TABLE_REGISTRY.clear() + yield + reset_engine() + clear_registry() + _MODEL_REGISTRY_PY.clear() + _PENDING_RELATIONS.clear() + _JOIN_TABLE_REGISTRY.clear() + + +def _load_vector(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def test_phase1_schema_compiler_matches_snapshot(clean_model_registry: None) -> None: + from ferro.ir import compile_registry_schema_ir, schema_ir_fingerprint + from ferro.relations import resolve_relationships + + from tests.test_cross_emitter_parity import _build_fixture_models + + _build_fixture_models() + resolve_relationships() + + compiled = compile_registry_schema_ir() + snapshot = _load_vector(VECTORS_DIR / "schema_phase1_fixture_models_v1.json") + + assert compiled == snapshot["ir"] + assert schema_ir_fingerprint(compiled) == snapshot["fingerprint"] + + +def test_phase1_schema_compiler_is_deterministic(clean_model_registry: None) -> None: + from ferro.ir import compile_registry_schema_ir, schema_ir_fingerprint + from ferro.relations import resolve_relationships + + from tests.test_cross_emitter_parity import _build_fixture_models + + _build_fixture_models() + resolve_relationships() + + first = compile_registry_schema_ir() + first_fp = schema_ir_fingerprint(first) + second = compile_registry_schema_ir() + second_fp = schema_ir_fingerprint(second) + + assert first == second + assert first_fp == second_fp From a4142ec277a30f08c32da576e4ca0d31fff836d8 Mon Sep 17 00:00:00 2001 From: Taylor Date: Fri, 19 Jun 2026 11:48:16 -0400 Subject: [PATCH 2/3] chore: satisfy eof fixer for IR compiler Apply the pre-commit end-of-file normalization in src/ferro/ir/compiler.py so the lint-and-format CI job passes. Co-authored-by: Cursor --- src/ferro/ir/compiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ferro/ir/compiler.py b/src/ferro/ir/compiler.py index 5c57563..bb18bde 100644 --- a/src/ferro/ir/compiler.py +++ b/src/ferro/ir/compiler.py @@ -306,4 +306,3 @@ def compile_registry_schema_ir() -> dict[str, Any]: def schema_ir_fingerprint(ir_envelope: dict[str, Any]) -> str: return _fingerprint(ir_envelope) - From 5df32b791aa25729502ffe46aec3c3e221d0e258 Mon Sep 17 00:00:00 2001 From: Taylor Date: Fri, 19 Jun 2026 11:51:35 -0400 Subject: [PATCH 3/3] docs: add SchemaIR compiler docstrings Document the new Phase 1 SchemaIR compiler module and APIs with concise docstrings describing contract, arguments, and return values. Co-authored-by: Cursor --- src/ferro/ir/__init__.py | 2 ++ src/ferro/ir/compiler.py | 46 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/src/ferro/ir/__init__.py b/src/ferro/ir/__init__.py index 6df04be..3206a66 100644 --- a/src/ferro/ir/__init__.py +++ b/src/ferro/ir/__init__.py @@ -1,3 +1,5 @@ +"""Public SchemaIR compilation API for the Python runtime.""" + from .compiler import ( compile_model_schema_ir, compile_registry_schema_ir, diff --git a/src/ferro/ir/compiler.py b/src/ferro/ir/compiler.py index bb18bde..984650e 100644 --- a/src/ferro/ir/compiler.py +++ b/src/ferro/ir/compiler.py @@ -1,3 +1,10 @@ +"""SchemaIR compilation and fingerprint helpers for Phase 1. + +This module compiles the canonical Ferro-enriched JSON schema metadata into +RFC-shaped SchemaIR envelopes and persists deterministic fingerprints for +individual models and full model sets. +""" + from __future__ import annotations import hashlib @@ -17,6 +24,7 @@ def _canonical_json(value: dict[str, Any]) -> str: + """Serialize an IR artifact with deterministic key ordering.""" return json.dumps( value, sort_keys=True, @@ -26,10 +34,12 @@ def _canonical_json(value: dict[str, Any]) -> str: def _fingerprint(value: dict[str, Any]) -> str: + """Return the canonical SHA-256 fingerprint for an IR artifact.""" return hashlib.sha256(_canonical_json(value).encode("utf-8")).hexdigest() def _resolve_ref(schema: dict[str, Any], col_info: dict[str, Any]) -> dict[str, Any]: + """Inline a local ``#/$defs/...`` reference into a property schema.""" ref_path = col_info.get("$ref") if not isinstance(ref_path, str): return col_info @@ -46,6 +56,7 @@ def _resolve_ref(schema: dict[str, Any], col_info: dict[str, Any]) -> dict[str, def _logical_type(col_info: dict[str, Any]) -> str: + """Map schema type metadata to SchemaIR ``logical_type``.""" field_type, field_format = _effective_type_and_format(col_info) if field_type == "integer": return "integer" @@ -67,6 +78,7 @@ def _logical_type(col_info: dict[str, Any]) -> str: def _default_db_type(col_info: dict[str, Any]) -> str: + """Map schema metadata to the default canonical Ferro ``db_type`` token.""" field_type, field_format = _effective_type_and_format(col_info) if field_type == "integer": return "bigint" @@ -86,6 +98,7 @@ def _default_db_type(col_info: dict[str, Any]) -> str: def _effective_type_and_format(col_info: dict[str, Any]) -> tuple[Any, Any]: + """Resolve concrete type/format from direct fields or ``anyOf`` unions.""" field_type = col_info.get("type") field_format = col_info.get("format") if field_type is not None: @@ -103,6 +116,7 @@ def _effective_type_and_format(col_info: dict[str, Any]) -> tuple[Any, Any]: def _is_nullable(col_name: str, col_info: dict[str, Any], required_fields: set[str]) -> bool: + """Determine nullability from explicit Ferro hint or required-field fallback.""" nullable_hint = col_info.get("ferro_nullable") if isinstance(nullable_hint, bool): return nullable_hint @@ -112,6 +126,7 @@ def _is_nullable(col_name: str, col_info: dict[str, Any], required_fields: set[s def _column_ir( col_name: str, col_info: dict[str, Any], required_fields: set[str] ) -> dict[str, Any]: + """Compile one schema property into a SchemaIR ``columns[]`` entry.""" return { "name": col_name, "logical_type": _logical_type(col_info), @@ -127,26 +142,32 @@ def _column_ir( def _fk_name(table_name: str, col_name: str, to_table: str) -> str: + """Build canonical foreign-key name for SchemaIR metadata.""" return f"fk_{table_name}_{col_name}_{to_table}" def _single_index_name(table_name: str, col_name: str) -> str: + """Build canonical single-column index name.""" return f"idx_{table_name}_{col_name}" def _single_unique_name(table_name: str, col_name: str) -> str: + """Build canonical single-column unique-constraint name.""" return f"uq_{table_name}_{col_name}" def _composite_index_name(table_name: str, columns: list[str]) -> str: + """Build canonical composite index name.""" return f"idx_{table_name}_{'_'.join(columns)}" def _composite_unique_name(table_name: str, columns: list[str]) -> str: + """Build canonical composite unique-constraint name.""" return f"uq_{table_name}_{'_'.join(columns)}" def _checks_from_columns(table_name: str, columns: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Compile per-column ``db_check`` markers into SchemaIR ``checks[]`` entries.""" checks: list[dict[str, Any]] = [] for col in columns: if col.get("db_check") is not True: @@ -164,6 +185,15 @@ def _checks_from_columns(table_name: str, columns: list[dict[str, Any]]) -> list def compile_schema_ir_payload(model_name: str, schema: dict[str, Any]) -> dict[str, Any]: + """Compile one model schema dict into a SchemaIR payload object. + + Args: + model_name: Registered model class name. + schema: Canonical Ferro-enriched model schema. + + Returns: + A SchemaIR payload object ready to be wrapped in an IR envelope. + """ table_name = model_name.lower() properties = schema.get("properties", {}) if not isinstance(properties, dict): @@ -264,6 +294,7 @@ def compile_schema_ir_payload(model_name: str, schema: dict[str, Any]) -> dict[s def wrap_schema_ir(payload: dict[str, Any]) -> dict[str, Any]: + """Wrap a SchemaIR payload with the standard IR envelope fields.""" return { "ir_kind": "schema", "ir_version": _IR_VERSION, @@ -272,6 +303,15 @@ def wrap_schema_ir(payload: dict[str, Any]) -> dict[str, Any]: def compile_model_schema_ir(model_name: str, model_cls: type[Any]) -> dict[str, Any]: + """Compile and persist a single model's SchemaIR envelope + fingerprint. + + Args: + model_name: Registry key / model class name. + model_cls: Python model class to compile. + + Returns: + The compiled SchemaIR envelope for ``model_cls``. + """ schema = build_model_schema(model_cls) payload = compile_schema_ir_payload(model_name, schema) envelope = wrap_schema_ir(payload) @@ -281,6 +321,11 @@ def compile_model_schema_ir(model_name: str, model_cls: type[Any]) -> dict[str, def compile_registry_schema_ir() -> dict[str, Any]: + """Compile and persist a deterministic SchemaIR envelope for all models. + + Returns: + The compiled model-set SchemaIR envelope, sorted by model name. + """ models: list[dict[str, Any]] = [] for model_name, model_cls in sorted(_MODEL_REGISTRY_PY.items(), key=lambda item: item[0]): if model_name == "Model": @@ -305,4 +350,5 @@ def compile_registry_schema_ir() -> dict[str, Any]: def schema_ir_fingerprint(ir_envelope: dict[str, Any]) -> str: + """Return a deterministic SHA-256 fingerprint for a SchemaIR envelope.""" return _fingerprint(ir_envelope)