From 489343e143a94dbcbe9d18f0292464cdbaa1bc5a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Fri, 19 Jun 2026 19:47:05 +0800 Subject: [PATCH] indent: un-overload YAML semantics off generic flags/literals (#44) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A non-YAML indentation grammar inherited three YAML behaviors derived from flags/literals that mean something else, with no opt-out short of mis-declaring the grammar. Detach each onto its own explicit, mode-neutral IndentConfig field that defaults OFF; yaml.ts opts into each. (A) Flow `:` key/value separator carve-out was derived from the `string` flag (`stringTokenNames`), silently enlisting every string-region token. New `flowSeparatorAfterTokens: string[]` names the membership explicitly (carve-out OFF when empty); `string: true` keeps its region-scoping / auto-close-derivation jobs without dragging a token into separator emission. PR #41's wholesale `flowColonSeparator` boolean is removed — an empty list is the neutral-off it provided, without re-overloading. (B) Plain-scalar continuation folding was derived from `blockPattern`, giving YAML folding to any block-pattern token. New `foldTokens: string[]` names the fold participants explicitly (folding OFF when empty); the last-named token is the catch-all continuation type. A grammar can now carry a `blockPattern` token without inheriting the fold. (C) `keyValueSeparator` was honored by gen-tm but the lexer hardcoded `:` (and `-`/`?`) in its key-line sniffs, a latent parser/highlighter split. Route every lexer key-separator sniff through `indent.keyValueSeparator` (via a shared `keySepAt` helper) and every compact-indicator sniff through `compactIndicators`, so the lexer and gen-tm share one source of truth for the separator for any value. Deferred: (D) the §6.1 tab-in-indentation errors and the value/item-position classification (seq-item `-` vs explicit-key `?`) still hardcode a few YAML indicators; cleanly splitting them needs `startsBlockStructuralNode`'s property/flow/alias indicator set parameterized — a larger sub-task, noted in-code at each site. yaml.ts opts in field-by-field (flowSeparatorAfterTokens + foldTokens) and tokenizes byte-identically: `npm run gen` produces zero generated-file diff across yaml + ts/js/jsx/tsx/html. test/indent-extensions.ts gains toy non-YAML grammars proving each un-overload (a `string:true` token that keeps its `:name` after values; a `blockPattern` token that does not fold; a `keyValueSeparator:'='` grammar whose lexer treats `=` as structural). --- src/gen-lexer.ts | 104 ++++++++++++++++---------- src/types.ts | 37 +++++++-- test/indent-extensions.ts | 153 ++++++++++++++++++++++++++++++-------- yaml.ts | 9 +++ 4 files changed, 224 insertions(+), 79 deletions(-) diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index 9aac269..7fc06ea 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -256,25 +256,28 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // exclusions, filtered ONCE here instead of re-tested per matcher per position. const scanMatchers = tokenMatchers.filter(tm => tm.name !== templateTokenName && !markupTokenNames.has(tm.name) && !indentTokenNames.has(tm.name)); - // String-literal token names (the `string`-flagged tokens — quoted scalars in YAML). Used by the - // flow mapping-separator guard below: a quoted scalar can never run past its closing quote, so a - // `:` immediately after one (inside flow) is ALWAYS the mapping `key: value` separator, never the - // start of a plain scalar — derived from the `string` flag, not a hardcoded token name. - const stringTokenNames = new Set(grammar.tokens.filter(t => t.string).map(t => t.name)); - // Plain-scalar token names: the tokens carrying a block-context pattern variant (`blockPattern`). - // In YAML these are exactly the UNQUOTED scalar family (plain / key / number / boolean-null) — the - // ones whose flow-vs-block forms differ because flow indicators are content in block. Used by the - // flow multi-line-plain FOLD post-pass: a plain scalar folded across a flow-internal newline arrives - // as ADJACENT plain tokens (a space-separated plain is already one token; only a NEWLINE splits it), - // which the post-pass re-merges. Derived from `blockPattern`, not a hardcoded token name. - const plainScalarTokenNames = new Set(grammar.tokens.filter(t => tokenBlockPatternSource(t)).map(t => t.name)); - // The generic (catch-all) plain-scalar token: the LAST-declared blockPattern token. Declaration - // order is specific-before-general (YAML: Key, Num, BoolNull, Plain — the typed/key shapes win - // earlier, so the broadest string-valued plain is necessarily last). Used as the type emitted for - // a folded plain-scalar CONTINUATION line — a more-indented line after a plain LEAF whose leading - // glyph (`-`/`&`/`!`/`[`/`?`/`*`) is plain CONTENT here, not structure (so it can't be lexed by - // the plain head pattern, which forbids those starts). Null when no blockPattern token exists. - const plainContinuationTokenName = [...grammar.tokens].reverse().find(t => tokenBlockPatternSource(t))?.name ?? null; + // Flow mapping-separator carve-out MEMBERSHIP (IndentConfig.flowSeparatorAfterTokens). A `:` glued + // (inside flow) right after one of these tokens is ALWAYS the mapping `key: value` separator, never + // the start of a `:`-led plain scalar — a quoted scalar / flow-close can never run past its closer. + // EXPLICIT list now (was derived from the `string` flag, which silently enlisted every string-region + // token); the carve-out is OFF when the list is absent. See the flow `:` guard below. + const flowSeparatorAfterTokens = new Set(indent?.flowSeparatorAfterTokens ?? []); + // Plain-scalar FOLD MEMBERSHIP (IndentConfig.foldTokens). The token TYPES that participate in YAML's + // plain-scalar continuation folding — in YAML the UNQUOTED scalar family (plain / key / number / + // boolean-null). EXPLICIT list now (was derived from `blockPattern`, which gave folding to ANY + // block-pattern token); folding is OFF when the list is absent. Used by: the block-context fold (a + // deeper line after a plain leaf), the flow illegal-head continuation, and the flow multi-line merge + // post-pass — a plain scalar folded across a flow-internal newline arrives as ADJACENT plain tokens + // (a NEWLINE splits it), which the post-pass re-merges. + const foldTokens = indent?.foldTokens ?? []; + const plainScalarTokenNames = new Set(foldTokens); + // The generic (catch-all) plain-scalar token: the LAST-named fold token. Declaration order is + // specific-before-general (YAML: Key, Num, BoolNull, Plain — the typed/key shapes win earlier, so + // the broadest string-valued plain is necessarily last). Used as the type emitted for a folded + // plain-scalar CONTINUATION line — a more-indented line after a plain LEAF whose leading glyph + // (`-`/`&`/`!`/`[`/`?`/`*`) is plain CONTENT here, not structure (so it can't be lexed by the plain + // head pattern, which forbids those starts). Null when no fold token is declared. + const plainContinuationTokenName = foldTokens.length ? foldTokens[foldTokens.length - 1] : null; // The generic plain token's FLOW pattern (its `pattern`, not the block variant) — used by the flow // illegal-head continuation fallback: a char that no token can START here (e.g. YAML's `%`/`@`/backtick, // illegal as a plain START) is, when it follows a plain scalar inside a flow collection, mid-scalar @@ -282,7 +285,8 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // pattern at the next position), emit it as a plain-continuation token, and let the flow fold post-pass // merge it with the preceding scalar. Compiled once; null when no generic plain token exists. const plainFlowRe = (() => { - const t = [...grammar.tokens].reverse().find(t => tokenBlockPatternSource(t)); + if (!plainContinuationTokenName) return null; + const t = grammar.tokens.find(t => t.name === plainContinuationTokenName); return t ? new RegExp(`^(?:${tokenPatternSource(t)})`) : null; })(); // Does the line content starting at `start` carry a KEY SEPARATOR — an unquoted `:` followed by @@ -306,7 +310,22 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { const kBlockScalarTok = kOf(indent?.blockScalar?.token ?? null); const kRawBlockTok = kOf(indent?.rawBlock?.token ?? null); const kPlainCont = kOf(plainContinuationTokenName); - const tColon = puLitOf.get(':') ?? 0; + // The mapping KEY/VALUE separator (IndentConfig.keyValueSeparator, default `:`) — the ONE source of + // truth shared with gen-tm for every "is this a mapping-key line" sniff in the lexer. `kKvSep` is its + // punctuation-literal intern, for the flow-`:` carve-out push. + const keyValueSep = indent?.keyValueSeparator ?? ':'; + const kKvSep = puLitOf.get(keyValueSep) ?? 0; + // Is `src` at `i` a mapping KEY separator — the `keyValueSeparator` literal followed by whitespace / + // EOL / a flow indicator (`,`/`[`/`]`/`{`/`}`)? The single shared test behind every key-line sniff + // (`lineHasKeySeparator`, `startsBlockStructuralNode`) so they read the separator from ONE place + // (the config) rather than each hardcoding `:`. Returns the index PAST the separator on a hit (so + // the caller can resume), or -1 on no match. + function keySepAt(src: string, i: number): number { + if (!src.startsWith(keyValueSep, i)) return -1; + const n = src[i + keyValueSep.length]; + return (n === undefined || n === ' ' || n === '\t' || n === '\n' || n === '\r' + || n === ',' || n === '[' || n === ']' || n === '{' || n === '}') ? i + keyValueSep.length : -1; + } function lineHasKeySeparator(src: string, start: number): boolean { for (let i = start; i < src.length; i++) { @@ -327,7 +346,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { if (src[i] !== "'") break; continue; } if ((ch === ' ' || ch === '\t') && src[i + 1] === '#') break; // trailing comment → any sep would be earlier - if (ch === ':') { const n = src[i + 1]; if (n === undefined || n === ' ' || n === '\t' || n === '\n' || n === '\r' || n === ',' || n === '[' || n === ']' || n === '{' || n === '}') return true; } + if (keySepAt(src, i) >= 0) return true; } return false; } @@ -392,18 +411,19 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { function startsBlockStructuralNode(src: string, start: number, allowProperty = true): boolean { const c0 = src[start]; if (c0 === '[' || c0 === '{' || c0 === '*') return false; // flow collection / alias → not indentation - if ((c0 === '-' || c0 === '?' || c0 === ':') && sepAfter(src[start + 1])) return true; // indicator / empty key + if (c0 !== undefined && compactIndicatorSet.has(c0) && sepAfter(src[start + 1])) return true; // compact indicator (`-`/`?`) + if (src.startsWith(keyValueSep, start) && sepAfter(src[start + keyValueSep.length])) return true; // empty key (`:` then ws/EOL) if ((c0 === '&' || c0 === '!') && allowProperty) return true; // node property → establishes a node here if (c0 === '&' || c0 === '!') return false; // property after `:` → inline value, legal - // Scalar key sniff: scan the line for an unquoted `:` followed by ws/EOL/flow-indicator (a - // block key separator), skipping over "…"/'…' regions and stopping at a ` #` comment / EOL. + // Scalar key sniff: scan the line for an unquoted key separator followed by ws/EOL/flow-indicator, + // skipping over "…"/'…' regions and stopping at a ` #` comment / EOL. for (let i = start; i < src.length; i++) { const ch = src[i]; if (ch === '\n' || ch === '\r') break; if (ch === '"') { i++; while (i < src.length && src[i] !== '"' && src[i] !== '\n') { if (src[i] === '\\') i++; i++; } continue; } if (ch === "'") { i++; while (i < src.length && src[i] !== '\n') { if (src[i] === "'" && src[i + 1] !== "'") break; if (src[i] === "'") i++; i++; } continue; } if ((ch === ' ' || ch === '\t') && src[i + 1] === '#') break; // trailing comment → key sep would be earlier - if (ch === ':') { const n = src[i + 1]; if (n === undefined || n === ' ' || n === '\t' || n === '\n' || n === '\r' || n === ',' || n === '[' || n === ']' || n === '{' || n === '}') return true; } + if (keySepAt(src, i) >= 0) return true; } return false; } @@ -424,8 +444,8 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { } else break; } if (i >= src.length || src[i] === '\n' || src[i] === '\r') return false; // property alone on the line → no nest - if ((src[i] === '-' || src[i] === '?') && sepAfter(src[i + 1])) return true; // nested indicator - return startsBlockStructuralNode(src, i, false); // a mapping key (the `:`-sniff) + if (src[i] !== undefined && compactIndicatorSet.has(src[i]) && sepAfter(src[i + 1])) return true; // nested compact indicator + return startsBlockStructuralNode(src, i, false); // a mapping key (the key-separator sniff) } // Scan from inside a template span to its next boundary: an interpolation hole @@ -544,7 +564,11 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // The §7.4 / multi-line-flow bookkeeping is indent-only (a newline grammar has no stack). if (flowDepth === 0 && indent) { const prevTok = tokens[tokens.length - 2]; // the token before this just-pushed open - flowValueIndent = (prevTok && prevTok.type === '' && (prevTok.text === ':' || prevTok.text === '-')) + // value/item position: a flow opened right after the key/value separator (map value) or a + // sequence-item indicator. The `-` here is the seq-item lead specifically (NOT every + // compactIndicator — `?` is an explicit KEY, not a value position); classifying it from + // config is the (D) indicator-role split, deferred — see issue #44. + flowValueIndent = (prevTok && prevTok.type === '' && (prevTok.text === keyValueSep || prevTok.text === '-')) ? indentStack[indentStack.length - 1] : -1; flowSawNewline = false; // start tracking whether this flow spans >1 line } @@ -776,10 +800,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // while `-`/`?` include them (`-\t&a x` IS an error). Block context only (flowDepth===0). if (indent && flowDepth === 0) { // §6.1 tab-after-indicator error is YAML-specific const prev = tokens[tokens.length - 1]; - const isIndicator = prev && prev.type === '' && (prev.text === '-' || prev.text === '?' || prev.text === ':'); + const isIndicator = prev && prev.type === '' && (compactIndicatorSet.has(prev.text) || prev.text === keyValueSep); if (isIndicator) { let q = pos; while (q < source.length && (source[q] === ' ' || source[q] === '\t')) q++; - if (source.slice(pos, q).includes('\t') && startsBlockStructuralNode(source, q, prev!.text !== ':')) { + if (source.slice(pos, q).includes('\t') && startsBlockStructuralNode(source, q, prev!.text !== keyValueSep)) { throw new Error(`Tab character used in indentation at offset ${pos}`); } } @@ -1046,14 +1070,16 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { // the separator — emit it as the `:` punctuation literal here. Gated on flow (block-context `:` // separators are handled by the KEY-position lookaheads). yaml-test-suite 5MUD / 5T43 / 9MMW // / C2DT / K3WX (quoted key) and the flow-collection-key cohort. - // flowColonSeparator: false disables the YAML `"key":value` / `}: value` flow - // separator carve-out, for indentation grammars with `:name`-shaped tokens that - // may legally follow a quoted value or a flow-close delimiter. - if (indent && indent.flowColonSeparator !== false && flowDepth > 0 && source[pos] === ':') { + // Declaring flowSeparatorAfterTokens (a non-empty list — YAML: the quoted-key tokens) ENABLES + // the carve-out; it then fires after a NAMED token OR after any flow-CLOSE delimiter (`]`/`}`, + // which structurally can't run past its closer either). An indentation grammar that declares no + // such tokens gets no carve-out at all, so a `:name`-shaped token survives after values in flow. + // The separator glyph is keyValueSeparator (default `:`). + if (indent && flowDepth > 0 && flowSeparatorAfterTokens.size && source.startsWith(keyValueSep, pos)) { const prevTok = tokens[tokens.length - 1]; - if (prevTok && (stringTokenNames.has(prevTok.type) || (prevTok.type === '' && flowCloseSet.has(prevTok.text)))) { - push(mkPu(':', pos, tColon)); - pos += 1; + if (prevTok && (flowSeparatorAfterTokens.has(prevTok.type) || (prevTok.type === '' && flowCloseSet.has(prevTok.text)))) { + push(mkPu(keyValueSep, pos, kKvSep)); + pos += keyValueSep.length; continue; } } @@ -1213,7 +1239,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { let q = i; while (q < source.length && source[q] === ' ') q++; return q > i && source[q] === '-' && sepAfter(source[q + 1]); }; - const colonPairsExplicit = wasLineLead && lit === ':' && currentLineCol === lastExplicitKeyCol; + const colonPairsExplicit = wasLineLead && lit === keyValueSep && currentLineCol === lastExplicitKeyCol; const compactColon = colonPairsExplicit && dashAfter(pos); // A line-lead `:` at its `?`'s column USES UP that pairing — the explicit entry now has its // value, so a SECOND `: …` at the same column (`? a\n: - b\n: - c`, yaml-test-suite cousin) is diff --git a/src/types.ts b/src/types.ts index 123b042..21186b0 100644 --- a/src/types.ts +++ b/src/types.ts @@ -324,10 +324,12 @@ export interface IndentConfig { explicitKey?: string; // the flow `?` explicit-key indicator (e.g. punctuation.definition.key-value) }; comment?: string; // line-comment introducer ignored for indentation (e.g. '#') - // The mapping KEY/VALUE separator literal (YAML `:`). Used by the derived highlighter's multi-line - // plain-scalar fold regions (gen-tm §2a′/§2a″) to recognise a `key:`-led line as STRUCTURAL (a - // sibling that ends a fold) vs a bare plain-scalar continuation. Declared here (not hardcoded in the - // generator) so the YAML region code stays data-driven. Absent → defaults to ':'. + // The mapping KEY/VALUE separator literal (YAML `:`). The ONE source of truth for "what glyph + // separates a mapping key from its value": BOTH the lexer's key-line sniffs (`lineHasKeySeparator`, + // `startsBlockStructuralNode`, the compact-key pairing) AND the derived highlighter's multi-line + // plain-scalar fold regions (gen-tm §2a′/§2a″) recognise a `key:`-led line as STRUCTURAL from this + // field — so parser and highlighter agree for ANY separator. Declared here (not hardcoded) so the + // region code stays data-driven. Absent → defaults to ':'. keyValueSeparator?: string; // Block scalars (YAML `|` / `>`): when the rest of a line is an introducer + indicators, the // following more-indented lines are verbatim content emitted as ONE token (like raw-text, but @@ -339,10 +341,29 @@ export interface IndentConfig { // control sigil, not content; absent → the block-scalar token's own scope (introducer reads as the // body string). The body always keeps the token scope; only the introducer capture is re-scoped. blockScalar?: { introducers: string[]; token: string; documentMarkers?: string[]; indicatorScope?: string }; - // Set false to disable the YAML flow `:` key-separator carve-out (a `:` glued after a quoted - // scalar / flow-close is forced punctuation). Indentation grammars with `:name`-shaped tokens - // (bound-attribute shorthand) need those to survive after values. Default true (YAML behavior). - flowColonSeparator?: boolean; + // Flow `:` key-separator carve-out MEMBERSHIP: token TYPES after which a `:` glued inside a flow + // collection is the `key: value` SEPARATOR (forced `:` punctuation), never the start of a `:`-led + // plain scalar. A quoted scalar / flow-close can never run past its closer, so a `:` immediately + // after one is unambiguously the separator (YAML: the quoted-key tokens). This is an EXPLICIT, + // mode-neutral list — the carve-out is OFF unless a token is named here. (Was derived from the + // `string` flag, which silently enlisted every string-region token; an indentation grammar with + // `:name`-shaped tokens after values keeps `string: true` for region scoping / auto-close + // derivation WITHOUT being dragged into separator emission.) Flow-CLOSE delimiters (`flowClose`) + // are always part of the carve-out — a `:` after `]`/`}` is structurally the separator regardless. + // Absent / empty → no carve-out (the `:` lexes normally). The separator glyph itself is + // `keyValueSeparator`. yaml-test-suite 5MUD / 5T43 / 9MMW / C2DT / K3WX. + flowSeparatorAfterTokens?: string[]; + // Plain-scalar CONTINUATION fold MEMBERSHIP: the token TYPES that participate in YAML's plain-scalar + // folding — a more-indented line right after one of these LEAF scalars (or an adjacent one inside a + // flow collection) is a CONTINUATION of that scalar, not a new node. Drives the block-context fold + // (a deeper line after a plain leaf), the flow illegal-head continuation, and the flow multi-line + // merge post-pass. The LAST-named token is the generic catch-all used as the emitted CONTINUATION + // token type and whose `pattern` matches a folded body (declaration order is specific-before-general, + // so the broadest plain is last). This is an EXPLICIT, mode-neutral list — folding is OFF unless a + // token is named here. (Was derived from `blockPattern`, which gave YAML plain-scalar folding to ANY + // block-pattern token; an indentation grammar can now carry a `blockPattern` token WITHOUT inheriting + // the fold.) Absent / empty → no folding. yaml-test-suite 3MYT / A2M4 / AB8U / FBC9 / JTV5 / UT92. + foldTokens?: string[]; // A comment introducer immediately followed by this string is NOT a comment line — it falls // through to ordinary tokenization (e.g. comment '//' + commentExcept '!' → `//!` doc-comment // lines lex as real tokens and stay visible to the indent stack, while `//` lines vanish). diff --git a/test/indent-extensions.ts b/test/indent-extensions.ts index dc6526e..003041b 100644 --- a/test/indent-extensions.ts +++ b/test/indent-extensions.ts @@ -2,22 +2,27 @@ // engine behavior over TOY grammars (token names and introducer characters // deliberately unlike any real language — the behaviors are grammar DATA). // -// Three opt-in IndentConfig fields, each motivated by a Pug-like indentation -// language (one that nests HTML-ish tag lines rather than key/value scalars): +// Opt-in IndentConfig fields, each motivated by a non-YAML indentation language +// (one that nests HTML-ish tag lines or `k=v` entries rather than YAML scalars): // -// 1. `commentExcept` — two-tier comments: `--` lines vanish (invisible to -// the indent stack, like YAML `#`), but `--!` lines -// are REAL tokens (doc comments that ship to output). -// 2. `rawBlock` — verbatim capture introduced from the END of a line -// (`tag:mode` filters/content modes, Pug-style); the -// mirror image of YAML's leading `|`/`>` blockScalar. -// 3. `flowColonSeparator: false` — languages with `:name`-shaped tokens -// (bound-attribute shorthand) need a `:` after a -// quoted value / flow-close to stay a token start, -// not YAML's forced `key: value` separator punct. +// 1. `commentExcept` — two-tier comments: `--` lines vanish (like +// YAML `#`), but `--!` lines are REAL tokens. +// 2. `rawBlock` — verbatim capture introduced from the END of +// a line (Pug-style); mirror of `blockScalar`. +// 3. `flowSeparatorAfterTokens` — EXPLICIT membership for the flow `:` key/ +// value carve-out, decoupled from `string` +// (issue #44 (A)). OFF unless declared. +// 4. `foldTokens` — EXPLICIT membership for plain-scalar +// continuation folding, decoupled from +// `blockPattern` (issue #44 (B)). OFF unless +// declared. +// 5. `keyValueSeparator` — the separator glyph the LEXER (not just +// gen-tm) reads for key-line sniffs; a non-`:` +// value is recognized structurally (issue +// #44 (C)). Default ':'. // -// All three default OFF — a grammar declaring none (YAML) tokenizes -// byte-identically, which the yaml gates already enforce. +// All default OFF / neutral — a grammar declaring none (YAML opts in field-by- +// field) tokenizes byte-identically, which the yaml gates already enforce. import { token, rule, defineGrammar, alt, many, many1, opt, seq, oneOf, noneOf, range, star, plus, never } from '../src/api.ts'; import type { IndentConfig } from '../src/types.ts'; import { createLexer } from '../src/gen-lexer.ts'; @@ -153,7 +158,8 @@ const Str = token(seq('"', star(noneOf('"')), '"'), { string: true }); } // ───────────────────────────────────────────────────────────────────────────── -// 3. flowColonSeparator: false — `:name` tokens survive after values in flow +// 3. flowSeparatorAfterTokens — flow `:` carve-out is EXPLICIT membership, OFF by +// default, and DECOUPLED from the `string` flag (issue #44 (A) un-overload). // ───────────────────────────────────────────────────────────────────────────── { const Indent = token(never(), {}); @@ -172,25 +178,108 @@ const Str = token(seq('"', star(noneOf('"')), '"'), { string: true }); indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline', flowOpen: ['('], flowClose: [')'], }; - const gYaml = mk(base); // default: YAML behavior - const gOff = mk({ ...base, flowColonSeparator: false }); - - // Default (YAML): a `:` after a quoted value in flow is forced separator punctuation. - check('flowColonSeparator default: `:` after a string is separator punct (YAML behavior)', - lexed(gYaml, 'tag("v" :k)').some(t => t.type === '' && t.text === ':')); - - // Disabled: the same `:` starts the BoundName token. - check('flowColonSeparator false: `:name` after a string lexes as one token', - lexed(gOff, 'tag("v" :k)').some(t => t.type === 'BoundName' && t.text === ':k')); - - // Same carve-out after a flow-CLOSE delimiter, nested so flow depth stays > 0. - check('flowColonSeparator false: `:name` after `)` (still in flow) lexes as one token', - lexed(gOff, 'tag((aa) :k)').some(t => t.type === 'BoundName' && t.text === ':k')); - check('flowColonSeparator default: `:` after `)` splits (YAML behavior preserved)', - !lexed(gYaml, 'tag((aa) :k)').some(t => t.type === 'BoundName')); + // `Str` carries `string: true` (region scoping). NEUTRAL grammar = no flowSeparatorAfterTokens + // declared. ROOT-CAUSE PROOF: under the old derivation `string: true` alone enlisted `Str` into the + // carve-out; after the un-overload it does NOT — the `:k` survives as one BoundName token. + const gNeutral = mk(base); + // Opt IN explicitly: name `Str` (and flow-close is then active too). + const gOn = mk({ ...base, flowSeparatorAfterTokens: ['Str'] }); + + check('flowSeparatorAfterTokens: a `string:true` token is NOT auto-enlisted — `:name` survives after a string', + lexed(gNeutral, 'tag("v" :k)').some(t => t.type === 'BoundName' && t.text === ':k')); + check('flowSeparatorAfterTokens: neutral grammar — `:name` survives after a flow-close `)` too', + lexed(gNeutral, 'tag((aa) :k)').some(t => t.type === 'BoundName' && t.text === ':k')); + + // Declared: the same `:` after the named token is now forced separator punctuation. + check('flowSeparatorAfterTokens: declared → `:` after the named token is separator punct', + lexed(gOn, 'tag("v" :k)').some(t => t.type === '' && t.text === ':') && + !lexed(gOn, 'tag("v" :k)').some(t => t.type === 'BoundName')); + // Declaring the carve-out also activates it after a flow-CLOSE delimiter. + check('flowSeparatorAfterTokens: declared → `:` after flow-close `)` also splits', + lexed(gOn, 'tag((aa) :k)').some(t => t.type === '' && t.text === ':') && + !lexed(gOn, 'tag((aa) :k)').some(t => t.type === 'BoundName')); +} + +// ───────────────────────────────────────────────────────────────────────────── +// 4. foldTokens — plain-scalar continuation fold is EXPLICIT membership, OFF by +// default, and DECOUPLED from the `blockPattern` flag (issue #44 (B) un-overload). +// ───────────────────────────────────────────────────────────────────────────── +{ + const Indent = token(never(), {}); + const Dedent = token(never(), {}); + const Newline = token(never(), {}); + // A token that DECLARES a block-context variant via `blockPattern`. Its head pattern is `[a-z]+`, so + // a line that STARTS with `-` cannot be lexed as a Scalar. Under the old derivation any blockPattern + // token got YAML plain-scalar folding; after the un-overload it does not, unless named in foldTokens. + const Scalar = token(plus(lower), { scope: 'scalar', blockPattern: plus(lower) }); + const Line = rule(() => [[Scalar, many(Newline, Scalar)]]); + const Lines = rule(() => [[Line, many(Newline, Line)]]); + const Doc = rule(() => [[opt(Lines), opt(Indent), opt(Lines), opt(Dedent)]]); + + const mk = (indent: IndentConfig) => defineGrammar({ + name: 'tinyfold', tokens: { Indent, Dedent, Newline, Scalar }, rules: { Line, Lines, Doc }, entry: Doc, indent, + }); + const base: IndentConfig = { indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline' }; + const gNeutral = mk(base); // blockPattern present, foldTokens absent + const gFold = mk({ ...base, foldTokens: ['Scalar'] }); // opt IN + + // ROOT-CAUSE PROOF: a deeper line `- bbb` (leading `-`, not a Scalar head) after a `blockPattern` + // LEAF. With folding OFF (the un-overload — blockPattern alone no longer triggers it) the `-` is an + // unlexable char → a hard lex error. So the block-context fold genuinely does NOT fire here. + let neutralThrew = false; + try { lexed(gNeutral, 'aaa\n - bbb'); } catch { neutralThrew = true; } + check('foldTokens: a `blockPattern` token is NOT auto-folded — an illegal-head deeper line errors', + neutralThrew); + + // Declared: the same input folds — the whole deeper line (its leading `-` is now scalar content) is + // absorbed as ONE continuation Scalar token. Observed: `Scalar:aaa Indent Scalar:"- bbb" Dedent`. + const tFold = lexed(gFold, 'aaa\n - bbb'); + check('foldTokens: declared → the illegal-head deeper line folds into one continuation Scalar', + tFold.some(t => t.type === 'Scalar' && t.text === '- bbb')); +} + +// ───────────────────────────────────────────────────────────────────────────── +// 5. keyValueSeparator — the lexer (not just gen-tm) reads the separator glyph; +// a non-`:` separator is recognized structurally by the parser (issue #44 (C)). +// ───────────────────────────────────────────────────────────────────────────── +{ + const Indent = token(never(), {}); + const Dedent = token(never(), {}); + const Newline = token(never(), {}); + // A key scalar whose block variant ends at the `=` separator, and a plain value scalar. The fold + // sniff (`lineHasKeySeparator`) must treat `=` (not `:`) as the structural key separator so a + // `k= v` line is a mapping line, not a foldable plain continuation. + const Key = token(plus(lower), { scope: 'key', blockPattern: seq(plus(lower), '=') }); + const Val = token(plus(lower), { scope: 'val', blockPattern: plus(lower) }); + const Sep = token('=', {}); + const Entry = rule(() => [[Key, Sep, Val]]); + const Lines = rule(() => [[Entry, many(Newline, Entry)]]); + const Doc = rule(() => [[opt(Lines), opt(Indent), opt(Lines), opt(Dedent)]]); + + const mk = (indent: IndentConfig) => defineGrammar({ + name: 'tinykv', tokens: { Indent, Dedent, Newline, Key, Val, Sep }, rules: { Entry, Lines, Doc }, entry: Doc, indent, + }); + // `=` is the separator AND a fold token list so a deeper `k= v` is recognized as a key line, not a fold. + const g = mk({ indentToken: 'Indent', dedentToken: 'Dedent', newlineToken: 'Newline', keyValueSeparator: '=', foldTokens: ['Key', 'Val'] }); + + // A `=`-led line after a plain leaf is a mapping line (the key separator), so it must NOT fold — + // the lexer's key-separator sniff has to read `=`, not `:`. (If it still hardcoded `:`, the `b= c` + // line would be seen as separator-less plain content and wrongly fold into one continuation token.) + // Observed: `Val:a Indent Key:"b=" Val:c Dedent` — `b=` lexes via the block variant as a Key. + const t = lexed(g, 'a\n b= c'); + check('keyValueSeparator: the lexer recognizes `=` as the structural separator (`b=` is a Key line, no fold)', + t.some(tk => tk.type === 'Key' && tk.text === 'b=') && + t.some(tk => tk.type === 'Val' && tk.text === 'a') && // `a` stayed its own leaf — not folded + !t.some(tk => tk.type === 'Val' && tk.text.includes('b'))); // `b=` did not fold into a Val continuation + // Sanity: with NO `=`, the same-shape deeper plain line IS a foldable continuation — proving the + // first case's non-fold is the `=` separator's doing, not folding being off. The block-context fold + // emits the deeper line as ONE continuation token (`Val:"b c"`), so it is not two Val tokens. + const t2 = lexed(g, 'a\n b c'); + check('keyValueSeparator: a separator-less deeper line still folds into one continuation token', + t2.some(tk => tk.type === 'Val' && tk.text === 'b c')); } console.log(fail === 0 - ? `\n${ok}/${ok} indent-extension checks pass — commentExcept / rawBlock / flowColonSeparator behave as specified` + ? `\n${ok}/${ok} indent-extension checks pass — commentExcept / rawBlock / flowSeparatorAfterTokens / foldTokens / keyValueSeparator behave as specified` : `\n${fail} FAILED`); process.exit(fail === 0 ? 0 : 1); diff --git a/yaml.ts b/yaml.ts index 847d281..84fa6c8 100644 --- a/yaml.ts +++ b/yaml.ts @@ -628,6 +628,15 @@ const indent: IndentConfig = { }, comment: '#', keyValueSeparator: ':', + // Flow `:` carve-out: a `:` glued (inside flow) after a quoted scalar / quoted key — or after a + // flow-close `]`/`}` — is the `key: value` separator, never a `:`-led plain. The quoted-key tokens + // opt IN explicitly (decoupled from the `string` flag, which scopes string regions / derives + // auto-close delimiters and no longer enlists a token here). + flowSeparatorAfterTokens: ['DQuote', 'SQuote', 'DQuoteKey', 'SQuoteKey'], + // Plain-scalar continuation fold participants — the UNQUOTED scalar family. The LAST (Plain) is the + // generic catch-all (folded-continuation token type + flow body pattern). Opt IN explicitly + // (decoupled from the `blockPattern` flag, which now only selects the block-context match variant). + foldTokens: ['Key', 'Num', 'BoolNull', 'Plain'], blockScalar: { introducers: ['|', '>'], token: 'BlockScalar', documentMarkers: ['---', '...'], indicatorScope: 'keyword.control.flow.block-scalar' }, compactIndicators: ['-', '?'], // Tag-handle per-document membership (§6.8.2 / §6.9.1): a named handle `!h!` used by a Tag must