-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathhtml-issue-cases.ts
More file actions
99 lines (94 loc) · 12.5 KB
/
Copy pathhtml-issue-cases.ts
File metadata and controls
99 lines (94 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// html-issue-cases.ts — REAL bugs documented against the official html.tmbundle / VS Code
// HTML grammar, as DATA (no side effects). Each snippet is valid HTML (parse5 parses it);
// the question is whether the grammar scopes the marked span correctly. Shared by the
// README cross-language ✓ table (test/issue-table.ts). The companion html-bench.ts grades
// the same snippets per-char against parse5; here each carries an explicit checkpoint so the
// grammars can be graded uniformly with TS/Vue. ids cite the upstream trackers.
export interface HtmlCase { id: string; title: string; src: string; at: string; nth?: number; want: (scope: string) => boolean; }
const isTag = (s: string) => s.includes('entity.name.tag');
const isString = (s: string) => s.includes('string'); // a quoted/unquoted attr VALUE
const notAttrName = (s: string) => !s.includes('entity.other.attribute-name');
const isText = (s: string) => s.includes('text') && !s.includes('entity') && !s.includes('string');
const isJs = (s: string) => s.includes('source.js'); // delegated to the JS grammar
const isCss = (s: string) => s.includes('source.css'); // delegated to the CSS grammar
const isTagPunct = (s: string) => s.includes('punctuation.definition.tag'); // a `<` `>` `/` delimiter
export const cases: HtmlCase[] = [
{ id: 'tmbundle#118', title: 'trailing `/` in an unquoted URL value', src: '<a href=https://example.org/>foo</a>',
at: '/', nth: 2, want: isString }, // the trailing `/` is still the VALUE (official breaks before it)
{ id: 'tmbundle#124', title: 'slash in unquoted value `foo/`', src: '<img class=foo/>',
at: 'foo', want: s => isString(s) && notAttrName(s) },
{ id: 'vscode#140360', title: '`/` inside an unquoted value (path)', src: '<link rel=stylesheet href=/css/app.css>',
at: '/', want: isString }, // the slash is part of the VALUE, not punctuation
{ id: 'tmbundle#84', title: 'tag name a prefix of a sibling (`<i>`/`<input>`)', src: '<div><i><input></i></div>',
at: 'input', want: isTag }, // <input> is a tag, not swallowed by <i>
{ id: 'tmbundle#117', title: 'SVG camelCase tag name', src: '<svg><animateTransform attributeName="x"/></svg>',
at: 'animateTransform', want: isTag },
{ id: 'tmbundle#122', title: '`<` inside a quoted attr value', src: '<a data-q="a < b">y</a>',
at: 'b', want: isString }, // still inside the string, not a new tag
{ id: 'vscode#130284', title: '`>` inside a quoted attr value does not close the tag early', src: '<button title="a > b">go</button>',
at: 'go', want: isText }, // the `>` is inside the VALUE, so the start tag is not closed at it and `go` is the element's text (NOT swallowed). RELABELLED from tmbundle#115 — that earlier cite was wrong: textmate/html.tmbundle#115 is a FEATURE REQUEST to flag an end tag carrying attributes (`</div id="x">`) as invalid (which neither grammar does, and isn't a tokenization bug); this snippet actually exercises the `>`-in-attribute-value case = microsoft/vscode#130284
{ id: 'tmbundle#97', title: 'whitespace (incl. a line feed) before `>` in a raw-text end tag', src: '<script src="d"></script\n><p>zz</p>',
at: 'zz', want: s => isText(s) && !isJs(s) }, // HTML5 allows ws (incl. line feeds) before `>` in an end tag → parse5 CLOSES the <script> (it is empty) and `<p>zz</p>` is a sibling. The text `zz` must therefore be HTML text, NOT leaked into the embedded source.js (which is what Monogram did before the deferred-`>` close rules: `</script`→keyword.operator, `<p>zz<`→string.regexp.js). Both engines now close it (a former Monogram-only gap vs parse5)
{ id: 'tmbundle#108', title: 'nested `<svg>` is a valid tag, not flagged invalid', src: '<svg><svg></svg></svg>',
at: 'svg', nth: 1, want: s => isTag(s) && !s.includes('invalid') }, // official's SVG-child whitelist marks a nested <svg> invalid.illegal; Monogram's generic nesting accepts it
// ── #81 (entities), #102 (`<style>`/`<script>` embedding) and #113 (`on*` JS) WERE Monogram-only
// gaps, now CLOSED: html.ts gained `markup.entity`, a `rawText.embed` map (delegating CSS — and
// Monogram's OWN JS — to the platform grammars), and `markup.attributeEmbed` (`on*`→source.js).
// All graded against the REAL embedded JS/CSS so a ✓ means *correctly highlighted*, not merely
// delegated — for #113 that's the whole point: the official DOES embed JS in `on*` yet still
// mis-reads `//` in the string as a comment (its inline-JS value rule hand-rolls a `//`
// splitter), so it can't win even with the embed; Monogram delegates the whole value to its
// own source.js (capture-bounded, the same helper Vue directive values use) and reads it right.
{ id: 'tmbundle#113', title: '`//` in an `onclick=` JS string read as a comment', src: `<input onclick="location.href='https://x.org/'">`,
at: '//', want: s => s.includes('source.js') && !s.includes('comment') }, // official: hand-rolled // splitter reads it as a comment (bug); Monogram: capture-embedded source.js keeps it a string
{ id: 'tmbundle#81', title: 'character entity `&` in text', src: '<p>x & z</p>',
at: '&', want: s => s.includes('constant.character.entity') }, // both scope it now — Monogram via markup.entity (was a Text blob), official natively
{ id: 'tmbundle#102', title: '`<style>` element CSS is tokenized, not a flat blob', src: '<style>.a{color:red}</style>',
at: 'color', want: s => s.includes('support.type.property-name.css') }, // both embed real CSS (color = property-name) — Monogram now delegates source.css like the official (was an untokenized blob)
// ── Embedded-language boundaries & inline-language attributes. Graded against the REAL
// embeds (Monogram's own source.js, VS Code's source.css) so a ✓ means correctly
// highlighted, not merely delegated. These mix every honest verdict — only-Monogram,
// both-pass, AND only-official (#85), which is NOT a shared ceiling but a documented,
// PROVEN tradeoff: fixing it agnostically would regress #5538 and/or leak source.js onto
// the close `<` (#65/#74); the official only "wins" by hand-patching JS's comment grammar
// (non-agnostic). The full mechanism + the measured begin/end experiment are at #85 below.
{ id: 'tmbundle#104', title: 'mixed-case `onChange=` event handler still reads as JS', src: '<div onChange="cb()"></div>',
at: 'cb', want: isJs }, // official: case-sensitive `on*` list → `onChange` is meta.attribute.unrecognized, value stays a plain string; Monogram lower-cases the `on*` test so the value delegates to source.js like `onchange`
{ id: 'tmbundle#50', title: '`onclick=` event-handler value is colored as JS', src: '<button onclick="run(1)">x</button>',
at: 'run', want: isJs }, // both embed source.js in the (lower-case) handler value now (was a flat string) — `run` is entity.name.function.js
{ id: 'tmbundle#88', title: 'inline `style=` value embeds CSS', src: '<div style="color:red"></div>',
at: 'color', want: s => s.includes('support.type.property-name.css') }, // ONLY-MONOGRAM: a granular embed — `color` is a CSS property-name, not a flat blob. Monogram embeds the value via source.css#rule-list-innards (the declaration-list context, markup.attributeEmbed `style`); VS Code's HTML grammar embeds the stylesheet ROOT, so it mis-reads `color:red` as a selector and `color` stays an undifferentiated `source.css` span. (The hand-written Vue grammar did tokenize granularly via its `#vue-directives-style-attr` copy of rule-list-innards — Monogram now MATCHES that and beats plain HTML.)
{ id: 'tmbundle#65', title: '`<` of `</script>` is HTML punctuation, not `source.js`', src: '<script>var a=1;</script>',
at: '<', nth: 1, want: s => isTagPunct(s) && !isJs(s) }, // the close-tag `<` is the 2nd `<` (nth:1). official leaks the embedded source.js scope onto it (vscode-textmate force-pops as `source.js-ignored-vscode`), miscoloring it under a JS theme; Monogram closes the embed before the `<` so it stays clean tag punctuation
{ id: 'tmbundle#74', title: '`<` of `</style>` is HTML punctuation, not `source.css`', src: '<style>.a{}</style>',
at: '<', nth: 1, want: s => isTagPunct(s) && !isCss(s) }, // same leak for CSS (close `<` = 2nd `<`, nth:1): official tags the `</style>` `<` with source.css-ignored-vscode; Monogram closes the embed first, keeping it clean tag punctuation
{ id: 'tmbundle#85', title: '`//</script>` on its own line still closes the script', src: '<script>\n//</script>\n<p>z</p>',
at: 'z', want: s => !isJs(s) }, // MONOGRAM ✓ — fixed AGNOSTICALLY (no JS-syntax knowledge), DISPROVING the earlier "tradeoff":
// Monogram's multi-line raw-text region is `begin/while`; the `while` re-checks each line at `^`
// and DROPS the region (force-unwinding any still-open embedded construct) BEFORE that line
// tokenizes. The fix WIDENED its negative lookahead from the line START (`^(?!\s*</tag…)`) to the
// WHOLE line (`^(?!.*</tag…)`): the region now drops at the start of ANY line CONTAINING `</script>`,
// not only one that begins with it. That catches the mid-line close in `//</script>` (the embed's
// `//` line-comment would otherwise claim the whole line at col 0, which no host `end`/sibling rule
// can preempt — a leftmost embedded match wins). It stays AGNOSTIC: the lookahead keys only on the
// configured tag + `<`/`/`/`>` delimiters (DATA), never on the embed's `//`,`/*…*/`, backtick or
// regex. And it is MORE faithful to the markup oracle — parse5 closes <script> at the FIRST
// `</script>` regardless of JS context (even inside a string/template/comment), so dropping on any
// line containing it IS the spec, not an over-drop. Because the embed stays ONE continuous region
// (the `while` only TESTS), a multi-line template/comment/string with no `</script>` inside is
// untouched (`<script>\nconst x=\`a\nb\`\n</script>` stays one continuous string.template); and the
// line-START drop still force-unwinds an open TS type-body, so the trailing-type unwind (#5538/#2060)
// and the clean close `<` (#65/#74) are preserved. `</tag[\s>]` needs the slash, so a bare `<` in
// the body (`a < b`, `x<y>`) does NOT drop — it stays embedded, matching parse5.
// EARLIER (and wrongly framed as a proven tradeoff): a `begin/end` form (end = `(?=^\s*</tag)|(<)(?=
// /tag)`, the official's shape) was measured + REJECTED — it FAILED #85 (source.js `//[^\n]*` claims
// col 0 before the host `end` at col 2) AND regressed #5538 (begin/end doesn't unwind the open type-
// body at the close line — only `while` does). The whole-line `while` keeps that unwind while ALSO
// reaching the mid-line close, winning both. (VS Code instead HAND-PATCHES JS's own comment/string
// rules with a baked-in `end:(?=</script)|\n` — JS-syntax-specific, and it PAYS with a
// `source.js-ignored-vscode` leak on EVERY close `<`, the #65/#74 cases Monogram keeps clean.)
{ id: 'tmbundle#51', title: 'self-closing `/` is tag punctuation', src: '<img src="a.png" />',
at: '/', want: isTagPunct }, // both scope the `/` of `/>` as punctuation.definition.tag (was plain text in old TextMate)
{ id: 'tmbundle#82', title: 'a `/>`-style `<script src=… />` does NOT self-close — its body is the script content', src: '<script src="x" /></head><body>hi</body>',
at: 'hi', want: s => isJs(s) && !isTag(s) }, // `<script>` is a RAW-TEXT element: per HTML5 (and parse5, the oracle) a trailing `/>` is NOT self-closing, so parse5 keeps the script OPEN and everything after it — `</head><body>hi</body>` — is its TEXT CONTENT (parse5: <body> parses EMPTY, "hi" lives inside the script node). Monogram cascades that content into its embedded source.js BY CONSTRUCTION (parse5-faithful: `hi` is source.js, NOT an HTML tag). VS Code's grammar does the same here (it does not honour `/>` on a raw-text element either) → both engines are parse5-correct. This REPLACES an unrelated paired-`<script>…</script>` JSON proxy that never exercised the self-close at all.
];