From 1052dc8f242e0c745c94db2703173d45b231313e Mon Sep 17 00:00:00 2001 From: Farnabaz Date: Fri, 6 Mar 2026 14:15:30 +0100 Subject: [PATCH 1/4] feat(parse): parse html to Comark Nodes --- packages/comark/SPEC/HTML/block+component.md | 58 ++++ packages/comark/SPEC/HTML/block.md | 53 +++ packages/comark/SPEC/HTML/inline.md | 53 +++ packages/comark/package.json | 3 + packages/comark/src/index.ts | 12 + .../src/internal/parse/token-processor.ts | 305 ++++++++++++++---- .../src/internal/stringify/attributes.ts | 1 - .../src/internal/stringify/handlers/html.ts | 5 +- .../src/internal/stringify/handlers/mdc.ts | 5 +- pnpm-lock.yaml | 13 + 10 files changed, 442 insertions(+), 66 deletions(-) create mode 100644 packages/comark/SPEC/HTML/block+component.md create mode 100644 packages/comark/SPEC/HTML/block.md create mode 100644 packages/comark/SPEC/HTML/inline.md diff --git a/packages/comark/SPEC/HTML/block+component.md b/packages/comark/SPEC/HTML/block+component.md new file mode 100644 index 0000000..4471d5a --- /dev/null +++ b/packages/comark/SPEC/HTML/block+component.md @@ -0,0 +1,58 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md + +::component +Default Slot +:: + +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "hello", + { + "$comark": { "html": 1 } + }, + [ + "component", + {}, + "Default Slot" + ] + ] + ] +} +``` + +## HTML + +```html + + + Default Slot + + +``` + +## Markdown + +```md +::hello + :::component + Default Slot + ::: +:: +``` diff --git a/packages/comark/SPEC/HTML/block.md b/packages/comark/SPEC/HTML/block.md new file mode 100644 index 0000000..43cfdf4 --- /dev/null +++ b/packages/comark/SPEC/HTML/block.md @@ -0,0 +1,53 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md + +Hello **World** + +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "hello", + { + "$comark": { "html": 1} + }, + "Hello ", + [ + "strong", + {}, + "World" + ] + ] + ] +} +``` + +## HTML + +```html + + Hello World + +``` + +## Markdown + +```md + +Hello **World** + +``` diff --git a/packages/comark/SPEC/HTML/inline.md b/packages/comark/SPEC/HTML/inline.md new file mode 100644 index 0000000..2f13b48 --- /dev/null +++ b/packages/comark/SPEC/HTML/inline.md @@ -0,0 +1,53 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md +Hello **World** +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "p", + {}, + [ + "hello", + { + "$comark": { "html": 1} + }, + "Hello ", + [ + "strong", + {}, + "World" + ] + ] + ] + ] +} +``` + +## HTML + +```html +

+ Hello World +

+``` + +## Markdown + +```md +Hello **World** +``` diff --git a/packages/comark/package.json b/packages/comark/package.json index 38e41c7..c0e74e5 100644 --- a/packages/comark/package.json +++ b/packages/comark/package.json @@ -58,6 +58,9 @@ "optional": true } }, + "dependencies": { + "htmlparser2": "^9.0.0" + }, "devDependencies": { "@comark/cjk": "workspace:*", "@comark/markdown-it": "^0.3.1", diff --git a/packages/comark/src/index.ts b/packages/comark/src/index.ts index 5c17a05..1d828be 100644 --- a/packages/comark/src/index.ts +++ b/packages/comark/src/index.ts @@ -102,6 +102,18 @@ export function createParse(options: ParseOptions = {}): ComarkParseFn { let nodes = marmdownItTokensToComarkTree(state.tokens, { startLine: state.parsedLines, preservePositions: opts.streaming ?? false, + parseInlineMarkdown: (text: string) => { + const blockTokens = parser.parse(text, {}) + // Always unwrap single-paragraph wrappers from inner content — they are + // an artifact of block-parsing text that lives inside an HTML element + let blockNodes = marmdownItTokensToComarkTree(blockTokens, { startLine: 0, preservePositions: false }) + .map(node => applyAutoUnwrap(node)) + // Single paragraph → unwrap to bare inline children (e.g. "Hello **World**") + if (blockNodes.length === 1 && Array.isArray(blockNodes[0]) && blockNodes[0][0] === 'p') { + return blockNodes[0].slice(2) as ComarkNode[] + } + return blockNodes + }, }) if (autoUnwrap) { diff --git a/packages/comark/src/internal/parse/token-processor.ts b/packages/comark/src/internal/parse/token-processor.ts index c64ea80..225a1f3 100644 --- a/packages/comark/src/internal/parse/token-processor.ts +++ b/packages/comark/src/internal/parse/token-processor.ts @@ -1,4 +1,8 @@ import type { ComarkNode } from 'comark/ast' +import { Parser } from 'htmlparser2' + +// Set for the duration of each marmdownItTokensToComarkTree call (safe: JS is single-threaded) +let _parseInlineMarkdown: ((text: string) => ComarkNode[]) | undefined // Mapping from token types to tag names const BLOCK_TAG_MAP: Record = { @@ -22,21 +26,189 @@ const INLINE_TAG_MAP: Record = { sub_open: 'del', } +const VOID_ELEMENTS = new Set([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'link', 'meta', 'param', 'source', 'track', 'wbr', +]) + +// ─── htmlparser2 helpers ──────────────────────────────────────────────────── + +function attribsToComarkAttrs(attribs: Record): Record { + const attrs: Record = { + $comark: { + html: 1 + } + } + for (const key in attribs) { + const value = attribs[key] + if (value === '') { + attrs[`:${key}`] = 'true' + } + else { + attrs[key] = value + } + } + return attrs +} + +/** + * Parse a full HTML string into ComarkNodes using htmlparser2. + * Handles nested elements, text, void elements, and comments. + */ +function htmlToComarkNodes(html: string): ComarkNode[] { + const root: ComarkNode[] = [] + const stack: { tag: string, attrs: Record, children: ComarkNode[] }[] = [] + + const parser = new Parser({ + onopentag(name, attribs) { + const attrs = attribsToComarkAttrs(attribs) + if (VOID_ELEMENTS.has(name)) { + const node = [name, attrs] as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + return + } + stack.push({ tag: name, attrs, children: [] }) + }, + + ontext(text) { + const trimmed = text.trim() + if (!trimmed) return + const nodes: ComarkNode[] = _parseInlineMarkdown ? _parseInlineMarkdown(trimmed) : [trimmed] + if (stack.length > 0) { + stack[stack.length - 1].children.push(...nodes) + } + else { + root.push(...nodes) + } + }, + + onclosetag(name) { + if (VOID_ELEMENTS.has(name)) { + return + } + // Find matching frame (handles mismatched tags gracefully) + let idx = stack.length - 1 + while (idx >= 0 && stack[idx].tag !== name) { + idx-- + } + if (idx >= 0) { + while (stack.length > idx) { + const frame = stack.pop()! + const node = frame.children.length > 0 + ? [frame.tag, frame.attrs, ...frame.children] as ComarkNode + : [frame.tag, frame.attrs] as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + } + } + }, + + oncomment(data) { + const node = [null, {}, data] as unknown as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + }, + }, { decodeEntities: true }) + + parser.write(html.trim()) + parser.end() + + return root +} + +interface HtmlTagInfo { + tag: string + attrs: Record + isVoid: boolean + isClose: boolean +} + +/** + * Parse a single inline HTML tag fragment (opening, closing, or void). + * Returns null if the content is not a recognisable HTML tag. + */ +function parseHtmlTag(html: string): HtmlTagInfo | null { + const trimmed = html.trim() + if (!trimmed.startsWith('<')) return null + + // Fast path: closing tag + const closeMatch = trimmed.match(/^<\/([a-z][a-z0-9]*)\s*>/i) + if (closeMatch) { + return { tag: closeMatch[1].toLowerCase(), attrs: {}, isVoid: false, isClose: true } + } + + let info: HtmlTagInfo | null = null + const parser = new Parser({ + onopentag(name, attribs) { + info = { + tag: name, + attrs: attribsToComarkAttrs(attribs), + isVoid: VOID_ELEMENTS.has(name), + isClose: false, + } + }, + }, { decodeEntities: false }) + + parser.write(trimmed) + parser.end() + return info +} + +// ─── html_block helper ────────────────────────────────────────────────────── + +function processHtmlBlock(content: string): ComarkNode[] { + const trimmed = content.trim() + if (trimmed.startsWith('') ? trimmed.slice(4, -3) : trimmed.slice(4) + return [[null, {}, inner] as unknown as ComarkNode] + } + return htmlToComarkNodes(content) +} + +// ─── main entry point ─────────────────────────────────────────────────────── + /** * Convert Markdown-It tokens to a Comark tree - * @param tokens - The tokens to convert - * @returns The Comark tree */ -export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine: number, preservePositions: boolean } = { startLine: 0, preservePositions: false }): ComarkNode[] { +export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine: number, preservePositions: boolean, parseInlineMarkdown?: (text: string) => ComarkNode[] } = { startLine: 0, preservePositions: false }): ComarkNode[] { + _parseInlineMarkdown = options.parseInlineMarkdown const nodes: ComarkNode[] = [] let i = 0 let endLine = options.startLine while (i < tokens.length) { + // html_block can produce multiple root nodes — handle before processBlockToken + if (tokens[i].type === 'html_block') { + const htmlNodes = processHtmlBlock(tokens[i].content) + if (options.preservePositions && tokens[i].map?.[1]) { + endLine = (tokens[i].map[1] as number) + options.startLine + for (const node of htmlNodes) { + if (Array.isArray(node) && node[1]) { + (node[1] as Record).$comark = { line: endLine } + } + } + } + nodes.push(...htmlNodes) + i++ + continue + } + const result = processBlockToken(tokens, i, false) if (result.node) { if (options.preservePositions) { - // find end line of node from token.map for (let j = i; j < result.nextIndex; j++) { if (tokens[j].map && tokens[j].map[1]) { endLine = (tokens[j].map[1] as number) + options.startLine @@ -115,48 +287,6 @@ function processAttributes( return attrs } -/** - * Parse HTML inline content to extract tag and attributes - * Example: '' - * Returns: { tag: 'input', attrs: { class: 'foo', checked: true, disabled: true, type: 'checkbox' } } - */ -function parseHtmlInline(html: string): { tag: string, attrs: Record, selfClosing: boolean } | null { - // Match opening or self-closing tags - // Use \s[^>]* to ensure attributes start with whitespace, preventing overlap with tag name - const tagMatch = html.match(/^<(\w+)(\s[^>]*)?(\/?)>/) - if (!tagMatch) { - return null - } - - const tag = tagMatch[1] - const attrsString = tagMatch[2] - const selfClosing = tagMatch[3] === '/' || tag === 'input' || tag === 'br' || tag === 'img' || tag === 'hr' - - const attrs: Record = {} - - // Parse attributes from the string - // Match: attr="value" or attr='' or attr (boolean) - const attrRegex = /(\w+)(?:="([^"]*)"|='([^']*)'|=(\S+)|(?=\s|$))/g - let match - - while ((match = attrRegex.exec(attrsString)) !== null) { - const attrName = match[1] - // Get value from whichever capture group matched (quotes or unquoted) - const attrValue = match[2] !== undefined ? match[2] : (match[3] !== undefined ? match[3] : (match[4] || '')) - - // Handle boolean attributes - if value is empty string, it's a boolean true - if (attrValue === '') { - attrs[`:${attrName}`] = 'true' - } - else { - // Regular attribute - attrs[attrName] = attrValue - } - } - - return { tag, attrs, selfClosing } -} - /** * Parse codeblock info string to extract language, highlights, filename, and meta * Example: "javascript {1-3} [filename.ts] meta=value" @@ -265,10 +395,6 @@ function parseCodeblockInfo(info: string): { /** * Extract Comark attributes from mdc_inline_props token - * @param tokens - Array of tokens - * @param startIndex - Index to start searching from (after the element token) - * @param skipEmptyText - Whether to skip empty text tokens before props token - * @returns Object with attrs and nextIndex */ function extractAttributes( tokens: any[], @@ -301,10 +427,12 @@ function processBlockToken(tokens: any[], startIndex: number, insideNestedContex return { node: ['hr', {}] as ComarkNode, nextIndex: startIndex + 1 } } + // html_block is now handled upstream (in marmdownItTokensToComarkTree / + // processBlockChildren / processBlockChildrenWithSlots) before reaching here. + // This branch is kept as a safety fallback. if (token.type === 'html_block') { - if (token.content.startsWith('/, true], + [/^<\?/, /\?>/, true], + [/^/, true], + [/^/, true], + [new RegExp(`^|$))`, 'i'), /^$/, true], + [new RegExp(`${HTML_OPEN_CLOSE_TAG_RE.source}\\s*$`), /^$/, false], +] + +export default function html_block(state: StateBlock, startLine: number, endLine: number, silent: boolean) { + let pos = state.bMarks[startLine] + state.tShift[startLine] + let max = state.eMarks[startLine] + + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) + return false + + if (state.src.charCodeAt(pos) !== 0x3C/* < */) + return false + + let lineText = state.src.slice(pos, max) + + let i = 0 + for (; i < HTML_SEQUENCES.length; i++) { + if (HTML_SEQUENCES[i][0].test(lineText)) + break + } + + if (i === HTML_SEQUENCES.length) + return false + + if (silent) { + // true if this sequence can be a terminator, false otherwise + return HTML_SEQUENCES[i][2] + } + + let nextLine = startLine + 1 + + // If we are here - we detected HTML block. + // Let's roll down till block end. + if (i !== 0 && !HTML_SEQUENCES[i][1].test(lineText)) { + for (; nextLine < endLine; nextLine++) { + if (state.sCount[nextLine] < state.blkIndent) { + break + } + + pos = state.bMarks[nextLine] + state.tShift[nextLine] + max = state.eMarks[nextLine] + lineText = state.src.slice(pos, max) + + if (HTML_SEQUENCES[i][1].test(lineText)) { + if (lineText.length !== 0) + nextLine++ + break + } + } + } + state.line = nextLine + + const token = lineText.startsWith('\s]/i.test(str) +} +function isLinkClose(str: string) { + return /^<\/a\s*>/i.test(str) +} + +function isLetter(ch: number) { + /* eslint no-bitwise:0 */ + const lc = ch | 0x20 // to lower case + return (lc >= 0x61/* a */) && (lc <= 0x7A/* z */) +} + +export default function html_inline(state: StateInline, silent: boolean) { + // Check start + const max = state.posMax + const pos = state.pos + if (state.src.charCodeAt(pos) !== 0x3C + ||/* < */ pos + 2 >= max) { + return false + } + + // Quick fail on second char + const ch = state.src.charCodeAt(pos + 1) + if (ch !== 0x21 + &&/* ! */ ch !== 0x3F + &&/* ? */ ch !== 0x2F + &&/* / */ !isLetter(ch)) { + return false + } + + const match = state.src.slice(pos).match(HTML_TAG_RE) + if (!match) + return false + + if (!silent) { + const token = state.push('html_inline', '', 0) + token.content = match[0] + + if (isLinkOpen(token.content)) + state.linkLevel++ + if (isLinkClose(token.content)) + state.linkLevel-- + } + state.pos += match[0].length + return true +} diff --git a/packages/comark/src/internal/parse/html_re.ts b/packages/comark/src/internal/parse/html_re.ts new file mode 100644 index 0000000..cb87544 --- /dev/null +++ b/packages/comark/src/internal/parse/html_re.ts @@ -0,0 +1,27 @@ +// Regexps to match html elements + +const attr_name = '[a-zA-Z_:][a-zA-Z0-9:._-]*' + +const unquoted = '[^"\'=<>`\\x00-\\x20]+' +const single_quoted = '\'[^\']*\'' +const double_quoted = '"[^"]*"' + +const attr_value = `(?:${unquoted}|${single_quoted}|${double_quoted})` + +const attribute = `(?:\\s+${attr_name}(?:\\s*=\\s*${attr_value})?)` + +const open_tag = `<[A-Za-z][A-Za-z0-9\\-]*${attribute}*\\s*\\/?>` + +const close_tag = '<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>' +const comment = '' +const processing = '<\\?[\\s\\S]*?\\?>' +const declaration = ']*>' +const cdata = '' + +// eslint-disable-next-line regexp/no-super-linear-backtracking, regexp/prefer-w +const HTML_TAG_RE = new RegExp(`^(?:${open_tag}|${close_tag}|${comment}|${processing}|${declaration}|${cdata})`) + +// eslint-disable-next-line regexp/use-ignore-case, regexp/no-super-linear-backtracking, regexp/prefer-w +const HTML_OPEN_CLOSE_TAG_RE = new RegExp(`^(?:${open_tag}|${close_tag})`) + +export { HTML_OPEN_CLOSE_TAG_RE, HTML_TAG_RE } diff --git a/packages/comark/src/internal/parse/incremental.ts b/packages/comark/src/internal/parse/incremental.ts index 98dfc5b..218027a 100644 --- a/packages/comark/src/internal/parse/incremental.ts +++ b/packages/comark/src/internal/parse/incremental.ts @@ -12,7 +12,7 @@ export function extractReusableNodes(markdown: string, lastOutput: ComarkTree) { let lastNodeIgnored = false while (i >= 0) { const node = lastOutput.nodes[i] as ComarkElement - if (node[1] && node[1].$comark?.line) { + if (node[1] && node[1].$?.line) { if (lastNodeIgnored) { lastValidNodeIndex = i break @@ -25,7 +25,7 @@ export function extractReusableNodes(markdown: string, lastOutput: ComarkTree) { } const lastNode = lastValidNodeIndex !== -1 ? lastOutput.nodes[lastValidNodeIndex] : null if (lastNode) { - const remainingMarkdownStartLine = (lastNode[1] as ComarkElementAttributes).$comark?.line ?? 0 + const remainingMarkdownStartLine = (lastNode[1] as ComarkElementAttributes).$?.line ?? 0 return { remainingMarkdownStartLine, reusedNodes: lastOutput.nodes.slice(0, lastValidNodeIndex + 1), diff --git a/packages/comark/src/internal/parse/token-processor.ts b/packages/comark/src/internal/parse/token-processor.ts index 09baa76..5533af1 100644 --- a/packages/comark/src/internal/parse/token-processor.ts +++ b/packages/comark/src/internal/parse/token-processor.ts @@ -1,9 +1,6 @@ -import type { ComarkNode } from 'comark/ast' +import type { ComarkElementAttributes, ComarkNode } from 'comark/ast' import { Parser } from 'htmlparser2' -// Set for the duration of each marmdownItTokensToComarkTree call (safe: JS is single-threaded) -let _parseInlineMarkdown: ((text: string) => ComarkNode[]) | undefined - // Mapping from token types to tag names const BLOCK_TAG_MAP: Record = { blockquote_open: 'blockquote', @@ -33,11 +30,12 @@ const VOID_ELEMENTS = new Set([ // ─── htmlparser2 helpers ──────────────────────────────────────────────────── -function attribsToComarkAttrs(attribs: Record): Record { +function attribsToComarkAttrs(attribs: Record, isInline: boolean = false): Record { const attrs: Record = { - $comark: { - html: 1 - } + $: { + html: 1, + block: isInline ? 0 : 1, + }, } for (const key in attribs) { const value = attribs[key] @@ -78,12 +76,11 @@ function htmlToComarkNodes(html: string): ComarkNode[] { ontext(text) { const trimmed = text.trim() if (!trimmed) return - const nodes: ComarkNode[] = _parseInlineMarkdown ? _parseInlineMarkdown(trimmed) : [trimmed] if (stack.length > 0) { - stack[stack.length - 1].children.push(...nodes) + stack[stack.length - 1].children.push(trimmed) } else { - root.push(...nodes) + root.push(trimmed) } }, @@ -140,7 +137,7 @@ interface HtmlTagInfo { * Parse a single inline HTML tag fragment (opening, closing, or void). * Returns null if the content is not a recognisable HTML tag. */ -function parseHtmlTag(html: string): HtmlTagInfo | null { +function parseInlineHtmlTag(html: string): HtmlTagInfo | null { const trimmed = html.trim() if (!trimmed.startsWith('<')) return null @@ -155,7 +152,7 @@ function parseHtmlTag(html: string): HtmlTagInfo | null { onopentag(name, attribs) { info = { tag: name, - attrs: attribsToComarkAttrs(attribs), + attrs: attribsToComarkAttrs(attribs, true), isVoid: VOID_ELEMENTS.has(name), isClose: false, } @@ -167,45 +164,17 @@ function parseHtmlTag(html: string): HtmlTagInfo | null { return info } -// ─── html_block helper ────────────────────────────────────────────────────── - -function processHtmlBlock(content: string): ComarkNode[] { - const trimmed = content.trim() - if (trimmed.startsWith('') ? trimmed.slice(4, -3) : trimmed.slice(4) - return [[null, {}, inner] as unknown as ComarkNode] - } - return htmlToComarkNodes(content) -} - // ─── main entry point ─────────────────────────────────────────────────────── /** * Convert Markdown-It tokens to a Comark tree */ -export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine: number, preservePositions: boolean, parseInlineMarkdown?: (text: string) => ComarkNode[] } = { startLine: 0, preservePositions: false }): ComarkNode[] { - _parseInlineMarkdown = options.parseInlineMarkdown +export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine: number, preservePositions: boolean } = { startLine: 0, preservePositions: false }): ComarkNode[] { const nodes: ComarkNode[] = [] let i = 0 let endLine = options.startLine while (i < tokens.length) { - // html_block can produce multiple root nodes — handle before processBlockToken - if (tokens[i].type === 'html_block') { - const htmlNodes = processHtmlBlock(tokens[i].content) - if (options.preservePositions && tokens[i].map?.[1]) { - endLine = (tokens[i].map[1] as number) + options.startLine - for (const node of htmlNodes) { - if (Array.isArray(node) && node[1]) { - (node[1] as Record).$comark = { line: endLine } - } - } - } - nodes.push(...htmlNodes) - i++ - continue - } - const result = processBlockToken(tokens, i, false) if (result.node) { if (options.preservePositions) { @@ -214,10 +183,10 @@ export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine endLine = (tokens[j].map[1] as number) + options.startLine } } - ;(result.node[1] as Record).$comark = { - ...((result.node[1] as Record).$comark || {}), - line: endLine, + if (!(result.node[1] as Record).$) { + (result.node[1] as Record).$ = {} } + ;((result.node[1] as Record).$ as Record).line = endLine } nodes.push(result.node) } @@ -438,8 +407,20 @@ function processBlockToken(tokens: any[], startIndex: number, insideNestedContex // processBlockChildren / processBlockChildrenWithSlots) before reaching here. // This branch is kept as a safety fallback. if (token.type === 'html_block') { - const nodes = processHtmlBlock(token.content) - return { node: nodes[0] ?? null, nextIndex: startIndex + 1 } + const content = token.content?.trim() || '' + if (content.startsWith('') ? content.slice(4, -3) : content.slice(4) + return { node: [null, {}, inner] as unknown as ComarkNode, nextIndex: startIndex + 1 } + } + + const children = processBlockChildren(tokens, startIndex + 1, 'html_block_close', false, false, false) + const [node1] = htmlToComarkNodes(content) + if (!node1) { + return { node: null, nextIndex: startIndex + 1 } + } + const node = [node1[0]!, node1[1]! as ComarkElementAttributes, ...children.nodes] as ComarkNode + + return { node, nextIndex: children.nextIndex + 1 } } // Handle Comark block components (e.g., ::component ... ::) @@ -624,7 +605,7 @@ function processBlockChildrenWithSlots( // html_block can produce multiple nodes — handle before processBlockToken if (token.type === 'html_block') { - const htmlNodes = processHtmlBlock(token.content) + const htmlNodes = htmlToComarkNodes(token.content) if (currentSlotName !== null) { currentSlotChildren.push(...htmlNodes) } @@ -705,7 +686,7 @@ function processBlockChildren( // html_block can produce multiple nodes — handle before processBlockToken if (token.type === 'html_block') { - nodes.push(...processHtmlBlock(token.content)) + nodes.push(...htmlToComarkNodes(token.content)) i++ continue } @@ -860,7 +841,7 @@ function processInlineToken(tokens: any[], startIndex: number, inHeading: boolea // Handle html_inline tokens using htmlparser2 if (token.type === 'html_inline') { const content = token.content || '' - const tagInfo = parseHtmlTag(content) + const tagInfo = parseInlineHtmlTag(content) if (!tagInfo) { // Not a recognisable tag — return as raw text @@ -884,7 +865,7 @@ function processInlineToken(tokens: any[], startIndex: number, inHeading: boolea while (j < tokens.length) { const nextToken = tokens[j] if (nextToken.type === 'html_inline') { - const nextInfo = parseHtmlTag(nextToken.content || '') + const nextInfo = parseInlineHtmlTag(nextToken.content || '') if (nextInfo?.isClose && nextInfo.tag === tagInfo.tag) { j++ // consume the closing tag break diff --git a/packages/comark/src/internal/stringify/handlers/html.ts b/packages/comark/src/internal/stringify/handlers/html.ts index 4bdf646..f29f490 100644 --- a/packages/comark/src/internal/stringify/handlers/html.ts +++ b/packages/comark/src/internal/stringify/handlers/html.ts @@ -10,12 +10,12 @@ const blockTags = new Set(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ul', export function html(node: ComarkElement, state: State, parent?: ComarkElement) { const [tag, attr, ...children] = node - const { $comark, ...attributes } = attr + const { $ = {}, ...attributes } = attr const hasOnlyTextChildren = children.every(child => typeof child === 'string' || inlineTags.has(String(child?.[0]))) const hasTextSibling = children.some(child => typeof child === 'string') const isBlock = textBlocks.has(String(tag)) - const isInline = inlineTags.has(String(tag)) + const isInline = inlineTags.has(String(tag)) && $.block === 0 let oneLiner = isBlock && hasOnlyTextChildren @@ -31,6 +31,10 @@ export function html(node: ComarkElement, state: State, parent?: ComarkElement) oneLiner = true } + if ($.block === 0) { + oneLiner = true + } + const isSelfClose = selfCloseTags.has(String(tag)) // Do not modify context if we are already in html mode @@ -70,7 +74,7 @@ export function html(node: ComarkElement, state: State, parent?: ComarkElement) } if (!oneLiner && content) { - content = '\n' + paddNoneHtmlContent(content, state) + '\n' + content = '\n' + paddNoneHtmlContent(content, state).trimEnd() + '\n' } return `<${tag}${attrs}>${content}` @@ -83,8 +87,8 @@ function paddNoneHtmlContent(content: string, state: State) { } return ( - (content.trim().startsWith('<') ? '' : '\n') + (content.trim().startsWith('<') ? '' : '') + content - + (content.trim().endsWith('>') ? '' : '\n') + + (content.trim().endsWith('>') ? '' : '') ) } diff --git a/packages/comark/src/internal/stringify/handlers/mdc.ts b/packages/comark/src/internal/stringify/handlers/mdc.ts index ee2a1a3..5e4d1cc 100644 --- a/packages/comark/src/internal/stringify/handlers/mdc.ts +++ b/packages/comark/src/internal/stringify/handlers/mdc.ts @@ -9,7 +9,7 @@ const INLINE_HTML_ELEMENTS = new Set(['a', 'strong', 'em', 'span']) export function mdc(node: ComarkElement, state: State, parent?: ComarkElement) { const [tag, attr, ...children] = node - const { $comark, ...attributes } = attr + const { $, ...attributes } = attr if (tag === 'table') { return html(node, state) diff --git a/packages/comark/src/internal/stringify/state.ts b/packages/comark/src/internal/stringify/state.ts index ec1cd50..6e0fd3b 100644 --- a/packages/comark/src/internal/stringify/state.ts +++ b/packages/comark/src/internal/stringify/state.ts @@ -26,7 +26,7 @@ export function one(node: ComarkNode, state: State, parent?: ComarkElement) { return userHandler(node, state, parent) } - if (state.context.html) { + if (state.context.html || node[1].$?.html === 1) { return state.handlers.html(node, state, parent) } diff --git a/packages/comark/test/streaming.test.ts b/packages/comark/test/streaming.test.ts index a21bbdd..6268000 100644 --- a/packages/comark/test/streaming.test.ts +++ b/packages/comark/test/streaming.test.ts @@ -3,23 +3,23 @@ import { createParse } from 'comark' import type { ComarkElement } from 'comark/ast' describe('streaming mode', () => { - describe('$comark.line metadata', () => { + describe('$.line metadata', () => { it('preserves position metadata on nodes in streaming mode', async () => { const parse = createParse() const result = await parse('# Hello\n\nParagraph one.\n\nParagraph two.\n', { streaming: true }) const nodes = result.nodes as ComarkElement[] - expect(nodes[0][1].$comark?.line).toBeDefined() - expect(nodes[1][1].$comark?.line).toBeDefined() - expect(nodes[2][1].$comark?.line).toBeDefined() + expect(nodes[0][1].$?.line).toBeDefined() + expect(nodes[1][1].$?.line).toBeDefined() + expect(nodes[2][1].$?.line).toBeDefined() }) - it('does NOT add $comark.line metadata without streaming', async () => { + it('does NOT add $.line metadata without streaming', async () => { const parse = createParse() const result = await parse('# Hello\n\nParagraph one.\n') const nodes = result.nodes as ComarkElement[] - expect(nodes[0][1].$comark).toBeUndefined() + expect(nodes[0][1].$).toBeUndefined() }) it('line numbers are monotonically increasing', async () => { @@ -27,7 +27,7 @@ describe('streaming mode', () => { const result = await parse('# Heading\n\nPara 1\n\nPara 2\n\nPara 3\n', { streaming: true }) const nodes = result.nodes as ComarkElement[] - const lines = nodes.map(n => n[1].$comark?.line ?? 0) + const lines = nodes.map(n => n[1].$?.line ?? 0) for (let i = 1; i < lines.length; i++) { expect(lines[i]).toBeGreaterThan(lines[i - 1]) } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b3d1d82..0d27333 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -417,6 +417,9 @@ importers: entities: specifier: ^4.5.0 version: 4.5.0 + htmlparser2: + specifier: ^9.0.0 + version: 9.1.0 js-yaml: specifier: ^4.1.1 version: 4.1.1 @@ -6627,6 +6630,9 @@ packages: html-whitespace-sensitive-tag-names@3.0.1: resolution: {integrity: sha512-q+310vW8zmymYHALr1da4HyXUQ0zgiIwIicEfotYPWGN0OJVEN/58IJ3A4GBYcEq3LGAZqKb+ugvP0GNB9CEAA==} + htmlparser2@9.1.0: + resolution: {integrity: sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==} + http-cache-semantics@4.2.0: resolution: {integrity: sha512-dTxcvPXqPvXBQpq5dUr6mEMJX4oIEFv6bwom3FDwKRDsuIjjJGANqhBuoAn9c1RQJIdAKav33ED65E2ys+87QQ==} @@ -17125,6 +17131,13 @@ snapshots: html-whitespace-sensitive-tag-names@3.0.1: {} + htmlparser2@9.1.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + entities: 4.5.0 + http-cache-semantics@4.2.0: {} http-errors@2.0.1: From e262efd2d5cd3aa2fc4a9ba3c3d3fef92f3494f7 Mon Sep 17 00:00:00 2001 From: Farnabaz Date: Wed, 11 Mar 2026 11:16:48 +0100 Subject: [PATCH 3/4] feat: option to disable html parsing --- docs/content/5.api/1.parse.md | 40 +++++++++++++++++++++++++++++++ docs/content/5.api/3.reference.md | 4 +++- packages/comark/src/index.ts | 20 ++++++++++++---- packages/comark/src/types.ts | 23 ++++++++++++++++++ 4 files changed, 82 insertions(+), 5 deletions(-) diff --git a/docs/content/5.api/1.parse.md b/docs/content/5.api/1.parse.md index 54578f3..4a6f79e 100644 --- a/docs/content/5.api/1.parse.md +++ b/docs/content/5.api/1.parse.md @@ -391,6 +391,46 @@ console.log(result.meta.summary) // ComarkNode[] with only the content before ``` +## HTML Parsing + +HTML tags embedded in Comark content are parsed into AST nodes by default and can be mixed freely with Comark components and markdown syntax. + +::code-group + +```typescript [parse.ts] +const content = ` +
+ ::alert{type="info"} + Hello world + :: +
+` + +const result = await parse(content) +console.log(result.nodes) +``` + +```json [Output] +[ + ["div", { "class": "note" }, + ["alert", { "type": "info" }, + "Hello ", + ["strong", { "class": "text-red-500" }, "world"] + ] + ] +] +``` + +:: + +To disable HTML parsing and treat tags as plain text, set `html: false`: + +```typescript [parse.ts] +const result = await parse(content, { html: false }) +``` + +--- + ## Error Handling ```typescript [parse.ts] diff --git a/docs/content/5.api/3.reference.md b/docs/content/5.api/3.reference.md index 5237f03..84b77e5 100644 --- a/docs/content/5.api/3.reference.md +++ b/docs/content/5.api/3.reference.md @@ -27,7 +27,8 @@ import { parse } from 'comark' const result = await parse(markdownContent, { autoUnwrap: true, // Remove

wrappers from single-paragraph containers - autoClose: true // Auto-close incomplete syntax + autoClose: true, // Auto-close incomplete syntax + html: true // Parse embedded HTML tags into AST nodes (default: true) }) // Returns: ComarkTree @@ -168,6 +169,7 @@ interface ComarkTree { interface ParseOptions { autoUnwrap?: boolean // Remove unnecessary

wrappers (default: true) autoClose?: boolean // Auto-close incomplete syntax (default: true) + html?: boolean // Parse embedded HTML tags into AST nodes (default: true) plugins?: ComarkPlugin[] // Array of plugins to apply } ``` diff --git a/packages/comark/src/index.ts b/packages/comark/src/index.ts index 2059bd0..09e0d31 100644 --- a/packages/comark/src/index.ts +++ b/packages/comark/src/index.ts @@ -42,6 +42,15 @@ export type * from './types' * const tree = await parse('# Hello **World**\n::alert\nhi\n::') * console.log(tree.nodes) * // → [ ['h1', { id: 'hello-world' }, 'Hello ', ['strong', {}, 'World'] ], ['alert', {}, 'hi'] ] + * + * // Enable HTML parsing (on by default) — HTML tags are included in the AST + * const parseWithHtml = createParse({ html: true }) + * const tree2 = await parseWithHtml('Hello _world_') + * console.log(tree2.nodes) + * // → [ ['strong', { class: 'bold' }, 'Hello'], ' ', ['em', {}, 'world'] ] + * + * // Disable HTML parsing — HTML tags are treated as plain text + * const parseNoHtml = createParse({ html: false }) * ``` */ export function createParse(options: ParseOptions = {}): ComarkParseFn { @@ -56,10 +65,13 @@ export function createParse(options: ParseOptions = {}): ComarkParseFn { }) .enable(['table', 'strikethrough']) .use(pluginMdc) - parser.inline.ruler.before('text', 'comark_html_inline', html_inline) - parser.block.ruler.before('html_block', 'comark_html_block', html_block, { - alt: ['paragraph', 'reference', 'blockquote'], - }) + + if (options.html !== false) { + parser.inline.ruler.before('text', 'comark_html_inline', html_inline) + parser.block.ruler.before('html_block', 'comark_html_block', html_block, { + alt: ['paragraph', 'reference', 'blockquote'], + }) + } for (const plugin of plugins) { for (const markdownItPlugin of (plugin.markdownItPlugins || [])) { diff --git a/packages/comark/src/types.ts b/packages/comark/src/types.ts index 05ac6c7..d2ade1b 100644 --- a/packages/comark/src/types.ts +++ b/packages/comark/src/types.ts @@ -56,6 +56,29 @@ export interface ParseOptions { */ autoClose?: boolean + /** + * Whether to parse HTML tags embedded in Comark/markdown content. + * When enabled, HTML block and inline elements are parsed into AST nodes and can be + * mixed freely with Comark components and markdown syntax. + * + * @default true + * @example + * // With html: true (default) — HTML is parsed into AST nodes + * // Input: `text` + * // AST: ['strong', { class: 'bold' }, 'text'] + * + * // HTML can be mixed with Comark components: + * // Input: + * //

+ * // ::alert + * // Hello world + * // :: + * //
+ * + * // With html: false — HTML tags are left as raw text / ignored + */ + html?: boolean + /** * Additional plugins to use * @default [] From 76b6a1731614c30a40934b0eab865e07d2e82e7b Mon Sep 17 00:00:00 2001 From: Farnabaz Date: Wed, 11 Mar 2026 11:25:42 +0100 Subject: [PATCH 4/4] up --- packages/comark/src/index.ts | 4 +- .../parse/{ => html}/html_block_rule.ts | 0 .../internal/parse/{ => html}/html_blocks.ts | 0 .../parse/{ => html}/html_inline_rule.ts | 0 .../src/internal/parse/{ => html}/html_re.ts | 0 .../comark/src/internal/parse/html/index.ts | 141 +++++++++++++++++ .../src/internal/parse/token-processor.ts | 143 +----------------- 7 files changed, 144 insertions(+), 144 deletions(-) rename packages/comark/src/internal/parse/{ => html}/html_block_rule.ts (100%) rename packages/comark/src/internal/parse/{ => html}/html_blocks.ts (100%) rename packages/comark/src/internal/parse/{ => html}/html_inline_rule.ts (100%) rename packages/comark/src/internal/parse/{ => html}/html_re.ts (100%) create mode 100644 packages/comark/src/internal/parse/html/index.ts diff --git a/packages/comark/src/index.ts b/packages/comark/src/index.ts index 09e0d31..8bdcac0 100644 --- a/packages/comark/src/index.ts +++ b/packages/comark/src/index.ts @@ -9,8 +9,8 @@ import { marmdownItTokensToComarkTree } from './internal/parse/token-processor' import { autoCloseMarkdown } from './internal/parse/auto-close/index' import { parseFrontmatter } from './internal/front-matter' import { extractReusableNodes } from './internal/parse/incremental' -import html_block from './internal/parse/html_block_rule' -import html_inline from './internal/parse/html_inline_rule' +import html_block from './internal/parse/html/html_block_rule' +import html_inline from './internal/parse/html/html_inline_rule' // Re-export ComarkTree and ComarkNode for convenience export type { ComarkTree, ComarkNode } from 'comark/ast' diff --git a/packages/comark/src/internal/parse/html_block_rule.ts b/packages/comark/src/internal/parse/html/html_block_rule.ts similarity index 100% rename from packages/comark/src/internal/parse/html_block_rule.ts rename to packages/comark/src/internal/parse/html/html_block_rule.ts diff --git a/packages/comark/src/internal/parse/html_blocks.ts b/packages/comark/src/internal/parse/html/html_blocks.ts similarity index 100% rename from packages/comark/src/internal/parse/html_blocks.ts rename to packages/comark/src/internal/parse/html/html_blocks.ts diff --git a/packages/comark/src/internal/parse/html_inline_rule.ts b/packages/comark/src/internal/parse/html/html_inline_rule.ts similarity index 100% rename from packages/comark/src/internal/parse/html_inline_rule.ts rename to packages/comark/src/internal/parse/html/html_inline_rule.ts diff --git a/packages/comark/src/internal/parse/html_re.ts b/packages/comark/src/internal/parse/html/html_re.ts similarity index 100% rename from packages/comark/src/internal/parse/html_re.ts rename to packages/comark/src/internal/parse/html/html_re.ts diff --git a/packages/comark/src/internal/parse/html/index.ts b/packages/comark/src/internal/parse/html/index.ts new file mode 100644 index 0000000..44d24ad --- /dev/null +++ b/packages/comark/src/internal/parse/html/index.ts @@ -0,0 +1,141 @@ +import { Parser } from 'htmlparser2' +import type { ComarkNode } from '../../../ast' + +const VOID_ELEMENTS = new Set([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'link', 'meta', 'param', 'source', 'track', 'wbr', +]) + +function attribsToComarkAttrs(attribs: Record, isInline: boolean = false): Record { + const attrs: Record = { + $: { + html: 1, + block: isInline ? 0 : 1, + }, + } + for (const key in attribs) { + const value = attribs[key] + if (value === '') { + attrs[`:${key}`] = 'true' + } + else { + attrs[key] = value + } + } + return attrs +} + +interface HtmlTagInfo { + tag: string + attrs: Record + isVoid: boolean + isClose: boolean +} + +/** + * Parse a single inline HTML tag fragment (opening, closing, or void). + * Returns null if the content is not a recognisable HTML tag. + */ +export function parseInlineHtmlTag(html: string): HtmlTagInfo | null { + const trimmed = html.trim() + if (!trimmed.startsWith('<')) return null + + // Fast path: closing tag + const closeMatch = trimmed.match(/^<\/([a-z][a-z0-9]*)\s*>/i) + if (closeMatch) { + return { tag: closeMatch[1].toLowerCase(), attrs: {}, isVoid: false, isClose: true } + } + + let info: HtmlTagInfo | null = null + const parser = new Parser({ + onopentag(name, attribs) { + info = { + tag: name, + attrs: attribsToComarkAttrs(attribs, true), + isVoid: VOID_ELEMENTS.has(name), + isClose: false, + } + }, + }, { decodeEntities: false }) + + parser.write(trimmed) + parser.end() + return info +} + +/** + * Parse a full HTML string into ComarkNodes using htmlparser2. + * Handles nested elements, text, void elements, and comments. + */ +export function htmlToComarkNodes(html: string): ComarkNode[] { + const root: ComarkNode[] = [] + const stack: { tag: string, attrs: Record, children: ComarkNode[] }[] = [] + + const parser = new Parser({ + onopentag(name, attribs) { + const attrs = attribsToComarkAttrs(attribs) + if (VOID_ELEMENTS.has(name)) { + const node = [name, attrs] as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + return + } + stack.push({ tag: name, attrs, children: [] }) + }, + + ontext(text) { + const trimmed = text.trim() + if (!trimmed) return + if (stack.length > 0) { + stack[stack.length - 1].children.push(trimmed) + } + else { + root.push(trimmed) + } + }, + + onclosetag(name) { + if (VOID_ELEMENTS.has(name)) { + return + } + // Find matching frame (handles mismatched tags gracefully) + let idx = stack.length - 1 + while (idx >= 0 && stack[idx].tag !== name) { + idx-- + } + if (idx >= 0) { + while (stack.length > idx) { + const frame = stack.pop()! + const node = frame.children.length > 0 + ? [frame.tag, frame.attrs, ...frame.children] as ComarkNode + : [frame.tag, frame.attrs] as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + } + } + }, + + oncomment(data) { + const node = [null, {}, data] as unknown as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + }, + }, { decodeEntities: true }) + + parser.write(html.trim()) + parser.end() + + return root +} diff --git a/packages/comark/src/internal/parse/token-processor.ts b/packages/comark/src/internal/parse/token-processor.ts index 5533af1..fa6fccc 100644 --- a/packages/comark/src/internal/parse/token-processor.ts +++ b/packages/comark/src/internal/parse/token-processor.ts @@ -1,5 +1,5 @@ import type { ComarkElementAttributes, ComarkNode } from 'comark/ast' -import { Parser } from 'htmlparser2' +import { htmlToComarkNodes, parseInlineHtmlTag } from './html' // Mapping from token types to tag names const BLOCK_TAG_MAP: Record = { @@ -23,147 +23,6 @@ const INLINE_TAG_MAP: Record = { sub_open: 'del', } -const VOID_ELEMENTS = new Set([ - 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', - 'link', 'meta', 'param', 'source', 'track', 'wbr', -]) - -// ─── htmlparser2 helpers ──────────────────────────────────────────────────── - -function attribsToComarkAttrs(attribs: Record, isInline: boolean = false): Record { - const attrs: Record = { - $: { - html: 1, - block: isInline ? 0 : 1, - }, - } - for (const key in attribs) { - const value = attribs[key] - if (value === '') { - attrs[`:${key}`] = 'true' - } - else { - attrs[key] = value - } - } - return attrs -} - -/** - * Parse a full HTML string into ComarkNodes using htmlparser2. - * Handles nested elements, text, void elements, and comments. - */ -function htmlToComarkNodes(html: string): ComarkNode[] { - const root: ComarkNode[] = [] - const stack: { tag: string, attrs: Record, children: ComarkNode[] }[] = [] - - const parser = new Parser({ - onopentag(name, attribs) { - const attrs = attribsToComarkAttrs(attribs) - if (VOID_ELEMENTS.has(name)) { - const node = [name, attrs] as ComarkNode - if (stack.length > 0) { - stack[stack.length - 1].children.push(node) - } - else { - root.push(node) - } - return - } - stack.push({ tag: name, attrs, children: [] }) - }, - - ontext(text) { - const trimmed = text.trim() - if (!trimmed) return - if (stack.length > 0) { - stack[stack.length - 1].children.push(trimmed) - } - else { - root.push(trimmed) - } - }, - - onclosetag(name) { - if (VOID_ELEMENTS.has(name)) { - return - } - // Find matching frame (handles mismatched tags gracefully) - let idx = stack.length - 1 - while (idx >= 0 && stack[idx].tag !== name) { - idx-- - } - if (idx >= 0) { - while (stack.length > idx) { - const frame = stack.pop()! - const node = frame.children.length > 0 - ? [frame.tag, frame.attrs, ...frame.children] as ComarkNode - : [frame.tag, frame.attrs] as ComarkNode - if (stack.length > 0) { - stack[stack.length - 1].children.push(node) - } - else { - root.push(node) - } - } - } - }, - - oncomment(data) { - const node = [null, {}, data] as unknown as ComarkNode - if (stack.length > 0) { - stack[stack.length - 1].children.push(node) - } - else { - root.push(node) - } - }, - }, { decodeEntities: true }) - - parser.write(html.trim()) - parser.end() - - return root -} - -interface HtmlTagInfo { - tag: string - attrs: Record - isVoid: boolean - isClose: boolean -} - -/** - * Parse a single inline HTML tag fragment (opening, closing, or void). - * Returns null if the content is not a recognisable HTML tag. - */ -function parseInlineHtmlTag(html: string): HtmlTagInfo | null { - const trimmed = html.trim() - if (!trimmed.startsWith('<')) return null - - // Fast path: closing tag - const closeMatch = trimmed.match(/^<\/([a-z][a-z0-9]*)\s*>/i) - if (closeMatch) { - return { tag: closeMatch[1].toLowerCase(), attrs: {}, isVoid: false, isClose: true } - } - - let info: HtmlTagInfo | null = null - const parser = new Parser({ - onopentag(name, attribs) { - info = { - tag: name, - attrs: attribsToComarkAttrs(attribs, true), - isVoid: VOID_ELEMENTS.has(name), - isClose: false, - } - }, - }, { decodeEntities: false }) - - parser.write(trimmed) - parser.end() - return info -} - // ─── main entry point ─────────────────────────────────────────────────────── /**