diff --git a/docs/content/5.api/1.parse.md b/docs/content/5.api/1.parse.md index 54578f3..4a6f79e 100644 --- a/docs/content/5.api/1.parse.md +++ b/docs/content/5.api/1.parse.md @@ -391,6 +391,46 @@ console.log(result.meta.summary) // ComarkNode[] with only the content before ``` +## HTML Parsing + +HTML tags embedded in Comark content are parsed into AST nodes by default and can be mixed freely with Comark components and markdown syntax. + +::code-group + +```typescript [parse.ts] +const content = ` +
+ ::alert{type="info"} + Hello world + :: +
+` + +const result = await parse(content) +console.log(result.nodes) +``` + +```json [Output] +[ + ["div", { "class": "note" }, + ["alert", { "type": "info" }, + "Hello ", + ["strong", { "class": "text-red-500" }, "world"] + ] + ] +] +``` + +:: + +To disable HTML parsing and treat tags as plain text, set `html: false`: + +```typescript [parse.ts] +const result = await parse(content, { html: false }) +``` + +--- + ## Error Handling ```typescript [parse.ts] diff --git a/docs/content/5.api/3.reference.md b/docs/content/5.api/3.reference.md index 5237f03..84b77e5 100644 --- a/docs/content/5.api/3.reference.md +++ b/docs/content/5.api/3.reference.md @@ -27,7 +27,8 @@ import { parse } from 'comark' const result = await parse(markdownContent, { autoUnwrap: true, // Remove

wrappers from single-paragraph containers - autoClose: true // Auto-close incomplete syntax + autoClose: true, // Auto-close incomplete syntax + html: true // Parse embedded HTML tags into AST nodes (default: true) }) // Returns: ComarkTree @@ -168,6 +169,7 @@ interface ComarkTree { interface ParseOptions { autoUnwrap?: boolean // Remove unnecessary

wrappers (default: true) autoClose?: boolean // Auto-close incomplete syntax (default: true) + html?: boolean // Parse embedded HTML tags into AST nodes (default: true) plugins?: ComarkPlugin[] // Array of plugins to apply } ``` diff --git a/packages/comark-react/src/components/ComarkRenderer.tsx b/packages/comark-react/src/components/ComarkRenderer.tsx index ba07c47..ee87006 100644 --- a/packages/comark-react/src/components/ComarkRenderer.tsx +++ b/packages/comark-react/src/components/ComarkRenderer.tsx @@ -159,7 +159,7 @@ function renderNode( // Parse special prop values (props starting with :) for (const [propKey, value] of Object.entries(nodeProps)) { - if (propKey === '$comark') { + if (propKey === '$') { Reflect.deleteProperty(props, propKey) } if (propKey === 'style') { diff --git a/packages/comark-vue/src/components/ComarkRenderer.ts b/packages/comark-vue/src/components/ComarkRenderer.ts index 4be8124..a5630d4 100644 --- a/packages/comark-vue/src/components/ComarkRenderer.ts +++ b/packages/comark-vue/src/components/ComarkRenderer.ts @@ -123,7 +123,7 @@ function renderNode( // Prepare props — use for...in instead of Object.entries() to avoid intermediate array allocation const props: Record = {} for (const k in nodeProps) { - if (k === '$comark') { + if (k === '$') { continue } if (k === 'className') { diff --git a/packages/comark/SPEC/HTML/block+component.md b/packages/comark/SPEC/HTML/block+component.md new file mode 100644 index 0000000..a83920c --- /dev/null +++ b/packages/comark/SPEC/HTML/block+component.md @@ -0,0 +1,58 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md + +::component +Default Slot +:: + +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "hello", + { + "$": { "html": 1, "block": 1 } + }, + [ + "component", + {}, + "Default Slot" + ] + ] + ] +} +``` + +## HTML + +```html + + + Default Slot + + +``` + +## Markdown + +```md + + ::component + Default Slot + :: + +``` diff --git a/packages/comark/SPEC/HTML/block.md b/packages/comark/SPEC/HTML/block.md new file mode 100644 index 0000000..161875e --- /dev/null +++ b/packages/comark/SPEC/HTML/block.md @@ -0,0 +1,53 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md + +Hello **World** + +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "hello", + { + "$": { "html": 1, "block": 1 } + }, + "Hello ", + [ + "strong", + {}, + "World" + ] + ] + ] +} +``` + +## HTML + +```html + + Hello World + +``` + +## Markdown + +```md + +Hello **World** + +``` diff --git a/packages/comark/SPEC/HTML/inline.md b/packages/comark/SPEC/HTML/inline.md new file mode 100644 index 0000000..374ccf3 --- /dev/null +++ b/packages/comark/SPEC/HTML/inline.md @@ -0,0 +1,53 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md +Hello **World** +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "p", + {}, + [ + "hello", + { + "$": { "html": 1, "block": 0 } + }, + "Hello ", + [ + "strong", + {}, + "World" + ] + ] + ] + ] +} +``` + +## HTML + +```html +

+ Hello World +

+``` + +## Markdown + +```md +Hello **World** +``` diff --git a/packages/comark/SPEC/HTML/mix+paragraph.md b/packages/comark/SPEC/HTML/mix+paragraph.md new file mode 100644 index 0000000..d5bb521 --- /dev/null +++ b/packages/comark/SPEC/HTML/mix+paragraph.md @@ -0,0 +1,62 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md + +Hello **World** + +Another Pragraph +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "hello", + { + "$": { "html": 1, "block": 1 } + }, + "Hello ", + [ + "strong", + {}, + "World" + ] + ], + [ + "p", + {}, + "Another Pragraph" + ] + ] +} +``` + +## HTML + +```html + + Hello World + +

Another Pragraph

+``` + +## Markdown + +```md + +Hello **World** + + +Another Pragraph +``` diff --git a/packages/comark/SPEC/HTML/mix.md b/packages/comark/SPEC/HTML/mix.md new file mode 100644 index 0000000..47690f0 --- /dev/null +++ b/packages/comark/SPEC/HTML/mix.md @@ -0,0 +1,132 @@ +--- +timeout: + parse: 5ms + html: 5ms + markdown: 5ms +--- + +## Input + +```md + + ::comp2 + #title + This is the title of `comp2` component + + #default + In this paragraph, we [mix html and _markdonw_] + :: + +``` + +## AST + +```json +{ + "frontmatter": {}, + "meta": {}, + "nodes": [ + [ + "comp1", + { "$": { "html": 1, "block": 1 } }, + [ + "comp2", + {}, + [ + "template", + { "name": "title" }, + "This is the title of ", + [ + "code", + {}, + "comp2" + ], + " component" + ], + [ + "template", + { + "name": "default" + }, + "In this ", + [ + "strong", + { + "$": { + "block": 0, + "html": 1 + }, + "class": "text-red-500" + }, + "paragraph" + ], + ", we ", + [ + "span", + {}, + "mix ", + [ + "sub", + { + "$": { + "block": 0, + "html": 1 + } + }, + "html" + ], + " and ", + [ + "em", + {}, + [ + "sub", + { + "$": { + "block": 0, + "html": 1 + } + }, + "markdonw" + ] + ] + ] + ] + ] + ] + ] +} +``` + +## HTML + +```html + + + + + + +``` + +## Markdown + +```md + + ::comp2 + #title + This is the title of `comp2` component + + #default + In this paragraph, we [mix html and *markdonw*] + :: + +``` diff --git a/packages/comark/package.json b/packages/comark/package.json index 7b99774..976a096 100644 --- a/packages/comark/package.json +++ b/packages/comark/package.json @@ -71,6 +71,7 @@ "@comark/markdown-it": "^0.3.2", "entities": "^4.5.0", "js-yaml": "^4.1.1", + "htmlparser2": "^9.0.0", "markdown-exit": "1.0.0-beta.9" } } diff --git a/packages/comark/src/ast/types.ts b/packages/comark/src/ast/types.ts index 406bcac..fc32a86 100644 --- a/packages/comark/src/ast/types.ts +++ b/packages/comark/src/ast/types.ts @@ -5,7 +5,11 @@ export type ComarkComment = [null, {}, string] export type ComarkElementAttributes = { [key: string]: unknown - $comark?: { line?: number } + $?: { + line?: number + html?: 0 | 1 + block?: 0 | 1 + } } export type ComarkElement = [string, ComarkElementAttributes, ...ComarkNode[]] diff --git a/packages/comark/src/index.ts b/packages/comark/src/index.ts index ab0e457..8bdcac0 100644 --- a/packages/comark/src/index.ts +++ b/packages/comark/src/index.ts @@ -9,6 +9,8 @@ import { marmdownItTokensToComarkTree } from './internal/parse/token-processor' import { autoCloseMarkdown } from './internal/parse/auto-close/index' import { parseFrontmatter } from './internal/front-matter' import { extractReusableNodes } from './internal/parse/incremental' +import html_block from './internal/parse/html/html_block_rule' +import html_inline from './internal/parse/html/html_inline_rule' // Re-export ComarkTree and ComarkNode for convenience export type { ComarkTree, ComarkNode } from 'comark/ast' @@ -40,6 +42,15 @@ export type * from './types' * const tree = await parse('# Hello **World**\n::alert\nhi\n::') * console.log(tree.nodes) * // → [ ['h1', { id: 'hello-world' }, 'Hello ', ['strong', {}, 'World'] ], ['alert', {}, 'hi'] ] + * + * // Enable HTML parsing (on by default) — HTML tags are included in the AST + * const parseWithHtml = createParse({ html: true }) + * const tree2 = await parseWithHtml('Hello _world_') + * console.log(tree2.nodes) + * // → [ ['strong', { class: 'bold' }, 'Hello'], ' ', ['em', {}, 'world'] ] + * + * // Disable HTML parsing — HTML tags are treated as plain text + * const parseNoHtml = createParse({ html: false }) * ``` */ export function createParse(options: ParseOptions = {}): ComarkParseFn { @@ -49,12 +60,19 @@ export function createParse(options: ParseOptions = {}): ComarkParseFn { plugins.unshift(alert()) const parser = new MarkdownExit({ - html: true, + html: false, linkify: true, }) .enable(['table', 'strikethrough']) .use(pluginMdc) + if (options.html !== false) { + parser.inline.ruler.before('text', 'comark_html_inline', html_inline) + parser.block.ruler.before('html_block', 'comark_html_block', html_block, { + alt: ['paragraph', 'reference', 'blockquote'], + }) + } + for (const plugin of plugins) { for (const markdownItPlugin of (plugin.markdownItPlugins || [])) { parser.use(markdownItPlugin as unknown as MarkdownExitPlugin) diff --git a/packages/comark/src/internal/parse/html/html_block_rule.ts b/packages/comark/src/internal/parse/html/html_block_rule.ts new file mode 100644 index 0000000..f63b036 --- /dev/null +++ b/packages/comark/src/internal/parse/html/html_block_rule.ts @@ -0,0 +1,76 @@ +// BASED ON https://github.com/serkodev/markdown-exit/blob/fe1351070a5841426223ab4a0a5c7874ba2b1257/packages/markdown-exit/src/parser/block/rules/html_block.ts + +import type { StateBlock } from 'markdown-exit' +import block_names from './html_blocks' +import { HTML_OPEN_CLOSE_TAG_RE } from './html_re' + +// An array of opening and corresponding closing sequences for html tags, +// last argument defines whether it can terminate a paragraph or not +// +const HTML_SEQUENCES: [RegExp, RegExp, boolean][] = [ + [new RegExp(`${HTML_OPEN_CLOSE_TAG_RE.source}\\s*$`), /^<\/[^>]+>$/, true], + [/^<(script|pre|style|textarea)(?=(\s|>|$))/i, /<\/(script|pre|style|textarea)>/i, true], + [/^/, true], + [/^<\?/, /\?>/, true], + [/^/, true], + [/^/, true], + [new RegExp(`^|$))`, 'i'), /^$/, true], + [new RegExp(`${HTML_OPEN_CLOSE_TAG_RE.source}\\s*$`), /^$/, false], +] + +export default function html_block(state: StateBlock, startLine: number, endLine: number, silent: boolean) { + let pos = state.bMarks[startLine] + state.tShift[startLine] + let max = state.eMarks[startLine] + + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) + return false + + if (state.src.charCodeAt(pos) !== 0x3C/* < */) + return false + + let lineText = state.src.slice(pos, max) + + let i = 0 + for (; i < HTML_SEQUENCES.length; i++) { + if (HTML_SEQUENCES[i][0].test(lineText)) + break + } + + if (i === HTML_SEQUENCES.length) + return false + + if (silent) { + // true if this sequence can be a terminator, false otherwise + return HTML_SEQUENCES[i][2] + } + + let nextLine = startLine + 1 + + // If we are here - we detected HTML block. + // Let's roll down till block end. + if (i !== 0 && !HTML_SEQUENCES[i][1].test(lineText)) { + for (; nextLine < endLine; nextLine++) { + if (state.sCount[nextLine] < state.blkIndent) { + break + } + + pos = state.bMarks[nextLine] + state.tShift[nextLine] + max = state.eMarks[nextLine] + lineText = state.src.slice(pos, max) + + if (HTML_SEQUENCES[i][1].test(lineText)) { + if (lineText.length !== 0) + nextLine++ + break + } + } + } + state.line = nextLine + + const token = lineText.startsWith('\s]/i.test(str) +} +function isLinkClose(str: string) { + return /^<\/a\s*>/i.test(str) +} + +function isLetter(ch: number) { + /* eslint no-bitwise:0 */ + const lc = ch | 0x20 // to lower case + return (lc >= 0x61/* a */) && (lc <= 0x7A/* z */) +} + +export default function html_inline(state: StateInline, silent: boolean) { + // Check start + const max = state.posMax + const pos = state.pos + if (state.src.charCodeAt(pos) !== 0x3C + ||/* < */ pos + 2 >= max) { + return false + } + + // Quick fail on second char + const ch = state.src.charCodeAt(pos + 1) + if (ch !== 0x21 + &&/* ! */ ch !== 0x3F + &&/* ? */ ch !== 0x2F + &&/* / */ !isLetter(ch)) { + return false + } + + const match = state.src.slice(pos).match(HTML_TAG_RE) + if (!match) + return false + + if (!silent) { + const token = state.push('html_inline', '', 0) + token.content = match[0] + + if (isLinkOpen(token.content)) + state.linkLevel++ + if (isLinkClose(token.content)) + state.linkLevel-- + } + state.pos += match[0].length + return true +} diff --git a/packages/comark/src/internal/parse/html/html_re.ts b/packages/comark/src/internal/parse/html/html_re.ts new file mode 100644 index 0000000..cb87544 --- /dev/null +++ b/packages/comark/src/internal/parse/html/html_re.ts @@ -0,0 +1,27 @@ +// Regexps to match html elements + +const attr_name = '[a-zA-Z_:][a-zA-Z0-9:._-]*' + +const unquoted = '[^"\'=<>`\\x00-\\x20]+' +const single_quoted = '\'[^\']*\'' +const double_quoted = '"[^"]*"' + +const attr_value = `(?:${unquoted}|${single_quoted}|${double_quoted})` + +const attribute = `(?:\\s+${attr_name}(?:\\s*=\\s*${attr_value})?)` + +const open_tag = `<[A-Za-z][A-Za-z0-9\\-]*${attribute}*\\s*\\/?>` + +const close_tag = '<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>' +const comment = '' +const processing = '<\\?[\\s\\S]*?\\?>' +const declaration = ']*>' +const cdata = '' + +// eslint-disable-next-line regexp/no-super-linear-backtracking, regexp/prefer-w +const HTML_TAG_RE = new RegExp(`^(?:${open_tag}|${close_tag}|${comment}|${processing}|${declaration}|${cdata})`) + +// eslint-disable-next-line regexp/use-ignore-case, regexp/no-super-linear-backtracking, regexp/prefer-w +const HTML_OPEN_CLOSE_TAG_RE = new RegExp(`^(?:${open_tag}|${close_tag})`) + +export { HTML_OPEN_CLOSE_TAG_RE, HTML_TAG_RE } diff --git a/packages/comark/src/internal/parse/html/index.ts b/packages/comark/src/internal/parse/html/index.ts new file mode 100644 index 0000000..44d24ad --- /dev/null +++ b/packages/comark/src/internal/parse/html/index.ts @@ -0,0 +1,141 @@ +import { Parser } from 'htmlparser2' +import type { ComarkNode } from '../../../ast' + +const VOID_ELEMENTS = new Set([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'link', 'meta', 'param', 'source', 'track', 'wbr', +]) + +function attribsToComarkAttrs(attribs: Record, isInline: boolean = false): Record { + const attrs: Record = { + $: { + html: 1, + block: isInline ? 0 : 1, + }, + } + for (const key in attribs) { + const value = attribs[key] + if (value === '') { + attrs[`:${key}`] = 'true' + } + else { + attrs[key] = value + } + } + return attrs +} + +interface HtmlTagInfo { + tag: string + attrs: Record + isVoid: boolean + isClose: boolean +} + +/** + * Parse a single inline HTML tag fragment (opening, closing, or void). + * Returns null if the content is not a recognisable HTML tag. + */ +export function parseInlineHtmlTag(html: string): HtmlTagInfo | null { + const trimmed = html.trim() + if (!trimmed.startsWith('<')) return null + + // Fast path: closing tag + const closeMatch = trimmed.match(/^<\/([a-z][a-z0-9]*)\s*>/i) + if (closeMatch) { + return { tag: closeMatch[1].toLowerCase(), attrs: {}, isVoid: false, isClose: true } + } + + let info: HtmlTagInfo | null = null + const parser = new Parser({ + onopentag(name, attribs) { + info = { + tag: name, + attrs: attribsToComarkAttrs(attribs, true), + isVoid: VOID_ELEMENTS.has(name), + isClose: false, + } + }, + }, { decodeEntities: false }) + + parser.write(trimmed) + parser.end() + return info +} + +/** + * Parse a full HTML string into ComarkNodes using htmlparser2. + * Handles nested elements, text, void elements, and comments. + */ +export function htmlToComarkNodes(html: string): ComarkNode[] { + const root: ComarkNode[] = [] + const stack: { tag: string, attrs: Record, children: ComarkNode[] }[] = [] + + const parser = new Parser({ + onopentag(name, attribs) { + const attrs = attribsToComarkAttrs(attribs) + if (VOID_ELEMENTS.has(name)) { + const node = [name, attrs] as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + return + } + stack.push({ tag: name, attrs, children: [] }) + }, + + ontext(text) { + const trimmed = text.trim() + if (!trimmed) return + if (stack.length > 0) { + stack[stack.length - 1].children.push(trimmed) + } + else { + root.push(trimmed) + } + }, + + onclosetag(name) { + if (VOID_ELEMENTS.has(name)) { + return + } + // Find matching frame (handles mismatched tags gracefully) + let idx = stack.length - 1 + while (idx >= 0 && stack[idx].tag !== name) { + idx-- + } + if (idx >= 0) { + while (stack.length > idx) { + const frame = stack.pop()! + const node = frame.children.length > 0 + ? [frame.tag, frame.attrs, ...frame.children] as ComarkNode + : [frame.tag, frame.attrs] as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + } + } + }, + + oncomment(data) { + const node = [null, {}, data] as unknown as ComarkNode + if (stack.length > 0) { + stack[stack.length - 1].children.push(node) + } + else { + root.push(node) + } + }, + }, { decodeEntities: true }) + + parser.write(html.trim()) + parser.end() + + return root +} diff --git a/packages/comark/src/internal/parse/incremental.ts b/packages/comark/src/internal/parse/incremental.ts index 98dfc5b..218027a 100644 --- a/packages/comark/src/internal/parse/incremental.ts +++ b/packages/comark/src/internal/parse/incremental.ts @@ -12,7 +12,7 @@ export function extractReusableNodes(markdown: string, lastOutput: ComarkTree) { let lastNodeIgnored = false while (i >= 0) { const node = lastOutput.nodes[i] as ComarkElement - if (node[1] && node[1].$comark?.line) { + if (node[1] && node[1].$?.line) { if (lastNodeIgnored) { lastValidNodeIndex = i break @@ -25,7 +25,7 @@ export function extractReusableNodes(markdown: string, lastOutput: ComarkTree) { } const lastNode = lastValidNodeIndex !== -1 ? lastOutput.nodes[lastValidNodeIndex] : null if (lastNode) { - const remainingMarkdownStartLine = (lastNode[1] as ComarkElementAttributes).$comark?.line ?? 0 + const remainingMarkdownStartLine = (lastNode[1] as ComarkElementAttributes).$?.line ?? 0 return { remainingMarkdownStartLine, reusedNodes: lastOutput.nodes.slice(0, lastValidNodeIndex + 1), diff --git a/packages/comark/src/internal/parse/token-processor.ts b/packages/comark/src/internal/parse/token-processor.ts index 0b6d7a0..fa6fccc 100644 --- a/packages/comark/src/internal/parse/token-processor.ts +++ b/packages/comark/src/internal/parse/token-processor.ts @@ -1,4 +1,5 @@ -import type { ComarkNode } from 'comark/ast' +import type { ComarkElementAttributes, ComarkNode } from 'comark/ast' +import { htmlToComarkNodes, parseInlineHtmlTag } from './html' // Mapping from token types to tag names const BLOCK_TAG_MAP: Record = { @@ -22,10 +23,10 @@ const INLINE_TAG_MAP: Record = { sub_open: 'del', } +// ─── main entry point ─────────────────────────────────────────────────────── + /** * Convert Markdown-It tokens to a Comark tree - * @param tokens - The tokens to convert - * @returns The Comark tree */ export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine: number, preservePositions: boolean } = { startLine: 0, preservePositions: false }): ComarkNode[] { const nodes: ComarkNode[] = [] @@ -36,16 +37,15 @@ export function marmdownItTokensToComarkTree(tokens: any[], options: { startLine const result = processBlockToken(tokens, i, false) if (result.node) { if (options.preservePositions) { - // find end line of node from token.map for (let j = i; j < result.nextIndex; j++) { if (tokens[j].map && tokens[j].map[1]) { endLine = (tokens[j].map[1] as number) + options.startLine } } - ;(result.node[1] as Record).$comark = { - ...((result.node[1] as Record).$comark || {}), - line: endLine, + if (!(result.node[1] as Record).$) { + (result.node[1] as Record).$ = {} } + ;((result.node[1] as Record).$ as Record).line = endLine } nodes.push(result.node) } @@ -122,48 +122,6 @@ function processAttributes( return attrs } -/** - * Parse HTML inline content to extract tag and attributes - * Example: '' - * Returns: { tag: 'input', attrs: { class: 'foo', checked: true, disabled: true, type: 'checkbox' } } - */ -function parseHtmlInline(html: string): { tag: string, attrs: Record, selfClosing: boolean } | null { - // Match opening or self-closing tags - // Use \s[^>]* to ensure attributes start with whitespace, preventing overlap with tag name - const tagMatch = html.match(/^<(\w+)(\s[^>]*)?(\/?)>/) - if (!tagMatch) { - return null - } - - const tag = tagMatch[1] - const attrsString = tagMatch[2] - const selfClosing = tagMatch[3] === '/' || tag === 'input' || tag === 'br' || tag === 'img' || tag === 'hr' - - const attrs: Record = {} - - // Parse attributes from the string - // Match: attr="value" or attr='' or attr (boolean) - const attrRegex = /(\w+)(?:="([^"]*)"|='([^']*)'|=(\S+)|(?=\s|$))/g - let match - - while ((match = attrRegex.exec(attrsString)) !== null) { - const attrName = match[1] - // Get value from whichever capture group matched (quotes or unquoted) - const attrValue = match[2] !== undefined ? match[2] : (match[3] !== undefined ? match[3] : (match[4] || '')) - - // Handle boolean attributes - if value is empty string, it's a boolean true - if (attrValue === '') { - attrs[`:${attrName}`] = 'true' - } - else { - // Regular attribute - attrs[attrName] = attrValue - } - } - - return { tag, attrs, selfClosing } -} - /** * Parse codeblock info string to extract language, highlights, filename, and meta * Example: "javascript {1-3} [filename.ts] meta=value" @@ -272,10 +230,6 @@ function parseCodeblockInfo(info: string): { /** * Extract Comark attributes from mdc_inline_props token - * @param tokens - Array of tokens - * @param startIndex - Index to start searching from (after the element token) - * @param skipEmptyText - Whether to skip empty text tokens before props token - * @returns Object with attrs and nextIndex */ function extractAttributes( tokens: any[], @@ -308,10 +262,24 @@ function processBlockToken(tokens: any[], startIndex: number, insideNestedContex return { node: ['hr', {}] as ComarkNode, nextIndex: startIndex + 1 } } + // html_block is now handled upstream (in marmdownItTokensToComarkTree / + // processBlockChildren / processBlockChildrenWithSlots) before reaching here. + // This branch is kept as a safety fallback. if (token.type === 'html_block') { - if (token.content.startsWith('') ? content.slice(4, -3) : content.slice(4) + return { node: [null, {}, inner] as unknown as ComarkNode, nextIndex: startIndex + 1 } + } + + const children = processBlockChildren(tokens, startIndex + 1, 'html_block_close', false, false, false) + const [node1] = htmlToComarkNodes(content) + if (!node1) { + return { node: null, nextIndex: startIndex + 1 } } + const node = [node1[0]!, node1[1]! as ComarkElementAttributes, ...children.nodes] as ComarkNode + + return { node, nextIndex: children.nextIndex + 1 } } // Handle Comark block components (e.g., ::component ... ::) @@ -494,6 +462,19 @@ function processBlockChildrenWithSlots( while (i < tokens.length && tokens[i].type !== closeType) { const token = tokens[i] + // html_block can produce multiple nodes — handle before processBlockToken + if (token.type === 'html_block') { + const htmlNodes = htmlToComarkNodes(token.content) + if (currentSlotName !== null) { + currentSlotChildren.push(...htmlNodes) + } + else { + nodes.push(...htmlNodes) + } + i++ + continue + } + // Check for slot marker: #slotname creates mdc_block_slot tokens if (token.type === 'mdc_block_slot') { // Extract slot name from token.attrs @@ -562,6 +543,13 @@ function processBlockChildren( while (i < tokens.length && tokens[i].type !== closeType) { const token = tokens[i] + // html_block can produce multiple nodes — handle before processBlockToken + if (token.type === 'html_block') { + nodes.push(...htmlToComarkNodes(token.content)) + i++ + continue + } + if (token.type === 'inline') { const inlineNodes = processInlineTokens(token.children || [], inHeading) nodes.push(...inlineNodes) @@ -670,7 +658,7 @@ function slugify(text: string): string { return slug } -function processInlineTokens(tokens: any[], inHeading: boolean = false): ComarkNode[] { +export function processInlineTokens(tokens: any[], inHeading: boolean = false): ComarkNode[] { const nodes: ComarkNode[] = [] let i = 0 @@ -709,15 +697,50 @@ function processInlineToken(tokens: any[], startIndex: number, inHeading: boolea return { node: token.content || null, nextIndex: startIndex + 1 } } - // Handle html_inline tokens (e.g., task list checkboxes) + // Handle html_inline tokens using htmlparser2 if (token.type === 'html_inline') { - const parsed = parseHtmlInline(token.content || '') - if (parsed && parsed.selfClosing) { - // Self-closing tags like ,
, - return { node: [parsed.tag, parsed.attrs] as ComarkNode, nextIndex: startIndex + 1 } + const content = token.content || '' + const tagInfo = parseInlineHtmlTag(content) + + if (!tagInfo) { + // Not a recognisable tag — return as raw text + return { node: content || null, nextIndex: startIndex + 1 } } - // For non-self-closing HTML or unparseable HTML, return as text - return { node: token.content || null, nextIndex: startIndex + 1 } + + if (tagInfo.isClose) { + // Orphaned closing tag — skip (handled by the opener's lookahead) + return { node: null, nextIndex: startIndex + 1 } + } + + if (tagInfo.isVoid) { + // Self-closing void element:
, , , … + return { node: [tagInfo.tag, tagInfo.attrs] as ComarkNode, nextIndex: startIndex + 1 } + } + + // Non-void opening tag — look ahead for the matching closing tag + const children: ComarkNode[] = [] + let j = startIndex + 1 + + while (j < tokens.length) { + const nextToken = tokens[j] + if (nextToken.type === 'html_inline') { + const nextInfo = parseInlineHtmlTag(nextToken.content || '') + if (nextInfo?.isClose && nextInfo.tag === tagInfo.tag) { + j++ // consume the closing tag + break + } + } + const result = processInlineToken(tokens, j, inHeading) + j = result.nextIndex + if (result.node) { + children.push(result.node as ComarkNode) + } + } + + const node = children.length > 0 + ? [tagInfo.tag, tagInfo.attrs, ...children] as ComarkNode + : [tagInfo.tag, tagInfo.attrs] as ComarkNode + return { node, nextIndex: j } } // Handle Comark inline span (e.g., [text]{attr}) diff --git a/packages/comark/src/internal/stringify/attributes.ts b/packages/comark/src/internal/stringify/attributes.ts index e068355..342e8a3 100644 --- a/packages/comark/src/internal/stringify/attributes.ts +++ b/packages/comark/src/internal/stringify/attributes.ts @@ -25,7 +25,6 @@ export function comarkAttributes(attributes: Record) { return `${key}="${value}"` }) - .join(' ') return attrs.length > 0 ? `{${attrs}}` : '' diff --git a/packages/comark/src/internal/stringify/handlers/html.ts b/packages/comark/src/internal/stringify/handlers/html.ts index 8150e8f..f29f490 100644 --- a/packages/comark/src/internal/stringify/handlers/html.ts +++ b/packages/comark/src/internal/stringify/handlers/html.ts @@ -9,12 +9,13 @@ const inlineTags = new Set(['strong', 'em', 'code', 'a', 'br', 'span', 'img']) const blockTags = new Set(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ul', 'ol', 'blockquote', 'hr', 'table', 'td', 'th']) export function html(node: ComarkElement, state: State, parent?: ComarkElement) { - const [tag, attributes, ...children] = node + const [tag, attr, ...children] = node + const { $ = {}, ...attributes } = attr const hasOnlyTextChildren = children.every(child => typeof child === 'string' || inlineTags.has(String(child?.[0]))) const hasTextSibling = children.some(child => typeof child === 'string') const isBlock = textBlocks.has(String(tag)) - const isInline = inlineTags.has(String(tag)) + const isInline = inlineTags.has(String(tag)) && $.block === 0 let oneLiner = isBlock && hasOnlyTextChildren @@ -30,10 +31,14 @@ export function html(node: ComarkElement, state: State, parent?: ComarkElement) oneLiner = true } + if ($.block === 0) { + oneLiner = true + } + const isSelfClose = selfCloseTags.has(String(tag)) // Do not modify context if we are already in html mode - const revert = state.applyContext({ html: true, inline: oneLiner }) + const revert = state.applyContext({ inline: oneLiner }) const childrenContent = children.map(child => state.one(child, state, node)) @@ -69,7 +74,7 @@ export function html(node: ComarkElement, state: State, parent?: ComarkElement) } if (!oneLiner && content) { - content = '\n' + paddNoneHtmlContent(content, state) + '\n' + content = '\n' + paddNoneHtmlContent(content, state).trimEnd() + '\n' } return `<${tag}${attrs}>${content}` @@ -82,8 +87,8 @@ function paddNoneHtmlContent(content: string, state: State) { } return ( - (content.trim().startsWith('<') ? '' : '\n') + (content.trim().startsWith('<') ? '' : '') + content - + (content.trim().endsWith('>') ? '' : '\n') + + (content.trim().endsWith('>') ? '' : '') ) } diff --git a/packages/comark/src/internal/stringify/handlers/mdc.ts b/packages/comark/src/internal/stringify/handlers/mdc.ts index c6d21f7..5e4d1cc 100644 --- a/packages/comark/src/internal/stringify/handlers/mdc.ts +++ b/packages/comark/src/internal/stringify/handlers/mdc.ts @@ -8,7 +8,8 @@ import { html } from './html' const INLINE_HTML_ELEMENTS = new Set(['a', 'strong', 'em', 'span']) export function mdc(node: ComarkElement, state: State, parent?: ComarkElement) { - const [tag, attributes, ...children] = node + const [tag, attr, ...children] = node + const { $, ...attributes } = attr if (tag === 'table') { return html(node, state) diff --git a/packages/comark/src/internal/stringify/state.ts b/packages/comark/src/internal/stringify/state.ts index ec1cd50..6e0fd3b 100644 --- a/packages/comark/src/internal/stringify/state.ts +++ b/packages/comark/src/internal/stringify/state.ts @@ -26,7 +26,7 @@ export function one(node: ComarkNode, state: State, parent?: ComarkElement) { return userHandler(node, state, parent) } - if (state.context.html) { + if (state.context.html || node[1].$?.html === 1) { return state.handlers.html(node, state, parent) } diff --git a/packages/comark/src/types.ts b/packages/comark/src/types.ts index 05ac6c7..d2ade1b 100644 --- a/packages/comark/src/types.ts +++ b/packages/comark/src/types.ts @@ -56,6 +56,29 @@ export interface ParseOptions { */ autoClose?: boolean + /** + * Whether to parse HTML tags embedded in Comark/markdown content. + * When enabled, HTML block and inline elements are parsed into AST nodes and can be + * mixed freely with Comark components and markdown syntax. + * + * @default true + * @example + * // With html: true (default) — HTML is parsed into AST nodes + * // Input: `text` + * // AST: ['strong', { class: 'bold' }, 'text'] + * + * // HTML can be mixed with Comark components: + * // Input: + * //
+ * // ::alert + * // Hello world + * // :: + * //
+ * + * // With html: false — HTML tags are left as raw text / ignored + */ + html?: boolean + /** * Additional plugins to use * @default [] diff --git a/packages/comark/test/streaming.test.ts b/packages/comark/test/streaming.test.ts index a21bbdd..6268000 100644 --- a/packages/comark/test/streaming.test.ts +++ b/packages/comark/test/streaming.test.ts @@ -3,23 +3,23 @@ import { createParse } from 'comark' import type { ComarkElement } from 'comark/ast' describe('streaming mode', () => { - describe('$comark.line metadata', () => { + describe('$.line metadata', () => { it('preserves position metadata on nodes in streaming mode', async () => { const parse = createParse() const result = await parse('# Hello\n\nParagraph one.\n\nParagraph two.\n', { streaming: true }) const nodes = result.nodes as ComarkElement[] - expect(nodes[0][1].$comark?.line).toBeDefined() - expect(nodes[1][1].$comark?.line).toBeDefined() - expect(nodes[2][1].$comark?.line).toBeDefined() + expect(nodes[0][1].$?.line).toBeDefined() + expect(nodes[1][1].$?.line).toBeDefined() + expect(nodes[2][1].$?.line).toBeDefined() }) - it('does NOT add $comark.line metadata without streaming', async () => { + it('does NOT add $.line metadata without streaming', async () => { const parse = createParse() const result = await parse('# Hello\n\nParagraph one.\n') const nodes = result.nodes as ComarkElement[] - expect(nodes[0][1].$comark).toBeUndefined() + expect(nodes[0][1].$).toBeUndefined() }) it('line numbers are monotonically increasing', async () => { @@ -27,7 +27,7 @@ describe('streaming mode', () => { const result = await parse('# Heading\n\nPara 1\n\nPara 2\n\nPara 3\n', { streaming: true }) const nodes = result.nodes as ComarkElement[] - const lines = nodes.map(n => n[1].$comark?.line ?? 0) + const lines = nodes.map(n => n[1].$?.line ?? 0) for (let i = 1; i < lines.length; i++) { expect(lines[i]).toBeGreaterThan(lines[i - 1]) } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b3d1d82..0d27333 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -417,6 +417,9 @@ importers: entities: specifier: ^4.5.0 version: 4.5.0 + htmlparser2: + specifier: ^9.0.0 + version: 9.1.0 js-yaml: specifier: ^4.1.1 version: 4.1.1 @@ -6627,6 +6630,9 @@ packages: html-whitespace-sensitive-tag-names@3.0.1: resolution: {integrity: sha512-q+310vW8zmymYHALr1da4HyXUQ0zgiIwIicEfotYPWGN0OJVEN/58IJ3A4GBYcEq3LGAZqKb+ugvP0GNB9CEAA==} + htmlparser2@9.1.0: + resolution: {integrity: sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==} + http-cache-semantics@4.2.0: resolution: {integrity: sha512-dTxcvPXqPvXBQpq5dUr6mEMJX4oIEFv6bwom3FDwKRDsuIjjJGANqhBuoAn9c1RQJIdAKav33ED65E2ys+87QQ==} @@ -17125,6 +17131,13 @@ snapshots: html-whitespace-sensitive-tag-names@3.0.1: {} + htmlparser2@9.1.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + entities: 4.5.0 + http-cache-semantics@4.2.0: {} http-errors@2.0.1: