Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 44 additions & 33 deletions Sources/DocCHTML/MarkdownRenderer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -675,58 +675,69 @@ package struct MarkdownRenderer<Provider: LinkProvider> {
// - An empty element like `<br />` or `<hr />` that's complete on its own.
// - An element with children like `<span style="color: red;">Something</span>` that needs to be created out of multiple markup elements.
//
// FIXME: See if this can be extracted into 2 private functions to make the code easier to read.
// Because it may take multiple markdown elements to create an HTML element, we pop elements rather than iterating
var elements = Array(container)
outer: while !elements.isEmpty {
let element = elements.removeFirst()

guard let start = element as? InlineHTML else {
var remainder = Array(container)[...]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you're using a slice here, I imagine this is quite efficient, even though we are mutating remainder.

Copy link
Contributor Author

@d-ronnqvist d-ronnqvist Dec 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because a slice is a portion of a buffer that's owned by another object, mutating the slice can be thought of as updating the start and end indices, which is very cheap.

If you use the Godbolt Compiler Explorer you can see that these two implementations produce rather comparable assembly code:

// Iterating over an array
func iterate(over numbers: [Int], doing work: (Int) -> Void) {
    for number in numbers {
        work(number)
    }
}
// Mutating a slice by repeatedly popping the first element
func iterate(over numbers: [Int], doing work: (Int) -> Void) {
    var remainder = numbers[...]
    while let number = remainder.popFirst() {
        work(number)
    }
}

Further; if the "work" is known to the compiler, like in the comparison below, then the Swift compiler is more likely to be able to optimize away those extra retain and release calls:

// Iterating over an array
func sum(numbers: [Int]) -> Int {
    var total = 0
    for number in numbers {
        total += number
    }
    return total
}
// Mutating a slice by repeatedly popping the first element
func sum(numbers: [Int]) -> Int {
    var total = 0
    var remainder = numbers[...]
    while let number = remainder.popFirst() {
        total += number
    }
    return total
}

while let element = remainder.popFirst() {
guard let openingHTML = element as? InlineHTML else {
// If the markup _isn't_ inline HTML we can simply visit it to transform it.
children.append(visit(element))
continue
}

// Otherwise, we need to determine how long this markdown element it.
var rawHTML = start.rawHTML
// Otherwise, we need to determine how long this markdown element is.
let rawHTML = openingHTML.rawHTML
// Simply skip any HTML/XML comments.
guard !rawHTML.hasPrefix("<!--") else {
// If it's a basic link, simply skip it.
continue
}

// Next, check if its empty element (for example `<br />` or `<hr />`) that's complete on its own.
if let parsed = try? XMLElement(xmlString: rawHTML) {

// On non-Darwin platforms, `XMLElement(xmlString:)` sometimes crashes for certain invalid / incomplete XML strings.
// To minimize the risk of this happening, don't try to parse the XML string as an empty HTML element unless it ends with "/>"
if rawHTML.hasSuffix("/>"), let parsed = try? XMLElement(xmlString: rawHTML) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you run into some crashes while running your new tests in Linux CI?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the Linux CI doesn't encounter any crashes with these changes. Locally, a Swift 6.0.3 container also didn't have any issues with the implementation in either 84dbf48 or e540ba3.

children.append(parsed)
continue
}

// This could be an HTML element with content or it could be invalid HTML.
// Don't modify `elements` until we know that we've parsed a valid HTML element.
var copy = elements

// Gradually check a longer and longer series of markup elements to see if they form a valid HTML element.
inner: while !copy.isEmpty, let next = copy.first as? any InlineMarkup {
_ = copy.removeFirst()

// Skip any HTML/XML comments _inside_ this HTML tag
if let html = next as? InlineHTML, html.rawHTML.hasPrefix("<!--") {
continue inner
}

rawHTML += next.format()
if let parsed = try? XMLElement(xmlString: rawHTML) {
children.append(parsed) // Include the valid HTML element in the output.
elements = copy // Skip over all the elements that were used to create that HTML element.
continue outer
}
// Lastly, check if this is the start of an HTML element that needs to be constructed out of more than one markup element
else if let parsed = _findMultiMarkupHTMLElement(in: &remainder, openingRawHTML: rawHTML) {
children.append(parsed)
}
// If we reached the end of the inline elements without parsing a valid HTML element, skip that first InlineHTML markup and continue from there.
continue
}

return children
}

private func _findMultiMarkupHTMLElement(in remainder: inout ArraySlice<any Markup>, openingRawHTML: String) -> XMLNode? {
// Don't modify `remainder` until we know that we've parsed a valid HTML element.
var copy = remainder

var rawHTML = openingRawHTML
let tagName = rawHTML.dropFirst(/* the opening "<" */).prefix(while: \.isLetter)
let expectedClosingTag = "</\(tagName)>"

// Only iterate as long the markup is _inline_ markup.
while let next = copy.first as? any InlineMarkup {
_ = copy.removeFirst()
let html = next as? InlineHTML

// Skip any HTML/XML comments _inside_ this HTML tag
if let html, html.rawHTML.hasPrefix("<!--") {
continue
}

// If this wasn't a comment, accumulate more raw HTML to try and parse
rawHTML += next.format()
// On non-Darwin platforms, `XMLElement(xmlString:)` sometimes crashes for certain invalid / incomplete XML strings.
// To minimize the risk of this happening, don't try to parse the XML string as an empty HTML element unless it ends with "/>"
if html?.rawHTML == expectedClosingTag, let parsed = try? XMLElement(xmlString: rawHTML) {
remainder = copy // Skip over all the elements that were used to create that HTML element.
return parsed // Include the valid HTML element in the output.
}
}
// If we reached the end of the _inline_ markup without parsing a valid HTML element, skip just that opening markup without updating `remainder`
return nil
}

// MARK: Directives

func visit(_: BlockDirective) -> XMLNode {
Expand Down
48 changes: 48 additions & 0 deletions Tests/DocCHTMLTests/MarkdownRendererTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,54 @@ struct MarkdownRendererTests {
)
}

@Test
func testParsesAndPreservesHTMLExceptComments() {
assert(
rendering: "This is a <!-- inline comment --><strong>formatted</strong> paragraph.",
matches: "<p>This is a <strong>formatted</strong> paragraph.</p>"
)

assert(
rendering: "This<br/> is a <em><!-- multi\n line\n comment-->formatted</em>paragraph.",
matches: "<p>This<br/> is a <em>formatted</em> paragraph.</p>"
)

assert(
rendering: "This is a <span style=\"color: red\"><!-- before -->custom formatted<!-- after --></span> paragraph.",
matches: "<p>This is a <span style=\"color: red\">custom formatted</span> paragraph.</p>"
)

// This markup doesn't properly close the `<strong>` tag (it uses an `</em>` tag.
// In this case we drop both tags but not their content in between. This matches what DocC does for inline HTML with regards to the Render JSON output.
assert(
rendering: "This is a <strong>custom formatted</em> paragraph.",
matches: "<p>This is a custom formatted paragraph.</p>"
)

// Any content _within_ HTML tags in the markdown isn't parsed as markdown content.
assert(
rendering: "This is a <span>custom **not** formatted</span> paragraph.",
matches: "<p>This is a <span>custom **not** formatted</span> paragraph.</p>"
)

assert(
rendering: """
<details>
<summary>Some summary<!-- comment in summary--></summary>
<!-- comment between elements -->
<p><!-- comment before -->Some longer<!-- comment between words --> description<!-- comment after --></p>
</details>
<!-- comment after block element -->
""",
matches: """
<details>
<summary>Some summary</summary>
<p>Some longer description</p>
</details>
"""
)
}

private func assert(
rendering markdownContent: String,
elementToReturn: LinkedElement? = nil,
Expand Down