Skip to content

Commit f561a86

Browse files
Add tests for parsing of inline HTML when rendering markdown as HTML (#1379)
* Add test about parsing inline HTML except for comments * Extract inner HTML parsing code into a private function * Fix minor spelling in code comment Co-authored-by: Pat Shaughnessy <pat_shaughnessy@apple.com> --------- Co-authored-by: Pat Shaughnessy <pat_shaughnessy@apple.com>
1 parent 56cb686 commit f561a86

File tree

2 files changed

+92
-33
lines changed

2 files changed

+92
-33
lines changed

Sources/DocCHTML/MarkdownRenderer.swift

Lines changed: 44 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -675,58 +675,69 @@ package struct MarkdownRenderer<Provider: LinkProvider> {
675675
// - An empty element like `<br />` or `<hr />` that's complete on its own.
676676
// - An element with children like `<span style="color: red;">Something</span>` that needs to be created out of multiple markup elements.
677677
//
678-
// FIXME: See if this can be extracted into 2 private functions to make the code easier to read.
679678
// Because it may take multiple markdown elements to create an HTML element, we pop elements rather than iterating
680-
var elements = Array(container)
681-
outer: while !elements.isEmpty {
682-
let element = elements.removeFirst()
683-
684-
guard let start = element as? InlineHTML else {
679+
var remainder = Array(container)[...]
680+
while let element = remainder.popFirst() {
681+
guard let openingHTML = element as? InlineHTML else {
685682
// If the markup _isn't_ inline HTML we can simply visit it to transform it.
686683
children.append(visit(element))
687684
continue
688685
}
689686

690-
// Otherwise, we need to determine how long this markdown element it.
691-
var rawHTML = start.rawHTML
687+
// Otherwise, we need to determine how long this markdown element is.
688+
let rawHTML = openingHTML.rawHTML
689+
// Simply skip any HTML/XML comments.
692690
guard !rawHTML.hasPrefix("<!--") else {
693-
// If it's a basic link, simply skip it.
694691
continue
695692
}
696693

697694
// Next, check if its empty element (for example `<br />` or `<hr />`) that's complete on its own.
698-
if let parsed = try? XMLElement(xmlString: rawHTML) {
695+
696+
// On non-Darwin platforms, `XMLElement(xmlString:)` sometimes crashes for certain invalid / incomplete XML strings.
697+
// To minimize the risk of this happening, don't try to parse the XML string as an empty HTML element unless it ends with "/>"
698+
if rawHTML.hasSuffix("/>"), let parsed = try? XMLElement(xmlString: rawHTML) {
699699
children.append(parsed)
700-
continue
701700
}
702-
703-
// This could be an HTML element with content or it could be invalid HTML.
704-
// Don't modify `elements` until we know that we've parsed a valid HTML element.
705-
var copy = elements
706-
707-
// Gradually check a longer and longer series of markup elements to see if they form a valid HTML element.
708-
inner: while !copy.isEmpty, let next = copy.first as? any InlineMarkup {
709-
_ = copy.removeFirst()
710-
711-
// Skip any HTML/XML comments _inside_ this HTML tag
712-
if let html = next as? InlineHTML, html.rawHTML.hasPrefix("<!--") {
713-
continue inner
714-
}
715-
716-
rawHTML += next.format()
717-
if let parsed = try? XMLElement(xmlString: rawHTML) {
718-
children.append(parsed) // Include the valid HTML element in the output.
719-
elements = copy // Skip over all the elements that were used to create that HTML element.
720-
continue outer
721-
}
701+
// Lastly, check if this is the start of an HTML element that needs to be constructed out of more than one markup element
702+
else if let parsed = _findMultiMarkupHTMLElement(in: &remainder, openingRawHTML: rawHTML) {
703+
children.append(parsed)
722704
}
723-
// If we reached the end of the inline elements without parsing a valid HTML element, skip that first InlineHTML markup and continue from there.
724-
continue
725705
}
726706

727707
return children
728708
}
729709

710+
private func _findMultiMarkupHTMLElement(in remainder: inout ArraySlice<any Markup>, openingRawHTML: String) -> XMLNode? {
711+
// Don't modify `remainder` until we know that we've parsed a valid HTML element.
712+
var copy = remainder
713+
714+
var rawHTML = openingRawHTML
715+
let tagName = rawHTML.dropFirst(/* the opening "<" */).prefix(while: \.isLetter)
716+
let expectedClosingTag = "</\(tagName)>"
717+
718+
// Only iterate as long the markup is _inline_ markup.
719+
while let next = copy.first as? any InlineMarkup {
720+
_ = copy.removeFirst()
721+
let html = next as? InlineHTML
722+
723+
// Skip any HTML/XML comments _inside_ this HTML tag
724+
if let html, html.rawHTML.hasPrefix("<!--") {
725+
continue
726+
}
727+
728+
// If this wasn't a comment, accumulate more raw HTML to try and parse
729+
rawHTML += next.format()
730+
// On non-Darwin platforms, `XMLElement(xmlString:)` sometimes crashes for certain invalid / incomplete XML strings.
731+
// To minimize the risk of this happening, don't try to parse the XML string as an empty HTML element unless it ends with "/>"
732+
if html?.rawHTML == expectedClosingTag, let parsed = try? XMLElement(xmlString: rawHTML) {
733+
remainder = copy // Skip over all the elements that were used to create that HTML element.
734+
return parsed // Include the valid HTML element in the output.
735+
}
736+
}
737+
// If we reached the end of the _inline_ markup without parsing a valid HTML element, skip just that opening markup without updating `remainder`
738+
return nil
739+
}
740+
730741
// MARK: Directives
731742

732743
func visit(_: BlockDirective) -> XMLNode {

Tests/DocCHTMLTests/MarkdownRendererTests.swift

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,54 @@ struct MarkdownRendererTests {
548548
)
549549
}
550550

551+
@Test
552+
func testParsesAndPreservesHTMLExceptComments() {
553+
assert(
554+
rendering: "This is a <!-- inline comment --><strong>formatted</strong> paragraph.",
555+
matches: "<p>This is a <strong>formatted</strong> paragraph.</p>"
556+
)
557+
558+
assert(
559+
rendering: "This<br/> is a <em><!-- multi\n line\n comment-->formatted</em>paragraph.",
560+
matches: "<p>This<br/> is a <em>formatted</em> paragraph.</p>"
561+
)
562+
563+
assert(
564+
rendering: "This is a <span style=\"color: red\"><!-- before -->custom formatted<!-- after --></span> paragraph.",
565+
matches: "<p>This is a <span style=\"color: red\">custom formatted</span> paragraph.</p>"
566+
)
567+
568+
// This markup doesn't properly close the `<strong>` tag (it uses an `</em>` tag.
569+
// In this case we drop both tags but not their content in between. This matches what DocC does for inline HTML with regards to the Render JSON output.
570+
assert(
571+
rendering: "This is a <strong>custom formatted</em> paragraph.",
572+
matches: "<p>This is a custom formatted paragraph.</p>"
573+
)
574+
575+
// Any content _within_ HTML tags in the markdown isn't parsed as markdown content.
576+
assert(
577+
rendering: "This is a <span>custom **not** formatted</span> paragraph.",
578+
matches: "<p>This is a <span>custom **not** formatted</span> paragraph.</p>"
579+
)
580+
581+
assert(
582+
rendering: """
583+
<details>
584+
<summary>Some summary<!-- comment in summary--></summary>
585+
<!-- comment between elements -->
586+
<p><!-- comment before -->Some longer<!-- comment between words --> description<!-- comment after --></p>
587+
</details>
588+
<!-- comment after block element -->
589+
""",
590+
matches: """
591+
<details>
592+
<summary>Some summary</summary>
593+
<p>Some longer description</p>
594+
</details>
595+
"""
596+
)
597+
}
598+
551599
private func assert(
552600
rendering markdownContent: String,
553601
elementToReturn: LinkedElement? = nil,

0 commit comments

Comments
 (0)