diff --git a/pkg/sanitize/sanitize.go b/pkg/sanitize/sanitize.go index e6401e4fb..c646f21c6 100644 --- a/pkg/sanitize/sanitize.go +++ b/pkg/sanitize/sanitize.go @@ -12,7 +12,12 @@ var policy *bluemonday.Policy var policyOnce sync.Once func Sanitize(input string) string { - return FilterHTMLTags(FilterCodeFenceMetadata(FilterInvisibleCharacters(input))) + s := FilterInvisibleCharacters(input) + s = FilterCodeFenceMetadata(s) + s = protectCodeAngles(s) + s = FilterHTMLTags(s) + s = restoreCodeAngles(s) + return s } // FilterInvisibleCharacters removes invisible or control characters that should not appear @@ -207,3 +212,72 @@ func shouldRemoveRune(r rune) bool { return false } + +// Placeholders used to shield angle brackets inside code regions from +// the HTML sanitizer. They must not look like HTML tags themselves and +// must be unlikely to appear in real content. +const ( + codeLtPlaceholder = "\x00CODELT\x00" + codeGtPlaceholder = "\x00CODEGT\x00" +) + +// protectCodeAngles replaces < and > with unique placeholders inside +// fenced code blocks so that bluemonday does not strip them as HTML tags. +// This must run after FilterCodeFenceMetadata (which cleans fence info +// strings) and before FilterHTMLTags. +func protectCodeAngles(input string) string { + if input == "" { + return input + } + + lines := strings.Split(input, "\n") + insideFence := false + currentFenceLen := 0 + + for i, line := range lines { + fenceIdx := strings.Index(line, "```") + + if fenceIdx != -1 && !hasNonWhitespace(line[:fenceIdx]) { + fenceEnd := fenceIdx + for fenceEnd < len(line) && line[fenceEnd] == '`' { + fenceEnd++ + } + fenceLen := fenceEnd - fenceIdx + + if fenceLen >= 3 { + if insideFence { + if currentFenceLen == 0 || fenceLen >= currentFenceLen { + // Valid closing fence (CommonMark: closing fence + // must be at least as long as the opening fence). + insideFence = false + currentFenceLen = 0 + continue + } + // Fence length too short — still inside code. + } else { + // Opening fence. + insideFence = true + currentFenceLen = fenceLen + continue + } + } + } + + if insideFence { + lines[i] = strings.ReplaceAll( + strings.ReplaceAll(line, "<", codeLtPlaceholder), + ">", codeGtPlaceholder, + ) + } + } + + return strings.Join(lines, "\n") +} + +// restoreCodeAngles reverses the placeholder substitution performed by +// protectCodeAngles. +func restoreCodeAngles(input string) string { + s := strings.ReplaceAll(input, codeLtPlaceholder, "<") + s = strings.ReplaceAll(s, codeGtPlaceholder, ">") + return s +} diff --git a/pkg/sanitize/sanitize_test.go b/pkg/sanitize/sanitize_test.go index 35b23e6ab..248f4fdea 100644 --- a/pkg/sanitize/sanitize_test.go +++ b/pkg/sanitize/sanitize_test.go @@ -300,3 +300,151 @@ func TestSanitizeRemovesInvisibleCodeFenceMetadata(t *testing.T) { result := Sanitize(input) assert.Equal(t, expected, result) } + +func TestProtectCodeAngles(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "empty string", + input: "", + expected: "", + }, + { + name: "no code blocks", + input: "Hello World", + expected: "Hello World", + }, + { + name: "fenced code block with angle brackets", + input: "```\nvector v;\n```", + expected: "```\nvector" + codeLtPlaceholder + "int" + codeGtPlaceholder + " v;\n```", + }, + { + name: "fenced code block with language tag", + input: "```cpp\nmap m;\n```", + expected: "```cpp\nmap" + codeLtPlaceholder + "string, int" + codeGtPlaceholder + " m;\n```", + }, + { + name: "multiple code blocks", + input: "text\n```\nac\n```\nmiddle\n```\ndf\n```", + expected: "text\n```\na" + codeLtPlaceholder + "b" + codeGtPlaceholder + "c\n```\nmiddle\n```\nd" + codeLtPlaceholder + "e" + codeGtPlaceholder + "f\n```", + }, + { + name: "angle brackets outside code blocks preserved as-is", + input: "Use bold\n```\ncode\n```\nMore text", + expected: "Use bold\n```\ncode" + codeLtPlaceholder + "T" + codeGtPlaceholder + "\n```\nMore text", + }, + { + name: "four-backtick fence", + input: "````\nfn foo()\n````", + expected: "````\nfn foo" + codeLtPlaceholder + "T" + codeGtPlaceholder + "()\n````", + }, + { + name: "shorter fence inside code does not close block", + input: "````\nline\n```\nstill\n````", + expected: "````\nline" + codeLtPlaceholder + "A" + codeGtPlaceholder + "\n```\nstill" + codeLtPlaceholder + "B" + codeGtPlaceholder + "\n````", + }, + { + name: "longer closing fence closes the block (CommonMark)", + input: "```\ncode\n````\noutsidetext", + expected: "```\ncode" + codeLtPlaceholder + "T" + codeGtPlaceholder + "\n````\noutsidetext", + }, + { + name: "unclosed fence protects remaining lines", + input: "```\nac\nmore", + expected: "```\na" + codeLtPlaceholder + "b" + codeGtPlaceholder + "c\nmore" + codeLtPlaceholder + "d" + codeGtPlaceholder, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := protectCodeAngles(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestRestoreCodeAngles(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "empty string", + input: "", + expected: "", + }, + { + name: "no placeholders", + input: "Hello World", + expected: "Hello World", + }, + { + name: "restores lt and gt", + input: "vector" + codeLtPlaceholder + "int" + codeGtPlaceholder, + expected: "vector", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := restoreCodeAngles(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestSanitizePreservesAngleBracketsInCodeBlocks(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "issue 2202: template parameter in code block", + input: "```\nlet ptr: mut_raw_ptr = raw_new int;\n```", + expected: "```\nlet ptr: mut_raw_ptr = raw_new int;\n```", + }, + { + name: "C++ template in code block", + input: "```cpp\nstd::vector items;\n```", + expected: "```cpp\nstd::vector items;\n```", + }, + { + name: "HTML-like tags outside code blocks still sanitized", + input: "\n```\nvector v;\n```", + expected: "\n```\nvector v;\n```", + }, + { + name: "allowed HTML outside code blocks preserved", + input: "bold\n```\nfoo()\n```", + expected: "bold\n```\nfoo()\n```", + }, + { + name: "multiple angle brackets in code", + input: "```\nMap> m;\n```", + expected: "```\nMap> m;\n```", + }, + { + name: "script tags after code block still sanitized", + input: "```\nvector v;\n```\n", + expected: "```\nvector v;\n```\n", + }, + { + name: "longer closing fence does not leak protection", + input: "```\ncode\n````\n", + expected: "```\ncode\n````\n", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := Sanitize(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +}