From 8dc44046abc00345ecde751452761f82e6ecab66 Mon Sep 17 00:00:00 2001 From: Stephen Young Date: Thu, 19 Feb 2026 23:23:17 -0500 Subject: [PATCH 1/2] fix some whitespace handling --- setup_test.go | 6 +++++- textplain_test.go | 7 ++++++ tree.go | 55 +++++++++++++++++++++++++++++++---------------- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/setup_test.go b/setup_test.go index 09c5a71..2c5a364 100644 --- a/setup_test.go +++ b/setup_test.go @@ -9,6 +9,7 @@ import ( ) func runTestCases(t *testing.T, testCases []testCase) { + t.Helper() for _, tc := range testCases { t.Run(tc.name, func(tt *testing.T) { @@ -18,12 +19,15 @@ func runTestCases(t *testing.T, testCases []testCase) { } func runTestCase(t *testing.T, tc testCase, converters ...textplain.Converter) { - + t.Helper() if len(converters) == 0 { converters = []textplain.Converter{textplain.NewRegexpConverter(), textplain.NewTreeConverter()} } for _, converter := range converters { + if tc.skipRegexp && reflect.TypeOf(converter) == reflect.TypeOf(&textplain.RegexpConverter{}) { + continue + } t.Run(reflect.TypeOf(converter).Elem().Name(), func(tt *testing.T) { result, err := converter.Convert(tc.body, textplain.DefaultLineLength) assert.Nil(tt, err) diff --git a/textplain_test.go b/textplain_test.go index 5dc1013..7b1742e 100644 --- a/textplain_test.go +++ b/textplain_test.go @@ -12,6 +12,7 @@ type testCase struct { name string body string expect string + skipRegexp bool } func TestConvert(t *testing.T) { @@ -69,6 +70,12 @@ func TestStrippingWhitespace(t *testing.T) { body: "test text ", expect: "test text", }, + { + name: "preheader block", + body: "test text  ͏  ͏  ͏ ­ ­ ­\n\nhello", + expect: "test text\n\nhello", + skipRegexp: true, + }, { name: "infix repeated space", body: "test text", diff --git a/tree.go b/tree.go index 239f926..8f90269 100644 --- a/tree.go +++ b/tree.go @@ -3,6 +3,7 @@ package textplain import ( "strconv" "strings" + "unicode" "golang.org/x/net/html" "golang.org/x/net/html/atom" @@ -77,7 +78,7 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) { switch c.DataAtom { case atom.Script, atom.Style: continue - case atom.P: + case atom.P, atom.Div: more, err := t.doConvert(c) if err != nil { return nil, err @@ -306,67 +307,85 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) { return c, parts, nil } -func (t *TreeConverter) fixSpacing(text string) string { +func (t *TreeConverter) fixSpacing(rt string) string { - if len(text) < 2 { - return text + runes := []rune(rt) + + if len(runes) < 2 { + return rt } - processed := make([]byte, 0, len(text)) - processed = append(processed, text[:2]...) + processed := make([]rune, 0, len(runes)) + processed = append(processed, runes[:2]...) idx := 1 var inList = (processed[0] == '*' && processed[1] == ' ') tidyLoop: - for i := 2; i < len(text); i++ { + for i := 2; i < len(runes); i++ { + + v := safeSpace(runes[i]) switch processed[idx] { case '\n': - if text[i] == '\t' || text[i] == ' ' { + if v == '\t' || v == ' ' { continue } - if processed[idx-1] == '\n' && text[i] == '\n' { + if processed[idx-1] == '\n' && v == '\n' { continue } - if inList && text[i] == '\n' { + if inList && v == '\n' { // lookahead through any whitespace to make sure we are still in a list - for j := i; j < len(text); j++ { - if text[j] == '\t' || text[j] == ' ' || text[j] == '\n' { + for j := i; j < len(runes); j++ { + vj := safeSpace(runes[j]) + if vj == '\t' || vj == ' ' || vj == '\n' { continue } - if text[j] == '*' && j+1 < len(text) && text[j+1] == ' ' { + if vj == '*' && j+1 < len(runes) && safeSpace(runes[j+1]) == ' ' { continue tidyLoop } } } - if text[i-1] == '*' && text[i] == ' ' { + if runes[i-1] == '*' && v == ' ' { inList = true } else { inList = false } case ' ': - if text[i] == ' ' { - continue + if v == ' ' { + continue } - if text[i] == '\t' || text[i] == '\n' { + if v == '\t' || v == '\n' { processed[idx] = '\n' continue } } - processed = append(processed, text[i]) + processed = append(processed, v) idx++ } return string(processed) } +func safeSpace(r rune) rune { + switch r { + case '\t', '\n', ' ': + return r + case '\u00ad', '\u034f': + return ' ' + } + if unicode.IsSpace(r) { + return ' ' + } + return r +} + func getAttr(n *html.Node, name string) string { for _, a := range n.Attr { if a.Key == name { From dacbbc1620babbfe3d04f24b1ec70565ffb1d938 Mon Sep 17 00:00:00 2001 From: Stephen Young Date: Fri, 20 Feb 2026 10:11:09 -0500 Subject: [PATCH 2/2] this is the better way, don't just eat all unicode spacesx --- tree.go | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/tree.go b/tree.go index 8f90269..e129ed3 100644 --- a/tree.go +++ b/tree.go @@ -3,7 +3,6 @@ package textplain import ( "strconv" "strings" - "unicode" "golang.org/x/net/html" "golang.org/x/net/html/atom" @@ -324,7 +323,7 @@ func (t *TreeConverter) fixSpacing(rt string) string { tidyLoop: for i := 2; i < len(runes); i++ { - v := safeSpace(runes[i]) + v := runes[i] switch processed[idx] { case '\n': @@ -340,12 +339,13 @@ tidyLoop: if inList && v == '\n' { // lookahead through any whitespace to make sure we are still in a list for j := i; j < len(runes); j++ { - vj := safeSpace(runes[j]) - if vj == '\t' || vj == ' ' || vj == '\n' { + switch runes[j] { + case '\t', ' ', '\n': continue - } - if vj == '*' && j+1 < len(runes) && safeSpace(runes[j+1]) == ' ' { - continue tidyLoop + case '*': + if j+1 < len(runes) && runes[j+1] == ' ' { + continue tidyLoop + } } } } @@ -366,6 +366,23 @@ tidyLoop: } } + + // handle whitespace characters being used for preheader blocks to produce a cleaner plaintext output + switch v { + case '\u034f','\u00ad','\u2007': + for j := i; j < len(runes); j++ { + switch runes[j] { + case ' ': + continue + case '\u034f','\u00ad','\u2007': + i = j + continue tidyLoop + default: + break + } + } + } + processed = append(processed, v) idx++ } @@ -373,18 +390,6 @@ tidyLoop: return string(processed) } -func safeSpace(r rune) rune { - switch r { - case '\t', '\n', ' ': - return r - case '\u00ad', '\u034f': - return ' ' - } - if unicode.IsSpace(r) { - return ' ' - } - return r -} func getAttr(n *html.Node, name string) string { for _, a := range n.Attr {