From 8dc44046abc00345ecde751452761f82e6ecab66 Mon Sep 17 00:00:00 2001
From: Stephen Young <stephen@customer.io>
Date: Thu, 19 Feb 2026 23:23:17 -0500
Subject: [PATCH 1/2] fix some whitespace handling

---
 setup_test.go     |  6 +++++-
 textplain_test.go |  7 ++++++
 tree.go           | 55 +++++++++++++++++++++++++++++++----------------
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/setup_test.go b/setup_test.go
index 09c5a71..2c5a364 100644
--- a/setup_test.go
+++ b/setup_test.go
@@ -9,6 +9,7 @@ import (
 )
 
 func runTestCases(t *testing.T, testCases []testCase) {
+	t.Helper()
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(tt *testing.T) {
@@ -18,12 +19,15 @@ func runTestCases(t *testing.T, testCases []testCase) {
 }
 
 func runTestCase(t *testing.T, tc testCase, converters ...textplain.Converter) {
-
+	t.Helper()
 	if len(converters) == 0 {
 		converters = []textplain.Converter{textplain.NewRegexpConverter(), textplain.NewTreeConverter()}
 	}
 
 	for _, converter := range converters {
+		if tc.skipRegexp && reflect.TypeOf(converter) == reflect.TypeOf(&textplain.RegexpConverter{}) {
+			continue
+		}
 		t.Run(reflect.TypeOf(converter).Elem().Name(), func(tt *testing.T) {
 			result, err := converter.Convert(tc.body, textplain.DefaultLineLength)
 			assert.Nil(tt, err)
diff --git a/textplain_test.go b/textplain_test.go
index 5dc1013..7b1742e 100644
--- a/textplain_test.go
+++ b/textplain_test.go
@@ -12,6 +12,7 @@ type testCase struct {
 	name   string
 	body   string
 	expect string
+	skipRegexp bool
 }
 
 func TestConvert(t *testing.T) {
@@ -69,6 +70,12 @@ func TestStrippingWhitespace(t *testing.T) {
 			body:   "test text&nbsp;",
 			expect: "test text",
 		},
+		{
+			name:   "preheader block",
+			body:   "test text &#8199;&#847; &#8199;&#847; &#8199;&#847; &shy; &shy; &shy;\n\nhello",
+			expect: "test text\n\nhello",
+			skipRegexp: true,
+		},
 		{
 			name:   "infix repeated space",
 			body:   "test        text",
diff --git a/tree.go b/tree.go
index 239f926..8f90269 100644
--- a/tree.go
+++ b/tree.go
@@ -3,6 +3,7 @@ package textplain
 import (
 	"strconv"
 	"strings"
+	"unicode"
 
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
@@ -77,7 +78,7 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) {
 			switch c.DataAtom {
 			case atom.Script, atom.Style:
 				continue
-			case atom.P:
+			case atom.P, atom.Div:
 				more, err := t.doConvert(c)
 				if err != nil {
 					return nil, err
@@ -306,67 +307,85 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) {
 	return c, parts, nil
 }
 
-func (t *TreeConverter) fixSpacing(text string) string {
+func (t *TreeConverter) fixSpacing(rt string) string {
 
-	if len(text) < 2 {
-		return text
+	runes := []rune(rt)
+
+	if len(runes) < 2 {
+		return rt
 	}
 
-	processed := make([]byte, 0, len(text))
-	processed = append(processed, text[:2]...)
+	processed := make([]rune, 0, len(runes))
+	processed = append(processed, runes[:2]...)
 	idx := 1
 
 	var inList = (processed[0] == '*' && processed[1] == ' ')
 
 tidyLoop:
-	for i := 2; i < len(text); i++ {
+	for i := 2; i < len(runes); i++ {
+		
+		v := safeSpace(runes[i])
 
 		switch processed[idx] {
 		case '\n':
 
-			if text[i] == '\t' || text[i] == ' ' {
+			if v == '\t' || v == ' ' {
 				continue
 			}
 
-			if processed[idx-1] == '\n' && text[i] == '\n' {
+			if processed[idx-1] == '\n' && v == '\n' {
 				continue
 			}
 
-			if inList && text[i] == '\n' {
+			if inList && v == '\n' {
 				// lookahead through any whitespace to make sure we are still in a list
-				for j := i; j < len(text); j++ {
-					if text[j] == '\t' || text[j] == ' ' || text[j] == '\n' {
+				for j := i; j < len(runes); j++ {
+					vj := safeSpace(runes[j])
+					if vj == '\t' || vj == ' ' || vj == '\n' {
 						continue
 					}
-					if text[j] == '*' && j+1 < len(text) && text[j+1] == ' ' {
+					if vj == '*' && j+1 < len(runes) && safeSpace(runes[j+1]) == ' ' {
 						continue tidyLoop
 					}
 				}
 			}
 
-			if text[i-1] == '*' && text[i] == ' ' {
+			if runes[i-1] == '*' && v == ' ' {
 				inList = true
 			} else {
 				inList = false
 			}
 
 		case ' ':
-			if text[i] == ' ' {
-				continue
+			if v == ' ' {
+				continue 
 			}
-			if text[i] == '\t' || text[i] == '\n' {
+			if v == '\t' || v == '\n' {
 				processed[idx] = '\n'
 				continue
 			}
 		}
 
-		processed = append(processed, text[i])
+		processed = append(processed, v)
 		idx++
 	}
 
 	return string(processed)
 }
 
+func safeSpace(r rune) rune {
+	switch r {
+	case '\t', '\n', ' ':
+		return r
+	case '\u00ad', '\u034f':
+		return ' '
+	}
+	if unicode.IsSpace(r) {
+		return ' '
+	}
+	return r
+}
+
 func getAttr(n *html.Node, name string) string {
 	for _, a := range n.Attr {
 		if a.Key == name {

From dacbbc1620babbfe3d04f24b1ec70565ffb1d938 Mon Sep 17 00:00:00 2001
From: Stephen Young <stephen@customer.io>
Date: Fri, 20 Feb 2026 10:11:09 -0500
Subject: [PATCH 2/2] this is the better way, don't just eat all unicode
 spacesx

---
 tree.go | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/tree.go b/tree.go
index 8f90269..e129ed3 100644
--- a/tree.go
+++ b/tree.go
@@ -3,7 +3,6 @@ package textplain
 import (
 	"strconv"
 	"strings"
-	"unicode"
 
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
@@ -324,7 +323,7 @@ func (t *TreeConverter) fixSpacing(rt string) string {
 tidyLoop:
 	for i := 2; i < len(runes); i++ {
 		
-		v := safeSpace(runes[i])
+		v := runes[i]
 
 		switch processed[idx] {
 		case '\n':
@@ -340,12 +339,13 @@ tidyLoop:
 			if inList && v == '\n' {
 				// lookahead through any whitespace to make sure we are still in a list
 				for j := i; j < len(runes); j++ {
-					vj := safeSpace(runes[j])
-					if vj == '\t' || vj == ' ' || vj == '\n' {
+					switch runes[j] {
+					case '\t', ' ', '\n':
 						continue
-					}
-					if vj == '*' && j+1 < len(runes) && safeSpace(runes[j+1]) == ' ' {
-						continue tidyLoop
+					case '*':
+						if j+1 < len(runes) && runes[j+1] == ' ' {
+							continue tidyLoop
+						}
 					}
 				}
 			}
@@ -366,6 +366,23 @@ tidyLoop:
 			}
 		}
 
+
+		// handle whitespace characters being used for preheader blocks to produce a cleaner plaintext output
+		switch v {
+		case '\u034f','\u00ad','\u2007':
+			for j := i; j < len(runes); j++ {
+				switch runes[j] {
+				case ' ':
+					continue
+				case '\u034f','\u00ad','\u2007':
+					i = j
+					continue tidyLoop
+				default:
+					break
+				}
+			}
+		}
+
 		processed = append(processed, v)
 		idx++
 	}
@@ -373,18 +390,6 @@ tidyLoop:
 	return string(processed)
 }
 
-func safeSpace(r rune) rune {
-	switch r {
-	case '\t', '\n', ' ':
-		return r
-	case '\u00ad', '\u034f':
-		return ' '
-	}
-	if unicode.IsSpace(r) {
-		return ' '
-	}
-	return r
-}
 
 func getAttr(n *html.Node, name string) string {
 	for _, a := range n.Attr {