diff --git a/setup_test.go b/setup_test.go index 09c5a71..2c5a364 100644 --- a/setup_test.go +++ b/setup_test.go @@ -9,6 +9,7 @@ import ( ) func runTestCases(t *testing.T, testCases []testCase) { + t.Helper() for _, tc := range testCases { t.Run(tc.name, func(tt *testing.T) { @@ -18,12 +19,15 @@ func runTestCases(t *testing.T, testCases []testCase) { } func runTestCase(t *testing.T, tc testCase, converters ...textplain.Converter) { - + t.Helper() if len(converters) == 0 { converters = []textplain.Converter{textplain.NewRegexpConverter(), textplain.NewTreeConverter()} } for _, converter := range converters { + if tc.skipRegexp && reflect.TypeOf(converter) == reflect.TypeOf(&textplain.RegexpConverter{}) { + continue + } t.Run(reflect.TypeOf(converter).Elem().Name(), func(tt *testing.T) { result, err := converter.Convert(tc.body, textplain.DefaultLineLength) assert.Nil(tt, err) diff --git a/textplain_test.go b/textplain_test.go index 5dc1013..7b1742e 100644 --- a/textplain_test.go +++ b/textplain_test.go @@ -12,6 +12,7 @@ type testCase struct { name string body string expect string + skipRegexp bool } func TestConvert(t *testing.T) { @@ -69,6 +70,12 @@ func TestStrippingWhitespace(t *testing.T) { body: "test text ", expect: "test text", }, + { + name: "preheader block", + body: "test text  ͏  ͏  ͏ ­ ­ ­\n\nhello", + expect: "test text\n\nhello", + skipRegexp: true, + }, { name: "infix repeated space", body: "test text", diff --git a/tree.go b/tree.go index 239f926..e129ed3 100644 --- a/tree.go +++ b/tree.go @@ -77,7 +77,7 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) { switch c.DataAtom { case atom.Script, atom.Style: continue - case atom.P: + case atom.P, atom.Div: more, err := t.doConvert(c) if err != nil { return nil, err @@ -306,67 +306,91 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) { return c, parts, nil } -func (t *TreeConverter) fixSpacing(text string) string { +func (t *TreeConverter) fixSpacing(rt string) string { - if len(text) < 2 { - return text + runes := []rune(rt) + + if len(runes) < 2 { + return rt } - processed := make([]byte, 0, len(text)) - processed = append(processed, text[:2]...) + processed := make([]rune, 0, len(runes)) + processed = append(processed, runes[:2]...) idx := 1 var inList = (processed[0] == '*' && processed[1] == ' ') tidyLoop: - for i := 2; i < len(text); i++ { + for i := 2; i < len(runes); i++ { + + v := runes[i] switch processed[idx] { case '\n': - if text[i] == '\t' || text[i] == ' ' { + if v == '\t' || v == ' ' { continue } - if processed[idx-1] == '\n' && text[i] == '\n' { + if processed[idx-1] == '\n' && v == '\n' { continue } - if inList && text[i] == '\n' { + if inList && v == '\n' { // lookahead through any whitespace to make sure we are still in a list - for j := i; j < len(text); j++ { - if text[j] == '\t' || text[j] == ' ' || text[j] == '\n' { + for j := i; j < len(runes); j++ { + switch runes[j] { + case '\t', ' ', '\n': continue - } - if text[j] == '*' && j+1 < len(text) && text[j+1] == ' ' { - continue tidyLoop + case '*': + if j+1 < len(runes) && runes[j+1] == ' ' { + continue tidyLoop + } } } } - if text[i-1] == '*' && text[i] == ' ' { + if runes[i-1] == '*' && v == ' ' { inList = true } else { inList = false } case ' ': - if text[i] == ' ' { - continue + if v == ' ' { + continue } - if text[i] == '\t' || text[i] == '\n' { + if v == '\t' || v == '\n' { processed[idx] = '\n' continue } } - processed = append(processed, text[i]) + + // handle whitespace characters being used for preheader blocks to produce a cleaner plaintext output + switch v { + case '\u034f','\u00ad','\u2007': + for j := i; j < len(runes); j++ { + switch runes[j] { + case ' ': + continue + case '\u034f','\u00ad','\u2007': + i = j + continue tidyLoop + default: + break + } + } + } + + processed = append(processed, v) idx++ } return string(processed) } + func getAttr(n *html.Node, name string) string { for _, a := range n.Attr { if a.Key == name {