Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion setup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
)

func runTestCases(t *testing.T, testCases []testCase) {
t.Helper()

for _, tc := range testCases {
t.Run(tc.name, func(tt *testing.T) {
Expand All @@ -18,12 +19,15 @@ func runTestCases(t *testing.T, testCases []testCase) {
}

func runTestCase(t *testing.T, tc testCase, converters ...textplain.Converter) {

t.Helper()
if len(converters) == 0 {
converters = []textplain.Converter{textplain.NewRegexpConverter(), textplain.NewTreeConverter()}
}

for _, converter := range converters {
if tc.skipRegexp && reflect.TypeOf(converter) == reflect.TypeOf(&textplain.RegexpConverter{}) {
continue
}
t.Run(reflect.TypeOf(converter).Elem().Name(), func(tt *testing.T) {
result, err := converter.Convert(tc.body, textplain.DefaultLineLength)
assert.Nil(tt, err)
Expand Down
7 changes: 7 additions & 0 deletions textplain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ type testCase struct {
name string
body string
expect string
skipRegexp bool
}

func TestConvert(t *testing.T) {
Expand Down Expand Up @@ -69,6 +70,12 @@ func TestStrippingWhitespace(t *testing.T) {
body: "test text ",
expect: "test text",
},
{
name: "preheader block",
body: "test text  ͏  ͏  ͏ ­ ­ ­\n\nhello",
expect: "test text\n\nhello",
skipRegexp: true,
},
{
name: "infix repeated space",
body: "test text",
Expand Down
64 changes: 44 additions & 20 deletions tree.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) {
switch c.DataAtom {
case atom.Script, atom.Style:
continue
case atom.P:
case atom.P, atom.Div:
more, err := t.doConvert(c)
if err != nil {
return nil, err
Expand Down Expand Up @@ -306,67 +306,91 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) {
return c, parts, nil
}

func (t *TreeConverter) fixSpacing(text string) string {
func (t *TreeConverter) fixSpacing(rt string) string {

if len(text) < 2 {
return text
runes := []rune(rt)

if len(runes) < 2 {
return rt
}

processed := make([]byte, 0, len(text))
processed = append(processed, text[:2]...)
processed := make([]rune, 0, len(runes))
processed = append(processed, runes[:2]...)
idx := 1

var inList = (processed[0] == '*' && processed[1] == ' ')

tidyLoop:
for i := 2; i < len(text); i++ {
for i := 2; i < len(runes); i++ {

v := runes[i]

switch processed[idx] {
case '\n':

if text[i] == '\t' || text[i] == ' ' {
if v == '\t' || v == ' ' {
continue
}

if processed[idx-1] == '\n' && text[i] == '\n' {
if processed[idx-1] == '\n' && v == '\n' {
continue
}

if inList && text[i] == '\n' {
if inList && v == '\n' {
// lookahead through any whitespace to make sure we are still in a list
for j := i; j < len(text); j++ {
if text[j] == '\t' || text[j] == ' ' || text[j] == '\n' {
for j := i; j < len(runes); j++ {
switch runes[j] {
case '\t', ' ', '\n':
continue
}
if text[j] == '*' && j+1 < len(text) && text[j+1] == ' ' {
continue tidyLoop
case '*':
if j+1 < len(runes) && runes[j+1] == ' ' {
continue tidyLoop
}
}
}
}

if text[i-1] == '*' && text[i] == ' ' {
if runes[i-1] == '*' && v == ' ' {
inList = true
} else {
inList = false
}

case ' ':
if text[i] == ' ' {
continue
if v == ' ' {
continue
}
if text[i] == '\t' || text[i] == '\n' {
if v == '\t' || v == '\n' {
processed[idx] = '\n'
continue
}
}

processed = append(processed, text[i])

// handle whitespace characters being used for preheader blocks to produce a cleaner plaintext output
switch v {
case '\u034f','\u00ad','\u2007':
for j := i; j < len(runes); j++ {
switch runes[j] {
case ' ':
continue
case '\u034f','\u00ad','\u2007':
i = j
continue tidyLoop
default:
break
}
}
}

processed = append(processed, v)
idx++
}

return string(processed)
}


func getAttr(n *html.Node, name string) string {
for _, a := range n.Attr {
if a.Key == name {
Expand Down