-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathselection.go
More file actions
186 lines (171 loc) · 5.73 KB
/
selection.go
File metadata and controls
186 lines (171 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
package htmlbag
import (
"fmt"
"regexp"
"strings"
"unicode"
"github.com/boxesandglue/boxesandglue/frontend"
"github.com/boxesandglue/csshtml"
"golang.org/x/net/html"
)
var (
// These patterns use [ \t\n\r\f] instead of \s to exclude NBSP (U+00A0).
// CSS treats NBSP as non-collapsible whitespace.
isSpace = regexp.MustCompile(`^[ \t\n\r\f]*$`)
reLeadcloseWhtsp = regexp.MustCompile(`^[ \t\n\r\f]+|[ \t\n\r\f]+$`)
reInsideWS = regexp.MustCompile(`\n|[ \t\n\r\f]{2,}`)
)
// isCollapsibleSpace returns true for whitespace characters that CSS considers
// collapsible. NBSP (U+00A0) is explicitly excluded.
func isCollapsibleSpace(r rune) bool {
return r != '\u00A0' && unicode.IsSpace(r)
}
// Mode is the progression direction of the current HTML element.
type Mode int
func (m Mode) String() string {
if m == ModeHorizontal {
return "→"
}
return "↓"
}
const (
// ModeHorizontal represents inline progression direction.
ModeHorizontal Mode = iota
// ModeVertical represents block progression direction.
ModeVertical
)
var preserveWhitespace = []bool{false}
// HTMLItem is a struct which represents a HTML element or a text node.
type HTMLItem struct {
Typ html.NodeType
Data string
Dir Mode
Attributes map[string]string
Styles map[string]string
Children []*HTMLItem
}
func (itm *HTMLItem) String() string {
switch itm.Typ {
case html.TextNode:
return fmt.Sprintf("%q", itm.Data)
case html.ElementNode:
return fmt.Sprintf("<%s>", itm.Data)
default:
return fmt.Sprintf("%s", itm.Data)
}
}
// isCustomVoidElement returns true for custom element names that should be
// treated as void (self-closing) elements. The HTML5 parser does not recognize
// custom tags as void, so <barcode ... /> gets parsed as an opening tag that
// swallows subsequent siblings as children.
func isCustomVoidElement(name string) bool {
return name == "barcode"
}
// GetHTMLItemFromHTMLNode fills the firstItem with the contents of thisNode. Comments and
// DocumentNodes are ignored.
func GetHTMLItemFromHTMLNode(thisNode *html.Node, direction Mode, firstItem *HTMLItem) error {
newDir := direction
for {
if thisNode == nil {
break
}
switch thisNode.Type {
case html.CommentNode, html.DoctypeNode:
// ignore
case html.TextNode:
itm := &HTMLItem{}
preserveWhitespace := preserveWhitespace[len(preserveWhitespace)-1]
txt := thisNode.Data
// When turning from vertical to horizontal (a text is always
// horizontal material), trim the left space. TODO: honor preserve
// whitespace setting
if direction == ModeVertical {
txt = strings.TrimLeftFunc(txt, isCollapsibleSpace)
}
if !preserveWhitespace {
if isSpace.MatchString(txt) {
txt = " "
}
}
if !isSpace.MatchString(txt) {
if direction == ModeVertical {
newDir = ModeHorizontal
}
}
if txt != "" {
if !preserveWhitespace {
txt = reLeadcloseWhtsp.ReplaceAllString(txt, " ")
txt = reInsideWS.ReplaceAllString(txt, " ")
}
}
itm.Data = txt
itm.Typ = html.TextNode
firstItem.Children = append(firstItem.Children, itm)
case html.ElementNode:
ws := preserveWhitespace[len(preserveWhitespace)-1]
eltname := thisNode.Data
switch eltname {
case "body", "address", "article", "aside", "blockquote", "canvas", "col", "colgroup", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "p", "pre", "section", "table", "tfoot", "thead", "tbody", "tr", "td", "th", "ul", "video":
newDir = ModeVertical
case "b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "code", "dfn", "em", "kbd", "strong", "samp", "var", "a", "barcode", "bdo", "img", "map", "object", "q", "script", "span", "sub", "sup", "button", "input", "label", "select", "textarea":
newDir = ModeHorizontal
default:
// keep dir
}
itm := &HTMLItem{
Typ: html.ElementNode,
Data: thisNode.Data,
Dir: newDir,
Attributes: map[string]string{},
}
firstItem.Children = append(firstItem.Children, itm)
attributes := thisNode.Attr
if len(attributes) > 0 {
itm.Styles, attributes = csshtml.ResolveAttributes(attributes)
for _, attr := range attributes {
itm.Attributes[attr.Key] = attr.Val
}
for key, value := range itm.Styles {
if key == "white-space" {
if value == "pre" {
ws = true
} else {
ws = false
}
}
}
}
if thisNode.FirstChild != nil {
if isCustomVoidElement(eltname) {
// Custom void elements like <barcode> are not
// recognized as self-closing by the HTML5 parser,
// so subsequent siblings get incorrectly nested as
// children. Promote them back to the parent level.
preserveWhitespace = append(preserveWhitespace, ws)
GetHTMLItemFromHTMLNode(thisNode.FirstChild, direction, firstItem)
preserveWhitespace = preserveWhitespace[:len(preserveWhitespace)-1]
} else {
preserveWhitespace = append(preserveWhitespace, ws)
GetHTMLItemFromHTMLNode(thisNode.FirstChild, newDir, itm)
preserveWhitespace = preserveWhitespace[:len(preserveWhitespace)-1]
}
}
case html.DocumentNode:
// just passthrough
if err := GetHTMLItemFromHTMLNode(thisNode.FirstChild, newDir, firstItem); err != nil {
return err
}
default:
return fmt.Errorf("Output: unknown node type %T", thisNode.Type)
}
thisNode = thisNode.NextSibling
direction = newDir
}
return nil
}
// HTMLNodeToText converts an HTML node to a *frontend.Text element.
func HTMLNodeToText(n *html.Node, ss StylesStack, df *frontend.Document) (*frontend.Text, error) {
h := &HTMLItem{Dir: ModeVertical}
GetHTMLItemFromHTMLNode(n, ModeVertical, h)
return Output(h, ss, df)
}