go-htmldate/utils.go at master · markusmobius/go-htmldate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// Copyright (C) 2022 Markus Mobius
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code in this file is ported from <https://github.com/adbar/htmldate>
// which available under Apache 2.0 license.

package htmldate

import (
	"bytes"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"

	"github.com/go-shiori/dom"
	"golang.org/x/net/html"
)

// cleanDocument cleans the document by discarding unwanted elements.
func cleanDocument(doc *html.Node) *html.Node {
	// Clone doc
	clone := dom.Clone(doc, true)

	// Remove comments
	// removeHtmlCommentNode(clone)

	// Remove useless nodes
	tagNames := []string{
		// Embed elements
		"object", "embed", "applet",
		// Frame elements
		"frame", "frameset", "noframes", "iframe",
		// Others
		"label", "map", "math",
		"audio", "canvas", "datalist",
		"picture", "rdf", "svg", "track", "video",
		// TODO: to be considered
		// "figure", "input", "layer", "param", "source"
	}

	for _, node := range dom.GetAllNodesWithTag(clone, tagNames...) {
		if node.Parent != nil {
			node.Parent.RemoveChild(node)
		}
	}

	return clone
}

// removeHtmlCommentNode removes all `html.CommentNode` in document.
func removeHtmlCommentNode(doc *html.Node) {
	// Find all comment nodes
	var finder func(*html.Node)
	var commentNodes []*html.Node

	finder = func(node *html.Node) {
		if node.Type == html.CommentNode {
			commentNodes = append(commentNodes, node)
		}

		for child := node.FirstChild; child != nil; child = child.NextSibling {
			finder(child)
		}
	}

	for child := doc.FirstChild; child != nil; child = child.NextSibling {
		finder(child)
	}

	// Remove it
	dom.RemoveNodes(commentNodes, nil)
}

// isDigit check if string only consisted of digit number.
func isDigit(s string) bool {
	for _, r := range s {
		if !unicode.IsDigit(r) {
			return false
		}
	}

	return true
}

// getDigitCount returns count of digit number in the specified string.
func getDigitCount(s string) int {
	var nDigit int
	for _, r := range s {
		if unicode.IsDigit(r) {
			nDigit++
		}
	}
	return nDigit
}

// etreeText returns texts before first subelement. If there was no text,
// this function will returns an empty string.
func etreeText(element *html.Node) string {
	if element == nil {
		return ""
	}

	buffer := bytes.NewBuffer(nil)
	for child := element.FirstChild; child != nil; child = child.NextSibling {
		if child.Type == html.ElementNode {
			break
		} else if child.Type == html.TextNode {
			buffer.WriteString(child.Data)
		}
	}

	return buffer.String()
}

// inMap check if keys exist in map.
func inMap(key string, mapString map[string]struct{}) bool {
	_, exist := mapString[key]
	return exist
}

// strLimit cut a string until the specified limit.
func strLimit(s string, limit int) string {
	if utf8.RuneCountInString(s) > limit {
		s = string([]rune(s)[:limit])
	}

	return s
}

// normalizeSpaces converts all whitespaces to normal spaces, remove multiple adjacent
// whitespaces and trim the string.
func normalizeSpaces(s string) string {
	s = strings.Join(strings.Fields(s), " ")
	return strings.TrimSpace(s)
}

func rxFindNamedStringSubmatch(rx *regexp.Regexp, s string) (map[string]string, string) {
	names := rx.SubexpNames()
	result := make(map[string]string)
	matches := rx.FindStringSubmatch(s)

	var lastMatchedName string
	for i, match := range matches {
		if i > 0 && match != "" {
			result[names[i]] = match
			lastMatchedName = names[i]
		}
	}

	return result, lastMatchedName
}

// isLeapYear check if year is leap year.
func isLeapYear(year int) bool {
	// If year is not divisible by 4, then it is not a leap year
	if year%4 != 0 {
		return false
	}

	// If year is not divisible by 100, then it is a leap year
	if year%100 != 0 {
		return true
	}

	// If year is not divisible by 400, then it is not a leap year
	if year%400 != 0 {
		return false
	}

	// If all passed, it's leap year
	return true
}