-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.go
More file actions
176 lines (142 loc) · 4.01 KB
/
process.go
File metadata and controls
176 lines (142 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package main
import (
"fmt"
"net/url"
"regexp"
"strings"
"time"
)
func ProcessURL(input Input) (*Node, error) {
root, err := process(input.URL, 0, input.SearchDepth)
if err != nil {
return nil, fmt.Errorf("error processing seed URL (%s): %v", input.URL.String(), err)
}
return root, nil
}
var visitedUrls = make(map[string]bool)
func process(url url.URL, depth, maxDepth int) (*Node, error) {
if depth > maxDepth {
error := fmt.Errorf("max depth reached at URL: %s. breaking search", url.String())
return nil, error
}
// skip already visited URLs
if visitedUrls[url.String()] {
return nil, nil
}
// first check robots.txt cache
rootUrl := url.Scheme + "://" + url.Host
robotsTxt, found := GetRobotsTxtCache(rootUrl)
// when crawler rules not found, fetch/parse them into the cache
if !found {
robotsUrl := rootUrl + "/robots.txt"
content, err := FetchURLContent(robotsUrl)
if err != nil {
fmt.Printf("error fetching robots.txt: %v\n", err)
} else {
parsed, err := ReadRobotsTxt(content)
if err != nil {
fmt.Printf("error parsing robots.txt: %v\n", err)
}
AddRobotsTxtCache(rootUrl, *parsed)
robotsTxt = *parsed
}
}
// before proceeding to crawl, ensure abiding by robots.txt rules
ok := CheckCrawlOk(url)
if !ok {
fmt.Printf("crawling URL %s is not allowed by robots.txt. skipping\n", url.String())
return nil, nil
}
// observe crawl delay if specified
if robotsTxt.CrawlDelay > 0 {
fmt.Printf("delay crawl %d seconds for site %s\n", robotsTxt.CrawlDelay, rootUrl)
time.Sleep(time.Duration(robotsTxt.CrawlDelay) * time.Second)
}
fmt.Printf("crawling URL: %s at depth %d\n", url.String(), depth)
// allowed to crawl - fetch the URL contents
body, err := FetchURLContent(url.String())
if err != nil {
return nil, err
}
// pages without <title> tags get default "Untitled" value
title, err := findTitle(body)
if err != nil {
title = "Untitled"
}
node, err := MakeNode(title, url)
if err != nil {
fmt.Printf("error creating node: %v\n", err)
return nil, err
}
links, err := findChildLinks(body)
// when errors occur finding links, return node without children
if err != nil {
fmt.Printf("error finding child links for page: %v", err)
return node, nil
}
// iterate child links and recursively process
parentUrl := url.String()
for _, link := range links {
// skip non http schemes
if strings.HasPrefix(link, "mailto:") || strings.HasPrefix(link, "tel:") || strings.HasPrefix(link, "javascript:") || strings.HasPrefix(link, "sms:") {
continue
}
// skip links within page
if strings.HasPrefix(link, "#") {
continue
}
// handle relative links
if strings.HasPrefix(link, "/") {
link = rootUrl + link
} else if strings.HasPrefix(link, "./") || strings.HasPrefix(link, "../") {
link = parentUrl + link
}
link = strings.TrimSuffix(link, "/")
// skip circular references (i.e. page links to self)
if link == parentUrl {
continue
}
childUrl, err := url.Parse(link)
if err != nil {
continue
}
child, err := process(*childUrl, depth+1, maxDepth)
if err != nil {
continue
}
node.AddChild(child)
}
// mark URL as visited
visitedUrls[url.String()] = true
return node, nil
}
func findTitle(body string) (string, error) {
// regex to find title tag
regex := regexp.MustCompile(`<title>(.*?)</title>`)
match := regex.FindStringSubmatch(body)
// no title found
if len(match) < 2 {
return "", fmt.Errorf("no title tag found in the body")
}
return match[1], nil
}
func findChildLinks(body string) ([]string, error) {
var links []string
// regex to find anchor tags
regex := regexp.MustCompile(`<a[^>]*>(.*?)</a>`)
matches := regex.FindAllString(string(body), -1)
// no links found
if matches == nil {
return links, nil
}
// extract href attributes
hrefRegex := regexp.MustCompile(`href=["'](.*?)["']`)
for match := range matches {
linkText := matches[match]
hrefMatch := hrefRegex.FindStringSubmatch(linkText)
if len(hrefMatch) > 1 {
links = append(links, hrefMatch[1])
}
}
return links, nil
}