go-crawler/process.go at main · lastbestdev/go-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package main

import (
	"fmt"
	"net/url"
	"regexp"
	"strings"
	"time"
)

func ProcessURL(input Input) (*Node, error) {
	root, err := process(input.URL, 0, input.SearchDepth)
	if err != nil {
		return nil, fmt.Errorf("error processing seed URL (%s): %v", input.URL.String(), err)
	}

	return root, nil
}

var visitedUrls = make(map[string]bool)

func process(url url.URL, depth, maxDepth int) (*Node, error) {
	if depth > maxDepth {
		error := fmt.Errorf("max depth reached at URL: %s. breaking search", url.String())
		return nil, error
	}
	// skip already visited URLs
	if visitedUrls[url.String()] {
		return nil, nil
	}

	// first check robots.txt cache
	rootUrl := url.Scheme + "://" + url.Host
	robotsTxt, found := GetRobotsTxtCache(rootUrl)

	// when crawler rules not found, fetch/parse them into the cache
	if !found {
		robotsUrl := rootUrl + "/robots.txt"
		content, err := FetchURLContent(robotsUrl)

		if err != nil {
			fmt.Printf("error fetching robots.txt: %v\n", err)
		} else {
			parsed, err := ReadRobotsTxt(content)
			if err != nil {
				fmt.Printf("error parsing robots.txt: %v\n", err)
			}

			AddRobotsTxtCache(rootUrl, *parsed)
			robotsTxt = *parsed
		}
	}

	// before proceeding to crawl, ensure abiding by robots.txt rules
	ok := CheckCrawlOk(url)
	if !ok {
		fmt.Printf("crawling URL %s is not allowed by robots.txt. skipping\n", url.String())
		return nil, nil
	}

	// observe crawl delay if specified
	if robotsTxt.CrawlDelay > 0 {
		fmt.Printf("delay crawl %d seconds for site %s\n", robotsTxt.CrawlDelay, rootUrl)
		time.Sleep(time.Duration(robotsTxt.CrawlDelay) * time.Second)
	}

	fmt.Printf("crawling URL: %s at depth %d\n", url.String(), depth)

	// allowed to crawl - fetch the URL contents
	body, err := FetchURLContent(url.String())
	if err != nil {
		return nil, err
	}

	// pages without <title> tags get default "Untitled" value
	title, err := findTitle(body)
	if err != nil {
		title = "Untitled"
	}

	node, err := MakeNode(title, url)
	if err != nil {
		fmt.Printf("error creating node: %v\n", err)
		return nil, err
	}

	links, err := findChildLinks(body)
	// when errors occur finding links, return node without children
	if err != nil {
		fmt.Printf("error finding child links for page: %v", err)
		return node, nil
	}

	// iterate child links and recursively process
	parentUrl := url.String()
	for _, link := range links {
		// skip non http schemes
		if strings.HasPrefix(link, "mailto:") || strings.HasPrefix(link, "tel:") || strings.HasPrefix(link, "javascript:") || strings.HasPrefix(link, "sms:") {
			continue
		}

		// skip links within page
		if strings.HasPrefix(link, "#") {
			continue
		}

		// handle relative links
		if strings.HasPrefix(link, "/") {
			link = rootUrl + link
		} else if strings.HasPrefix(link, "./") || strings.HasPrefix(link, "../") {
			link = parentUrl + link
		}

		link = strings.TrimSuffix(link, "/")

		// skip circular references (i.e. page links to self)
		if link == parentUrl {
			continue
		}

		childUrl, err := url.Parse(link)
		if err != nil {
			continue
		}

		child, err := process(*childUrl, depth+1, maxDepth)
		if err != nil {
			continue
		}

		node.AddChild(child)
	}

	// mark URL as visited
	visitedUrls[url.String()] = true

	return node, nil
}

func findTitle(body string) (string, error) {
	// regex to find title tag
	regex := regexp.MustCompile(`<title>(.*?)</title>`)
	match := regex.FindStringSubmatch(body)

	// no title found
	if len(match) < 2 {
		return "", fmt.Errorf("no title tag found in the body")
	}

	return match[1], nil
}

func findChildLinks(body string) ([]string, error) {
	var links []string

	// regex to find anchor tags
	regex := regexp.MustCompile(`<a[^>]*>(.*?)</a>`)
	matches := regex.FindAllString(string(body), -1)

	// no links found
	if matches == nil {
		return links, nil
	}

	// extract href attributes
	hrefRegex := regexp.MustCompile(`href=["'](.*?)["']`)
	for match := range matches {
		linkText := matches[match]
		hrefMatch := hrefRegex.FindStringSubmatch(linkText)
		if len(hrefMatch) > 1 {
			links = append(links, hrefMatch[1])
		}
	}

	return links, nil
}