go-crawler/robots.go at main · lastbestdev/go-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
package main

import (
	"fmt"
	"net/url"
	"regexp"
	"strings"
)

type RobotsTxt struct {
	AllowedPaths    []string
	DisallowedPaths []string
	CrawlDelay      int
}

// cache of processed robots.txt files, to avoid refetch/processing (key: root URL, value: parsed robots.txt rules)
var robotsTxtCache = make(map[string]RobotsTxt)

func AddRobotsTxtCache(url string, parsed RobotsTxt) {
	robotsTxtCache[url] = parsed
	fmt.Println("cached robots.txt rules for site ", url)
}

func GetRobotsTxtCache(url string) (RobotsTxt, bool) {
	robotsTxt, found := robotsTxtCache[url]
	return robotsTxt, found
}

func ReadRobotsTxt(content string) (*RobotsTxt, error) {
	allowedPaths := []string{}
	disallowedPaths := []string{}
	crawlDelay := 0

	userAgentRegex := regexp.MustCompile(`User-[Aa]gent: \*`)
	lines := strings.Split(content, "\n")
	parse := false
	for i := 0; i < len(lines); i++ {
		line := strings.TrimSpace(lines[i])

		// begin parsing when User-agent: * rules found
		if !parse && userAgentRegex.MatchString(line) {
			parse = true
		}
		if !parse {
			continue
		}

		// stop parsing rules when line break encountered
		if line == "" {
			break
		}

		// parse Allow, Disallow, and Crawl-delay rules
		if strings.HasPrefix(line, "Allow: ") {
			allowedPaths = append(allowedPaths, strings.TrimPrefix(line, "Allow: "))
		} else if strings.HasPrefix(line, "Disallow: ") {
			disallowedPaths = append(disallowedPaths, strings.TrimPrefix(line, "Disallow: "))
		} else if strings.HasPrefix(line, "Crawl-delay: ") {
			fmt.Sscanf(line, "Crawl-delay: %d", &crawlDelay)
		}
	}

	robotsTxt := &RobotsTxt{
		AllowedPaths:    allowedPaths,
		DisallowedPaths: disallowedPaths,
		CrawlDelay:      crawlDelay,
	}

	return robotsTxt, nil
}

func CheckCrawlOk(url url.URL) bool {
	rootUrl := url.Scheme + "://" + url.Host
	robotsTxt, found := GetRobotsTxtCache(rootUrl)

	// if rules not found for site, assume free to crawl
	if !found {
		fmt.Printf("no robots.txt rules found for site %s, free to crawl\n", rootUrl)
		return true
	}

	// check if the URL is explicitly allowed
	for _, allowedPath := range robotsTxt.AllowedPaths {
		if strings.HasPrefix(url.Path, allowedPath) {
			return true
		}
	}

	// check if the URL is disallowed
	for _, disallowedPath := range robotsTxt.DisallowedPaths {
		if strings.HasPrefix(url.Path, disallowedPath) {
			return false
		}
	}

	return true
}