-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrobots.go
More file actions
97 lines (80 loc) · 2.32 KB
/
robots.go
File metadata and controls
97 lines (80 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
package main
import (
"fmt"
"net/url"
"regexp"
"strings"
)
type RobotsTxt struct {
AllowedPaths []string
DisallowedPaths []string
CrawlDelay int
}
// cache of processed robots.txt files, to avoid refetch/processing (key: root URL, value: parsed robots.txt rules)
var robotsTxtCache = make(map[string]RobotsTxt)
func AddRobotsTxtCache(url string, parsed RobotsTxt) {
robotsTxtCache[url] = parsed
fmt.Println("cached robots.txt rules for site ", url)
}
func GetRobotsTxtCache(url string) (RobotsTxt, bool) {
robotsTxt, found := robotsTxtCache[url]
return robotsTxt, found
}
func ReadRobotsTxt(content string) (*RobotsTxt, error) {
allowedPaths := []string{}
disallowedPaths := []string{}
crawlDelay := 0
userAgentRegex := regexp.MustCompile(`User-[Aa]gent: \*`)
lines := strings.Split(content, "\n")
parse := false
for i := 0; i < len(lines); i++ {
line := strings.TrimSpace(lines[i])
// begin parsing when User-agent: * rules found
if !parse && userAgentRegex.MatchString(line) {
parse = true
}
if !parse {
continue
}
// stop parsing rules when line break encountered
if line == "" {
break
}
// parse Allow, Disallow, and Crawl-delay rules
if strings.HasPrefix(line, "Allow: ") {
allowedPaths = append(allowedPaths, strings.TrimPrefix(line, "Allow: "))
} else if strings.HasPrefix(line, "Disallow: ") {
disallowedPaths = append(disallowedPaths, strings.TrimPrefix(line, "Disallow: "))
} else if strings.HasPrefix(line, "Crawl-delay: ") {
fmt.Sscanf(line, "Crawl-delay: %d", &crawlDelay)
}
}
robotsTxt := &RobotsTxt{
AllowedPaths: allowedPaths,
DisallowedPaths: disallowedPaths,
CrawlDelay: crawlDelay,
}
return robotsTxt, nil
}
func CheckCrawlOk(url url.URL) bool {
rootUrl := url.Scheme + "://" + url.Host
robotsTxt, found := GetRobotsTxtCache(rootUrl)
// if rules not found for site, assume free to crawl
if !found {
fmt.Printf("no robots.txt rules found for site %s, free to crawl\n", rootUrl)
return true
}
// check if the URL is explicitly allowed
for _, allowedPath := range robotsTxt.AllowedPaths {
if strings.HasPrefix(url.Path, allowedPath) {
return true
}
}
// check if the URL is disallowed
for _, disallowedPath := range robotsTxt.DisallowedPaths {
if strings.HasPrefix(url.Path, disallowedPath) {
return false
}
}
return true
}