-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.go
More file actions
257 lines (227 loc) · 6.44 KB
/
crawler.go
File metadata and controls
257 lines (227 loc) · 6.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// Main package containing scraper logic and cli
package main
import (
"flag"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
"golang.org/x/net/html"
)
// Page struct represents a page and all links found on the page
type Page struct {
URL *url.URL
links []*Page
}
func main() {
// setup logger
createDirIfNotExist("./output")
createDirIfNotExist("output/logs")
logFileName := generateDateFileName("output/logs/log_")
f, err := os.OpenFile(logFileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
log.Fatalf("Error opening file: %v", err)
}
defer func(f *os.File) {
err := f.Close()
if err != nil {
return
}
}(f)
mw := io.MultiWriter(os.Stdout, f)
log.SetOutput(mw)
// setup cli
baseURLStr := flag.String("b", "https://example.com", "Starting URL to crawl from")
var searchDepth int
flag.IntVar(&searchDepth, "d", 5, "Number of levels you want to traverse (depth)")
flag.Parse()
// start crawling
log.Printf("------- STARTING NEW CRAWL FOR: %s -------", *baseURLStr)
baseURL, err := parseURL(*baseURLStr)
if err != nil {
logger("e", "Base url could not be parsed")
panic(err)
}
// start crawl from base page
basePage := &Page{URL: baseURL}
var globalWait sync.WaitGroup
globalWait.Add(1)
go crawl(basePage, searchDepth, baseURL, &globalWait)
globalWait.Wait()
// all routines returned, so we can now print the textual sitemap
// create output file
createDirIfNotExist("./output")
var filePrefix string
switch strings.Split(baseURL.Hostname(), ".")[0] {
case "www":
filePrefix = strings.Split(baseURL.Hostname(), ".")[1]
default:
filePrefix = strings.Split(baseURL.Hostname(), ".")[0]
}
outputFile, err := os.Create(generateDateFileName(fmt.Sprintf("output/%s_", filePrefix)))
if err != nil {
panic(err)
}
defer func(outputFile *os.File) {
err := outputFile.Close()
if err != nil {
return
}
}(outputFile)
// write sitemap to output file
logger("i", "Writing textual sitemap...")
err = printSitemap(basePage, 0, outputFile)
if err != nil {
return
}
logger("i", "Crawl finished!")
}
/*
Crawls one page:
1. check if depth reached, and return if it has
2. get all links on a page
3. for each link
. recursively set a new go routine running to crawl
*/
func crawl(page *Page, depth int, initialBaseURL *url.URL, globalWait *sync.WaitGroup) {
logger("i", fmt.Sprintf("Starting crawl for: %s || depth of %d", page.URL.String(), depth))
// handle wait group at start
defer globalWait.Done()
// check depth and return if max depth
if depth == 0 {
return
}
// get all links on the given page
newLinks, err := getLinksFromURL(page.URL, initialBaseURL)
if err != nil {
logger("e", fmt.Sprintf("Error getting links for: %s \n", page.URL.String()))
return
}
for _, newL := range newLinks {
// add to current pages Links slice
page.links = append(page.links, newL)
// only proceed if next depth > 0
if depth-1 > 0 {
// Crawl each page found on this page
globalWait.Add(1)
go crawl(newL, depth-1, initialBaseURL, globalWait)
}
}
}
func getLinksFromURL(link *url.URL, baseURL *url.URL) ([]*Page, error) {
// get response
resp, err := http.Get(link.String())
if err != nil {
logger("e", fmt.Sprintf("Error getting response from: %s", link.String()))
return nil, err
}
// parse for <a> tags
var newPages []*Page
z := html.NewTokenizer(resp.Body)
// used to avoid duplicate links being returned
seen := make(map[string]bool)
for {
token := z.Next()
switch {
case token == html.ErrorToken:
// End of page, return
return newPages, nil
case token == html.StartTagToken:
// check if anchor tag found
tag := z.Token()
// worth noting here that this does not guarantee 100% of links, could be stuff in javascript somewhere
isAnchor := tag.Data == "a" //|| tag.Data == "link"
if isAnchor {
// get href attribute
for _, a := range tag.Attr {
if a.Key == "href" {
// link found, lets check if it belongs to current subdomain
l, err := parseURL(a.Val)
if err != nil {
logger("e", fmt.Sprintf("Error parsing link: %s", a.Val))
return nil, err
}
if l.Hostname() == baseURL.Hostname() || !l.IsAbs() {
// if link is a path, append the domain to it
if !l.IsAbs() {
l = baseURL.ResolveReference(l)
}
// check if link has been seen on this page already
if _, ok := seen[l.String()]; !ok {
//link has not been seen before and is of the right domain
lPage := &Page{URL: l}
newPages = append(newPages, lPage)
// add to seen map
seen[l.String()] = true
break
}
} else {
logger("i", fmt.Sprintf("Discarding url: %s", l.String()))
}
}
}
}
}
}
}
// Takes a url string and returns an url.URL type if valid
func parseURL(URLString string) (*url.URL, error) {
resultURL, err := url.Parse(URLString)
if err != nil {
logger("e", fmt.Sprintf("Could not parse URL: %s", URLString))
}
return resultURL, err
}
// Prints all links in the basePage with the given indent level
// To be used recursively
func printSitemap(basePage *Page, indent int, outputFile *os.File) error {
indentString := " "
// print basePage URL
if indent == 0 {
_, err := io.WriteString(outputFile, fmt.Sprintf("%s\n", basePage.URL.String()))
if err != nil {
logger("e", fmt.Sprintf("Failed to write to file for: %s", basePage.URL.String()))
}
}
// print basePage links recursively
for _, l := range basePage.links {
_, err := io.WriteString(outputFile, fmt.Sprintf("%s - %s\n", strings.Repeat(indentString, indent), l.URL.String()))
if err != nil {
logger("e", fmt.Sprintf("Failed to write to file for: %s", l.URL.String()))
}
// print children links
err = printSitemap(l, indent+1, outputFile)
if err != nil {
return err
}
}
return nil
}
// Allows for levels of severity in our logger
func logger(severity string, message string) {
switch severity {
case "e":
log.Printf("[ERROR] %s", message)
case "i":
log.Printf("[INFO] %s", message)
}
}
// create a directory given in a path if one doesn't already exist
func createDirIfNotExist(path string) {
if _, err := os.Stat(path); os.IsNotExist(err) {
err := os.Mkdir(path, 0755)
if err != nil {
log.Fatal(err)
}
}
}
// generate a filename using a given prefix and time.now
func generateDateFileName(datePrefix string) string {
now := time.Now().UTC().Format("2006-01-02_15-04-05")
return fmt.Sprintf("%s_%s", datePrefix, now)
}