-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapper.go
More file actions
144 lines (115 loc) · 3.75 KB
/
mapper.go
File metadata and controls
144 lines (115 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
package main
import (
"net/url"
log "github.com/sirupsen/logrus"
"fmt"
)
// Utility function, checks if two addresses pertain to the same host
func isSameHost(root *url.URL, other *url.URL) bool {
return other.Host == root.Host
}
// The MapSite will start by pushing the specified root down the addressChan,
// will wait the related links on the link chan and (if those links are from
// the same host as the root and not already processed) will send those on the addressChan too.
// Once it has retrieved all the pages in the tree for the specified root, it will return a map
// containing as key the various pages and as value the list of the pages it links to.
func MapSite(root url.URL, addressChan chan url.URL, linksChan chan HtmlPageLinks) PagesMap {
state := initState()
log.Info("Starting crawling from root ", root)
addressChan <- root
state.onRequested(root)
for links := range linksChan {
// update the state (mapper is single threaded, no sync needed)
state.onRetrieved(links.Address, links.LinksTo)
for _, link := range links.LinksTo {
if isSameHost(&root, &link) && state.shouldBeRequested(link) {
log.Debug("Requesting ", link)
addressChan <- link
state.onRequested(link)
} else {
log.Debug("Skipping ", link)
}
}
if !state.hasPending() {
// all that we pushed down the addressChan has come back
// through the linksChan, there is nothing else for us to do
log.Info("Fetching completed")
// we completed our crawling, let's close the channel we write to,
// this will generate a chain reaction, leading to the closure of the
// linksChan, that will allow us to exit
close(addressChan)
}
}
return state.retrieved
}
type PendingMap map[url.URL]bool
type PagesMap map[url.URL][]url.URL
// Stores the current state of the mapper
type State struct {
pending PendingMap
retrieved PagesMap
}
func initState() State {
return State{
make(map[url.URL]bool),
make(map[url.URL][]url.URL),
}
}
func (state *State) onRequested(url url.URL) {
log.Print("Fetching ", url.String())
state.pending[url] = true
}
func (state *State) onRetrieved(url url.URL, links []url.URL) {
log.Print("Fetched ", len(links), " ", url.String())
delete(state.pending, url)
state.retrieved[url] = links
}
func (state *State) shouldBeRequested(url url.URL) bool {
_, isPending := state.pending[url]
_, isRetrieved := state.retrieved[url]
return !isPending && !isRetrieved
}
func (state *State) hasPending() bool {
return len(state.pending) != 0
}
// struct used to simulate the recursion stack
type StackElement struct {
address url.URL
level int
}
// Prints the sitemap
func (pages PagesMap) Print(root url.URL) {
// recursive version might be more concise, but if I understood correctly
// go does not interpret tail recursion so it would risk a stack overflow,
// let's iterate (assuming max slice size > max stack size)
printed := make(map[url.URL]bool, 0)
stack := make([]StackElement, 0)
// start from the root links and go through the map
stack = append(stack, StackElement{root, 0})
for len(stack) > 0 {
// pop the head
toPrint := stack[0]
stack = stack[1:]
// setup some decoration
for i := 0; i < toPrint.level; i++ {
fmt.Print(" ")
}
if toPrint.level > 0 {
fmt.Print("|-")
}
_, alreadyPrinted := printed[toPrint.address]
if !alreadyPrinted {
fmt.Println(toPrint.address.String())
printed[toPrint.address] = true
nextLevel := toPrint.level + 1
for _, child := range pages[toPrint.address] {
// push the stack
stack = append([]StackElement{{child, nextLevel}}, stack...)
}
} else {
// we already printed the tree starting from this page, just add a line
// to reference the previously printed branch
fmt.Println(toPrint.address.String(), "--> see above")
}
}
}