-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.go
More file actions
113 lines (87 loc) · 2.43 KB
/
main.go
File metadata and controls
113 lines (87 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package main
import (
"errors"
"fmt"
"github.com/spf13/afero"
"net"
"net/url"
"os"
"sort"
)
func appendUnique(links *[]url.URL, link url.URL) []url.URL {
for _, _link := range *links {
if _link.Path == link.Path {
return *links
}
}
return append(*links, link)
}
func retrieve(control_channels controlChannels, config _config, uri url.URL, visited chan visitedUpdateRequest) {
if !(visitedUpdateRequest{url: uri}).dispatch(visited) {
control_channels.skipped <- errors.New("url already visited")
return
}
links, body, error :=
getLink(config.rooturl, uri, nil)
if error != nil {
control_channels.skipped <- error
} else {
saveMarkdownFile(config, uri, body)
}
additional_fetches := 0
for _, link := range *links {
additional_fetches++
go retrieve(control_channels, config, link, visited)
}
control_channels.fetched <- url_subsequent_visits{uri, additional_fetches}
}
type url_subsequent_visits struct {
url url.URL
subsequent_requests int
}
type controlChannels struct {
fetched chan url_subsequent_visits
skipped chan error
}
func main() {
config, err :=
readFromArgs(os.Args, afero.NewOsFs())
if err != nil {
fmt.Printf("\nusage: %v <root_url> <storage_directory>\n\n", os.Args[0])
fmt.Printf("you didn't provide valid arguments:\n\n %s\n\n", err)
os.Exit(1)
}
fmt.Printf("using the following arguments: \n %#v\n", config)
track_visited := trackVisitedUrls()
control_channels := controlChannels{
make(chan url_subsequent_visits),
make(chan error),
}
go retrieve(control_channels, config, config.rooturl, track_visited)
tocollect := 1
urls_fetched := []url.URL{}
for n := 0; n < tocollect; n++ {
select {
case url_subsequent := <-control_channels.fetched:
urls_fetched = append(urls_fetched, url_subsequent.url)
tocollect += url_subsequent.subsequent_requests
case err := <-control_channels.skipped:
if oe, ok := err.(*net.OpError); ok {
fmt.Println(oe)
} else {
// url has been skipped because it was already requested
// we dont treat this as an error, just to track the
// requests for channel syncronisation
}
}
}
fmt.Printf("Finished scraping `%s`\n", config.rooturl.String())
fmt.Println("locations processed:")
sort.Slice(urls_fetched, func(i, j int) bool {
return urls_fetched[i].Path < urls_fetched[j].Path
})
for i, link := range urls_fetched {
fmt.Printf("%3d %s\n", i+1, link.String())
}
fmt.Println("bye bye...")
}