-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl.rb
More file actions
executable file
·42 lines (35 loc) · 1.24 KB
/
crawl.rb
File metadata and controls
executable file
·42 lines (35 loc) · 1.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/opt/local/bin/ruby
require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'eventmachine'
require 'uri'
doc = Hpricot(open('http://news.ycombinator.com/'))
news_links = doc.search('a[@href^=http]').reject {|x| x[:href].include?('ycombinator.com') || x[:href].include?('economist.com') }
def strip_html(html)
return Hpricot(html).inner_text.gsub(/\n|\t|\r/, " ").gsub(/ */, " ")
end
word_counts = File.open "wcs.txt", File::RDWR | File::CREAT | File::APPEND
storage_directory = './news-yc-corpus/'
Dir.mkdir(storage_directory) unless File.directory? storage_directory
EM.run {
news_links.each {|nl|
begin
uri = URI::parse(nl[:href].gsub(' ', ''))
req = EM::Protocols::HttpClient2.connect(uri.host, 80).get(uri.request_uri)
req.callback {
text_from_site = strip_html(req.content)
begin
html_file = File.new storage_directory + uri.to_s.gsub(/[\/:&?]/, '_'), File::RDWR | File::TRUNC | File::CREAT
html_file.puts text_from_site
ensure
html_file.close
end
word_count = text_from_site.split(" ").length
word_counts.puts [word_count.to_s, uri.to_s + uri.to_s.gsub(/[\/:&?]/, '_')].join(' ')
}
rescue URI::InvalidURIError
;
end
}
}