forked from codificat/top500
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
executable file
·132 lines (118 loc) · 5.09 KB
/
scrape.py
File metadata and controls
executable file
·132 lines (118 loc) · 5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
'''
A web scraper to extract the details of the most powerful commercial
computing systems in the world from the Top500 list.
See https://www.top500.org/
'''
import argparse
import csv
import sys
from datetime import date
from top500.scraper import Scraper, ENTRY_FIELDS
from top500.urlgen import url_for_list, LAST_LIST, editions, VALID_YEARS, VALID_MONTHS
#
# Default values for command line options
#
DEFAULT_YEAR = LAST_LIST.year
DEFAULT_MONTH = LAST_LIST.month
DEFAULT_END_YEAR = DEFAULT_YEAR
DEFAULT_END_MONTH = DEFAULT_MONTH
DEFAULT_COUNT = 500
DEFAULT_OUTPUT_FILE = 'top500.csv'
def parse_options(dest):
'''Parses and validate command line arguments
'''
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('-y', '--year', help="Collect from year",
default=DEFAULT_YEAR, type=int,
choices=VALID_YEARS)
parser.add_argument('-m', '--month', help="Collect from month",
default=DEFAULT_MONTH, type=int,
choices=VALID_MONTHS)
parser.add_argument('-z', '--endyear', help="Collect until year",
default=DEFAULT_END_YEAR, type=int,
choices=VALID_YEARS)
parser.add_argument('-n', '--endmonth', help="Collect until month",
default=DEFAULT_END_MONTH, type=int,
choices=VALID_MONTHS)
parser.add_argument('-c', '--count', default=DEFAULT_COUNT, type=int,
help="Number of entries to get from each edition")
parser.add_argument('-f', '--force', action='store_true',
help="Force a partial count")
parser.add_argument('outfile', nargs='?', default=DEFAULT_OUTPUT_FILE,
help="Output file",
type=argparse.FileType('w', encoding='utf-8'))
parser.parse_args(namespace=dest)
if dest.count < 1 or dest.count > 500:
parser.error("COUNT must be >1 and <500 (in hundreds unless forced)")
if dest.count % 100 != 0 and not dest.force:
parser.error("COUNT must be in hundreds. Use --force to override")
if dest.endyear < dest.year or \
(dest.endyear == dest.year and dest.endmonth < dest.month):
parser.error("End year/month must be >= start year/month")
class TOP500:
'''Main logic for the TOP500 website scraping'''
# pylint: disable=too-many-instance-attributes
def __init__(self):
self.year = DEFAULT_YEAR
self.month = DEFAULT_MONTH
self.endyear = DEFAULT_END_YEAR
self.endmonth = DEFAULT_END_MONTH
self.count = DEFAULT_COUNT
self.outfile = sys.stdout
self.csvwriter = None
self.scraper = Scraper()
def init_writer(self):
'''Initialize the CSV writer on top of the output file. This is not done
inside __init__ to allow options to set a differnt outfile
'''
self.csvwriter = csv.writer(self.outfile,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
# Write header (column names)
self.csvwriter.writerow(ENTRY_FIELDS)
def write_entry(self, entry):
'Writes an entry to the output file in CSV format'
self.csvwriter.writerow(entry.values())
def write_all(self):
'''Alternative approach to writing: when calling scrape() with
write=False, no entries will be written to the file during the
scraping process. You can then call write_all() to write them
all at once.'''
self.init_writer()
entries = self.scraper.get_list()
if entries:
for entry in entries:
self.csvwriter.writerow(entry.values())
print("Wrote a total of %d entries" % len(entries))
def scrape(self, write=True):
'''Scraping function. It drives the scraping by obtaining each of
the list's pages and calling the scraper for each.'''
if write:
self.init_writer()
self.scraper.set_entry_callback(self.write_entry)
start = date(self.year, self.month, 1)
end = date(self.endyear, self.endmonth, 1)
pages = int(self.count / 100)
limit = self.count % 100
if limit:
# The user requested a partial page
pages += 1
for edition in editions(start, end):
print("* Scraping TOP500 list edition: %d/%d" % (edition.year, edition.month))
for page in range(pages):
pagenum = page + 1
print("** Page %d of %d" % (pagenum, pages))
url = url_for_list(edition, pagenum)
if pagenum == pages and limit:
# Only partially parse the last page as requested
self.scraper.scrape_list_page(url, limit)
else:
self.scraper.scrape_list_page(url)
if __name__ == '__main__':
top500 = TOP500() # pylint: disable=invalid-name
parse_options(top500)
top500.scrape()