1- from argparse import ArgumentParser
1+ from argparse import ArgumentParser , Namespace
22import logging
33import csv
44import sys
55import json
66import os
77
88import cdx_toolkit
9- from cdx_toolkit .commoncrawl import normalize_crawl
9+
10+ from cdx_toolkit .utils import get_version , setup_cdx_fetcher_and_kwargs
11+
1012
1113LOGGER = logging .getLogger (__name__ )
1214
@@ -135,7 +137,7 @@ def main(args=None):
135137 cmd .func (cmd , cmdline )
136138
137139
138- def set_loglevel (cmd ):
140+ def set_loglevel (cmd : Namespace ):
139141 loglevel = os .getenv ('LOGLEVEL' ) or 'WARNING'
140142 if cmd .verbose :
141143 if cmd .verbose > 0 :
@@ -151,58 +153,15 @@ def set_loglevel(cmd):
151153 LOGGER .info ('set loglevel to %s' , str (loglevel ))
152154
153155
154- def get_version ():
155- return cdx_toolkit .__version__
156-
157-
158- def setup (cmd ):
159- kwargs = {}
160- kwargs ['source' ] = 'cc' if cmd .crawl else cmd .cc or cmd .ia or cmd .source or None
161- if kwargs ['source' ] is None :
162- raise ValueError ('must specify --cc, --ia, or a --source' )
163- if cmd .wb :
164- kwargs ['wb' ] = cmd .wb
165- if cmd .cc_mirror :
166- kwargs ['cc_mirror' ] = cmd .cc_mirror
167- if cmd .crawl :
168- kwargs ['crawl' ] = normalize_crawl ([cmd .crawl ]) # currently a string, not a list
169- if getattr (cmd , 'warc_download_prefix' , None ) is not None :
170- kwargs ['warc_download_prefix' ] = cmd .warc_download_prefix
171-
172- cdx = cdx_toolkit .CDXFetcher (** kwargs )
173-
174- kwargs = {}
175- if cmd .limit :
176- kwargs ['limit' ] = cmd .limit
177- if 'from' in vars (cmd ) and vars (cmd )['from' ]: # python, uh, from is a reserved word
178- kwargs ['from_ts' ] = vars (cmd )['from' ]
179- if cmd .to :
180- kwargs ['to' ] = cmd .to
181- if cmd .closest :
182- if not cmd .get : # pragma: no cover
183- LOGGER .info ('note: --closest works best with --get' )
184- kwargs ['closest' ] = cmd .closest
185- if cmd .filter :
186- kwargs ['filter' ] = cmd .filter
187-
188- if cmd .cmd == 'warc' and cmd .size :
189- kwargs ['size' ] = cmd .size
190-
191- if cmd .cmd == 'size' and cmd .details :
192- kwargs ['details' ] = cmd .details
193-
194- return cdx , kwargs
195-
196-
197- def winnow_fields (cmd , fields , obj ):
156+ def winnow_fields (cmd : Namespace , fields , obj ):
198157 if cmd .all_fields :
199158 printme = obj
200159 else :
201160 printme = dict ([(k , obj [k ]) for k in fields if k in obj ])
202161 return printme
203162
204163
205- def print_line (cmd , writer , printme ):
164+ def print_line (cmd : Namespace , writer , printme ):
206165 if cmd .jsonl :
207166 print (json .dumps (printme , sort_keys = True ))
208167 elif writer :
@@ -211,8 +170,8 @@ def print_line(cmd, writer, printme):
211170 print (', ' .join ([' ' .join ((k , printme [k ])) for k in sorted (printme .keys ())]))
212171
213172
214- def iterator (cmd , cmdline ):
215- cdx , kwargs = setup (cmd )
173+ def iterator (cmd : Namespace , cmdline ):
174+ cdx , kwargs = setup_cdx_fetcher_and_kwargs (cmd )
216175 fields = set (cmd .fields .split (',' ))
217176 if cmd .csv :
218177 writer = csv .DictWriter (sys .stdout , fieldnames = sorted (list (fields )))
@@ -232,8 +191,8 @@ def iterator(cmd, cmdline):
232191 print_line (cmd , writer , printme )
233192
234193
235- def warcer (cmd , cmdline ):
236- cdx , kwargs = setup (cmd )
194+ def warcer (cmd : Namespace , cmdline : str ):
195+ cdx , kwargs = setup_cdx_fetcher_and_kwargs (cmd )
237196
238197 ispartof = cmd .prefix
239198 if cmd .subprefix :
@@ -275,9 +234,15 @@ def warcer(cmd, cmdline):
275234 LOGGER .warning ('revisit record being resolved for url %s %s' , url , timestamp )
276235 writer .write_record (record )
277236
237+ writer .close ()
278238
279- def sizer (cmd , cmdline ):
280- cdx , kwargs = setup (cmd )
239+
240+ def sizer (cmd : Namespace , cmdline ):
241+ cdx , kwargs = setup_cdx_fetcher_and_kwargs (cmd )
281242
282243 size = cdx .get_size_estimate (cmd .url , ** kwargs )
283244 print (size )
245+
246+
247+ if __name__ == "__main__" :
248+ main ()
0 commit comments