From 1ae4ad0a4044752a90f1adbb401323d23db3ebff Mon Sep 17 00:00:00 2001 From: David Buckley Date: Mon, 23 Apr 2018 13:05:31 -0400 Subject: [PATCH 1/2] Small update.py modularization (#11) initial modularization refactor of the data update process --- data/update.py | 70 +++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/data/update.py b/data/update.py index 00849065..e1711fd7 100644 --- a/data/update.py +++ b/data/update.py @@ -53,15 +53,42 @@ # skip: skip gathering, assume CSVs are locally cached # here: run the default full gather -def run(options): - # If this is just being used to download production data, do that. - if options.get("just-download", False): - download_s3() - return - - # Definitive scan date for the run. - today = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") +def run(options): + # If this is just being used to download production data, do that. + if options.get("just-download", False): + download_s3() + return + + update(options) + # Sanity check to make sure we have what we need. + if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")): + LOGGER.info("No scan metadata downloaded, aborting.") + exit() + + # Date can be overridden if need be, but defaults to meta.json. + if options.get("date", None) is not None: + the_date = options.get("date") + else: + # depends on YYYY-MM-DD coming first in meta.json time format + scan_meta = ujson.load(open("data/output/parents/results/meta.json")) + the_date = scan_meta['start_time'][0:10] + + # 2. Process and load data into Pulse's database. + LOGGER.info("[%s] Loading data into Pulse." % the_date) + data.processing.run(the_date, options) + LOGGER.info("[%s] Data now loaded into Pulse." % the_date) + + # 3. Upload data to S3 (if requested). + if options.get("upload", False): + LOGGER.info("[%s] Syncing scan data and database to S3." % the_date) + upload_s3(the_date) + LOGGER.info("[%s] Scan data and database now in S3." % the_date) + + LOGGER.info("[%s] All done." % the_date) + + +def update(options): # 1. Download scan data, do a new scan, or skip altogether. scan_mode = options.get("scan", "skip") @@ -91,33 +118,6 @@ def run(options): download_s3() LOGGER.info("Download complete.") - # Sanity check to make sure we have what we need. - if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")): - LOGGER.info("No scan metadata downloaded, aborting.") - exit() - - # Date can be overridden if need be, but defaults to meta.json. - if options.get("date", None) is not None: - the_date = options.get("date") - else: - # depends on YYYY-MM-DD coming first in meta.json time format - scan_meta = ujson.load(open("data/output/parents/results/meta.json")) - the_date = scan_meta['start_time'][0:10] - - - # 2. Process and load data into Pulse's database. - LOGGER.info("[%s] Loading data into Pulse." % the_date) - data.processing.run(the_date, options) - LOGGER.info("[%s] Data now loaded into Pulse." % the_date) - - # 3. Upload data to S3 (if requested). - if options.get("upload", False): - LOGGER.info("[%s] Syncing scan data and database to S3." % the_date) - upload_s3(the_date) - LOGGER.info("[%s] Scan data and database now in S3." % the_date) - - LOGGER.info("[%s] All done." % the_date) - # Upload the scan + processed data to /live/ and /archive/ locations by date. def upload_s3(date): From 1ec53e285346e14765a18618775c09cca5ccc751 Mon Sep 17 00:00:00 2001 From: David Buckley Date: Wed, 25 Apr 2018 13:48:09 -0400 Subject: [PATCH 2/2] Add new CLI interface (#14) replace existing CLI interface with new consolidated, click-based CLI --- Makefile | 8 +-- README.md | 7 ++- data/cli.py | 138 +++++++++++++++++++++++++++++++++++++++++++++ data/processing.py | 9 +-- data/update.py | 58 ++----------------- setup.py | 6 ++ tests/test_cli.py | 92 ++++++++++++++++++++++++++++++ 7 files changed, 252 insertions(+), 66 deletions(-) create mode 100644 data/cli.py create mode 100644 tests/test_cli.py diff --git a/Makefile b/Makefile index 293183eb..e2cff326 100644 --- a/Makefile +++ b/Makefile @@ -39,19 +39,19 @@ clean: # Run a fresh scan, update the database, and upload data to S3. # Enable Lambda mode, using Lambda AWS profile set up in production. update_production: - python -m data.update --scan=here --upload --lambda --lambda-profile=lambda + pulse run --scan here --upload --lambda --lambda-profile lambda # Staging data update process: # # Download last production scan data, update the database. update_staging: - python -m data.update --scan=download + pulse run --scan download # Development data update process: # # Don't scan or download latest data (rely on local cache), update database. update_development: - python -m data.update --scan=skip + pulse run --scan skip # downloads latest snapshot of data locally # Pending cloud.gov production bucket: @@ -59,4 +59,4 @@ update_development: # Pending cloud.gov backup bucket: # cg-72ce4caf-d81b-4771-9b96-3624b5554587 data_init: - python -m data.update --just-download + pulse download diff --git a/README.md b/README.md index 3e5d383d..5717b003 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,11 @@ Pulse is a [Flask](http://flask.pocoo.org/) app written for **Python 3.5 and up* pip install -r requirements.txt ``` +* If developing Pulse, you will also need the development requirements +```bash +pip install .[development] +``` + * If developing the stylesheets, you will also need [Sass](http://sass-lang.com/), [Bourbon](http://bourbon.io/), [Neat](http://neat.bourbon.io/), and [Bitters](http://bitters.bourbon.io/). ```bash @@ -84,7 +89,7 @@ And link it to AWS credentials that allow authorized write access to the `pulse. From the Pulse root directory: ``` -python -m data.update +pulse run ``` This will kick off the `domain-scan` scanning process for HTTP/HTTPS and DAP participation, using the `.gov` domain list as specified in `meta.yml` for the base set of domains to scan. diff --git a/data/cli.py b/data/cli.py new file mode 100644 index 00000000..adc5a173 --- /dev/null +++ b/data/cli.py @@ -0,0 +1,138 @@ +import os +import typing +import datetime +import click +import ujson +from data.env import PARENTS_RESULTS +from data import update as data_update +from data import processing +from data import logger + + +LOGGER = logger.get_logger(__name__) + + +class DateType(click.ParamType): + name = 'date' + + def convert(self, value, param, ctx) -> str: + try: + datetime.datetime.strptime(value, '%Y-%m-%d') + return value + except ValueError: + self.fail(f'{value} is not a valid date') +DATE = DateType() + + +def get_cached_date(directory: str) -> str: + meta = os.path.join(directory, 'output/parents/results/meta.json') + with open(meta) as meta_file: + scan_meta = ujson.load(meta_file) + return scan_meta['start_time'][0:10] + + +def get_date( + ctx: click.core.Context, # pylint: disable=unused-argument + param: click.core.Option, # pylint: disable=unused-argument + value: typing.Optional[str] + ) -> str: + + # Date can be overridden if need be, but defaults to meta.json. + directory, _ = os.path.split(__file__) + + return value if value is not None else get_cached_date(directory) + + +# Convert ['--option', 'value', ... ] to {'option': 'value', ...} +def transform_args(args: typing.List[str]) -> typing.Dict[str, str]: + transformed = {} + for option, value in zip(args, args[1:]): + if option.startswith('--'): + name = option.strip('--') + transformed[name] = value if not value.startswith('--') else True + return transformed + + +@click.group() +def main() -> None: + pass + + +@main.command( + context_settings=dict( + ignore_unknown_options=True, + ), + help='Coposition of `update`, `process`, and `upload` commands', +) +@click.option('--date', type=DATE) +@click.option('--scan', type=click.Choice(['skip', 'download', 'here']), default='skip') +@click.option('--gather', type=click.Choice(['skip', 'here']), default='here') +@click.option('--upload-results', is_flag=True, default=False) +@click.argument('scan_args', nargs=-1, type=click.UNPROCESSED) +def run( + date: typing.Optional[str], + scan: str, + gather: str, + upload_results: bool, + scan_args: typing.List[str] + ) -> None: + + update.callback(scan, gather, scan_args) + the_date = get_date(None, 'date', date) + process.callback(the_date) + if upload_results: + upload.callback(the_date) + + +@main.command( + context_settings=dict( + ignore_unknown_options=True, + ), + help='Gather and scan domains', +) +@click.option('--scan', type=click.Choice(['skip', 'download', 'here']), default='skip') +@click.option('--gather', type=click.Choice(['skip', 'here']), default='here') +@click.argument('scan_args', nargs=-1, type=click.UNPROCESSED) +def update( + scan: str, + gather: str, + scan_args: typing.List[str] + ) -> None: + + LOGGER.info('Starting update') + data_update.update(scan, gather, transform_args(scan_args)) + LOGGER.info('Finished update') + + +@main.command(help='Download scan results from s3') +def download() -> None: + LOGGER.info('Downloading production data') + data_update.download_s3() + LOGGER.info('Finished downloading production data') + + +@main.command(help='Upload scan results to s3') +@click.option('--date', type=DATE, callback=get_date) +def upload(date: str) -> None: + # Sanity check to make sure we have what we need. + if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")): + LOGGER.info("No scan metadata downloaded, aborting.") + return + + LOGGER.info(f'[{date}] Syncing scan data and database to S3.') + data_update.upload_s3(date) + LOGGER.info(f"[{date}] Scan data and database now in S3.") + + +@main.command(help='Process scan data') +@click.option('--date', type=DATE, callback=get_date) +def process(date: str) -> None: + + # Sanity check to make sure we have what we need. + if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")): + LOGGER.info("No scan metadata downloaded, aborting.") + return + + LOGGER.info(f"[{date}] Loading data into Pulse.") + processing.run(date) + LOGGER.info(f"[{date}] Data now loaded into Pulse.") diff --git a/data/processing.py b/data/processing.py index 4c8576f0..d58e4d23 100644 --- a/data/processing.py +++ b/data/processing.py @@ -97,9 +97,7 @@ # # This method blows away the database and rebuilds it from the given data. -# options (for debugging) - -def run(date, options): +def run(date): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") @@ -1175,8 +1173,3 @@ def branch_for(agency): else: return "executive" - -### Run when executed. - -if __name__ == '__main__': - run(None, options()) diff --git a/data/update.py b/data/update.py index e1711fd7..010cc809 100644 --- a/data/update.py +++ b/data/update.py @@ -41,60 +41,18 @@ # - TODO: Consider moving from aws CLI to Python library. - # Options: -# --date: override date, defaults to contents of meta.json -# --scan=[skip,download,here] +# scan_mode=[skip,download,here] # skip: skip all scanning, assume CSVs are locally cached # download: download scan data from S3 # here: run the default full scan -# --upload: upload scan data and resulting db.json anything to S3 -# --gather=[skip,here] +# gather_mode=[skip,here] # skip: skip gathering, assume CSVs are locally cached # here: run the default full gather +# options +# options to pass along to scan and gather operations - -def run(options): - # If this is just being used to download production data, do that. - if options.get("just-download", False): - download_s3() - return - - update(options) - # Sanity check to make sure we have what we need. - if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")): - LOGGER.info("No scan metadata downloaded, aborting.") - exit() - - # Date can be overridden if need be, but defaults to meta.json. - if options.get("date", None) is not None: - the_date = options.get("date") - else: - # depends on YYYY-MM-DD coming first in meta.json time format - scan_meta = ujson.load(open("data/output/parents/results/meta.json")) - the_date = scan_meta['start_time'][0:10] - - # 2. Process and load data into Pulse's database. - LOGGER.info("[%s] Loading data into Pulse." % the_date) - data.processing.run(the_date, options) - LOGGER.info("[%s] Data now loaded into Pulse." % the_date) - - # 3. Upload data to S3 (if requested). - if options.get("upload", False): - LOGGER.info("[%s] Syncing scan data and database to S3." % the_date) - upload_s3(the_date) - LOGGER.info("[%s] Scan data and database now in S3." % the_date) - - LOGGER.info("[%s] All done." % the_date) - - -def update(options): - # 1. Download scan data, do a new scan, or skip altogether. - scan_mode = options.get("scan", "skip") - - # Whether to gather domains (defaults to doing so). - gather_mode = options.get("gather", "here") - +def update(scan_mode, gather_mode, options): if scan_mode == "here": # 1a. Gather .gov federal subdomains. if gather_mode == "here": @@ -306,9 +264,3 @@ def shell_out(command, env=None): LOGGER.critical("Error running %s." % (str(command))) exit(1) return None - - -### Run when executed. - -if __name__ == '__main__': - run(options()) diff --git a/setup.py b/setup.py index 130ebe97..48c748df 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ 'ujson==1.35', 'waitress==1.0.1', 'flask-compress==1.4.0', + 'click==6.7', ], extras_require={ 'development': [ @@ -36,4 +37,9 @@ 'pytest-cov==2.5.1', ], }, + entry_points=''' + [console_scripts] + pulse=data.cli:main + ''' + ) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..dd7153a1 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,92 @@ +import typing +from click.testing import CliRunner +import _pytest +import pytest +from data import cli +from data import update +from data import processing + + +@pytest.fixture(params=[('2018-04-24', 0), ('BAD_DATE', 2)]) +def date_result(request: _pytest.fixtures.SubRequest) -> typing.Tuple[str, int]: + return request.param + + +def noop(*args) -> None: # pylint: disable=unused-argument + pass + + +def test_run_all_args( + date_result: typing.Tuple[str, int], + monkeypatch: _pytest.monkeypatch.MonkeyPatch, + ) -> None: + monkeypatch.setattr(update, 'update', noop) + monkeypatch.setattr(processing, 'run', noop) + monkeypatch.setattr(update, 'download_s3', noop) + monkeypatch.setattr(update, 'upload_s3', noop) + + date, exit_code = date_result + + runner = CliRunner() + result = runner.invoke(cli.main, args=[ + 'run', + '--date', date, + '--scan', 'here', + '--gather', 'here', + '--upload' + ]) + assert result.exit_code == exit_code + + +def test_update(monkeypatch: _pytest.monkeypatch.MonkeyPatch) -> None: + monkeypatch.setattr(update, 'update', noop) + + runner = CliRunner() + result = runner.invoke(cli.main, args=[ + 'update', + '--scan', 'here', + '--gather', 'here', + ]) + assert result.exit_code == 0 + + +def test_process( + date_result: typing.Tuple[str, int], + monkeypatch: _pytest.monkeypatch.MonkeyPatch, + ) -> None: + + date, exit_code = date_result + + monkeypatch.setattr(processing, 'run', noop) + + runner = CliRunner() + result = runner.invoke(cli.main, args=[ + 'process', + '--date', date, + ]) + assert result.exit_code == exit_code + + +def test_download(monkeypatch: _pytest.monkeypatch.MonkeyPatch) -> None: + monkeypatch.setattr(update, 'download_s3', noop) + + runner = CliRunner() + result = runner.invoke(cli.main, args=[ + 'download', + ]) + assert result.exit_code == 0 + + +def test_upload( + date_result, + monkeypatch: _pytest.monkeypatch.MonkeyPatch) -> None: + + date, exit_code = date_result + monkeypatch.setattr(update, 'upload_s3', noop) + + runner = CliRunner() + result = runner.invoke(cli.main, args=[ + 'upload', + '--date', date + ]) + assert result.exit_code == exit_code