From 1ae4ad0a4044752a90f1adbb401323d23db3ebff Mon Sep 17 00:00:00 2001
From: David Buckley <buckley.w.david@gmail.com>
Date: Mon, 23 Apr 2018 13:05:31 -0400
Subject: [PATCH 1/2] Small update.py modularization (#11)

initial modularization refactor of the data update process
---
 data/update.py | 70 +++++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/data/update.py b/data/update.py
index 00849065..e1711fd7 100644
--- a/data/update.py
+++ b/data/update.py
@@ -53,15 +53,42 @@
 #     skip: skip gathering, assume CSVs are locally cached
 #     here: run the default full gather
 
-def run(options):
-  # If this is just being used to download production data, do that.
-  if options.get("just-download", False):
-    download_s3()
-    return
-
-  # Definitive scan date for the run.
-  today = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")
 
+def run(options):
+    # If this is just being used to download production data, do that.
+    if options.get("just-download", False):
+        download_s3()
+        return
+
+    update(options)
+    # Sanity check to make sure we have what we need.
+    if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
+        LOGGER.info("No scan metadata downloaded, aborting.")
+        exit()
+
+    # Date can be overridden if need be, but defaults to meta.json.
+    if options.get("date", None) is not None:
+        the_date = options.get("date")
+    else:
+        # depends on YYYY-MM-DD coming first in meta.json time format
+        scan_meta = ujson.load(open("data/output/parents/results/meta.json"))
+        the_date = scan_meta['start_time'][0:10]
+
+    # 2. Process and load data into Pulse's database.
+    LOGGER.info("[%s] Loading data into Pulse." % the_date)
+    data.processing.run(the_date, options)
+    LOGGER.info("[%s] Data now loaded into Pulse." % the_date)
+
+    # 3. Upload data to S3 (if requested).
+    if options.get("upload", False):
+        LOGGER.info("[%s] Syncing scan data and database to S3." % the_date)
+        upload_s3(the_date)
+        LOGGER.info("[%s] Scan data and database now in S3." % the_date)
+
+    LOGGER.info("[%s] All done." % the_date)
+
+
+def update(options):
   # 1. Download scan data, do a new scan, or skip altogether.
   scan_mode = options.get("scan", "skip")
 
@@ -91,33 +118,6 @@ def run(options):
     download_s3()
     LOGGER.info("Download complete.")
 
-  # Sanity check to make sure we have what we need.
-  if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
-    LOGGER.info("No scan metadata downloaded, aborting.")
-    exit()
-
-  # Date can be overridden if need be, but defaults to meta.json.
-  if options.get("date", None) is not None:
-    the_date = options.get("date")
-  else:
-    # depends on YYYY-MM-DD coming first in meta.json time format
-    scan_meta = ujson.load(open("data/output/parents/results/meta.json"))
-    the_date = scan_meta['start_time'][0:10]
-
-
-  # 2. Process and load data into Pulse's database.
-  LOGGER.info("[%s] Loading data into Pulse." % the_date)
-  data.processing.run(the_date, options)
-  LOGGER.info("[%s] Data now loaded into Pulse." % the_date)
-
-  # 3. Upload data to S3 (if requested).
-  if options.get("upload", False):
-    LOGGER.info("[%s] Syncing scan data and database to S3." % the_date)
-    upload_s3(the_date)
-    LOGGER.info("[%s] Scan data and database now in S3." % the_date)
-
-  LOGGER.info("[%s] All done." % the_date)
-
 
 # Upload the scan + processed data to /live/ and /archive/ locations by date.
 def upload_s3(date):

From 1ec53e285346e14765a18618775c09cca5ccc751 Mon Sep 17 00:00:00 2001
From: David Buckley <buckley.w.david@gmail.com>
Date: Wed, 25 Apr 2018 13:48:09 -0400
Subject: [PATCH 2/2] Add new CLI interface (#14)

replace existing CLI interface with new consolidated, click-based CLI
---
 Makefile           |   8 +--
 README.md          |   7 ++-
 data/cli.py        | 138 +++++++++++++++++++++++++++++++++++++++++++++
 data/processing.py |   9 +--
 data/update.py     |  58 ++-----------------
 setup.py           |   6 ++
 tests/test_cli.py  |  92 ++++++++++++++++++++++++++++++
 7 files changed, 252 insertions(+), 66 deletions(-)
 create mode 100644 data/cli.py
 create mode 100644 tests/test_cli.py

diff --git a/Makefile b/Makefile
index 293183eb..e2cff326 100644
--- a/Makefile
+++ b/Makefile
@@ -39,19 +39,19 @@ clean:
 # Run a fresh scan, update the database, and upload data to S3.
 # Enable Lambda mode, using Lambda AWS profile set up in production.
 update_production:
-	python -m data.update --scan=here --upload --lambda --lambda-profile=lambda
+	pulse run --scan here --upload --lambda --lambda-profile lambda
 
 # Staging data update process:
 #
 # Download last production scan data, update the database.
 update_staging:
-	python -m data.update --scan=download
+	pulse run --scan download
 
 # Development data update process:
 #
 # Don't scan or download latest data (rely on local cache), update database.
 update_development:
-	python -m data.update --scan=skip
+	pulse run --scan skip
 
 # downloads latest snapshot of data locally
 # Pending cloud.gov production bucket:
@@ -59,4 +59,4 @@ update_development:
 # Pending cloud.gov backup bucket:
 # cg-72ce4caf-d81b-4771-9b96-3624b5554587
 data_init:
-	python -m data.update --just-download
+	pulse download
diff --git a/README.md b/README.md
index 3e5d383d..5717b003 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,11 @@ Pulse is a [Flask](http://flask.pocoo.org/) app written for **Python 3.5 and up*
 pip install -r requirements.txt
 ```
 
+* If developing Pulse, you will also need the development requirements
+```bash
+pip install .[development]
+```
+
 * If developing the stylesheets, you will also need [Sass](http://sass-lang.com/), [Bourbon](http://bourbon.io/), [Neat](http://neat.bourbon.io/), and [Bitters](http://bitters.bourbon.io/).
 
 ```bash
@@ -84,7 +89,7 @@ And link it to AWS credentials that allow authorized write access to the `pulse.
 From the Pulse root directory:
 
 ```
-python -m data.update
+pulse run
 ```
 
 This will kick off the `domain-scan` scanning process for HTTP/HTTPS and DAP participation, using the `.gov` domain list as specified in `meta.yml` for the base set of domains to scan.
diff --git a/data/cli.py b/data/cli.py
new file mode 100644
index 00000000..adc5a173
--- /dev/null
+++ b/data/cli.py
@@ -0,0 +1,138 @@
+import os
+import typing
+import datetime
+import click
+import ujson
+from data.env import PARENTS_RESULTS
+from data import update as data_update
+from data import processing
+from data import logger
+
+
+LOGGER = logger.get_logger(__name__)
+
+
+class DateType(click.ParamType):
+    name = 'date'
+
+    def convert(self, value, param, ctx) -> str:
+        try:
+            datetime.datetime.strptime(value, '%Y-%m-%d')
+            return value
+        except ValueError:
+            self.fail(f'{value} is not a valid date')
+DATE = DateType()
+
+
+def get_cached_date(directory: str) -> str:
+    meta = os.path.join(directory, 'output/parents/results/meta.json')
+    with open(meta) as meta_file:
+        scan_meta = ujson.load(meta_file)
+    return scan_meta['start_time'][0:10]
+
+
+def get_date(
+        ctx: click.core.Context, # pylint: disable=unused-argument
+        param: click.core.Option, # pylint: disable=unused-argument
+        value: typing.Optional[str]
+    ) -> str:
+
+    # Date can be overridden if need be, but defaults to meta.json.
+    directory, _ = os.path.split(__file__)
+
+    return value if value is not None else get_cached_date(directory)
+
+
+# Convert ['--option', 'value', ... ] to {'option': 'value', ...}
+def transform_args(args: typing.List[str]) -> typing.Dict[str, str]:
+    transformed = {}
+    for option, value in zip(args, args[1:]):
+        if option.startswith('--'):
+            name = option.strip('--')
+            transformed[name] = value if not value.startswith('--') else True
+    return transformed
+
+
+@click.group()
+def main() -> None:
+    pass
+
+
+@main.command(
+    context_settings=dict(
+        ignore_unknown_options=True,
+    ),
+    help='Coposition of `update`, `process`, and `upload` commands',
+)
+@click.option('--date', type=DATE)
+@click.option('--scan', type=click.Choice(['skip', 'download', 'here']), default='skip')
+@click.option('--gather', type=click.Choice(['skip', 'here']), default='here')
+@click.option('--upload-results', is_flag=True, default=False)
+@click.argument('scan_args', nargs=-1, type=click.UNPROCESSED)
+def run(
+        date: typing.Optional[str],
+        scan: str,
+        gather: str,
+        upload_results: bool,
+        scan_args: typing.List[str]
+    ) -> None:
+
+    update.callback(scan, gather, scan_args)
+    the_date = get_date(None, 'date', date)
+    process.callback(the_date)
+    if upload_results:
+        upload.callback(the_date)
+
+
+@main.command(
+    context_settings=dict(
+        ignore_unknown_options=True,
+    ),
+    help='Gather and scan domains',
+)
+@click.option('--scan', type=click.Choice(['skip', 'download', 'here']), default='skip')
+@click.option('--gather', type=click.Choice(['skip', 'here']), default='here')
+@click.argument('scan_args', nargs=-1, type=click.UNPROCESSED)
+def update(
+        scan: str,
+        gather: str,
+        scan_args: typing.List[str]
+    ) -> None:
+
+    LOGGER.info('Starting update')
+    data_update.update(scan, gather, transform_args(scan_args))
+    LOGGER.info('Finished update')
+
+
+@main.command(help='Download scan results from s3')
+def download() -> None:
+    LOGGER.info('Downloading production data')
+    data_update.download_s3()
+    LOGGER.info('Finished downloading production data')
+
+
+@main.command(help='Upload scan results to s3')
+@click.option('--date', type=DATE, callback=get_date)
+def upload(date: str) -> None:
+    # Sanity check to make sure we have what we need.
+    if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
+        LOGGER.info("No scan metadata downloaded, aborting.")
+        return
+
+    LOGGER.info(f'[{date}] Syncing scan data and database to S3.')
+    data_update.upload_s3(date)
+    LOGGER.info(f"[{date}] Scan data and database now in S3.")
+
+
+@main.command(help='Process scan data')
+@click.option('--date', type=DATE, callback=get_date)
+def process(date: str) -> None:
+
+    # Sanity check to make sure we have what we need.
+    if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
+        LOGGER.info("No scan metadata downloaded, aborting.")
+        return
+
+    LOGGER.info(f"[{date}] Loading data into Pulse.")
+    processing.run(date)
+    LOGGER.info(f"[{date}] Data now loaded into Pulse.")
diff --git a/data/processing.py b/data/processing.py
index 4c8576f0..d58e4d23 100644
--- a/data/processing.py
+++ b/data/processing.py
@@ -97,9 +97,7 @@
 #
 # This method blows away the database and rebuilds it from the given data.
 
-# options (for debugging)
-
-def run(date, options):
+def run(date):
   if date is None:
     date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")
 
@@ -1175,8 +1173,3 @@ def branch_for(agency):
 
   else:
     return "executive"
-
-### Run when executed.
-
-if __name__ == '__main__':
-    run(None, options())
diff --git a/data/update.py b/data/update.py
index e1711fd7..010cc809 100644
--- a/data/update.py
+++ b/data/update.py
@@ -41,60 +41,18 @@
 #    - TODO: Consider moving from aws CLI to Python library.
 
 
-
 # Options:
-# --date: override date, defaults to contents of meta.json
-# --scan=[skip,download,here]
+# scan_mode=[skip,download,here]
 #     skip: skip all scanning, assume CSVs are locally cached
 #     download: download scan data from S3
 #     here: run the default full scan
-# --upload: upload scan data and resulting db.json anything to S3
-# --gather=[skip,here]
+# gather_mode=[skip,here]
 #     skip: skip gathering, assume CSVs are locally cached
 #     here: run the default full gather
+# options
+#     options to pass along to scan and gather operations
 
-
-def run(options):
-    # If this is just being used to download production data, do that.
-    if options.get("just-download", False):
-        download_s3()
-        return
-
-    update(options)
-    # Sanity check to make sure we have what we need.
-    if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
-        LOGGER.info("No scan metadata downloaded, aborting.")
-        exit()
-
-    # Date can be overridden if need be, but defaults to meta.json.
-    if options.get("date", None) is not None:
-        the_date = options.get("date")
-    else:
-        # depends on YYYY-MM-DD coming first in meta.json time format
-        scan_meta = ujson.load(open("data/output/parents/results/meta.json"))
-        the_date = scan_meta['start_time'][0:10]
-
-    # 2. Process and load data into Pulse's database.
-    LOGGER.info("[%s] Loading data into Pulse." % the_date)
-    data.processing.run(the_date, options)
-    LOGGER.info("[%s] Data now loaded into Pulse." % the_date)
-
-    # 3. Upload data to S3 (if requested).
-    if options.get("upload", False):
-        LOGGER.info("[%s] Syncing scan data and database to S3." % the_date)
-        upload_s3(the_date)
-        LOGGER.info("[%s] Scan data and database now in S3." % the_date)
-
-    LOGGER.info("[%s] All done." % the_date)
-
-
-def update(options):
-  # 1. Download scan data, do a new scan, or skip altogether.
-  scan_mode = options.get("scan", "skip")
-
-  # Whether to gather domains (defaults to doing so).
-  gather_mode = options.get("gather", "here")
-
+def update(scan_mode, gather_mode, options):
   if scan_mode == "here":
     # 1a. Gather .gov federal subdomains.
     if gather_mode == "here":
@@ -306,9 +264,3 @@ def shell_out(command, env=None):
         LOGGER.critical("Error running %s." % (str(command)))
         exit(1)
         return None
-
-
-### Run when executed.
-
-if __name__ == '__main__':
-    run(options())
diff --git a/setup.py b/setup.py
index 130ebe97..48c748df 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,7 @@
         'ujson==1.35',
         'waitress==1.0.1',
         'flask-compress==1.4.0',
+        'click==6.7',
     ],
     extras_require={
         'development': [
@@ -36,4 +37,9 @@
             'pytest-cov==2.5.1',
         ],
     },
+    entry_points='''
+        [console_scripts]
+        pulse=data.cli:main
+    '''
+
 )
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 00000000..dd7153a1
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,92 @@
+import typing
+from click.testing import CliRunner
+import _pytest
+import pytest
+from data import cli
+from data import update
+from data import processing
+
+
+@pytest.fixture(params=[('2018-04-24', 0), ('BAD_DATE', 2)])
+def date_result(request: _pytest.fixtures.SubRequest) -> typing.Tuple[str, int]:
+    return request.param
+
+
+def noop(*args) -> None: # pylint: disable=unused-argument
+    pass
+
+
+def test_run_all_args(
+        date_result: typing.Tuple[str, int],
+        monkeypatch: _pytest.monkeypatch.MonkeyPatch,
+    ) -> None:
+    monkeypatch.setattr(update, 'update', noop)
+    monkeypatch.setattr(processing, 'run', noop)
+    monkeypatch.setattr(update, 'download_s3', noop)
+    monkeypatch.setattr(update, 'upload_s3', noop)
+
+    date, exit_code = date_result
+
+    runner = CliRunner()
+    result = runner.invoke(cli.main, args=[
+        'run',
+        '--date', date,
+        '--scan', 'here',
+        '--gather', 'here',
+        '--upload'
+    ])
+    assert result.exit_code == exit_code
+
+
+def test_update(monkeypatch: _pytest.monkeypatch.MonkeyPatch) -> None:
+    monkeypatch.setattr(update, 'update', noop)
+
+    runner = CliRunner()
+    result = runner.invoke(cli.main, args=[
+        'update',
+        '--scan', 'here',
+        '--gather', 'here',
+    ])
+    assert result.exit_code == 0
+
+
+def test_process(
+        date_result: typing.Tuple[str, int],
+        monkeypatch: _pytest.monkeypatch.MonkeyPatch,
+    ) -> None:
+
+    date, exit_code = date_result
+
+    monkeypatch.setattr(processing, 'run', noop)
+
+    runner = CliRunner()
+    result = runner.invoke(cli.main, args=[
+        'process',
+        '--date', date,
+    ])
+    assert result.exit_code == exit_code
+
+
+def test_download(monkeypatch: _pytest.monkeypatch.MonkeyPatch) -> None:
+    monkeypatch.setattr(update, 'download_s3', noop)
+
+    runner = CliRunner()
+    result = runner.invoke(cli.main, args=[
+        'download',
+    ])
+    assert result.exit_code == 0
+
+
+def test_upload(
+        date_result,
+        monkeypatch: _pytest.monkeypatch.MonkeyPatch) -> None:
+
+    date, exit_code = date_result
+    monkeypatch.setattr(update, 'upload_s3', noop)
+
+    runner = CliRunner()
+    result = runner.invoke(cli.main, args=[
+        'upload',
+        '--date', date
+    ])
+    assert result.exit_code == exit_code