Skip to content
This repository was archived by the owner on Jun 10, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,24 @@ clean:
# Run a fresh scan, update the database, and upload data to S3.
# Enable Lambda mode, using Lambda AWS profile set up in production.
update_production:
python -m data.update --scan=here --upload --lambda --lambda-profile=lambda
pulse run --scan here --upload --lambda --lambda-profile lambda

# Staging data update process:
#
# Download last production scan data, update the database.
update_staging:
python -m data.update --scan=download
pulse run --scan download

# Development data update process:
#
# Don't scan or download latest data (rely on local cache), update database.
update_development:
python -m data.update --scan=skip
pulse run --scan skip

# downloads latest snapshot of data locally
# Pending cloud.gov production bucket:
# cg-4adefb86-dadb-4ecf-be3e-f1c7b4f6d084
# Pending cloud.gov backup bucket:
# cg-72ce4caf-d81b-4771-9b96-3624b5554587
data_init:
python -m data.update --just-download
pulse download
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ Pulse is a [Flask](http://flask.pocoo.org/) app written for **Python 3.5 and up*
pip install -r requirements.txt
```

* If developing Pulse, you will also need the development requirements
```bash
pip install .[development]
```

* If developing the stylesheets, you will also need [Sass](http://sass-lang.com/), [Bourbon](http://bourbon.io/), [Neat](http://neat.bourbon.io/), and [Bitters](http://bitters.bourbon.io/).

```bash
Expand Down Expand Up @@ -84,7 +89,7 @@ And link it to AWS credentials that allow authorized write access to the `pulse.
From the Pulse root directory:

```
python -m data.update
pulse run
```

This will kick off the `domain-scan` scanning process for HTTP/HTTPS and DAP participation, using the `.gov` domain list as specified in `meta.yml` for the base set of domains to scan.
Expand Down
138 changes: 138 additions & 0 deletions data/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import typing
import datetime
import click
import ujson
from data.env import PARENTS_RESULTS
from data import update as data_update
from data import processing
from data import logger


LOGGER = logger.get_logger(__name__)


class DateType(click.ParamType):
name = 'date'

def convert(self, value, param, ctx) -> str:
try:
datetime.datetime.strptime(value, '%Y-%m-%d')
return value
except ValueError:
self.fail(f'{value} is not a valid date')
DATE = DateType()


def get_cached_date(directory: str) -> str:
meta = os.path.join(directory, 'output/parents/results/meta.json')
with open(meta) as meta_file:
scan_meta = ujson.load(meta_file)
return scan_meta['start_time'][0:10]


def get_date(
ctx: click.core.Context, # pylint: disable=unused-argument
param: click.core.Option, # pylint: disable=unused-argument
value: typing.Optional[str]
) -> str:

# Date can be overridden if need be, but defaults to meta.json.
directory, _ = os.path.split(__file__)

return value if value is not None else get_cached_date(directory)


# Convert ['--option', 'value', ... ] to {'option': 'value', ...}
def transform_args(args: typing.List[str]) -> typing.Dict[str, str]:
transformed = {}
for option, value in zip(args, args[1:]):
if option.startswith('--'):
name = option.strip('--')
transformed[name] = value if not value.startswith('--') else True
return transformed


@click.group()
def main() -> None:
pass


@main.command(
context_settings=dict(
ignore_unknown_options=True,
),
help='Coposition of `update`, `process`, and `upload` commands',
)
@click.option('--date', type=DATE)
@click.option('--scan', type=click.Choice(['skip', 'download', 'here']), default='skip')
@click.option('--gather', type=click.Choice(['skip', 'here']), default='here')
@click.option('--upload-results', is_flag=True, default=False)
@click.argument('scan_args', nargs=-1, type=click.UNPROCESSED)
def run(
date: typing.Optional[str],
scan: str,
gather: str,
upload_results: bool,
scan_args: typing.List[str]
) -> None:

update.callback(scan, gather, scan_args)
the_date = get_date(None, 'date', date)
process.callback(the_date)
if upload_results:
upload.callback(the_date)


@main.command(
context_settings=dict(
ignore_unknown_options=True,
),
help='Gather and scan domains',
)
@click.option('--scan', type=click.Choice(['skip', 'download', 'here']), default='skip')
@click.option('--gather', type=click.Choice(['skip', 'here']), default='here')
@click.argument('scan_args', nargs=-1, type=click.UNPROCESSED)
def update(
scan: str,
gather: str,
scan_args: typing.List[str]
) -> None:

LOGGER.info('Starting update')
data_update.update(scan, gather, transform_args(scan_args))
LOGGER.info('Finished update')


@main.command(help='Download scan results from s3')
def download() -> None:
LOGGER.info('Downloading production data')
data_update.download_s3()
LOGGER.info('Finished downloading production data')


@main.command(help='Upload scan results to s3')
@click.option('--date', type=DATE, callback=get_date)
def upload(date: str) -> None:
# Sanity check to make sure we have what we need.
if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
LOGGER.info("No scan metadata downloaded, aborting.")
return

LOGGER.info(f'[{date}] Syncing scan data and database to S3.')
data_update.upload_s3(date)
LOGGER.info(f"[{date}] Scan data and database now in S3.")


@main.command(help='Process scan data')
@click.option('--date', type=DATE, callback=get_date)
def process(date: str) -> None:

# Sanity check to make sure we have what we need.
if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
LOGGER.info("No scan metadata downloaded, aborting.")
return

LOGGER.info(f"[{date}] Loading data into Pulse.")
processing.run(date)
LOGGER.info(f"[{date}] Data now loaded into Pulse.")
10 changes: 1 addition & 9 deletions data/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,7 @@
#
# This method blows away the database and rebuilds it from the given data.

# options (for debugging)

def run(date, options):
def run(date):
if date is None:
date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

Expand Down Expand Up @@ -1172,9 +1170,3 @@ def branch_for(domain_type):
branch = branch.lower().strip()

return branch


### Run when executed.

if __name__ == '__main__':
run(None, options())
58 changes: 5 additions & 53 deletions data/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,33 +41,18 @@
# - TODO: Consider moving from aws CLI to Python library.



# Options:
# --date: override date, defaults to contents of meta.json
# --scan=[skip,download,here]
# scan_mode=[skip,download,here]
# skip: skip all scanning, assume CSVs are locally cached
# download: download scan data from S3
# here: run the default full scan
# --upload: upload scan data and resulting db.json anything to S3
# --gather=[skip,here]
# gather_mode=[skip,here]
# skip: skip gathering, assume CSVs are locally cached
# here: run the default full gather
# options
# options to pass along to scan and gather operations

def run(options):
# If this is just being used to download production data, do that.
if options.get("just-download", False):
download_s3()
return

# Definitive scan date for the run.
today = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

# 1. Download scan data, do a new scan, or skip altogether.
scan_mode = options.get("scan", "skip")

# Whether to gather domains (defaults to doing so).
gather_mode = options.get("gather", "here")

def update(scan_mode, gather_mode, options):
if scan_mode == "here":
# 1a. Gather .gov federal subdomains.
if gather_mode == "here":
Expand All @@ -91,33 +76,6 @@ def run(options):
download_s3()
LOGGER.info("Download complete.")

# Sanity check to make sure we have what we need.
if not os.path.exists(os.path.join(PARENTS_RESULTS, "meta.json")):
LOGGER.info("No scan metadata downloaded, aborting.")
exit()

# Date can be overridden if need be, but defaults to meta.json.
if options.get("date", None) is not None:
the_date = options.get("date")
else:
# depends on YYYY-MM-DD coming first in meta.json time format
scan_meta = ujson.load(open("data/output/parents/results/meta.json"))
the_date = scan_meta['start_time'][0:10]


# 2. Process and load data into Pulse's database.
LOGGER.info("[%s] Loading data into Pulse." % the_date)
data.processing.run(the_date, options)
LOGGER.info("[%s] Data now loaded into Pulse." % the_date)

# 3. Upload data to S3 (if requested).
if options.get("upload", False):
LOGGER.info("[%s] Syncing scan data and database to S3." % the_date)
upload_s3(the_date)
LOGGER.info("[%s] Scan data and database now in S3." % the_date)

LOGGER.info("[%s] All done." % the_date)


# Upload the scan + processed data to /live/ and /archive/ locations by date.
def upload_s3(date):
Expand Down Expand Up @@ -306,9 +264,3 @@ def shell_out(command, env=None):
LOGGER.critical("Error running %s." % (str(command)))
exit(1)
return None


### Run when executed.

if __name__ == '__main__':
run(options())
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
'ujson==1.35',
'waitress==1.0.1',
'flask-compress==1.4.0',
'click==6.7',
],
extras_require={
'development': [
Expand All @@ -36,4 +37,9 @@
'pytest-cov==2.5.1',
],
},
entry_points='''
[console_scripts]
pulse=data.cli:main
'''

)
Loading