Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 4 additions & 12 deletions .github/workflows/update-charts.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
name: Update Charts
name: Update Charts Only

on:
# Primary trigger: explicitly dispatched by update-data.yml after its PR merges.
# (GitHub does not fire push events from GITHUB_TOKEN bot merges, so on:push
# alone is not reliable for bot-initiated data updates.)
# Fallback: push to main touching docs/data/** covers human-initiated merges.
push:
branches: [main]
paths:
- 'docs/data/**'
workflow_dispatch: # Primary trigger from update-data.yml; also allows manual runs
workflow_dispatch: # Manual trigger only

concurrency:
group: update-charts
Expand Down Expand Up @@ -71,10 +63,10 @@ jobs:
git push origin "$BRANCH"
gh pr create \
--title "Auto-update dashboard charts $(date -u +%Y-%m-%d)" \
--body "Automated chart regeneration triggered by data update." \
--body "Manual chart regeneration against current GCS database." \
--base main \
--head "$BRANCH"
gh pr merge "$BRANCH" --auto --squash
gh pr merge "$BRANCH" --squash

- name: Open issue on failure
if: failure()
Expand Down
31 changes: 23 additions & 8 deletions .github/workflows/update-data.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
name: Update Data
name: Update Data Only

on:
schedule:
- cron: '0 6 * * 1' # Every Monday at 6am UTC
workflow_dispatch: # Allow manual trigger from GitHub UI
workflow_dispatch: # Manual trigger only

env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: 'true'
Expand Down Expand Up @@ -43,7 +41,8 @@ jobs:
- name: Write Google API key
run: echo "${{ secrets.GOOGLE_API_KEY }}" > get_data/SECRET_GOOGLE_API_KEY

- name: Fetch data
- name: Fetch — EPA/budget/staff/EEA/CSO/precipitation/303d/MS4
timeout-minutes: 45
working-directory: get_data
run: |
python get_EPARegion1_NPDES_permits.py
Expand All @@ -55,6 +54,23 @@ jobs:
python get_ATTAINS_303d.py
python get_MS4_annual_reports.py --yes --skip-download

- name: Fetch — MA lobbying disclosures (incremental)
timeout-minutes: 45
working-directory: get_data
run: python get_MA_lobbying.py

- name: Fetch — MA legislature bills (incremental)
timeout-minutes: 20
working-directory: get_data
run: python get_MA_legislature_bills.py

- name: Score and cluster lobbying bills
timeout-minutes: 30
working-directory: get_data
run: |
python score_lobbying_bills.py
python cluster_lobbying_bills.py --incremental

- name: Validate data
working-directory: get_data
run: python validate_data.py
Expand All @@ -80,11 +96,10 @@ jobs:
git push origin "$BRANCH"
gh pr create \
--title "Auto-update data $(date -u +%Y-%m-%d)" \
--body "Automated weekly data refresh. Merging this will trigger the chart-update workflow." \
--body "Manual data-only refresh. Run 'Update Charts Only' separately to regenerate dashboard charts." \
--base main \
--head "$BRANCH"
gh pr merge "$BRANCH" --squash
gh workflow run update-charts.yml --ref main

- name: Open issue on failure
if: failure()
Expand All @@ -96,7 +111,7 @@ jobs:
repo: context.repo.repo,
title: `Data update failed: ${new Date().toISOString().split('T')[0]}`,
body: [
'## Scheduled data update failed',
'## Manual data update failed',
'',
`**Run:** ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
'',
Expand Down
146 changes: 146 additions & 0 deletions .github/workflows/update-weekly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
name: Weekly Update (Data + Charts)

on:
schedule:
- cron: '0 6 * * 1' # Every Monday at 6am UTC
workflow_dispatch: # Allow manual trigger from GitHub UI

concurrency:
group: update-weekly
cancel-in-progress: false

env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: 'true'

jobs:
update:
runs-on: ubuntu-latest
permissions:
contents: write # to push branch
issues: write # to open failure issues
pull-requests: write # to create PR

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: pip

- name: Install dependencies
run: pip install -r requirements-ci.txt

- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}

- name: Set up Cloud SDK (provides gsutil)
uses: google-github-actions/setup-gcloud@v2

- name: Write SODA credentials
run: |
printf '%s\n%s\n' "${{ secrets.SODA_APP_TOKEN }}" "${{ secrets.SODA_SECRET_TOKEN }}" \
> get_data/SECRET_SODA_token

- name: Write Google API key
run: echo "${{ secrets.GOOGLE_API_KEY }}" > get_data/SECRET_GOOGLE_API_KEY

- name: Fetch — EPA/budget/staff/EEA/CSO/precipitation/303d/MS4
timeout-minutes: 45
working-directory: get_data
run: |
python get_EPARegion1_NPDES_permits.py
python get_budget_CTHRU.py
python get_DEP_staff_SODA.py
python get_EEA_data_portal.py
python get_eea_dp_cso.py
python get_MA_precipitation.py
python get_ATTAINS_303d.py
python get_MS4_annual_reports.py --yes --skip-download

- name: Fetch — MA lobbying disclosures (incremental)
timeout-minutes: 45
working-directory: get_data
run: python get_MA_lobbying.py

- name: Fetch — MA legislature bills (incremental)
timeout-minutes: 20
working-directory: get_data
run: python get_MA_legislature_bills.py

- name: Score and cluster lobbying bills
timeout-minutes: 30
working-directory: get_data
run: |
python score_lobbying_bills.py
python cluster_lobbying_bills.py --incremental

- name: Validate data
working-directory: get_data
run: python validate_data.py

- name: Assemble database
working-directory: get_data
run: python assemble_db.py

- name: Generate dashboard charts
working-directory: analysis
run: python dashboard_charts.py

- name: Write timestamps
run: |
echo "updated: $(date -u +'%Y-%m-%d %H:%M:%S')" > docs/data/ts_update_dashboard.yml

- name: Commit and open PR
env:
GH_TOKEN: ${{ github.token }}
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add docs/data/ \
docs/_includes/charts/dash_*.html \
docs/data/facts_dash_*.yml \
docs/data/facts_DEPstaff.yml \
docs/data/facts_DEPenforce.yml \
docs/data/facts_ECOSbudgets.yml \
docs/data/facts_EPA303d.yml \
docs/data/ts_update_dashboard.yml
if git diff --staged --quiet; then
echo "No changes — skipping PR."
exit 0
fi
BRANCH="auto/weekly-$(date -u +%Y-%m-%d)"
git checkout -b "$BRANCH"
git commit -m "Auto-update data + charts $(date -u +%Y-%m-%d)"
git push origin "$BRANCH"
gh pr create \
--title "Auto-update data + charts $(date -u +%Y-%m-%d)" \
--body "Automated weekly data and chart refresh." \
--base main \
--head "$BRANCH"
gh pr merge "$BRANCH" --squash

- name: Open issue on failure
if: failure()
uses: actions/github-script@v7
with:
script: |
github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `Weekly update failed: ${new Date().toISOString().split('T')[0]}`,
body: [
'## Scheduled weekly update failed',
'',
`**Run:** ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
'',
'Check the run logs above for details. Common causes:',
'- EPA or EEA website structure changed (scraper broke)',
'- API returned unexpected data (caught by validate_data.py)',
'- Row count decreased vs. previous run',
'- GCS credentials expired',
].join('\n'),
labels: ['data-update-failure'],
})
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@ get_data/MS4_annual_reports/

# Large files
*EEADP_drinkingWater.csv
docs/data/MA_bill_embeddings.parquet
docs/data/MA_bill_embeddings.npy
get_data/MA_legislature_cache/

# MA lobbying large CSVs — stored in GCS, only samples committed
docs/data/MA_lobbying_bills.csv
docs/data/MA_lobbying_employers.csv
docs/data/MA_lobbying_summary_links.csv
docs/data/MA_lobbying_bills_scored.csv
docs/data/MA_legislature_bills.csv

get_data/backup_AMEND.db
get_data/EEADP_drinkingWater.csv
Expand All @@ -117,3 +127,5 @@ get_data/backup_AMEND.db
get_data/AMEND.db
docs/assets/debug_screenshot.png

docs/data/MA_legislature_bills.csv.bak_wrong_gc
docs/data/MA_lobbying_static_site_proposal.md
8 changes: 8 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ bash set_cors_gsutil.sh
- **EPA NPDES page changes**: EPA changed JSON format and column names around 2025; both handled with `isinstance` checks and fallback column detection.
- **EEA CSOAPI**: Requires `Referer` and `Origin` headers matching the portal URL; plain requests return HTTP 500. Pagination is 1-indexed.
- **303(d) data (biennial)**: `get_ATTAINS_303d.py` fetches from MassGIS S3-hosted shapefiles (not the ATTAINS REST API, which times out on `/assessments`). Data updates only biennially (even years); the script exits early if all known cycles are already in the cached CSV. The 2020 cycle was never published by MassGIS. The 2024/2026 cycle is in draft as of April 2026 — the script will auto-detect it when MassGIS publishes the approved shapefile. `CSO_303d_Mapping` in `assemble_db.py` is a manually curated dict (35 verified matches of 56 CSO waterbodies); update it when a new cycle is added by reviewing new assessment unit names against CSO waterBody values.
- **MA lobbying portal (Incapsula WAF)**: The SoS portal (`sec.state.ma.us/LobbyistPublicSearch/`) is protected by Incapsula WAF. A Chrome User-Agent gets a 302 redirect to a JS challenge page. An **iPad User-Agent** bypasses it entirely with plain `requests` — no Selenium needed. The working UA is `Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148`.
- **MA lobbying portal (search form)**: ASP.NET with ViewState. POST to `Default.aspx` with `drpType=L` (Lobbyist or Lobbying Entity — do NOT use `Z` which returns Client pages with different structure), `drpPageSize=20000`, `ddlYear=<year>`. POST timeout must be 120s (response is ~2MB for ~1700 results). Data goes back to 2005 (22 years).
- **MA lobbying incremental fetch**: Disclosures are filed semi-annually (H1 due ~Jul 15 of the year; H2 due ~Jan 15 of the following year), and amendments cluster within ~60 days of those deadlines. The incremental state lives in `MA_lobbying_summary_links.csv` (gitignored, synced to/from GCS by the script itself): each visited summary page is stamped with `last_checked`, pages with no disclosures get a marker row with null `disc_url`. Pages are re-checked only inside a filing window (`deadline − 14d` to `deadline + 60d`) plus one closing sweep after it; a year is skipped wholesale before Jul 1 (H1 period not yet closed). Steady-state weekly runs take ~1–2 min; runs during the Jul and Jan filing windows scan all ~1,700 pages (~40 min). State uploads to GCS happen every 200 pages (data files first, links index last) so a timed-out CI run still makes durable progress.
- **MA lobbying full historical fetch**: `REQUEST_DELAY` is 0.3s; actual page time is ~1.3s (server latency dominates). A full single-year scan (~1,700 registrants) takes ~40 min when most pages are skipped-after-visit, ~hours when every disclosure must also be fetched. The full 22-year history was completed in May 2026; it only ever needs re-running if the GCS state files are lost.
- **Running lobbying scripts**: Do NOT use `conda run` with stdout redirect (`> file`) — `conda run` buffers all output through a pipe and the log file stays empty until the process exits. Run Python directly: `/home/nes/miniconda/envs/amend_python/bin/python -u get_MA_lobbying.py` (the `-u` flag ensures unbuffered output).
- **MA lobbying Gemini SDK**: Uses `google-genai` (new SDK, `google.genai`), NOT the old `google-generativeai` package. API: `client = genai.Client(api_key=...)`, then `client.models.embed_content(model='gemini-embedding-2', contents=text, config=types.EmbedContentConfig(output_dimensionality=768))`.
- **MA lobbying Gemini API cost — do not underestimate**: Actual observed cost for `summarize_lobbying_bills.py` is **$0.627/1k bills** (verified June 2026). Output tokens ($2.50/1M) dominate at ~60% of total cost even though output is only ~151 tokens/bill. Prior estimates were off by 6× because they applied the input price ($0.30/1M) to output tokens. Rule of thumb: a one-time backfill of 7k bills ≈ $4.50; 26k bills ≈ $16. Weekly incremental runs (20–50 new bills) ≈ $0.02. Embedding via `gemini-embedding-2` is negligible ($0.20/1M tokens ≈ $0.00015/bill). Always verify estimate against GCP billing console before running a large batch — the API pricing page may show non-GA or pre-discount rates.
- **MA lobbying General Court formula bug (unfixed)**: `get_MA_lobbying.py` uses `FIRST_GC_START_YEAR = 2005` but the 183rd General Court started in 2003, not 2005. The correct constant is `2003`. As a result, every bill's `general_court` assignment is one session too low (year 2024 → GC192 instead of GC193, etc.), and `get_MA_legislature_bills.py` fetches bill text from the wrong legislative session. Fixing this constant and re-running the full pipeline would bring the title match rate from ~2% to ~65%. The remaining ~35% of mismatches are string-normalisation differences or genuinely wrong bill numbers in the SoS portal. See `get_data/NOTES_bill_embeddings.md` for full analysis.

## Running scripts

Expand Down
9 changes: 9 additions & 0 deletions analysis/dashboard_charts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
import ECOS_budgets_viz
import EPA_303d_viz
import MS4_compliance_viz
try:
import MA_lobbying_viz
_LOBBYING_VIZ_AVAILABLE = True
except ImportError:
_LOBBYING_VIZ_AVAILABLE = False
from EEA_DP_CSO_map import CSOAnalysisEEADP

PREFIX = 'dash_'
Expand Down Expand Up @@ -56,3 +61,7 @@

# --- MS4 stormwater compliance charts (3 charts) ---
MS4_compliance_viz.generate_charts(engine, prefix=PREFIX)

# --- Lobbying charts (4 charts; skipped until MA_lobbying_viz.py is available) ---
if _LOBBYING_VIZ_AVAILABLE:
MA_lobbying_viz.generate_charts(engine, prefix=PREFIX)
23 changes: 14 additions & 9 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,20 @@ All data, code for data gathering and cleaning, code for analysis, and web devel
## Datasets

{{ site.data.site_config.site_abbrev }} includes a [constantly-growing list of datasets]({{ site.url }}{{ site.baseurl }}/data/index.html) such as:

* MA state budget information
* MA Department of Environmental Protection (DEP) staffing records
* MA DEP regulatory enforcement actions
* MA 2011 combined sewer overflow (CSO) discharge data
* US Environmental Protection Agency (EPA) Environmental Justice (EJ) community data
* US Environmental Protection Agency (EPA) Region 1 NPDES permits
* US Census and Social Security Administration population and wage data
* EPA 303(d) Integrated List of MA Impaired Waters — waterbodies failing water quality standards (MassGIS, biennial 2010–2022)

* MA environmental agency budgets (DEP, DCR, EEA; FY2001–present)
* MA DEP staffing records
* MA DEP/EEA enforcement actions (1996–present)
* MA combined sewer overflow (CSO) discharge incidents (2022–present)
* MA municipal stormwater (MS4) annual compliance reports (FY2019–present)
* MA lobbying disclosures — bills, employers, spending (2005–present)
* MA Legislature bill text and passage status (GC183–194)
* EPA 303(d) impaired waters (MassGIS, biennial 2010–2022)
* EPA Region 1 NPDES permits
* EPA EJScreen environmental justice indicators (2017, 2023)
* US Census population and income estimates
* NOAA daily precipitation (Massachusetts)
* ECOS state environmental agency budget survey (FY2009–2023)

## Analysis

Expand Down
2 changes: 1 addition & 1 deletion docs/_includes/charts/dash_MADEP_enforcement_bytype.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

data: {
labels: ["2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026"],
datasets: [{'data': [87, 63, 40, 46, 90, 110, 70, 67, 61, 44, 72, 49, 81, 60, 40, 48, 22, 21, 18, 9, 11, 5, 8, 8, 9, 3], 'label': 'ACO w/o Penalty', 'backgroundColor': 'rgba(31,120,180,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [129, 144, 271, 215, 291, 240, 201, 214, 203, 213, 175, 160, 152, 111, 97, 121, 67, 81, 88, 80, 70, 53, 66, 58, 73, 28], 'label': 'ACO w/ Penalty', 'backgroundColor': 'rgba(166,206,227,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [7, 3, 1, 4, 4, 14, 5, 7, 2, 3, 7, 10, 6, 9, 10, 6, 8, 10, 8, 2, 6, 6, 3, 2, 3, 2], 'label': 'Demand Action', 'backgroundColor': 'rgba(51,160,44,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [1, 0, 0, 16, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 'Federal ACO (PWS)', 'backgroundColor': 'rgba(178,223,138,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [26, 35, 35, 30, 59, 96, 86, 68, 44, 12, 13, 13, 8, 87, 6, 8, 11, 12, 12, 13, 9, 3, 5, 9, 17, 9], 'label': 'Penalty Notice', 'backgroundColor': 'rgba(227,26,28,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [0, 0, 60, 67, 101, 91, 65, 128, 138, 58, 116, 77, 126, 84, 51, 57, 206, 164, 217, 106, 246, 226, 193, 240, 211, 85], 'label': 'Reporting Penalty', 'backgroundColor': 'rgba(253,191,111,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [20, 38, 47, 44, 56, 38, 67, 62, 51, 31, 58, 55, 58, 50, 45, 50, 20, 31, 26, 23, 27, 26, 39, 22, 27, 4], 'label': 'Unilateral Order', 'backgroundColor': 'rgba(202,178,214,0.8)', 'stack': 'annual', 'yAxisID': 'y'}]
datasets: [{'data': [87, 63, 40, 46, 90, 110, 70, 67, 61, 44, 72, 49, 81, 60, 40, 48, 22, 21, 18, 9, 11, 5, 8, 8, 9, 4], 'label': 'ACO w/o Penalty', 'backgroundColor': 'rgba(31,120,180,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [129, 144, 271, 215, 291, 240, 201, 214, 203, 213, 175, 160, 152, 111, 97, 121, 67, 81, 88, 80, 70, 53, 66, 58, 73, 34], 'label': 'ACO w/ Penalty', 'backgroundColor': 'rgba(166,206,227,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [7, 3, 1, 4, 4, 14, 5, 7, 2, 3, 7, 10, 6, 9, 10, 6, 8, 10, 8, 2, 6, 6, 3, 2, 3, 2], 'label': 'Demand Action', 'backgroundColor': 'rgba(51,160,44,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [1, 0, 0, 16, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 'Federal ACO (PWS)', 'backgroundColor': 'rgba(178,223,138,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [26, 35, 35, 30, 59, 96, 86, 68, 44, 12, 13, 13, 8, 87, 6, 8, 11, 12, 12, 13, 9, 3, 5, 9, 17, 11], 'label': 'Penalty Notice', 'backgroundColor': 'rgba(227,26,28,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [0, 0, 60, 67, 101, 91, 65, 128, 138, 58, 116, 77, 126, 84, 51, 57, 206, 164, 217, 106, 246, 226, 193, 240, 211, 126], 'label': 'Reporting Penalty', 'backgroundColor': 'rgba(253,191,111,0.8)', 'stack': 'annual', 'yAxisID': 'y'},{'data': [20, 38, 47, 44, 56, 38, 67, 62, 51, 31, 58, 55, 58, 50, 45, 50, 20, 31, 26, 23, 27, 26, 39, 22, 27, 5], 'label': 'Unilateral Order', 'backgroundColor': 'rgba(202,178,214,0.8)', 'stack': 'annual', 'yAxisID': 'y'}]
},
type: 'bar',
options: {
Expand Down
Loading