Skip to content

[manual] - Data Scrape by Population #133

[manual] - Data Scrape by Population

[manual] - Data Scrape by Population #133

name: "[manual] - Data Scrape by Population"
on:
workflow_dispatch:
inputs:
state:
description: "State initials (e.g. co, wa)"
required: true
type: choice
options:
- co
- nj
- wa
- tx
num_jurisdictions:
description: "Number of jurisdictions to scrape by population (e.g. 10 for top 10)"
required: false
default: 10
type: number
jobs:
fetch-jurisdictions:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v5
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Cache Docker images.
uses: ScribeMD/docker-cache@0.5.0
with:
key: docker-${{ steps.get-date.outputs.date }}
- name: Pull docker image
run: |
docker compose pull
docker images
- name: Fetch jurisdictions
id: find-jurisdictions
env:
API_CIVICPATCH_ORG_TOKEN: ${{ secrets.API_CIVICPATCH_ORG_TOKEN }}
API_CIVICPATCH_ORG_URL: https://api.civicpatch.org
run: |
# Create a temporary file to store the jurisdictions
TMP_FILE="$RUNNER_TEMP/jurisdictions.json"
# Run the CLI command and save the output to the temporary file
curl -s -X GET "${API_CIVICPATCH_ORG_URL}/api/v1/jurisdictions/available?num_jurisdictions=${{ github.event.inputs.num_jurisdictions }}&state=${{ github.event.inputs.state }}" \
-H "Authorization: ${API_CIVICPATCH_ORG_TOKEN}" > $TMP_FILE
# Use jq to extract the jurisdictions and format them as a JSON array for the matrix
jq -c '.jurisdictions[] | {id, name, url}' "$TMP_FILE" > "$RUNNER_TEMP/matrix.json"
- name: Set matrix
id: set-matrix
run: |
# Convert the JSON array into a GitHub Actions matrix format
echo "matrix=$(jq -c -s '.' $RUNNER_TEMP/matrix.json)" >> $GITHUB_OUTPUT
run-pipelines:
needs: fetch-jurisdictions
runs-on: ubuntu-latest
strategy:
matrix:
jurisdiction: ${{ fromJson(needs.fetch-jurisdictions.outputs.matrix) }}
fail-fast: false
steps:
- name: Sleep based on hashed jurisdiction ID
run: |
SLEEP_SECONDS=$(( ( $(echo -n "${{ matrix.jurisdiction.id }}" | cksum | awk '{print $1}') % 60 ) ))
echo "Sleeping for $SLEEP_SECONDS seconds to avoid rate limiting from Google Search APIs..."
sleep $SLEEP_SECONDS
- name: Checkout repository
uses: actions/checkout@v5
- name: Pull docker image
run: |
docker compose pull
- name: Run data scrape
env:
BRAVE_SEARCH_TOKEN: ${{ secrets.BRAVE_SEARCH_TOKEN }}
GOOGLE_SEARCH_TOKEN: ${{ secrets.GOOGLE_SEARCH_TOKEN }}
GOOGLE_SEARCH_ENGINE_ID: ${{ secrets.GOOGLE_SEARCH_ENGINE_ID }}
SERP_API_TOKEN: ${{ secrets.SERP_API_TOKEN }}
GOOGLE_GEMINI_TOKEN: ${{ secrets.GOOGLE_GEMINI_TOKEN }}
OPENAI_TOKEN: ${{ secrets.OPENAI_TOKEN }}
TOGETHER_AI_TOKEN: ${{ secrets.TOGETHER_AI_TOKEN }}
API_CIVICPATCH_ORG_TOKEN: ${{ secrets.API_CIVICPATCH_ORG_TOKEN }}
API_CIVICPATCH_ORG_URL: https://api.civicpatch.org
run: |
echo "Running pipeline for jurisdiction: ${{ matrix.jurisdiction.name }}"
docker compose run --rm civicpatch-cicd poetry run \
python src/interfaces/cli/main.py \
run_pipeline \
--jurisdiction-ocdid "${{ matrix.jurisdiction.id }}" \
--name "${{ matrix.jurisdiction.name }}" \
--url "${{ matrix.jurisdiction.url }}"