[manual] - Data Scrape by Population #133
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: "[manual] - Data Scrape by Population" | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| state: | |
| description: "State initials (e.g. co, wa)" | |
| required: true | |
| type: choice | |
| options: | |
| - co | |
| - nj | |
| - wa | |
| - tx | |
| num_jurisdictions: | |
| description: "Number of jurisdictions to scrape by population (e.g. 10 for top 10)" | |
| required: false | |
| default: 10 | |
| type: number | |
| jobs: | |
| fetch-jurisdictions: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix: ${{ steps.set-matrix.outputs.matrix }} | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v5 | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: "3.11" | |
| - name: Cache Docker images. | |
| uses: ScribeMD/docker-cache@0.5.0 | |
| with: | |
| key: docker-${{ steps.get-date.outputs.date }} | |
| - name: Pull docker image | |
| run: | | |
| docker compose pull | |
| docker images | |
| - name: Fetch jurisdictions | |
| id: find-jurisdictions | |
| env: | |
| API_CIVICPATCH_ORG_TOKEN: ${{ secrets.API_CIVICPATCH_ORG_TOKEN }} | |
| API_CIVICPATCH_ORG_URL: https://api.civicpatch.org | |
| run: | | |
| # Create a temporary file to store the jurisdictions | |
| TMP_FILE="$RUNNER_TEMP/jurisdictions.json" | |
| # Run the CLI command and save the output to the temporary file | |
| curl -s -X GET "${API_CIVICPATCH_ORG_URL}/api/v1/jurisdictions/available?num_jurisdictions=${{ github.event.inputs.num_jurisdictions }}&state=${{ github.event.inputs.state }}" \ | |
| -H "Authorization: ${API_CIVICPATCH_ORG_TOKEN}" > $TMP_FILE | |
| # Use jq to extract the jurisdictions and format them as a JSON array for the matrix | |
| jq -c '.jurisdictions[] | {id, name, url}' "$TMP_FILE" > "$RUNNER_TEMP/matrix.json" | |
| - name: Set matrix | |
| id: set-matrix | |
| run: | | |
| # Convert the JSON array into a GitHub Actions matrix format | |
| echo "matrix=$(jq -c -s '.' $RUNNER_TEMP/matrix.json)" >> $GITHUB_OUTPUT | |
| run-pipelines: | |
| needs: fetch-jurisdictions | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: | |
| jurisdiction: ${{ fromJson(needs.fetch-jurisdictions.outputs.matrix) }} | |
| fail-fast: false | |
| steps: | |
| - name: Sleep based on hashed jurisdiction ID | |
| run: | | |
| SLEEP_SECONDS=$(( ( $(echo -n "${{ matrix.jurisdiction.id }}" | cksum | awk '{print $1}') % 60 ) )) | |
| echo "Sleeping for $SLEEP_SECONDS seconds to avoid rate limiting from Google Search APIs..." | |
| sleep $SLEEP_SECONDS | |
| - name: Checkout repository | |
| uses: actions/checkout@v5 | |
| - name: Pull docker image | |
| run: | | |
| docker compose pull | |
| - name: Run data scrape | |
| env: | |
| BRAVE_SEARCH_TOKEN: ${{ secrets.BRAVE_SEARCH_TOKEN }} | |
| GOOGLE_SEARCH_TOKEN: ${{ secrets.GOOGLE_SEARCH_TOKEN }} | |
| GOOGLE_SEARCH_ENGINE_ID: ${{ secrets.GOOGLE_SEARCH_ENGINE_ID }} | |
| SERP_API_TOKEN: ${{ secrets.SERP_API_TOKEN }} | |
| GOOGLE_GEMINI_TOKEN: ${{ secrets.GOOGLE_GEMINI_TOKEN }} | |
| OPENAI_TOKEN: ${{ secrets.OPENAI_TOKEN }} | |
| TOGETHER_AI_TOKEN: ${{ secrets.TOGETHER_AI_TOKEN }} | |
| API_CIVICPATCH_ORG_TOKEN: ${{ secrets.API_CIVICPATCH_ORG_TOKEN }} | |
| API_CIVICPATCH_ORG_URL: https://api.civicpatch.org | |
| run: | | |
| echo "Running pipeline for jurisdiction: ${{ matrix.jurisdiction.name }}" | |
| docker compose run --rm civicpatch-cicd poetry run \ | |
| python src/interfaces/cli/main.py \ | |
| run_pipeline \ | |
| --jurisdiction-ocdid "${{ matrix.jurisdiction.id }}" \ | |
| --name "${{ matrix.jurisdiction.name }}" \ | |
| --url "${{ matrix.jurisdiction.url }}" |