[manual] - Data Scrape by Population #133

Workflow file for this run

.github/workflows/data_scrape_by_population.yml at 53b9cde

	name: "[manual] - Data Scrape by Population"

	on:
	workflow_dispatch:
	inputs:
	state:
	description: "State initials (e.g. co, wa)"
	required: true
	type: choice
	options:
	- co
	- nj
	- wa
	- tx
	num_jurisdictions:
	description: "Number of jurisdictions to scrape by population (e.g. 10 for top 10)"
	required: false
	default: 10
	type: number

	jobs:
	fetch-jurisdictions:
	runs-on: ubuntu-latest
	outputs:
	matrix: ${{ steps.set-matrix.outputs.matrix }}
	timeout-minutes: 30

	steps:
	- name: Checkout repository
	uses: actions/checkout@v5

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: "3.11"

	- name: Cache Docker images.
	uses: ScribeMD/docker-cache@0.5.0
	with:
	key: docker-${{ steps.get-date.outputs.date }}

	- name: Pull docker image
	run: \|
	docker compose pull
	docker images

	- name: Fetch jurisdictions
	id: find-jurisdictions
	env:
	API_CIVICPATCH_ORG_TOKEN: ${{ secrets.API_CIVICPATCH_ORG_TOKEN }}
	API_CIVICPATCH_ORG_URL: https://api.civicpatch.org
	run: \|
	# Create a temporary file to store the jurisdictions
	TMP_FILE="$RUNNER_TEMP/jurisdictions.json"

	# Run the CLI command and save the output to the temporary file
	curl -s -X GET "${API_CIVICPATCH_ORG_URL}/api/v1/jurisdictions/available?num_jurisdictions=${{ github.event.inputs.num_jurisdictions }}&state=${{ github.event.inputs.state }}" \
	-H "Authorization: ${API_CIVICPATCH_ORG_TOKEN}" > $TMP_FILE

	# Use jq to extract the jurisdictions and format them as a JSON array for the matrix
	jq -c '.jurisdictions[] \| {id, name, url}' "$TMP_FILE" > "$RUNNER_TEMP/matrix.json"

	- name: Set matrix
	id: set-matrix
	run: \|
	# Convert the JSON array into a GitHub Actions matrix format
	echo "matrix=$(jq -c -s '.' $RUNNER_TEMP/matrix.json)" >> $GITHUB_OUTPUT

	run-pipelines:
	needs: fetch-jurisdictions
	runs-on: ubuntu-latest
	strategy:
	matrix:
	jurisdiction: ${{ fromJson(needs.fetch-jurisdictions.outputs.matrix) }}
	fail-fast: false
	steps:
	- name: Sleep based on hashed jurisdiction ID
	run: \|
	SLEEP_SECONDS=$(( ( $(echo -n "${{ matrix.jurisdiction.id }}" \| cksum \| awk '{print $1}') % 60 ) ))
	echo "Sleeping for $SLEEP_SECONDS seconds to avoid rate limiting from Google Search APIs..."
	sleep $SLEEP_SECONDS

	- name: Checkout repository
	uses: actions/checkout@v5

	- name: Pull docker image
	run: \|
	docker compose pull

	- name: Run data scrape
	env:
	BRAVE_SEARCH_TOKEN: ${{ secrets.BRAVE_SEARCH_TOKEN }}
	GOOGLE_SEARCH_TOKEN: ${{ secrets.GOOGLE_SEARCH_TOKEN }}
	GOOGLE_SEARCH_ENGINE_ID: ${{ secrets.GOOGLE_SEARCH_ENGINE_ID }}
	SERP_API_TOKEN: ${{ secrets.SERP_API_TOKEN }}

	GOOGLE_GEMINI_TOKEN: ${{ secrets.GOOGLE_GEMINI_TOKEN }}
	OPENAI_TOKEN: ${{ secrets.OPENAI_TOKEN }}
	TOGETHER_AI_TOKEN: ${{ secrets.TOGETHER_AI_TOKEN }}

	API_CIVICPATCH_ORG_TOKEN: ${{ secrets.API_CIVICPATCH_ORG_TOKEN }}
	API_CIVICPATCH_ORG_URL: https://api.civicpatch.org
	run: \|
	echo "Running pipeline for jurisdiction: ${{ matrix.jurisdiction.name }}"
	docker compose run --rm civicpatch-cicd poetry run \
	python src/interfaces/cli/main.py \
	run_pipeline \
	--jurisdiction-ocdid "${{ matrix.jurisdiction.id }}" \
	--name "${{ matrix.jurisdiction.name }}" \
	--url "${{ matrix.jurisdiction.url }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[manual] - Data Scrape by Population #133

Workflow file

[manual] - Data Scrape by Population #133

Uh oh!

Workflow file for this run