diff --git a/.github/actions/ansible-setup/action.yml b/.github/actions/ansible-setup/action.yml index db9bf8e309..8ebb487f0d 100644 --- a/.github/actions/ansible-setup/action.yml +++ b/.github/actions/ansible-setup/action.yml @@ -23,12 +23,12 @@ runs: using: composite steps: - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ inputs.python-version }} - name: Cache Ansible toolchain - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/pip diff --git a/.github/actions/python-setup/action.yml b/.github/actions/python-setup/action.yml index 25c87fa7c7..626200b3f9 100644 --- a/.github/actions/python-setup/action.yml +++ b/.github/actions/python-setup/action.yml @@ -27,7 +27,7 @@ runs: using: composite steps: - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ inputs.python-version }} @@ -42,7 +42,7 @@ runs: run: poetry config virtualenvs.in-project true - name: Cache Poetry dependencies - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/pypoetry diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml index ce0f001192..c7e29f0a17 100644 --- a/.github/workflows/ansible-deploy.yml +++ b/.github/workflows/ansible-deploy.yml @@ -30,6 +30,7 @@ on: permissions: contents: read + actions: read concurrency: group: ansible-deploy-${{ github.ref }} @@ -47,7 +48,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Setup Ansible toolchain uses: ./.github/actions/ansible-setup @@ -58,9 +59,180 @@ jobs: ansible-directory: ${{ env.ANSIBLE_DIRECTORY }} vault-password: ${{ secrets.ANSIBLE_VAULT_PASSWORD }} + wait-for-prerequisites: + name: Wait for Prerequisite Workflows + if: github.event_name != 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - name: Wait for required workflow runs on current commit + env: + GITHUB_TOKEN: ${{ github.token }} + REPOSITORY: ${{ github.repository }} + COMMIT_SHA: ${{ github.sha }} + EVENT_NAME: ${{ github.event_name }} + REF_NAME: ${{ github.ref_name }} + BEFORE_SHA: ${{ github.event.before || '' }} + PR_NUMBER: ${{ github.event.pull_request.number || '' }} + run: | + set -euo pipefail + + api() { + curl -fsSL \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + "$1" + } + + declare -A required_workflows=() + + add_workflow() { + required_workflows["$1"]=1 + } + + changed_files=() + if [[ "$EVENT_NAME" == "pull_request" && -n "$PR_NUMBER" ]]; then + while IFS= read -r path; do + [[ -n "$path" ]] && changed_files+=("$path") + done < <( + api "https://api.github.com/repos/$REPOSITORY/pulls/$PR_NUMBER/files?per_page=100" \ + | jq -r '.[].filename' + ) + elif [[ -n "$BEFORE_SHA" && "$BEFORE_SHA" != "0000000000000000000000000000000000000000" ]]; then + while IFS= read -r path; do + [[ -n "$path" ]] && changed_files+=("$path") + done < <( + api "https://api.github.com/repos/$REPOSITORY/compare/$BEFORE_SHA...$COMMIT_SHA" \ + | jq -r '.files[]?.filename' + ) + fi + + for path in "${changed_files[@]}"; do + case "$path" in + app_go/*|.github/workflows/go-docker.yml) + if [[ "$REF_NAME" == lab* ]]; then + add_workflow "Go Docker Publish" + fi + ;; + esac + + case "$path" in + app_python/*|.github/actions/python-setup/*|.github/workflows/python-ci.yml) + add_workflow "Python CI" + ;; + esac + + case "$path" in + app_python/*|.github/workflows/python-docker.yml) + if [[ "$REF_NAME" == lab* ]]; then + add_workflow "Python Docker Publish" + fi + ;; + esac + done + + deadline=$(( $(date +%s) + 900 )) + grace_deadline=$(( $(date +%s) + 60 )) + tracked_workflows=( + "Go Docker Publish" + "Python CI" + "Python Docker Publish" + ) + + while (( $(date +%s) < deadline )); do + runs_json="$( + api "https://api.github.com/repos/$REPOSITORY/actions/runs?head_sha=$COMMIT_SHA&per_page=100" + )" + + while IFS= read -r workflow_name; do + [[ -n "$workflow_name" ]] && add_workflow "$workflow_name" + done < <( + jq -r ' + .workflow_runs[] + | select( + .name == "Go Docker Publish" + or .name == "Python CI" + or .name == "Python Docker Publish" + ) + | .name + ' <<<"$runs_json" + ) + + if (( ${#required_workflows[@]} == 0 )); then + echo "No prerequisite workflows apply to this commit." + exit 0 + fi + + pending=0 + failures=() + + for workflow_name in "${tracked_workflows[@]}"; do + if [[ -z "${required_workflows[$workflow_name]+x}" ]]; then + continue + fi + + run_json="$( + jq -c \ + --arg name "$workflow_name" \ + '.workflow_runs + | map(select(.name == $name)) + | sort_by(.run_started_at // .created_at) + | last // empty' <<<"$runs_json" + )" + + if [[ -z "$run_json" ]]; then + if (( $(date +%s) < grace_deadline )); then + echo "Waiting for workflow record: $workflow_name" + pending=1 + continue + fi + + echo "No run found for $workflow_name on $COMMIT_SHA after grace period; treating it as not triggered." + continue + fi + + status="$(jq -r '.status' <<<"$run_json")" + conclusion="$(jq -r '.conclusion // ""' <<<"$run_json")" + event="$(jq -r '.event' <<<"$run_json")" + html_url="$(jq -r '.html_url' <<<"$run_json")" + + echo "$workflow_name: status=$status conclusion=${conclusion:-n/a} event=$event" + + if [[ "$status" != "completed" ]]; then + pending=1 + continue + fi + + case "$conclusion" in + success|skipped) + ;; + *) + failures+=("$workflow_name ($conclusion) $html_url") + ;; + esac + done + + if (( ${#failures[@]} > 0 )); then + printf 'Prerequisite workflow failed: %s\n' "${failures[@]}" >&2 + exit 1 + fi + + if (( pending == 0 )); then + echo "All prerequisite workflows finished successfully." + exit 0 + fi + + sleep 15 + done + + echo "Timed out while waiting for prerequisite workflows on $COMMIT_SHA." >&2 + exit 1 + deploy: name: Deploy Application - needs: lint + needs: + - lint + - wait-for-prerequisites if: github.event_name != 'pull_request' runs-on: - self-hosted @@ -69,7 +241,7 @@ jobs: timeout-minutes: 20 steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Setup Ansible toolchain uses: ./.github/actions/ansible-setup @@ -131,7 +303,7 @@ jobs: - name: Upload deployment log if: always() && steps.deploy.outputs.log-path != '' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: ansible-deploy-log path: ${{ steps.deploy.outputs.log-path }} diff --git a/.github/workflows/go-docker.yml b/.github/workflows/go-docker.yml new file mode 100644 index 0000000000..a1fcc9268f --- /dev/null +++ b/.github/workflows/go-docker.yml @@ -0,0 +1,82 @@ +name: Go Docker Publish + +on: + push: + branches: + - "lab*" + paths: + - app_go/** + - .github/workflows/go-docker.yml + pull_request: + branches: + - master + types: + - closed + paths: + - app_go/** + - .github/workflows/go-docker.yml + +jobs: + build-and-push-branch: + if: github.event_name == 'push' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Derive lab+sha tag from branch + id: version + run: | + source_branch="${{ github.ref_name }}" + if [[ "$source_branch" =~ ([0-9]+) ]]; then + lab_number="${BASH_REMATCH[1]}" + lab_number=$((10#$lab_number)) + short_sha="${GITHUB_SHA::7}" + echo "branch_tag=1.${lab_number}.${short_sha}" >> "$GITHUB_OUTPUT" + else + echo "Failed to extract lab number from branch: $source_branch" >&2 + exit 1 + fi + - name: Log in to Docker Hub + uses: docker/login-action@v4 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and push Docker image (branch) + uses: docker/build-push-action@v7 + with: + context: ./app_go + file: ./app_go/Dockerfile + push: true + tags: | + ${{ secrets.DOCKERHUB_USERNAME }}/devops-app-go:${{ steps.version.outputs.branch_tag }} + + build-and-push: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Derive lab version tag from merged branch + id: version + run: | + source_branch="${{ github.event.pull_request.head.ref }}" + if [[ "$source_branch" =~ ([0-9]+) ]]; then + lab_number="${BASH_REMATCH[1]}" + lab_number=$((10#$lab_number)) + echo "version_tag=1.${lab_number}" >> "$GITHUB_OUTPUT" + else + echo "Failed to extract lab number from merged branch: $source_branch" >&2 + exit 1 + fi + - name: Log in to Docker Hub + uses: docker/login-action@v4 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and push Docker image + uses: docker/build-push-action@v7 + with: + context: ./app_go + file: ./app_go/Dockerfile + push: true + tags: | + ${{ secrets.DOCKERHUB_USERNAME }}/devops-app-go:${{ steps.version.outputs.version_tag }} + ${{ secrets.DOCKERHUB_USERNAME }}/devops-app-go:latest diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 712fe39c42..62c6a465fd 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -27,7 +27,7 @@ jobs: run: working-directory: ./app_python steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v6 - name: Setup Python tooling and dependencies uses: ./.github/actions/python-setup with: @@ -48,7 +48,7 @@ jobs: --cov-report=xml:test-results/coverage.xml - name: Upload pytest and coverage reports if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: python-test-reports path: | diff --git a/.github/workflows/python-docker.yml b/.github/workflows/python-docker.yml index dde175a645..dc486928df 100644 --- a/.github/workflows/python-docker.yml +++ b/.github/workflows/python-docker.yml @@ -21,7 +21,7 @@ jobs: if: github.event_name == 'push' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Derive lab+sha tag from branch id: version run: | @@ -36,12 +36,12 @@ jobs: exit 1 fi - name: Log in to Docker Hub - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Build and push Docker image (branch) - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: ./app_python file: ./app_python/Dockerfile @@ -53,7 +53,7 @@ jobs: if: github.event.pull_request.merged == true runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Derive lab version tag from merged branch id: version run: | @@ -67,12 +67,12 @@ jobs: exit 1 fi - name: Log in to Docker Hub - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Build and push Docker image - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: ./app_python file: ./app_python/Dockerfile diff --git a/.github/workflows/python-snyk.yml b/.github/workflows/python-snyk.yml index c3297eccc1..a9967bc9b9 100644 --- a/.github/workflows/python-snyk.yml +++ b/.github/workflows/python-snyk.yml @@ -21,7 +21,7 @@ jobs: run: working-directory: ./app_python steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Setup Python tooling and dependencies uses: ./.github/actions/python-setup - name: Setup Snyk CLI diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..97a00f34e2 --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,18 @@ +--- +- name: Deploy monitoring stack + hosts: webservers + become: true + vars_files: + - ../group_vars/all.yml + + tasks: + - name: Run monitoring role + ansible.builtin.include_role: + name: monitoring + apply: + tags: + - monitoring + tags: + - monitoring + - monitoring_deploy + - compose diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..d9d610fbf5 --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,83 @@ +--- +monitoring_registry_username: "{{ dockerhub_username | default('') }}" +monitoring_registry_password: "{{ dockerhub_password | default(docker_api_token | default('')) }}" + +monitoring_dir: /opt/devops-monitoring +monitoring_compose_file: docker-compose.yml +monitoring_env_file: .env +monitoring_project_name: "{{ monitoring_dir | basename }}" + +monitoring_loki_version: 3.0.0 +monitoring_promtail_version: 3.0.0 +monitoring_grafana_version: 12.3.1 + +monitoring_loki_port: 3100 +monitoring_loki_grpc_port: 9096 +monitoring_promtail_port: 9080 +monitoring_grafana_port: 3000 + +monitoring_loki_retention_period: 168h +monitoring_loki_schema_version: v13 +monitoring_loki_schema_from: "2024-01-01" + +monitoring_grafana_admin_user: "{{ grafana_admin_user | default('admin') }}" +monitoring_grafana_admin_password: "{{ grafana_admin_password | default('ChangeMe!123') }}" +monitoring_grafana_allow_embedding: true +monitoring_grafana_anonymous_enabled: false + +monitoring_python_image: >- + {{ (monitoring_registry_username ~ '/devops-app-py') + if monitoring_registry_username | length > 0 else 'localt0aster/devops-app-py' }} +monitoring_python_tag: "{{ monitoring_python_image_tag | default(docker_image_tag | default('latest')) }}" +monitoring_python_port: 8000 +monitoring_python_internal_port: 8000 +monitoring_python_app_label: devops-python +monitoring_python_health_path: /health + +monitoring_go_image: >- + {{ (monitoring_registry_username ~ '/devops-app-go') + if monitoring_registry_username | length > 0 else 'localt0aster/devops-app-go' }} +monitoring_go_tag: "{{ monitoring_go_image_tag | default(docker_image_tag | default('latest')) }}" +monitoring_go_port: 8001 +monitoring_go_internal_port: 8001 +monitoring_go_app_label: devops-go +monitoring_go_health_path: /health +monitoring_go_external_healthcheck_enabled: true +monitoring_go_healthcheck_image: curlimages/curl:8.18.0 + +monitoring_loki_cpu_limit: "1.0" +monitoring_loki_memory_limit: 1G +monitoring_loki_cpu_reservation: "0.25" +monitoring_loki_memory_reservation: 256M + +monitoring_promtail_cpu_limit: "0.5" +monitoring_promtail_memory_limit: 256M +monitoring_promtail_cpu_reservation: "0.10" +monitoring_promtail_memory_reservation: 64M + +monitoring_grafana_cpu_limit: "1.0" +monitoring_grafana_memory_limit: 512M +monitoring_grafana_cpu_reservation: "0.25" +monitoring_grafana_memory_reservation: 128M + +monitoring_app_cpu_limit: "0.5" +monitoring_app_memory_limit: 256M +monitoring_app_cpu_reservation: "0.10" +monitoring_app_memory_reservation: 64M + +monitoring_go_healthcheck_cpu_limit: "0.10" +monitoring_go_healthcheck_memory_limit: 64M +monitoring_go_healthcheck_cpu_reservation: "0.05" +monitoring_go_healthcheck_memory_reservation: 32M + +monitoring_compose_pull_policy: always +monitoring_compose_recreate: auto +monitoring_compose_wait: true +monitoring_compose_wait_timeout: 180 +monitoring_compose_remove_orphans: true + +monitoring_healthcheck_host: "{{ monitoring_public_host | default(ansible_host | default(inventory_hostname)) }}" +monitoring_healthcheck_delegate_to: "{{ monitoring_healthcheck_delegate | default('localhost') }}" +monitoring_healthcheck_timeout: 5 +monitoring_healthcheck_retries: 20 +monitoring_healthcheck_delay: 3 diff --git a/ansible/roles/monitoring/tasks/deploy.yml b/ansible/roles/monitoring/tasks/deploy.yml new file mode 100644 index 0000000000..ccb6b6b4a0 --- /dev/null +++ b/ansible/roles/monitoring/tasks/deploy.yml @@ -0,0 +1,174 @@ +--- +- name: Skip monitoring deployment actions in check mode + ansible.builtin.debug: + msg: Monitoring stack deployment is skipped in check mode. + when: ansible_check_mode + +- name: Deploy monitoring stack with Docker Compose + tags: + - monitoring + - monitoring_deploy + - compose + when: not ansible_check_mode + block: + - name: Log in to Docker Hub when credentials are available + community.docker.docker_login: + registry_url: https://index.docker.io/v1/ + username: "{{ monitoring_registry_username }}" + password: "{{ monitoring_registry_password }}" + no_log: true + when: + - monitoring_registry_username | string | length > 0 + - monitoring_registry_password | string | length > 0 + + - name: Deploy monitoring stack with Docker Compose v2 + community.docker.docker_compose_v2: + project_src: "{{ monitoring_dir }}" + files: + - "{{ monitoring_compose_file }}" + pull: "{{ monitoring_compose_pull_policy }}" + recreate: "{{ monitoring_compose_recreate }}" + remove_orphans: "{{ monitoring_compose_remove_orphans | bool }}" + state: present + wait: "{{ monitoring_compose_wait | bool }}" + wait_timeout: "{{ monitoring_compose_wait_timeout | int }}" + register: monitoring_compose_result + retries: 3 + delay: 10 + until: monitoring_compose_result is succeeded + + - name: Wait for exposed monitoring ports + ansible.builtin.wait_for: + host: "{{ monitoring_healthcheck_host }}" + port: "{{ item | int }}" + timeout: 60 + delay: 1 + loop: + - "{{ monitoring_loki_port }}" + - "{{ monitoring_promtail_port }}" + - "{{ monitoring_grafana_port }}" + - "{{ monitoring_python_port }}" + - "{{ monitoring_go_port }}" + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Verify Loki readiness endpoint + ansible.builtin.uri: + url: "http://{{ monitoring_healthcheck_host }}:{{ monitoring_loki_port }}/ready" + method: GET + status_code: 200 + return_content: true + timeout: "{{ monitoring_healthcheck_timeout | int }}" + register: monitoring_loki_ready + retries: "{{ monitoring_healthcheck_retries | int }}" + delay: "{{ monitoring_healthcheck_delay | int }}" + until: + - monitoring_loki_ready.status == 200 + - "'ready' in (monitoring_loki_ready.content | default(''))" + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Verify Promtail targets endpoint + ansible.builtin.uri: + url: "http://{{ monitoring_healthcheck_host }}:{{ monitoring_promtail_port }}/targets" + method: GET + status_code: 200 + timeout: "{{ monitoring_healthcheck_timeout | int }}" + register: monitoring_promtail_targets + retries: "{{ monitoring_healthcheck_retries | int }}" + delay: "{{ monitoring_healthcheck_delay | int }}" + until: monitoring_promtail_targets.status == 200 + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Verify Grafana API health + ansible.builtin.uri: + url: "http://{{ monitoring_healthcheck_host }}:{{ monitoring_grafana_port }}/api/health" + method: GET + status_code: 200 + timeout: "{{ monitoring_healthcheck_timeout | int }}" + register: monitoring_grafana_health + retries: "{{ monitoring_healthcheck_retries | int }}" + delay: "{{ monitoring_healthcheck_delay | int }}" + until: monitoring_grafana_health.status == 200 + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Verify Grafana requires authentication + ansible.builtin.uri: + url: "http://{{ monitoring_healthcheck_host }}:{{ monitoring_grafana_port }}/api/user" + method: GET + status_code: 401 + timeout: "{{ monitoring_healthcheck_timeout | int }}" + register: monitoring_grafana_auth_gate + retries: "{{ monitoring_healthcheck_retries | int }}" + delay: "{{ monitoring_healthcheck_delay | int }}" + until: monitoring_grafana_auth_gate.status == 401 + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Verify Python application health endpoint + ansible.builtin.uri: + url: "http://{{ monitoring_healthcheck_host }}:{{ monitoring_python_port }}{{ monitoring_python_health_path }}" + method: GET + status_code: 200 + return_content: true + timeout: "{{ monitoring_healthcheck_timeout | int }}" + register: monitoring_python_health + retries: "{{ monitoring_healthcheck_retries | int }}" + delay: "{{ monitoring_healthcheck_delay | int }}" + until: + - monitoring_python_health.status == 200 + - monitoring_python_health.json.status | default('') == 'healthy' + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Verify Go application health endpoint + ansible.builtin.uri: + url: "http://{{ monitoring_healthcheck_host }}:{{ monitoring_go_port }}{{ monitoring_go_health_path }}" + method: GET + status_code: 200 + return_content: true + timeout: "{{ monitoring_healthcheck_timeout | int }}" + register: monitoring_go_health + retries: "{{ monitoring_healthcheck_retries | int }}" + delay: "{{ monitoring_healthcheck_delay | int }}" + until: + - monitoring_go_health.status == 200 + - monitoring_go_health.json.status | default('') == 'healthy' + delegate_to: "{{ monitoring_healthcheck_delegate_to }}" + become: false + + - name: Read external Go healthcheck container info + community.docker.docker_container_info: + name: "{{ monitoring_project_name }}-app-go-healthcheck-1" + register: monitoring_go_healthcheck_container + when: monitoring_go_external_healthcheck_enabled | bool + + - name: Assert external Go healthcheck is healthy + ansible.builtin.assert: + that: + - monitoring_go_healthcheck_container.exists | bool + - monitoring_go_healthcheck_container.container.State.Health.Status == 'healthy' + fail_msg: External Go healthcheck container is not healthy. + when: monitoring_go_external_healthcheck_enabled | bool + + rescue: + - name: Capture docker compose status after failed monitoring deployment + ansible.builtin.command: + argv: + - docker + - compose + - -f + - "{{ monitoring_dir }}/{{ monitoring_compose_file }}" + - ps + - --all + register: monitoring_compose_ps + changed_when: false + failed_when: false + + - name: Fail deployment with compose status context + ansible.builtin.fail: + msg: >- + Monitoring deployment failed. Compose status: + {{ monitoring_compose_ps.stdout | default('no compose status available') }} diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..e0b934409c --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,8 @@ +--- +- name: Prepare monitoring stack files + ansible.builtin.include_tasks: + file: setup.yml + +- name: Deploy monitoring stack + ansible.builtin.include_tasks: + file: deploy.yml diff --git a/ansible/roles/monitoring/tasks/setup.yml b/ansible/roles/monitoring/tasks/setup.yml new file mode 100644 index 0000000000..45fc1fe486 --- /dev/null +++ b/ansible/roles/monitoring/tasks/setup.yml @@ -0,0 +1,55 @@ +--- +- name: Ensure monitoring directory structure exists + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - "{{ monitoring_dir }}" + - "{{ monitoring_dir }}/loki" + - "{{ monitoring_dir }}/promtail" + - "{{ monitoring_dir }}/grafana" + - "{{ monitoring_dir }}/grafana/provisioning" + - "{{ monitoring_dir }}/grafana/provisioning/datasources" + +- name: Template monitoring environment file + ansible.builtin.template: + src: env.j2 + dest: "{{ monitoring_dir }}/{{ monitoring_env_file }}" + owner: root + group: root + mode: "0600" + +- name: Template monitoring Docker Compose configuration + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_dir }}/{{ monitoring_compose_file }}" + owner: root + group: root + mode: "0644" + +- name: Template Loki configuration + ansible.builtin.template: + src: loki-config.yml.j2 + dest: "{{ monitoring_dir }}/loki/config.yml" + owner: root + group: root + mode: "0644" + +- name: Template Promtail configuration + ansible.builtin.template: + src: promtail-config.yml.j2 + dest: "{{ monitoring_dir }}/promtail/config.yml" + owner: root + group: root + mode: "0644" + +- name: Template Grafana Loki datasource provisioning + ansible.builtin.template: + src: grafana-loki-datasource.yml.j2 + dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/loki.yml" + owner: root + group: root + mode: "0644" diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..73178674d3 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,183 @@ +services: + loki: + image: grafana/loki:{{ monitoring_loki_version }} + command: + - -config.file=/etc/loki/config.yml + ports: + - "{{ monitoring_loki_port }}:{{ monitoring_loki_port }}" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + healthcheck: + test: + - CMD-SHELL + - wget --no-verbose --tries=1 --spider http://127.0.0.1:{{ monitoring_loki_port }}/ready || exit 1 + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "{{ monitoring_loki_cpu_limit }}" + memory: {{ monitoring_loki_memory_limit }} + reservations: + cpus: "{{ monitoring_loki_cpu_reservation }}" + memory: {{ monitoring_loki_memory_reservation }} + networks: + - monitoring + restart: unless-stopped + + promtail: + image: grafana/promtail:{{ monitoring_promtail_version }} + command: + - -config.file=/etc/promtail/config.yml + user: "0:0" + depends_on: + loki: + condition: service_healthy + ports: + - "{{ monitoring_promtail_port }}:{{ monitoring_promtail_port }}" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - promtail-data:/run/promtail + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + deploy: + resources: + limits: + cpus: "{{ monitoring_promtail_cpu_limit }}" + memory: {{ monitoring_promtail_memory_limit }} + reservations: + cpus: "{{ monitoring_promtail_cpu_reservation }}" + memory: {{ monitoring_promtail_memory_reservation }} + networks: + - monitoring + restart: unless-stopped + + grafana: + image: grafana/grafana:{{ monitoring_grafana_version }} + depends_on: + loki: + condition: service_healthy + env_file: + - .env + environment: + GF_AUTH_ANONYMOUS_ENABLED: "{{ monitoring_grafana_anonymous_enabled | ternary('true', 'false') }}" + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD} + GF_SECURITY_ALLOW_EMBEDDING: "{{ monitoring_grafana_allow_embedding | ternary('true', 'false') }}" + ports: + - "{{ monitoring_grafana_port }}:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + healthcheck: + test: + - CMD-SHELL + - wget --no-verbose --tries=1 --spider http://127.0.0.1:3000/api/health || exit 1 + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "{{ monitoring_grafana_cpu_limit }}" + memory: {{ monitoring_grafana_memory_limit }} + reservations: + cpus: "{{ monitoring_grafana_cpu_reservation }}" + memory: {{ monitoring_grafana_memory_reservation }} + networks: + - monitoring + restart: unless-stopped + + app-python: + image: {{ monitoring_python_image }}:{{ monitoring_python_tag }} + environment: + HOST: "0.0.0.0" + PORT: "{{ monitoring_python_internal_port }}" + ports: + - "{{ monitoring_python_port }}:{{ monitoring_python_internal_port }}" + labels: + logging: "promtail" + app: "{{ monitoring_python_app_label }}" + healthcheck: + test: + - CMD-SHELL + - wget --no-verbose --tries=1 --spider http://127.0.0.1:{{ monitoring_python_internal_port }}{{ monitoring_python_health_path }} || exit 1 + interval: 15s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "{{ monitoring_app_cpu_limit }}" + memory: {{ monitoring_app_memory_limit }} + reservations: + cpus: "{{ monitoring_app_cpu_reservation }}" + memory: {{ monitoring_app_memory_reservation }} + networks: + - monitoring + restart: unless-stopped + + app-go: + image: {{ monitoring_go_image }}:{{ monitoring_go_tag }} + environment: + HOST: "0.0.0.0" + PORT: "{{ monitoring_go_internal_port }}" + ports: + - "{{ monitoring_go_port }}:{{ monitoring_go_internal_port }}" + labels: + logging: "promtail" + app: "{{ monitoring_go_app_label }}" + deploy: + resources: + limits: + cpus: "{{ monitoring_app_cpu_limit }}" + memory: {{ monitoring_app_memory_limit }} + reservations: + cpus: "{{ monitoring_app_cpu_reservation }}" + memory: {{ monitoring_app_memory_reservation }} + networks: + - monitoring + restart: unless-stopped +{% if monitoring_go_external_healthcheck_enabled | bool %} + + app-go-healthcheck: + image: {{ monitoring_go_healthcheck_image }} + command: + - sh + - -c + - sleep infinity + depends_on: + - app-go + healthcheck: + test: + - CMD-SHELL + - curl -fsS http://app-go:{{ monitoring_go_internal_port }}{{ monitoring_go_health_path }} >/dev/null || exit 1 + interval: 15s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "{{ monitoring_go_healthcheck_cpu_limit }}" + memory: {{ monitoring_go_healthcheck_memory_limit }} + reservations: + cpus: "{{ monitoring_go_healthcheck_cpu_reservation }}" + memory: {{ monitoring_go_healthcheck_memory_reservation }} + networks: + - monitoring + restart: unless-stopped +{% endif %} + +volumes: + loki-data: + promtail-data: + grafana-data: + +networks: + monitoring: diff --git a/ansible/roles/monitoring/templates/env.j2 b/ansible/roles/monitoring/templates/env.j2 new file mode 100644 index 0000000000..c5134d339c --- /dev/null +++ b/ansible/roles/monitoring/templates/env.j2 @@ -0,0 +1,2 @@ +GRAFANA_ADMIN_USER={{ monitoring_grafana_admin_user }} +GRAFANA_ADMIN_PASSWORD={{ monitoring_grafana_admin_password }} diff --git a/ansible/roles/monitoring/templates/grafana-loki-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-loki-datasource.yml.j2 new file mode 100644 index 0000000000..f13a7c7c40 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-loki-datasource.yml.j2 @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:{{ monitoring_loki_port }} + isDefault: true + editable: true diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..c6b83653fa --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,42 @@ +auth_enabled: false + +server: + http_listen_port: {{ monitoring_loki_port }} + grpc_listen_port: {{ monitoring_loki_grpc_port }} + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: {{ monitoring_loki_schema_from }} + store: tsdb + object_store: filesystem + schema: {{ monitoring_loki_schema_version }} + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + filesystem: + directory: /loki/chunks + +limits_config: + retention_period: {{ monitoring_loki_retention_period }} + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..2cb509ba23 --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,38 @@ +server: + http_listen_port: {{ monitoring_promtail_port }} + grpc_listen_port: 0 + +positions: + filename: /run/promtail/positions.yaml + +clients: + - url: http://loki:{{ monitoring_loki_port }}/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: + - logging=promtail + relabel_configs: + - target_label: job + replacement: docker + - source_labels: + - __meta_docker_container_label_app + action: replace + regex: "(.+)" + replacement: "$1" + target_label: app + - source_labels: + - __meta_docker_container_name + regex: "/(.*)" + target_label: container + - source_labels: + - __meta_docker_container_label_com_docker_compose_service + target_label: compose_service + - source_labels: + - __meta_docker_container_log_stream + target_label: logstream diff --git a/app_go/main.go b/app_go/main.go index 194aa657fc..14aa5441d8 100644 --- a/app_go/main.go +++ b/app_go/main.go @@ -4,20 +4,23 @@ package main import ( "encoding/json" "fmt" - "log" + "io" "net" "net/http" "os" "runtime" "strings" + "sync" "time" ) const ( serviceName = "devops-info-service" - serviceVersion = "1.0.0" + serviceVersion = "1.7.0" serviceDescription = "DevOps course info service" serviceFramework = "Go net/http" + serviceLoggerName = "devops_info_service" + accessLoggerName = "http.access" ) type ServiceInfo struct { @@ -71,6 +74,8 @@ type HealthResponse struct { var ( // startTime is used for uptime calculations. startTime = time.Now().UTC() + logMu sync.Mutex + logOutput io.Writer = os.Stdout // endpoints is a static list used to mirror the Python app output. endpoints = []EndpointInfo{ {Path: "/", Method: http.MethodGet, Description: "Service information."}, @@ -78,6 +83,12 @@ var ( } ) +type responseRecorder struct { + http.ResponseWriter + statusCode int + bytesWritten int +} + // getServiceInfo returns static service metadata. func getServiceInfo() ServiceInfo { return ServiceInfo{ @@ -195,6 +206,61 @@ func listEndpoints() []EndpointInfo { return endpoints } +func newResponseRecorder(w http.ResponseWriter) *responseRecorder { + return &responseRecorder{ + ResponseWriter: w, + statusCode: http.StatusOK, + } +} + +func (recorder *responseRecorder) WriteHeader(statusCode int) { + recorder.statusCode = statusCode + recorder.ResponseWriter.WriteHeader(statusCode) +} + +func (recorder *responseRecorder) Write(data []byte) (int, error) { + written, err := recorder.ResponseWriter.Write(data) + recorder.bytesWritten += written + return written, err +} + +func emitLog(level, loggerName, message string, fields map[string]any) { + payload := map[string]any{ + "timestamp": time.Now().UTC().Format(time.RFC3339Nano), + "level": level, + "logger": loggerName, + } + + if message != "" { + payload["message"] = message + } + + for key, value := range fields { + payload[key] = value + } + + encoded, err := json.Marshal(payload) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to marshal log entry: %v\n", err) + return + } + + logMu.Lock() + defer logMu.Unlock() + + if _, err := fmt.Fprintln(logOutput, string(encoded)); err != nil { + fmt.Fprintf(os.Stderr, "failed to write log entry: %v\n", err) + } +} + +func queryString(r *http.Request) string { + if r.URL.RawQuery == "" { + return "" + } + + return "?" + r.URL.RawQuery +} + // mainHandler serves GET /. func mainHandler(w http.ResponseWriter, r *http.Request) { payload := RootResponse{ @@ -244,24 +310,53 @@ func recoverMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { - log.Printf("panic: %v", err) + emitLog("ERROR", serviceLoggerName, "request panic recovered", map[string]any{ + "error": fmt.Sprint(err), + "client_ip": clientIP(r), + "method": r.Method, + "path": r.URL.Path, + "query": queryString(r), + "user_agent": r.Header.Get("User-Agent"), + }) writeJSON(w, http.StatusInternalServerError, map[string]string{ "error": "Internal Server Error", "message": "An unexpected error occurred", }) } }() - log.Printf("Request: %s %s", r.Method, r.URL.Path) next.ServeHTTP(w, r) }) } +func requestLoggingMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + startedAt := time.Now() + recorder := newResponseRecorder(w) + + next.ServeHTTP(recorder, r) + + emitLog("INFO", accessLoggerName, "", map[string]any{ + "client_ip": clientIP(r), + "method": r.Method, + "path": r.URL.Path, + "query": queryString(r), + "status_code": recorder.statusCode, + "response_bytes": fmt.Sprintf("%d", recorder.bytesWritten), + "request_time_us": time.Since(startedAt).Microseconds(), + "user_agent": r.Header.Get("User-Agent"), + }) + }) +} + // writeJSON serializes a payload with the given status code. func writeJSON(w http.ResponseWriter, status int, payload any) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) if err := json.NewEncoder(w).Encode(payload); err != nil { - log.Printf("encode error: %v", err) + emitLog("ERROR", serviceLoggerName, "failed to encode response", map[string]any{ + "status_code": status, + "error": err.Error(), + }) } } @@ -277,10 +372,17 @@ func main() { } addr := net.JoinHostPort(host, port) - log.Printf("Application starting on %s", addr) + emitLog("INFO", serviceLoggerName, "application starting", map[string]any{ + "address": addr, + "service": serviceName, + "version": serviceVersion, + }) - handler := recoverMiddleware(http.HandlerFunc(router)) + handler := requestLoggingMiddleware(recoverMiddleware(http.HandlerFunc(router))) if err := http.ListenAndServe(addr, handler); err != nil { - log.Fatalf("server error: %v", err) + emitLog("ERROR", serviceLoggerName, "server error", map[string]any{ + "error": err.Error(), + }) + os.Exit(1) } } diff --git a/app_go/main_test.go b/app_go/main_test.go new file mode 100644 index 0000000000..2622645ec9 --- /dev/null +++ b/app_go/main_test.go @@ -0,0 +1,140 @@ +package main + +import ( + "bytes" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" +) + +func captureLogOutput(w io.Writer) func() { + logMu.Lock() + previous := logOutput + logOutput = w + logMu.Unlock() + + return func() { + logMu.Lock() + logOutput = previous + logMu.Unlock() + } +} + +func decodeLogEntry(t *testing.T, buffer *bytes.Buffer) map[string]any { + t.Helper() + + lines := bytes.Split(bytes.TrimSpace(buffer.Bytes()), []byte("\n")) + if len(lines) != 1 { + t.Fatalf("expected exactly one log line, got %d", len(lines)) + } + + var entry map[string]any + if err := json.Unmarshal(lines[0], &entry); err != nil { + t.Fatalf("failed to decode log entry: %v", err) + } + + return entry +} + +func TestRequestLoggingMiddlewareEmitsJSONAccessLog(t *testing.T) { + var buffer bytes.Buffer + restore := captureLogOutput(&buffer) + defer restore() + + handler := requestLoggingMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"ok":true}`)) + })) + + request := httptest.NewRequest(http.MethodGet, "/health?full=1", nil) + request.RemoteAddr = "203.0.113.10:4321" + request.Header.Set("User-Agent", "go-test") + + recorder := httptest.NewRecorder() + handler.ServeHTTP(recorder, request) + + if recorder.Code != http.StatusCreated { + t.Fatalf("expected status %d, got %d", http.StatusCreated, recorder.Code) + } + + entry := decodeLogEntry(t, &buffer) + if entry["level"] != "INFO" { + t.Fatalf("expected INFO level, got %#v", entry["level"]) + } + if entry["logger"] != accessLoggerName { + t.Fatalf("expected logger %q, got %#v", accessLoggerName, entry["logger"]) + } + if entry["client_ip"] != "203.0.113.10" { + t.Fatalf("expected client_ip to be logged, got %#v", entry["client_ip"]) + } + if entry["method"] != http.MethodGet { + t.Fatalf("expected method to be logged, got %#v", entry["method"]) + } + if entry["path"] != "/health" { + t.Fatalf("expected path to be logged, got %#v", entry["path"]) + } + if entry["query"] != "?full=1" { + t.Fatalf("expected query string to be logged, got %#v", entry["query"]) + } + if entry["status_code"] != float64(http.StatusCreated) { + t.Fatalf("expected status_code to be logged, got %#v", entry["status_code"]) + } + if entry["response_bytes"] != "11" { + t.Fatalf("expected response_bytes to be logged, got %#v", entry["response_bytes"]) + } + if _, ok := entry["request_time_us"].(float64); !ok { + t.Fatalf("expected request_time_us to be numeric, got %#v", entry["request_time_us"]) + } + if entry["user_agent"] != "go-test" { + t.Fatalf("expected user_agent to be logged, got %#v", entry["user_agent"]) + } + if _, hasMessage := entry["message"]; hasMessage { + t.Fatalf("access log should not include message, got %#v", entry["message"]) + } +} + +func TestRecoverMiddlewareEmitsJSONPanicLog(t *testing.T) { + var buffer bytes.Buffer + restore := captureLogOutput(&buffer) + defer restore() + + handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + panic("boom") + })) + + request := httptest.NewRequest(http.MethodGet, "/explode", nil) + request.RemoteAddr = "203.0.113.20:8080" + request.Header.Set("User-Agent", "go-test") + + recorder := httptest.NewRecorder() + handler.ServeHTTP(recorder, request) + + if recorder.Code != http.StatusInternalServerError { + t.Fatalf("expected status %d, got %d", http.StatusInternalServerError, recorder.Code) + } + + entry := decodeLogEntry(t, &buffer) + if entry["level"] != "ERROR" { + t.Fatalf("expected ERROR level, got %#v", entry["level"]) + } + if entry["logger"] != serviceLoggerName { + t.Fatalf("expected logger %q, got %#v", serviceLoggerName, entry["logger"]) + } + if entry["message"] != "request panic recovered" { + t.Fatalf("expected panic message to be logged, got %#v", entry["message"]) + } + if entry["error"] != "boom" { + t.Fatalf("expected panic error to be logged, got %#v", entry["error"]) + } + if entry["path"] != "/explode" { + t.Fatalf("expected panic path to be logged, got %#v", entry["path"]) + } + if entry["query"] != "" { + t.Fatalf("expected empty query string, got %#v", entry["query"]) + } + if entry["client_ip"] != "203.0.113.20" { + t.Fatalf("expected client_ip to be logged, got %#v", entry["client_ip"]) + } +} diff --git a/app_python/.dockerignore b/app_python/.dockerignore index 511a810855..deb2fd9687 100644 --- a/app_python/.dockerignore +++ b/app_python/.dockerignore @@ -2,3 +2,4 @@ !src/** !pyproject.toml !poetry.lock +!gunicorn.conf.py diff --git a/app_python/Dockerfile b/app_python/Dockerfile index f28f1d8a88..2c59bb0ca7 100644 --- a/app_python/Dockerfile +++ b/app_python/Dockerfile @@ -10,7 +10,7 @@ RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" \ WORKDIR /app -COPY pyproject.toml poetry.lock ./ +COPY pyproject.toml poetry.lock gunicorn.conf.py ./ RUN poetry config virtualenvs.create false \ && poetry install --only main --no-interaction --no-ansi --no-root @@ -20,4 +20,4 @@ ENV PORT=5000 ENV HOST="0.0.0.0" USER appuser -CMD ["sh", "-c", "gunicorn --bind ${HOST:-0.0.0.0}:${PORT:-5000} src.main:app"] +CMD ["sh", "-c", "gunicorn --config /app/gunicorn.conf.py src.main:app"] diff --git a/app_python/README.md b/app_python/README.md index ffbc05e2ba..1415a24a2a 100644 --- a/app_python/README.md +++ b/app_python/README.md @@ -37,11 +37,12 @@ poetry install Production-style local run with Gunicorn: ```bash -poetry run gunicorn --bind 0.0.0.0:5000 src.main:app -# Or with custom config -HOST=127.0.0.1 PORT=8080 poetry run gunicorn --bind 127.0.0.1:8080 src.main:app +poetry run gunicorn --config gunicorn.conf.py src.main:app +HOST=127.0.0.1 PORT=8080 poetry run gunicorn --config gunicorn.conf.py src.main:app ``` +Gunicorn access logs are emitted as JSON so Loki can parse request fields cleanly. + ### Docker - Run the container: diff --git a/app_python/gunicorn.conf.py b/app_python/gunicorn.conf.py new file mode 100644 index 0000000000..187118541e --- /dev/null +++ b/app_python/gunicorn.conf.py @@ -0,0 +1,17 @@ +"""Gunicorn configuration for container deployment.""" + +from __future__ import annotations + +import os + +bind = f"{os.getenv('HOST', '0.0.0.0')}:{os.getenv('PORT', '5000')}" +workers = int(os.getenv("GUNICORN_WORKERS", "1")) +accesslog = "-" +errorlog = "-" +loglevel = os.getenv("LOG_LEVEL", "info").lower() +access_log_format = ( + '{"timestamp":"%(t)s","level":"INFO","logger":"gunicorn.access",' + '"client_ip":"%(h)s","method":"%(m)s","path":"%(U)s","query":"%(q)s",' + '"status_code":%(s)s,"response_bytes":"%(B)s","request_time_us":%(D)s,' + '"user_agent":"%(a)s"}' +) diff --git a/app_python/src/flask_instance.py b/app_python/src/flask_instance.py index 7f3267ba03..13950caaff 100644 --- a/app_python/src/flask_instance.py +++ b/app_python/src/flask_instance.py @@ -1,16 +1,29 @@ -""" -Flask app instance and shared process-level state. -""" +"""Flask app instance and shared process-level state.""" from datetime import datetime, timezone -import logging +import os from flask import Flask -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) +try: + from .logging_utils import configure_json_logger +except ImportError: # pragma: no cover - allows `python src/main.py` + from logging_utils import configure_json_logger app = Flask("DevOps Info Service") START_TIME = datetime.now(timezone.utc) # Application start time (UTC). +logger = configure_json_logger("devops_info_service") + +app.logger.handlers = list(logger.handlers) +app.logger.setLevel(logger.level) +app.logger.propagate = False + +logger.info( + "application initialized", + extra={ + "event": "startup", + "host": os.getenv("HOST", "0.0.0.0"), + "port": int(os.getenv("PORT", 5000)), + "debug": os.getenv("DEBUG", "False").lower() == "true", + }, +) diff --git a/app_python/src/logging_utils.py b/app_python/src/logging_utils.py new file mode 100644 index 0000000000..1f4017ee6a --- /dev/null +++ b/app_python/src/logging_utils.py @@ -0,0 +1,73 @@ +"""Shared JSON logging helpers for the Python service.""" + +from __future__ import annotations + +from datetime import datetime, timezone +import json +import logging +import os +import sys +from typing import Any + +_RESERVED_RECORD_FIELDS = frozenset( + vars(logging.LogRecord("", logging.INFO, "", 0, "", (), None)).keys() +) | {"message", "asctime"} + + +def _to_jsonable(value: Any) -> Any: + """Convert values into JSON-safe representations.""" + if isinstance(value, (str, int, float, bool)) or value is None: + return value + if isinstance(value, datetime): + return value.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + if isinstance(value, dict): + return {str(key): _to_jsonable(item) for key, item in value.items()} + if isinstance(value, (list, tuple, set)): + return [_to_jsonable(item) for item in value] + return str(value) + + +class JSONFormatter(logging.Formatter): + """Format log records as a single JSON object per line.""" + + def format(self, record: logging.LogRecord) -> str: + payload: dict[str, Any] = { + "timestamp": datetime.fromtimestamp( + record.created, tz=timezone.utc + ).isoformat().replace("+00:00", "Z"), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + + for key, value in record.__dict__.items(): + if key in _RESERVED_RECORD_FIELDS or key.startswith("_"): + continue + payload[key] = _to_jsonable(value) + + if record.exc_info: + payload["exc_info"] = self.formatException(record.exc_info) + if record.stack_info: + payload["stack_info"] = self.formatStack(record.stack_info) + + return json.dumps(payload, separators=(",", ":")) + + +def get_log_level() -> int: + """Return the configured application log level.""" + raw_level = os.getenv("LOG_LEVEL", "INFO").upper() + return getattr(logging, raw_level, logging.INFO) + + +def configure_json_logger(name: str) -> logging.Logger: + """Create a stdout logger that emits JSON records.""" + logger = logging.getLogger(name) + logger.handlers.clear() + logger.setLevel(get_log_level()) + logger.propagate = False + + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(JSONFormatter()) + logger.addHandler(handler) + + return logger diff --git a/app_python/src/router.py b/app_python/src/router.py index 994b1340e4..b2590de495 100644 --- a/app_python/src/router.py +++ b/app_python/src/router.py @@ -15,7 +15,7 @@ except ImportError: # pragma: no cover - allows `python src/main.py` from flask_instance import START_TIME, app, logger -__version__ = "1.0.0" +__version__ = "1.7.0" def get_service_info() -> dict[str, str]: @@ -84,6 +84,13 @@ def get_request_info(req) -> dict[str, str | None]: } +def get_request_log_context(req, status_code: int) -> dict[str, str | int | None]: + """Return request metadata suitable for structured logs.""" + context = get_request_info(req) + context["status_code"] = status_code + return context + + def list_routes() -> list[dict[str, str]]: """Return a flat list of route + method + description.""" out: list[dict[str, str]] = [] @@ -113,7 +120,6 @@ def list_routes() -> list[dict[str, str]]: @app.route("/") def index(): """Service information.""" - logger.debug("Request: %s %s", request.method, request.path) return jsonify( { "service": get_service_info(), @@ -128,7 +134,6 @@ def index(): @app.route("/health") def health(): """Health check.""" - logger.debug("Request: %s %s", request.method, request.path) return jsonify( { "status": "healthy", @@ -141,13 +146,25 @@ def health(): @app.errorhandler(404) def not_found(error): # noqa: ARG001 """Return a JSON 404 payload.""" - logger.debug("Request: %s %s", request.method, request.path) + logger.warning( + "request returned not found", + extra=get_request_log_context(request, status_code=404), + ) return jsonify({"error": "Not Found", "message": "Endpoint does not exist"}), 404 @app.errorhandler(500) def internal_error(error): # noqa: ARG001 """Return a JSON 500 payload.""" + original_error = getattr(error, "original_exception", None) or error + logger.error( + "request failed", + extra={ + **get_request_log_context(request, status_code=500), + "error_type": type(original_error).__name__, + "error": str(original_error), + }, + ) return ( jsonify( { diff --git a/app_python/tests/test_logging_utils.py b/app_python/tests/test_logging_utils.py new file mode 100644 index 0000000000..bef1790ef3 --- /dev/null +++ b/app_python/tests/test_logging_utils.py @@ -0,0 +1,34 @@ +"""Unit tests for JSON logging helpers.""" + +import json +import logging + +from src.logging_utils import JSONFormatter + + +def test_json_formatter_serializes_message_and_extra_fields(): + """Formatter should emit a JSON line with standard and custom fields.""" + record = logging.LogRecord( + name="devops_info_service", + level=logging.INFO, + pathname=__file__, + lineno=12, + msg="hello %s", + args=("world",), + exc_info=None, + ) + record.client_ip = "203.0.113.7" + record.method = "GET" + record.path = "/health" + record.status_code = 200 + + payload = json.loads(JSONFormatter().format(record)) + + assert payload["logger"] == "devops_info_service" + assert payload["level"] == "INFO" + assert payload["message"] == "hello world" + assert payload["client_ip"] == "203.0.113.7" + assert payload["method"] == "GET" + assert payload["path"] == "/health" + assert payload["status_code"] == 200 + assert payload["timestamp"].endswith("Z") diff --git a/monitoring/.env.example b/monitoring/.env.example new file mode 100644 index 0000000000..5b4ba04148 --- /dev/null +++ b/monitoring/.env.example @@ -0,0 +1,2 @@ +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=ChangeMe!123 diff --git a/monitoring/.gitignore b/monitoring/.gitignore new file mode 100644 index 0000000000..03bd4129be --- /dev/null +++ b/monitoring/.gitignore @@ -0,0 +1 @@ +*.env diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..ac36c52b2e --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,187 @@ +services: + loki: + image: grafana/loki:3.0.0 + command: + - -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + healthcheck: + test: + - CMD-SHELL + - wget --no-verbose --tries=1 --spider http://127.0.0.1:3100/ready || exit 1 + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + networks: + - monitoring + restart: unless-stopped + + promtail: + image: grafana/promtail:3.0.0 + command: + - -config.file=/etc/promtail/config.yml + user: "0:0" + depends_on: + loki: + condition: service_healthy + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - promtail-data:/run/promtail + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + reservations: + cpus: "0.10" + memory: 64M + networks: + - monitoring + restart: unless-stopped + + grafana: + image: grafana/grafana:12.3.1 + depends_on: + loki: + condition: service_healthy + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:?set in monitoring/.env} + GF_SECURITY_ALLOW_EMBEDDING: "true" + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + healthcheck: + test: + - CMD-SHELL + - wget --no-verbose --tries=1 --spider http://127.0.0.1:3000/api/health || exit 1 + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "1.0" + memory: 512M + reservations: + cpus: "0.25" + memory: 128M + networks: + - monitoring + restart: unless-stopped + + app-python: + image: localt0aster/devops-app-py:1.7.9a42ee5 + # Re-enable local builds if Docker networking breaks behind the tun/VPN setup. + # build: + # context: ../app_python + # network: host + environment: + HOST: "0.0.0.0" + PORT: "8000" + ports: + - "8000:8000" + labels: + logging: "promtail" + app: "devops-python" + healthcheck: + test: + - CMD-SHELL + - wget --no-verbose --tries=1 --spider http://127.0.0.1:8000/health || exit 1 + interval: 15s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + reservations: + cpus: "0.10" + memory: 64M + networks: + - monitoring + restart: unless-stopped + + app-go: + image: localt0aster/devops-app-go:1.7.9a42ee5 + # Re-enable local builds if Docker networking breaks behind the tun/VPN setup. + # build: + # context: ../app_go + # network: host + environment: + HOST: "0.0.0.0" + PORT: "8001" + ports: + - "8001:8001" + labels: + logging: "promtail" + app: "devops-go" + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + reservations: + cpus: "0.10" + memory: 64M + networks: + - monitoring + restart: unless-stopped + + app-go-healthcheck: + image: curlimages/curl:8.18.0 + command: + - sh + - -c + - sleep infinity + depends_on: + - app-go + healthcheck: + test: + - CMD-SHELL + - curl -fsS http://app-go:8001/health >/dev/null || exit 1 + interval: 15s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "0.10" + memory: 64M + reservations: + cpus: "0.05" + memory: 32M + networks: + - monitoring + restart: unless-stopped + +volumes: + loki-data: + promtail-data: + grafana-data: + +networks: + monitoring: diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..cd74a47a51 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,800 @@ +# LAB07 - Observability & Logging with Loki Stack + +## 1. Architecture + +This lab uses a single Docker Compose stack for log collection, storage, querying, and visualization. + +- `loki` stores logs on local disk with TSDB and schema `v13`. +- `promtail` discovers Docker containers through the Docker socket and ships selected logs to Loki. +- `grafana` uses Loki as the default data source and provides Explore plus a custom dashboard. +- `app-python` and `app-go` write structured JSON logs to container stdout. +- Only containers labeled `logging=promtail` are scraped. + +Current image tags in Compose are branch-style image tags: + +- `localt0aster/devops-app-py:1.7.9a42ee5` +- `localt0aster/devops-app-go:1.7.9a42ee5` + +The application payloads themselves report service version `1.7.0`. + +```text +curl / browser + | + v ++-----------------------------+ +| app-python app-go | +| JSON logs to stdout | ++-----------------------------+ + | + v ++-----------------------------+ +| promtail | +| docker_sd + relabeling | ++-----------------------------+ + | + v ++-----------------------------+ +| loki | +| TSDB + filesystem storage | ++-----------------------------+ + | + v ++-----------------------------+ +| grafana | +| Explore + dashboard | ++-----------------------------+ +``` + +## 2. Setup Guide + +The project structure for the monitoring stack is: + +```text +monitoring/ +├── docker-compose.yml +├── loki/config.yml +├── promtail/config.yml +├── grafana/provisioning/datasources/loki.yml +└── docs/LAB07.md +``` + +Bring the stack up from the repository root: + +```bash +cd monitoring +docker compose up -d +docker compose ps +``` + +Useful local endpoints: + +- Grafana: `http://localhost:3000` +- Loki: `http://localhost:3100` +- Promtail: `http://localhost:9080` +- Python app: `http://localhost:8000` +- Go app: `http://localhost:8001` + +Basic verification commands: + +```bash +curl -fSsL localhost:3100/ready +curl -fSsL localhost:9080/targets +curl -fSsL localhost:3000/api/health +curl -fSsL localhost:8000/health +curl -fSsL localhost:8001/health +``` + +Grafana is configured to provision Loki automatically, so the data source is available immediately after startup. + +## 3. Configuration + +### Docker Compose + +The stack keeps all services on one shared `monitoring` network and persists Loki, Promtail positions, and Grafana state in named volumes. + +Compose excerpt: + +```yaml +services: + loki: + image: grafana/loki:3.0.0 + promtail: + image: grafana/promtail:3.0.0 + grafana: + image: grafana/grafana:12.3.1 + app-python: + image: localt0aster/devops-app-py:1.7.9a42ee5 + labels: + logging: "promtail" + app: "devops-python" + app-go: + image: localt0aster/devops-app-go:1.7.9a42ee5 + labels: + logging: "promtail" + app: "devops-go" +``` + +Two practical decisions matter here: + +- published images are used for the apps instead of local builds; +- the earlier `build.network: host` workaround is preserved as commented YAML for the tun/VPN case, but it is not active in the final stack. + +### Loki + +Loki is configured as a single-node instance with filesystem storage, TSDB, schema `v13`, and 7-day retention. + +Snippet: + +```yaml +common: + path_prefix: /loki + replication_factor: 1 + storage: + filesystem: + chunks_directory: /loki/chunks + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + +limits_config: + retention_period: 168h + +compactor: + retention_enabled: true +``` + +Why this setup: + +- TSDB is the current Loki 3.x recommendation. +- Filesystem storage is enough for a single-node lab environment. +- 7-day retention keeps local disk usage bounded. + +### Promtail + +Promtail uses Docker service discovery and only scrapes labeled containers. + +Snippet: + +```yaml +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + filters: + - name: label + values: + - logging=promtail + relabel_configs: + - target_label: job + replacement: docker + - source_labels: [__meta_docker_container_label_app] + target_label: app + - source_labels: [__meta_docker_container_name] + regex: "/(.*)" + target_label: container +``` + +Why this setup: + +- `logging=promtail` avoids scraping unrelated containers. +- the custom `app` label makes LogQL queries stable across container restarts; +- `container`, `compose_service`, and `logstream` are useful for debugging and panel filtering. + +### Grafana + +Loki is provisioned as the default data source. + +Snippet: + +```yaml +datasources: + - name: Loki + uid: loki + type: loki + url: http://loki:3100 + isDefault: true +``` + +This removes a manual setup step and makes the stack reproducible. + +## 4. Application Logging + +### Python app + +The Python service has two JSON logging paths: + +- Gunicorn access logging for every HTTP request. +- Application logging through a custom `JSONFormatter`. + +Gunicorn access format: + +```python +access_log_format = ( + '{"timestamp":"%(t)s","level":"INFO","logger":"gunicorn.access",' + '"client_ip":"%(h)s","method":"%(m)s","path":"%(U)s","query":"%(q)s",' + '"status_code":%(s)s,"response_bytes":"%(B)s","request_time_us":%(D)s,' + '"user_agent":"%(a)s"}' +) +``` + +Application logger behavior: + +- startup is logged as `application initialized`; +- `404` responses are logged as `WARNING`; +- `500` responses are logged as `ERROR` with `error_type` and `error`. + +### Go app + +The Go service was updated for parity with the Python service and now emits JSON for: + +- startup; +- access logs after each request; +- panic recovery; +- response encoding failures. + +Its access logger writes fields compatible with the Python app: + +- `timestamp` +- `level` +- `logger` +- `client_ip` +- `method` +- `path` +- `query` +- `status_code` +- `response_bytes` +- `request_time_us` +- `user_agent` + +### Example queries and evidence + +Logs from both applications: + +```logql +{job="docker", app=~"devops-python|devops-go"} +``` + +![](img/task2_apps.png) + +Only JSON request logs: + +```logql +{app=~"devops-python|devops-go"} | json | method="GET" +``` + +![](img/task2_get.png) + +Warnings: + +```logql +{app=~"devops-python|devops-go"} |= "WARN" +``` + +![](img/task2_warn.png) + +## 5. Dashboard + +The Grafana dashboard is named `DevOps Service`. It contains four panels and uses Loki as the only data source. + +![](img/task3_panel.png) + +### Panel overview + +#### Logs Table + +- Type: table +- Purpose: show recent raw logs from both applications +- Query: + +```logql +{app=~"devops-.*"} +``` + +#### Request Rate + +- Type: time series +- Purpose: show request throughput grouped by `app` +- Query: + +```logql +sum by (app) (rate({app=~"devops-.*"} [1m])) +``` + +#### Error Logs + +- Type: table +- Purpose: show only error-level log lines +- Query: + +```logql +{app=~"devops-.*"} | json | level=~"ERROR|error" +``` + +Practical note: + +- the dashboard currently includes one synthetic Python error record used to keep the panel non-empty during normal demo traffic; +- an easy public error endpoint was intentionally not added, because it would let any user spam error logs on demand; +- ordinary health and index requests only generate `INFO`, and missing endpoints generate `WARNING`, so a forced error was needed for visible evidence. + +#### Log Level Distribution + +- Type: stat +- Purpose: count logs by parsed JSON `level` +- Query: + +```logql +sum by (level) (count_over_time({app=~"devops-.*"} | json [5m])) +``` + +## 6. Production Hardening + +Task 4 is implemented in the Compose stack and verified locally. + +### Implemented changes + +- Resource limits and reservations were added to all services in `docker-compose.yml`. +- Anonymous Grafana access is disabled with `GF_AUTH_ANONYMOUS_ENABLED=false`. +- Grafana admin credentials are read from a local `.env` file. +- `.env` is ignored by git, and `.env.example` documents the required variables. +- Healthchecks were added for Loki, Grafana, and the Python app. +- The Go app is monitored by a small external probe service named `app-go-healthcheck`. +- Grafana and Promtail now wait for Loki to become healthy before starting. + +Implemented snippets: + +```yaml +deploy: + resources: + limits: + cpus: "1.0" + memory: 1G +``` + +```yaml +environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" +``` + +```yaml +healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 +``` + +### Verification + +- `docker compose ps` shows Loki, Grafana, `app-python`, and `app-go-healthcheck` as `healthy`. +- Anonymous access to `http://localhost:3000/api/user` now returns `401`. +- Admin access works with the credentials from the local `.env`. +- The Grafana login page is served at `http://localhost:3000/login`. + +![](img/task4_password_required.png) + +Practical note: + +- the Go app image is based on `scratch`, so it does not contain shell or probe tools for a simple in-container HTTP healthcheck; +- for that reason, `app-go-healthcheck` performs the HTTP probe externally with `curl` against `http://app-go:8001/health`. + +## 7. Testing + +Commands used to generate traffic and verify ingestion: + +```bash +cd monitoring +docker compose up -d + +for i in $(seq 1 10); do + curl -fsSL localhost:8000/ >/dev/null + curl -fsSL localhost:8000/health >/dev/null + curl -fsSL localhost:8001/ >/dev/null + curl -fsSL localhost:8001/health >/dev/null +done + +curl -fsSL localhost:8000/do404 >/dev/null +curl -fsSL localhost:8001/do404 >/dev/null +``` + +Useful checks: + +```bash +curl -fSsL localhost:3100/ready +curl -fSsL localhost:9080/targets +curl -fSsL localhost:3000/api/health +curl -s -o /dev/null -w '%{http_code}\n' localhost:3000/api/user +docker compose ps +docker compose logs app-python --tail=20 +docker compose logs app-go --tail=20 +``` + +Useful LogQL checks: + +```logql +{app=~"devops-python|devops-go"} +{app=~"devops-python|devops-go"} | json | method="GET" +{app=~"devops-python|devops-go"} |= "WARN" +{app=~"devops-python|devops-go"} | json | level=~"ERROR|error" +``` + +## 8. Bonus Task + +### Automated deployment with Ansible + +The bonus task was implemented as a dedicated monitoring deployment playbook plus a reusable role: + +- `ansible/playbooks/deploy-monitoring.yml` +- `ansible/roles/monitoring/defaults/main.yml` +- `ansible/roles/monitoring/tasks/setup.yml` +- `ansible/roles/monitoring/tasks/deploy.yml` +- `ansible/roles/monitoring/templates/*.j2` + +The role creates `/opt/devops-monitoring`, templates the Compose stack plus Loki, Promtail, Grafana, and `.env` files, starts the stack with `community.docker.docker_compose_v2`, and verifies: + +- published ports for Loki, Promtail, Grafana, Python app, and Go app; +- Loki `/ready`; +- Promtail `/targets`; +- Grafana `/api/health`; +- Grafana auth gate returning `401` anonymously; +- Python `/health`; +- Go `/health`; +- the external `app-go-healthcheck` container status. + +The first manual runs exposed a real bug in the role: the healthcheck assertion was hard-coded to `monitoring-app-go-healthcheck-1`, but the VM uses Compose project name `devops-monitoring`, so the real container name is `devops-monitoring-app-go-healthcheck-1`. I fixed that by deriving the container name from `monitoring_dir | basename`. + +### CI dependency gate + +`.github/workflows/ansible-deploy.yml` now contains a `wait-for-prerequisites` job. It polls workflow runs for the current commit and waits for: + +- `Go Docker Publish` +- `Python CI` +- `Python Docker Publish` + +Practical behavior: + +- if one of these workflows exists for the same commit and is still running, Ansible deployment waits; +- if one exists and fails, the Ansible workflow fails before deployment; +- if a workflow never started for that commit because its path filters did not match, it is treated as not applicable after a short grace period instead of deadlocking the pipeline. + +### Playbook evidence + +Because the VM images were already pulled and Docker Hub reachability on my host is inconsistent, the successful validation reruns used: + +```bash +cd ansible +.venv/bin/ansible-playbook playbooks/deploy-monitoring.yml \ + -e monitoring_compose_pull_policy=missing \ + -e monitoring_compose_wait=false +``` + +
+Initial failed run before the container-name fix + +```text +PLAY [Deploy monitoring stack] ************************************************* + +TASK [Gathering Facts] ********************************************************* +ok: [vagrant] + +TASK [Run monitoring role] ***************************************************** +included: monitoring for vagrant + +TASK [monitoring : Prepare monitoring stack files] ***************************** +included: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/setup.yml for vagrant + +TASK [monitoring : Ensure monitoring directory structure exists] *************** +ok: [vagrant] => (item=/opt/devops-monitoring) +ok: [vagrant] => (item=/opt/devops-monitoring/loki) +ok: [vagrant] => (item=/opt/devops-monitoring/promtail) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana/provisioning) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana/provisioning/datasources) + +TASK [monitoring : Template monitoring environment file] *********************** +ok: [vagrant] + +TASK [monitoring : Template monitoring Docker Compose configuration] *********** +ok: [vagrant] + +TASK [monitoring : Template Loki configuration] ******************************** +ok: [vagrant] + +TASK [monitoring : Template Promtail configuration] **************************** +ok: [vagrant] + +TASK [monitoring : Template Grafana Loki datasource provisioning] ************** +ok: [vagrant] + +TASK [monitoring : Deploy monitoring stack] ************************************ +included: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/deploy.yml for vagrant + +TASK [monitoring : Skip monitoring deployment actions in check mode] *********** +skipping: [vagrant] + +TASK [monitoring : Log in to Docker Hub when credentials are available] ******** +ok: [vagrant] + +TASK [monitoring : Deploy monitoring stack with Docker Compose v2] ************* +changed: [vagrant] + +TASK [monitoring : Wait for exposed monitoring ports] ************************** +ok: [vagrant -> localhost] => (item=3100) +ok: [vagrant -> localhost] => (item=9080) +ok: [vagrant -> localhost] => (item=3000) +ok: [vagrant -> localhost] => (item=8000) +ok: [vagrant -> localhost] => (item=8001) + +TASK [monitoring : Verify Loki readiness endpoint] ***************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Promtail targets endpoint] *************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Grafana API health] ********************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Grafana requires authentication] ********************* +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Python application health endpoint] ****************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Go application health endpoint] ********************** +ok: [vagrant -> localhost] + +TASK [monitoring : Read external Go healthcheck container info] **************** +ok: [vagrant] + +TASK [monitoring : Assert external Go healthcheck is healthy] ****************** +[ERROR]: Task failed: Action failed: External Go healthcheck container is not healthy. +Origin: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/deploy.yml:148:7 + +146 when: monitoring_go_external_healthcheck_enabled | bool +147 +148 - name: Assert external Go healthcheck is healthy + ^ column 7 + +fatal: [vagrant]: FAILED! => { + "assertion": "monitoring_go_healthcheck_container.exists | bool", + "changed": false, + "evaluated_to": false, + "msg": "External Go healthcheck container is not healthy." +} + +TASK [monitoring : Capture docker compose status after failed monitoring deployment] *** +ok: [vagrant] + +TASK [monitoring : Fail deployment with compose status context] **************** +[ERROR]: Task failed: Action failed: Monitoring deployment failed. Compose status: NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS +devops-monitoring-app-go-1 localt0aster/devops-app-go:latest "/devops-info-servic…" app-go 41 seconds ago Up 40 seconds 0.0.0.0:8001->8001/tcp, [::]:8001->8001/tcp +devops-monitoring-app-go-healthcheck-1 curlimages/curl:8.18.0 "/entrypoint.sh sh -…" app-go-healthcheck 41 seconds ago Up 40 seconds (healthy) +devops-monitoring-app-python-1 localt0aster/devops-app-py:latest "sh -c 'gunicorn --c…" app-python 41 seconds ago Up 40 seconds (healthy) 0.0.0.0:8000->8000/tcp, [::]:8000->8000/tcp +devops-monitoring-grafana-1 grafana/grafana:12.3.1 "/run.sh" grafana 41 seconds ago Up 20 seconds (healthy) 0.0.0.0:3000->3000/tcp, [::]:3000->3000/tcp +devops-monitoring-loki-1 grafana/loki:3.0.0 "/usr/bin/loki -conf…" loki 41 seconds ago Up 40 seconds (healthy) 0.0.0.0:3100->3100/tcp, [::]:3100->3100/tcp +devops-monitoring-promtail-1 grafana/promtail:3.0.0 "/usr/bin/promtail -…" promtail 41 seconds ago Up 20 seconds 0.0.0.0:9080->9080/tcp, [::]:9080->9080/tcp +Origin: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/deploy.yml:170:7 + +168 failed_when: false +169 +170 - name: Fail deployment with compose status context + ^ column 7 + +fatal: [vagrant]: FAILED! => {"changed": false, "msg": "Monitoring deployment failed. Compose status: NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS\ndevops-monitoring-app-go-1 localt0aster/devops-app-go:latest \"/devops-info-servic…\" app-go 41 seconds ago Up 40 seconds 0.0.0.0:8001->8001/tcp, [::]:8001->8001/tcp\ndevops-monitoring-app-go-healthcheck-1 curlimages/curl:8.18.0 \"/entrypoint.sh sh -…\" app-go-healthcheck 41 seconds ago Up 40 seconds (healthy) \ndevops-monitoring-app-python-1 localt0aster/devops-app-py:latest \"sh -c 'gunicorn --c…\" app-python 41 seconds ago Up 40 seconds (healthy) 0.0.0.0:8000->8000/tcp, [::]:8000->8000/tcp\ndevops-monitoring-grafana-1 grafana/grafana:12.3.1 \"/run.sh\" grafana 41 seconds ago Up 20 seconds (healthy) 0.0.0.0:3000->3000/tcp, [::]:3000->3000/tcp\ndevops-monitoring-loki-1 grafana/loki:3.0.0 \"/usr/bin/loki -conf…\" loki 41 seconds ago Up 40 seconds (healthy) 0.0.0.0:3100->3100/tcp, [::]:3100->3100/tcp\ndevops-monitoring-promtail-1 grafana/promtail:3.0.0 \"/usr/bin/promtail -…\" promtail 41 seconds ago Up 20 seconds 0.0.0.0:9080->9080/tcp, [::]:9080->9080/tcp"} + +PLAY RECAP ********************************************************************* +vagrant : ok=21 changed=1 unreachable=0 failed=1 skipped=1 rescued=1 ignored=0 +``` + +
+ +
+Successful rerun after the fix + +```text +PLAY [Deploy monitoring stack] ************************************************* + +TASK [Gathering Facts] ********************************************************* +ok: [vagrant] + +TASK [Run monitoring role] ***************************************************** +included: monitoring for vagrant + +TASK [monitoring : Prepare monitoring stack files] ***************************** +included: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/setup.yml for vagrant + +TASK [monitoring : Ensure monitoring directory structure exists] *************** +ok: [vagrant] => (item=/opt/devops-monitoring) +ok: [vagrant] => (item=/opt/devops-monitoring/loki) +ok: [vagrant] => (item=/opt/devops-monitoring/promtail) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana/provisioning) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana/provisioning/datasources) + +TASK [monitoring : Template monitoring environment file] *********************** +ok: [vagrant] + +TASK [monitoring : Template monitoring Docker Compose configuration] *********** +ok: [vagrant] + +TASK [monitoring : Template Loki configuration] ******************************** +ok: [vagrant] + +TASK [monitoring : Template Promtail configuration] **************************** +ok: [vagrant] + +TASK [monitoring : Template Grafana Loki datasource provisioning] ************** +ok: [vagrant] + +TASK [monitoring : Deploy monitoring stack] ************************************ +included: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/deploy.yml for vagrant + +TASK [monitoring : Skip monitoring deployment actions in check mode] *********** +skipping: [vagrant] + +TASK [monitoring : Log in to Docker Hub when credentials are available] ******** +ok: [vagrant] + +TASK [monitoring : Deploy monitoring stack with Docker Compose v2] ************* +ok: [vagrant] + +TASK [monitoring : Wait for exposed monitoring ports] ************************** +ok: [vagrant -> localhost] => (item=3100) +ok: [vagrant -> localhost] => (item=9080) +ok: [vagrant -> localhost] => (item=3000) +ok: [vagrant -> localhost] => (item=8000) +ok: [vagrant -> localhost] => (item=8001) + +TASK [monitoring : Verify Loki readiness endpoint] ***************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Promtail targets endpoint] *************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Grafana API health] ********************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Grafana requires authentication] ********************* +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Python application health endpoint] ****************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Go application health endpoint] ********************** +ok: [vagrant -> localhost] + +TASK [monitoring : Read external Go healthcheck container info] **************** +ok: [vagrant] + +TASK [monitoring : Assert external Go healthcheck is healthy] ****************** +ok: [vagrant] => { + "changed": false, + "msg": "All assertions passed" +} + +PLAY RECAP ********************************************************************* +vagrant : ok=21 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +
+ +
+Idempotent rerun after the fix + +```text +PLAY [Deploy monitoring stack] ************************************************* + +TASK [Gathering Facts] ********************************************************* +ok: [vagrant] + +TASK [Run monitoring role] ***************************************************** +included: monitoring for vagrant + +TASK [monitoring : Prepare monitoring stack files] ***************************** +included: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/setup.yml for vagrant + +TASK [monitoring : Ensure monitoring directory structure exists] *************** +ok: [vagrant] => (item=/opt/devops-monitoring) +ok: [vagrant] => (item=/opt/devops-monitoring/loki) +ok: [vagrant] => (item=/opt/devops-monitoring/promtail) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana/provisioning) +ok: [vagrant] => (item=/opt/devops-monitoring/grafana/provisioning/datasources) + +TASK [monitoring : Template monitoring environment file] *********************** +ok: [vagrant] + +TASK [monitoring : Template monitoring Docker Compose configuration] *********** +ok: [vagrant] + +TASK [monitoring : Template Loki configuration] ******************************** +ok: [vagrant] + +TASK [monitoring : Template Promtail configuration] **************************** +ok: [vagrant] + +TASK [monitoring : Template Grafana Loki datasource provisioning] ************** +ok: [vagrant] + +TASK [monitoring : Deploy monitoring stack] ************************************ +included: /home/t0ast/Repos/DevOps-Core-S26/ansible/roles/monitoring/tasks/deploy.yml for vagrant + +TASK [monitoring : Skip monitoring deployment actions in check mode] *********** +skipping: [vagrant] + +TASK [monitoring : Log in to Docker Hub when credentials are available] ******** +ok: [vagrant] + +TASK [monitoring : Deploy monitoring stack with Docker Compose v2] ************* +ok: [vagrant] + +TASK [monitoring : Wait for exposed monitoring ports] ************************** +ok: [vagrant -> localhost] => (item=3100) +ok: [vagrant -> localhost] => (item=9080) +ok: [vagrant -> localhost] => (item=3000) +ok: [vagrant -> localhost] => (item=8000) +ok: [vagrant -> localhost] => (item=8001) + +TASK [monitoring : Verify Loki readiness endpoint] ***************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Promtail targets endpoint] *************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Grafana API health] ********************************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Grafana requires authentication] ********************* +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Python application health endpoint] ****************** +ok: [vagrant -> localhost] + +TASK [monitoring : Verify Go application health endpoint] ********************** +ok: [vagrant -> localhost] + +TASK [monitoring : Read external Go healthcheck container info] **************** +ok: [vagrant] + +TASK [monitoring : Assert external Go healthcheck is healthy] ****************** +ok: [vagrant] => { + "changed": false, + "msg": "All assertions passed" +} + +PLAY RECAP ********************************************************************* +vagrant : ok=21 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +
+ +## 9. Challenges + +### 1. Docker build networking under VPN + +Local Docker builds initially failed because build containers could not reach package indexes over the default bridge network. The practical workaround was `build.network: host`. In the final Compose file I switched to published images and kept that workaround commented for future use. + +### 2. Go logging parity + +The Go service originally used plain `log.Printf`, which was enough for console output but poor for LogQL field filtering. I replaced it with structured JSON access and error logging so both apps can be queried the same way in Loki. + +### 3. Empty error panel + +Normal demo traffic produced `INFO` and `WARNING` records but not `ERROR`. I intentionally did not add a trivial user-triggered error route, because that would make log spamming easy from the client side. For dashboard evidence I seeded one synthetic `ERROR` entry into the Python app log stream instead. That is not ideal in production, but it is a practical way to prove the panel and query work in the lab environment. + +### 4. Container crash behavior + +Crashing a Gunicorn worker with `SIGSEGV` produced only a Gunicorn `WARNING`, not an application `ERROR`. Killing the whole container process later was useful for demonstrating stop behavior, but it still did not produce the desired application-level error log for the dashboard. Docker then treated the service as a stopped container until it was started again. + +### 5. Grafana password persistence + +Disabling anonymous auth was immediate, but the admin password from Compose environment variables did not replace the existing password stored in Grafana's persistent SQLite database. Because the Grafana volume was already initialized, I had to reset the admin password once against the running instance to bring it in sync with `.env`. diff --git a/monitoring/docs/img/task2_apps.png b/monitoring/docs/img/task2_apps.png new file mode 100644 index 0000000000..7e12728522 Binary files /dev/null and b/monitoring/docs/img/task2_apps.png differ diff --git a/monitoring/docs/img/task2_get.png b/monitoring/docs/img/task2_get.png new file mode 100644 index 0000000000..b37d536989 Binary files /dev/null and b/monitoring/docs/img/task2_get.png differ diff --git a/monitoring/docs/img/task2_warn.png b/monitoring/docs/img/task2_warn.png new file mode 100644 index 0000000000..ad51dca168 Binary files /dev/null and b/monitoring/docs/img/task2_warn.png differ diff --git a/monitoring/docs/img/task3_panel.png b/monitoring/docs/img/task3_panel.png new file mode 100644 index 0000000000..e46e3e2667 Binary files /dev/null and b/monitoring/docs/img/task3_panel.png differ diff --git a/monitoring/docs/img/task4_password_required.png b/monitoring/docs/img/task4_password_required.png new file mode 100644 index 0000000000..0856c75782 Binary files /dev/null and b/monitoring/docs/img/task4_password_required.png differ diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..fba0b1b8e0 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: true diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..bf81bc7602 --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,42 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + filesystem: + directory: /loki/chunks + +limits_config: + retention_period: 168h + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..0b58e710d6 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,38 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /run/promtail/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: + - logging=promtail + relabel_configs: + - target_label: job + replacement: docker + - source_labels: + - __meta_docker_container_label_app + action: replace + regex: "(.+)" + replacement: "$1" + target_label: app + - source_labels: + - __meta_docker_container_name + regex: "/(.*)" + target_label: container + - source_labels: + - __meta_docker_container_label_com_docker_compose_service + target_label: compose_service + - source_labels: + - __meta_docker_container_log_stream + target_label: logstream