diff --git a/.github/workflows/build-test-push-workflow.yml b/.github/workflows/build-test-push-workflow.yml index f392cd98a..18aa51c46 100644 --- a/.github/workflows/build-test-push-workflow.yml +++ b/.github/workflows/build-test-push-workflow.yml @@ -47,15 +47,27 @@ jobs: make setup/ginkgo go mod tidy - name: Run Unit Tests + id: run-unit-tests run: | - make test + mkdir -p /tmp/ci-diagnostics/unit-tests + set +e + make test 2>&1 | tee /tmp/ci-diagnostics/unit-tests/unit-test-console.log + test_exit=${PIPESTATUS[0]} + echo "test_exit_code=${test_exit}" >> "$GITHUB_OUTPUT" + exit ${test_exit} - name: Run Code Coverage + if: ${{ success() }} run: goveralls -coverprofile=coverage.out -service=circle-ci -repotoken ${{ secrets.COVERALLS_TOKEN }} - - name: Upload Coverage artifacts + - name: Upload Unit Test artifacts + if: ${{ always() }} uses: actions/upload-artifact@v4.4.0 with: - name: coverage.out - path: coverage.out + name: unit-test-artifacts + path: | + /tmp/ci-diagnostics/unit-tests/** + unit_test.xml + coverage.out + if-no-files-found: warn build-operator-image: runs-on: ubuntu-latest needs: unit-tests @@ -300,18 +312,119 @@ jobs: TEST_S3_ACCESS_KEY_ID: ${{ vars.TEST_S3_ACCESS_KEY_ID }} TEST_S3_SECRET_ACCESS_KEY: ${{ secrets.TEST_S3_SECRET_ACCESS_KEY }} run: | - make int-test - - name: Collect Test Logs - if: ${{ always() }} + mkdir -p /tmp/ci-diagnostics/${{ matrix.test }} + set +e + make int-test 2>&1 | tee /tmp/ci-diagnostics/${{ matrix.test }}/int-test-console.log + test_exit=${PIPESTATUS[0]} + echo "test_exit_code=${test_exit}" >> "$GITHUB_OUTPUT" + if [[ ${test_exit} -ne 0 ]]; then + echo "::group::Quick failure diagnostics for Explain Error" + kubectl get nodes -o wide || true + kubectl get pods -A -o wide || true + kubectl get events -A --sort-by=.lastTimestamp | tail -n 200 || true + while read -r ns pod; do + [[ -z "${ns}" || -z "${pod}" ]] && continue + echo "---- ${ns}/${pod} ----" + kubectl logs -n "${ns}" "${pod}" --all-containers=true --tail=200 || true + done < <(kubectl get pods -A --no-headers 2>/dev/null | awk '/splunk|operator/ {print $1 " " $2}' | head -n 20) + echo "::endgroup::" + fi + exit ${test_exit} + - name: Collect Smoke Test Diagnostics + if: ${{ failure() || cancelled() }} run: | - mkdir -p /tmp/pod_logs - find ./test -name "*.log" -exec cp {} /tmp/pod_logs \; - - name: Archive Pod Logs - if: ${{ always() }} + DIAG_DIR="/tmp/ci-diagnostics/${{ matrix.test }}" + mkdir -p "${DIAG_DIR}/cluster" "${DIAG_DIR}/pod-logs" "${DIAG_DIR}/test-logs" "${DIAG_DIR}/splunkd-logs" + { + echo "run_id=${{ github.run_id }}" + echo "run_attempt=${{ github.run_attempt }}" + echo "job=${{ github.job }}" + echo "matrix_test=${{ matrix.test }}" + echo "cluster_name=${TEST_CLUSTER_NAME}" + echo "smoke_test_outcome=${{ steps.smoketest.outcome }}" + echo "smoke_test_exit_code=${{ steps.smoketest.outputs.test_exit_code }}" + date -u +"collected_at_utc=%Y-%m-%dT%H:%M:%SZ" + } > "${DIAG_DIR}/metadata.txt" + + find . -type f \( -name "*.log" -o -name "inttest-*.xml" -o -name "*junit*.xml" \) -print0 | while IFS= read -r -d '' file; do + target="${DIAG_DIR}/test-logs/${file#./}" + mkdir -p "$(dirname "${target}")" + cp "${file}" "${target}" + done + + kubectl version > "${DIAG_DIR}/cluster/kubectl-version.txt" 2>&1 || true + kubectl config current-context > "${DIAG_DIR}/cluster/current-context.txt" 2>&1 || true + kubectl get nodes -o wide > "${DIAG_DIR}/cluster/nodes.txt" 2>&1 || true + kubectl get namespaces -o wide > "${DIAG_DIR}/cluster/namespaces.txt" 2>&1 || true + kubectl get pods -A -o wide > "${DIAG_DIR}/cluster/pods-wide.txt" 2>&1 || true + kubectl get pvc -A > "${DIAG_DIR}/cluster/pvc.txt" 2>&1 || true + kubectl get statefulsets -A -o wide > "${DIAG_DIR}/cluster/statefulsets.txt" 2>&1 || true + kubectl get events -A --sort-by=.lastTimestamp > "${DIAG_DIR}/cluster/events.txt" 2>&1 || true + kubectl describe nodes > "${DIAG_DIR}/cluster/nodes-describe.txt" 2>&1 || true + + mapfile -t target_namespaces < <( + { + echo "splunk-operator" + if [[ -n "${TEST_CLUSTER_NAME}" ]]; then + echo "${TEST_CLUSTER_NAME}" + fi + kubectl get namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | grep -E 'splunk|test' || true + kubectl get pods -A --no-headers 2>/dev/null | awk '/splunk|operator/ {print $1}' || true + } | awk 'NF' | sort -u + ) + + for ns in "${target_namespaces[@]}"; do + mkdir -p "${DIAG_DIR}/cluster/${ns}" "${DIAG_DIR}/pod-logs/${ns}" + kubectl get all -n "${ns}" -o wide > "${DIAG_DIR}/cluster/${ns}/all.txt" 2>&1 || true + kubectl describe all -n "${ns}" > "${DIAG_DIR}/cluster/${ns}/describe-all.txt" 2>&1 || true + kubectl get events -n "${ns}" --sort-by=.lastTimestamp > "${DIAG_DIR}/cluster/${ns}/events.txt" 2>&1 || true + kubectl get events -n "${ns}" -o yaml > "${DIAG_DIR}/cluster/${ns}/events.yaml" 2>&1 || true + while read -r pod; do + [[ -z "${pod}" ]] && continue + kubectl describe pod -n "${ns}" "${pod}" > "${DIAG_DIR}/cluster/${ns}/${pod}-describe.txt" 2>&1 || true + mapfile -t containers < <(kubectl get pod -n "${ns}" "${pod}" -o jsonpath='{range .spec.containers[*]}{.name}{"\n"}{end}' 2>/dev/null || true) + for container in "${containers[@]}"; do + [[ -z "${container}" ]] && continue + kubectl logs -n "${ns}" "${pod}" -c "${container}" --timestamps=true --since=24h > "${DIAG_DIR}/pod-logs/${ns}/${pod}-${container}.log" 2>&1 || true + kubectl logs -n "${ns}" "${pod}" -c "${container}" --timestamps=true --previous > "${DIAG_DIR}/pod-logs/${ns}/${pod}-${container}-previous.log" 2>&1 || true + # Collect Splunk internal splunkd logs from inside Splunk containers. + if [[ "${pod}" == *splunk* || "${container}" == *splunk* ]]; then + SPLUNKD_DIR="${DIAG_DIR}/splunkd-logs/${ns}/${pod}-${container}" + mkdir -p "${SPLUNKD_DIR}" + if kubectl exec -n "${ns}" "${pod}" -c "${container}" -- test -d /opt/splunk/var/log/splunk >/dev/null 2>&1; then + kubectl exec -n "${ns}" "${pod}" -c "${container}" -- ls -1 /opt/splunk/var/log/splunk > "${SPLUNKD_DIR}/directory-list.txt" 2>&1 || true + # Try archiving all splunkd logs first (includes rotated files when present). + kubectl exec -n "${ns}" "${pod}" -c "${container}" -- sh -c 'ls -1 /opt/splunk/var/log/splunk/splunkd*.log >/dev/null 2>&1 && tar -C /opt/splunk/var/log/splunk -czf - splunkd*.log' > "${SPLUNKD_DIR}/splunkd-logs.tar.gz" 2>/dev/null || true + # Always keep direct text copies of current files for quick AI/readability. + while read -r splunk_log; do + [[ -z "${splunk_log}" ]] && continue + log_name="$(basename "${splunk_log}")" + kubectl exec -n "${ns}" "${pod}" -c "${container}" -- sh -c "cat '${splunk_log}'" > "${SPLUNKD_DIR}/${log_name}" 2>&1 || true + done < <(kubectl exec -n "${ns}" "${pod}" -c "${container}" -- sh -c 'ls -1 /opt/splunk/var/log/splunk/splunkd*.log 2>/dev/null || true') + fi + fi + done + done < <(kubectl get pods -n "${ns}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true) + done + - name: Add Smoke Test Summary + if: ${{ failure() || cancelled() }} + run: | + { + echo "### Smoke Test Diagnostics" + echo "- Test: \`${{ matrix.test }}\`" + echo "- Smoke test outcome: \`${{ steps.smoketest.outcome }}\`" + echo "- Exit code: \`${{ steps.smoketest.outputs.test_exit_code }}\`" + echo "- Cluster name: \`${TEST_CLUSTER_NAME}\`" + echo "- Artifact: \`smoke-test-diagnostics-${{ matrix.test }}\`" + } >> "$GITHUB_STEP_SUMMARY" + - name: Archive Smoke Test Diagnostics + if: ${{ failure() || cancelled() }} uses: actions/upload-artifact@v4.4.0 with: - name: "splunk-pods-logs--artifacts-${{ matrix.test }}" - path: "/tmp/pod_logs/**" + name: "smoke-test-diagnostics-${{ matrix.test }}" + path: "/tmp/ci-diagnostics/${{ matrix.test }}/**" + if-no-files-found: warn + retention-days: 14 - name: Cleanup Test Case artifacts if: ${{ always() }} run: |