From c2d71e94329825d62d56473aa950ae4a8f005dca Mon Sep 17 00:00:00 2001 From: Patryk Matuszak Date: Fri, 3 Jul 2026 17:12:36 +0200 Subject: [PATCH 1/2] microshift-ci-doctor: run deterministic phases outside Claude sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doctor step previously ran artifact downloads, PCP graph generation, and evidence extraction inside the doctor Claude session — burning its 45-minute timeout and turn budget while the model waited — and booted an entire 10-minute Claude session (doctor-refresh) to run one deterministic script plus a JSON check. Now the bash step owns the deterministic pipeline: - prepare/graphs/evidence/fetch-previous run before the doctor session; the session is invoked with --prepared and spends its (reduced, 40m) budget purely on root cause analysis - finalize (aggregation, cross-run history, HTML generation) runs right after the session, so the report no longer depends on the model ending its session gracefully - the doctor-refresh session is replaced by a direct doctor.sh refresh call with the --ignore keys derived from closed-bugs.json Step timeout goes to 1h45m: the preparation time that was previously hidden inside the doctor session budget is now additive, partially offset by the removed refresh session. --- ...e-tooling-microshift-ci-doctor-commands.sh | 59 +++++++++++++------ ...edge-tooling-microshift-ci-doctor-ref.yaml | 6 +- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh b/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh index 215a93dcb61d3..db7e806d18d57 100644 --- a/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh +++ b/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh @@ -13,7 +13,6 @@ CLAUDE_DOCTOR_LOG="${WORKDIR}/claude-doctor.log" CLAUDE_CREATE_BUGS_LOG="${WORKDIR}/claude-create-bugs.log" CLAUDE_CLOSE_STALE_BUGS_LOG="${WORKDIR}/claude-close-stale-bugs.log" CLAUDE_FIX_TEST_BUGS_LOG="${WORKDIR}/claude-fix-test-bugs.log" -CLAUDE_DOCTOR_REFRESH_LOG="${WORKDIR}/claude-doctor-refresh.log" MCP_JIRA_LOG="${WORKDIR}/mcp-jira.log" # The procedure to copy reports and session logs to artifacts, executed at exit @@ -49,7 +48,7 @@ atexit_handler() { fi # Check if the Claude sessions were completed successfully - for log_file in "${CLAUDE_DOCTOR_LOG}" "${CLAUDE_CREATE_BUGS_LOG}" "${CLAUDE_CLOSE_STALE_BUGS_LOG}" "${CLAUDE_FIX_TEST_BUGS_LOG}" "${CLAUDE_DOCTOR_REFRESH_LOG}"; do + for log_file in "${CLAUDE_DOCTOR_LOG}" "${CLAUDE_CREATE_BUGS_LOG}" "${CLAUDE_CLOSE_STALE_BUGS_LOG}" "${CLAUDE_FIX_TEST_BUGS_LOG}"; do # If a session was terminated due to a timeout, report lack of # subsequent session log files as a warning and continue not # to mask the actual error @@ -261,17 +260,42 @@ echo "Running automatic closing of duplicate rebase PRs..." --filter 'NO-ISSUE: rebase-release' echo "Automatic closing of duplicate rebase PRs completed" -# Run analysis on all releases and open rebase PRs (45m and 100 turns). +# Run the deterministic preparation phases outside Claude so the analysis +# session spends its entire time budget on actual root cause analysis: +# collect and download artifacts, generate PCP graphs, extract evidence, +# fetch the previous run's outputs for carry-forward. +echo "Running deterministic data collection and analysis preparation..." +bash "${PLUGIN_DIR}/scripts/doctor.sh" prepare \ + --component microshift --workdir "${WORKDIR}" \ + "${RELEASE_VERSIONS}" --rebase --repo openshift/microshift +bash "${PLUGIN_DIR}/scripts/doctor.sh" graphs \ + --component microshift --workdir "${WORKDIR}" \ + || echo "WARNING: PCP graph generation failed, continuing without graphs" +bash "${PLUGIN_DIR}/scripts/doctor.sh" evidence \ + --component microshift --workdir "${WORKDIR}" \ + || echo "WARNING: evidence extraction failed, agents will use raw artifacts" +bash "${PLUGIN_DIR}/scripts/doctor.sh" fetch-previous \ + --component microshift --workdir "${WORKDIR}" --job "${JOB_NAME}" + +# Run analysis on all releases and open rebase PRs (40m and 100 turns). +# --prepared: the deterministic phases above already ran; finalize runs +# in this script after the session. echo "Running Claude to analyze MicroShift CI jobs and pull requests..." CLAUDE_RC=0 -timeout 2700 claude \ +timeout 2400 claude \ --model "${CLAUDE_MODEL}" \ --max-turns 100 \ --output-format stream-json \ --plugin-dir "${PLUGIN_DIR}" \ - -p "/microshift-ci:doctor ${RELEASE_VERSIONS}" \ + -p "/microshift-ci:doctor ${RELEASE_VERSIONS} --prepared" \ --verbose &> "${CLAUDE_DOCTOR_LOG}" || CLAUDE_RC=$? -check_claude_rc "${CLAUDE_RC}" "doctor" 45 +check_claude_rc "${CLAUDE_RC}" "doctor" 40 + +# Aggregate results, update cross-run issue history, and generate the +# HTML report — deterministic, no Claude session needed. +echo "Running deterministic aggregation and HTML report generation..." +bash "${PLUGIN_DIR}/scripts/doctor.sh" finalize \ + --component microshift --workdir "${WORKDIR}" "${RELEASE_VERSIONS}" # Run bug creation for failed jobs (15m and 50 turns). echo "Running Claude to create bugs for failed jobs..." @@ -310,17 +334,18 @@ timeout 900 claude \ --verbose &> "${CLAUDE_FIX_TEST_BUGS_LOG}" || CLAUDE_RC=$? check_claude_rc "${CLAUDE_RC}" "fix-test-bugs" 15 -# Run HTML report refresh to include the new bugs (10m and 20 turns). -echo "Running Claude to refresh the HTML report..." -CLAUDE_RC=0 -timeout 600 claude \ - --model "${CLAUDE_MODEL}" \ - --max-turns 20 \ - --output-format stream-json \ - --plugin-dir "${PLUGIN_DIR}" \ - -p "/microshift-ci:doctor-refresh ${RELEASE_VERSIONS}" \ - --verbose &> "${CLAUDE_DOCTOR_REFRESH_LOG}" || CLAUDE_RC=$? -check_claude_rc "${CLAUDE_RC}" "doctor-refresh" 10 +# Refresh the HTML report to include the newly created bugs and exclude +# bugs closed by close-stale-bugs — deterministic, no Claude session needed. +echo "Refreshing the HTML report..." +REFRESH_ARGS=(--component microshift --workdir "${WORKDIR}") +CLOSED_BUGS_FILE="${WORKDIR}/close-stale-bugs/closed-bugs.json" +if [ -f "${CLOSED_BUGS_FILE}" ]; then + IGNORE_KEYS="$(jq -r '(.closed // []) | join(",")' "${CLOSED_BUGS_FILE}" 2>/dev/null || true)" + if [ -n "${IGNORE_KEYS}" ]; then + REFRESH_ARGS+=(--ignore "${IGNORE_KEYS}") + fi +fi +bash "${PLUGIN_DIR}/scripts/doctor.sh" refresh "${REFRESH_ARGS[@]}" "${RELEASE_VERSIONS}" # Now attempt to restart failed rebase PRs tests. If the restarted tests # complete successfully, the PR will be automatically merged. diff --git a/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-ref.yaml b/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-ref.yaml index 2764a288b6301..450829a959ee5 100644 --- a/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-ref.yaml +++ b/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-ref.yaml @@ -42,9 +42,11 @@ ref: requests: cpu: 2000m memory: 4Gi - timeout: 1h30m0s + timeout: 1h45m0s grace_period: 10m0s documentation: |- Analyzes MicroShift periodic jobs and pull requests using Claude AI. - Runs the microshift-ci:doctor command to analyze the releases and pull requests. + Deterministic phases (artifact download, evidence extraction, + previous-run fetch, aggregation, HTML generation) run as scripts; + Claude sessions cover only root cause analysis and Jira bug handling. Generates a HTML report consolidating all results. From 575380b9f4f4976410a91354b941f80e0dd7ca7b Mon Sep 17 00:00:00 2001 From: Patryk Matuszak Date: Fri, 3 Jul 2026 17:14:33 +0200 Subject: [PATCH 2/2] Remove fetch-previous --- .../openshift-edge-tooling-microshift-ci-doctor-commands.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh b/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh index db7e806d18d57..2b05679b54457 100644 --- a/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh +++ b/ci-operator/step-registry/openshift/edge-tooling/microshift-ci/doctor/openshift-edge-tooling-microshift-ci-doctor-commands.sh @@ -262,8 +262,7 @@ echo "Automatic closing of duplicate rebase PRs completed" # Run the deterministic preparation phases outside Claude so the analysis # session spends its entire time budget on actual root cause analysis: -# collect and download artifacts, generate PCP graphs, extract evidence, -# fetch the previous run's outputs for carry-forward. +# collect and download artifacts, generate PCP graphs, extract evidence. echo "Running deterministic data collection and analysis preparation..." bash "${PLUGIN_DIR}/scripts/doctor.sh" prepare \ --component microshift --workdir "${WORKDIR}" \ @@ -274,8 +273,6 @@ bash "${PLUGIN_DIR}/scripts/doctor.sh" graphs \ bash "${PLUGIN_DIR}/scripts/doctor.sh" evidence \ --component microshift --workdir "${WORKDIR}" \ || echo "WARNING: evidence extraction failed, agents will use raw artifacts" -bash "${PLUGIN_DIR}/scripts/doctor.sh" fetch-previous \ - --component microshift --workdir "${WORKDIR}" --job "${JOB_NAME}" # Run analysis on all releases and open rebase PRs (40m and 100 turns). # --prepared: the deterministic phases above already ran; finalize runs