Skip to content

Commit 22b4375

Browse files
committed
feat: collect downstream failure logs recursively
When a build fails due to a downstream job triggered via the build step or Cause.UpstreamCause, error logs from the failing sub-job are now collected recursively and included in the AI analysis. Changes: - Add optional pipeline-build-step dependency to read DownstreamBuildAction when installed - PipelineLogExtractor: add downstream recursion with MAX_DOWNSTREAM_DEPTH=5 and visitedRunIds de-dup - Discover sub-jobs via DownstreamBuildAction, with Cause.UpstreamCause scan as a universal fallback - Wrap downstream sections with labeled header + Result; detect fail-fast aborts via InterruptedBuildAction and mark them non-root-cause - Reuse ErrorExplanationAction output under an [AI explanation from sub-job] marker; otherwise extract raw logs via a child PipelineLogExtractor - Redirect the failure URL to the first genuine downstream failure - BaseAIProvider prompt updated to interpret downstream sections and AI explanation blocks correctly Tests: - PipelineLogExtractorTest: cover downstream failure inclusion, success exclusion, visitedRunIds de-dup, and explanation reuse vs raw-log fallback
1 parent 6639edc commit 22b4375

4 files changed

Lines changed: 589 additions & 9 deletions

File tree

pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,13 @@
111111
<optional>true</optional>
112112
</dependency>
113113

114+
<!-- Pipeline build step for downstream build tracking (optional) -->
115+
<dependency>
116+
<groupId>org.jenkins-ci.plugins</groupId>
117+
<artifactId>pipeline-build-step</artifactId>
118+
<optional>true</optional>
119+
</dependency>
120+
114121
<dependency>
115122
<groupId>io.jenkins.plugins</groupId>
116123
<artifactId>ionicons-api</artifactId>

src/main/java/io/jenkins/plugins/explain_error/PipelineLogExtractor.java

Lines changed: 241 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212

1313
import hudson.console.AnnotatedLargeText;
1414
import hudson.console.ConsoleNote;
15+
import hudson.model.Cause;
1516
import hudson.model.Result;
1617
import hudson.model.Run;
18+
import jenkins.model.CauseOfInterruption;
19+
import jenkins.model.InterruptedBuildAction;
1720
import jenkins.model.Jenkins;
1821

1922
import java.io.BufferedReader;
@@ -66,10 +69,14 @@ public class PipelineLogExtractor {
6669
/** Lines of context to include before and after each error-pattern match. */
6770
private static final int ERROR_CONTEXT_LINES = 5;
6871

72+
/** Maximum recursion depth when following downstream (sub-job) failures. */
73+
private static final int MAX_DOWNSTREAM_DEPTH = 5;
74+
6975
private boolean isGraphViewPluginAvailable = false;
7076
private transient String url;
7177
private transient Run<?, ?> run;
7278
private int maxLines;
79+
private int downstreamDepth;
7380

7481
/**
7582
* Reads the provided log text and returns at most the last {@code maxLines} lines.
@@ -297,12 +304,20 @@ public List<String> getFailedStepLog() throws IOException {
297304

298305
if (!accumulated.isEmpty()) {
299306
setUrl(primaryNodeId != null ? primaryNodeId : "0");
300-
return accumulated;
307+
} else {
308+
// Final fallback: last N lines of the full build console log
309+
setUrl("0");
310+
accumulated.addAll(run.getLog(maxLines));
301311
}
302312

303-
// Final fallback: last N lines of the full build console log
304-
setUrl("0");
305-
return run.getLog(maxLines);
313+
// Collect logs from failed downstream (sub-job) builds, recursively
314+
if (downstreamDepth == 0) {
315+
Set<String> visitedRunIds = new HashSet<>();
316+
visitedRunIds.add(run.getParent().getFullName() + "#" + run.getNumber());
317+
collectDownstreamLogs(accumulated, visitedRunIds);
318+
}
319+
320+
return accumulated;
306321
}
307322

308323
/**
@@ -346,11 +361,233 @@ public String getUrl() {
346361
}
347362

348363
public PipelineLogExtractor(Run<?, ?> run, int maxLines)
364+
{
365+
this(run, maxLines, 0);
366+
}
367+
368+
private PipelineLogExtractor(Run<?, ?> run, int maxLines, int downstreamDepth)
349369
{
350370
this.run = run;
351371
this.maxLines = maxLines;
372+
this.downstreamDepth = downstreamDepth;
352373
if (Jenkins.get().getPlugin("pipeline-graph-view") != null) {
353374
isGraphViewPluginAvailable = true;
354375
}
355376
}
377+
378+
/**
379+
* Collects error logs from failed downstream (sub-job) builds triggered by this run.
380+
* <p>
381+
* Supports two discovery mechanisms:
382+
* <ol>
383+
* <li><b>DownstreamBuildAction</b> (pipeline-build-step plugin): reads the
384+
* {@link org.jenkinsci.plugins.workflow.support.steps.build.DownstreamBuildAction}
385+
* attached to the current run to find builds triggered by the {@code build} step.</li>
386+
* <li><b>Cause.UpstreamCause</b>: scans all jobs in Jenkins for builds whose
387+
* {@link Cause.UpstreamCause} points back to this run. This covers cases where
388+
* the pipeline-build-step plugin is not installed.</li>
389+
* </ol>
390+
* Recursion is bounded by {@link #MAX_DOWNSTREAM_DEPTH} to prevent infinite loops.
391+
*
392+
* @param accumulated the list to append downstream log lines into
393+
* @param visitedRunIds set of already-visited run IDs (job full name + "#" + build number)
394+
* used to prevent duplicate processing across recursive calls
395+
*/
396+
void collectDownstreamLogs(List<String> accumulated, Set<String> visitedRunIds) {
397+
if (downstreamDepth >= MAX_DOWNSTREAM_DEPTH) {
398+
return;
399+
}
400+
401+
// Strategy A: DownstreamBuildAction (pipeline-build-step plugin)
402+
if (Jenkins.get().getPlugin("pipeline-build-step") != null) {
403+
try {
404+
collectViaDownstreamBuildAction(accumulated, visitedRunIds);
405+
} catch (Exception e) {
406+
LOGGER.warning("Failed to collect downstream logs via DownstreamBuildAction: " + e.getMessage());
407+
}
408+
}
409+
410+
// Strategy B: Cause.UpstreamCause — scan builds that list this run as upstream
411+
try {
412+
collectViaUpstreamCause(accumulated, visitedRunIds);
413+
} catch (Exception e) {
414+
LOGGER.warning("Failed to collect downstream logs via UpstreamCause: " + e.getMessage());
415+
}
416+
}
417+
418+
/**
419+
* Discovers failed downstream builds via
420+
* {@link org.jenkinsci.plugins.workflow.support.steps.build.DownstreamBuildAction}
421+
* and appends their logs to {@code accumulated}.
422+
*/
423+
private void collectViaDownstreamBuildAction(List<String> accumulated, Set<String> visitedRunIds) throws IOException {
424+
org.jenkinsci.plugins.workflow.support.steps.build.DownstreamBuildAction action =
425+
run.getAction(org.jenkinsci.plugins.workflow.support.steps.build.DownstreamBuildAction.class);
426+
if (action == null) {
427+
return;
428+
}
429+
for (org.jenkinsci.plugins.workflow.support.steps.build.DownstreamBuildAction.DownstreamBuild db : action.getDownstreamBuilds()) {
430+
Run<?, ?> downstreamRun = db.getBuild();
431+
if (downstreamRun == null) {
432+
continue;
433+
}
434+
appendDownstreamRunLog(downstreamRun, accumulated, visitedRunIds);
435+
}
436+
}
437+
438+
/**
439+
* Discovers failed downstream builds by scanning all jobs for builds whose
440+
* {@link Cause.UpstreamCause} points to this run, and appends their logs to
441+
* {@code accumulated}.
442+
*/
443+
private void collectViaUpstreamCause(List<String> accumulated, Set<String> visitedRunIds) throws IOException {
444+
String thisJobName = run.getParent().getFullName();
445+
int thisBuildNumber = run.getNumber();
446+
447+
for (hudson.model.Job<?, ?> job : Jenkins.get().getAllItems(hudson.model.Job.class)) {
448+
// Skip the current job itself
449+
if (job.getFullName().equals(thisJobName)) {
450+
continue;
451+
}
452+
Run<?, ?> lastBuild = job.getLastBuild();
453+
if (lastBuild == null) {
454+
continue;
455+
}
456+
// Walk recent builds of this job to find ones triggered by our run
457+
for (Run<?, ?> candidate = lastBuild; candidate != null; candidate = candidate.getPreviousBuild()) {
458+
// Only look at builds that could have been triggered by our run
459+
if (candidate.getTimeInMillis() < run.getTimeInMillis()) {
460+
break;
461+
}
462+
for (Cause cause : candidate.getCauses()) {
463+
if (cause instanceof Cause.UpstreamCause) {
464+
Cause.UpstreamCause upstreamCause = (Cause.UpstreamCause) cause;
465+
if (upstreamCause.getUpstreamProject().equals(thisJobName)
466+
&& upstreamCause.getUpstreamBuild() == thisBuildNumber) {
467+
appendDownstreamRunLog(candidate, accumulated, visitedRunIds);
468+
break;
469+
}
470+
}
471+
}
472+
}
473+
}
474+
}
475+
476+
/**
477+
* Returns {@code true} if the given run was aborted because a sibling branch triggered
478+
* a fail-fast interruption (e.g. via {@code parallelsAlwaysFailFast()} or
479+
* {@code parallel(failFast: true, ...)}).
480+
* <p>
481+
* Jenkins records the interruption cause in an {@link InterruptedBuildAction} attached to
482+
* the run. When the cause is a fail-fast signal, its
483+
* {@link CauseOfInterruption#getShortDescription()} contains the phrase "fail fast"
484+
* (case-insensitive). This distinguishes a sibling-aborted run from a run that was
485+
* independently aborted by a user or another mechanism.
486+
*
487+
* @param run the build to inspect
488+
* @return {@code true} if the build was interrupted by a fail-fast signal
489+
*/
490+
boolean isAbortedByFailFast(Run<?, ?> run) {
491+
if (run.getResult() != Result.ABORTED) {
492+
return false;
493+
}
494+
for (InterruptedBuildAction action : run.getActions(InterruptedBuildAction.class)) {
495+
for (CauseOfInterruption cause : action.getCauses()) {
496+
String desc = cause.getShortDescription();
497+
if (desc != null && desc.toLowerCase(java.util.Locale.ROOT).contains("fail fast")) {
498+
return true;
499+
}
500+
}
501+
}
502+
return false;
503+
}
504+
505+
/**
506+
* Appends the error content of a single downstream run to {@code accumulated},
507+
* then recurses into its own downstream builds.
508+
* <p>
509+
* <b>Fast path — reuse existing AI explanation:</b> if the downstream run already has an
510+
* {@link ErrorExplanationAction} (i.e. the sub-job called {@code explainError()} itself),
511+
* its pre-computed explanation text is used directly. This avoids a redundant AI call and
512+
* preserves the full context that was available when the sub-job ran.
513+
* <p>
514+
* <b>Slow path — extract raw logs:</b> when no {@link ErrorExplanationAction} is present,
515+
* a {@link PipelineLogExtractor} is created for the downstream run and its log lines are
516+
* appended as before.
517+
* <p>
518+
* Builds that were aborted by a fail-fast signal from a sibling branch are labelled
519+
* {@code ABORTED (interrupted by fail-fast, not the root cause)} in the section header
520+
* so that the AI can distinguish them from the build that actually caused the failure.
521+
*
522+
* @param downstreamRun the downstream build to extract content from
523+
* @param accumulated the list to append content lines into
524+
* @param visitedRunIds set of already-visited run IDs to prevent duplicates
525+
*/
526+
private void appendDownstreamRunLog(Run<?, ?> downstreamRun, List<String> accumulated,
527+
Set<String> visitedRunIds) throws IOException {
528+
if (downstreamRun.getResult() == null || !downstreamRun.getResult().isWorseThan(Result.SUCCESS)) {
529+
return;
530+
}
531+
String runId = downstreamRun.getParent().getFullName() + "#" + downstreamRun.getNumber();
532+
if (!visitedRunIds.add(runId)) {
533+
return; // already processed
534+
}
535+
int remaining = this.maxLines - accumulated.size();
536+
if (remaining <= 0) {
537+
return;
538+
}
539+
540+
boolean failFastAborted = isAbortedByFailFast(downstreamRun);
541+
String resultLabel = failFastAborted
542+
? "ABORTED (interrupted by fail-fast, not the root cause)"
543+
: String.valueOf(downstreamRun.getResult());
544+
545+
List<String> header = Arrays.asList(
546+
"### Downstream Job: " + downstreamRun.getParent().getFullName()
547+
+ " #" + downstreamRun.getNumber() + " ###",
548+
"Result: " + resultLabel,
549+
"--- LOG CONTENT ---"
550+
);
551+
552+
String runUrl = run.getUrl();
553+
554+
// Fast path: sub-job already has an AI explanation — reuse it directly.
555+
ErrorExplanationAction existingExplanation = downstreamRun.getAction(ErrorExplanationAction.class);
556+
if (existingExplanation != null && existingExplanation.hasValidExplanation()) {
557+
// Redirect "View failure output" to the sub-job's own explanation URL when available.
558+
if (!failFastAborted && existingExplanation.getUrlString() != null && this.url != null
559+
&& runUrl != null && this.url.contains(runUrl)) {
560+
this.url = existingExplanation.getUrlString();
561+
}
562+
accumulated.addAll(header);
563+
accumulated.add("[AI explanation from sub-job]");
564+
accumulated.addAll(Arrays.asList(existingExplanation.getExplanation().split("\n", -1)));
565+
accumulated.add("### END OF DOWNSTREAM JOB: " + downstreamRun.getParent().getFullName() + " ###");
566+
// No need to recurse further — the sub-job's explanation already covers its own
567+
// downstream failures (it was produced with full context at the time of the failure).
568+
return;
569+
}
570+
571+
// Slow path: no existing explanation — extract raw logs as before.
572+
PipelineLogExtractor subExtractor = new PipelineLogExtractor(downstreamRun, remaining, downstreamDepth + 1);
573+
List<String> subLog = subExtractor.getFailedStepLog();
574+
if (subLog == null || subLog.isEmpty()) {
575+
return;
576+
}
577+
578+
// If this sub-job genuinely failed (not just aborted by fail-fast) and the parent
579+
// URL still points to the parent job (i.e. no prior real sub-job failure has already
580+
// claimed the URL), redirect "View failure output" to the sub-job's failing node.
581+
if (!failFastAborted && subExtractor.getUrl() != null && this.url != null
582+
&& runUrl != null && this.url.contains(runUrl)) {
583+
this.url = subExtractor.getUrl();
584+
}
585+
586+
accumulated.addAll(header);
587+
accumulated.addAll(subLog);
588+
accumulated.add("### END OF DOWNSTREAM JOB: " + downstreamRun.getParent().getFullName() + " ###");
589+
590+
// Recurse into sub-job's own downstream builds
591+
subExtractor.collectDownstreamLogs(accumulated, visitedRunIds);
592+
}
356593
}

src/main/java/io/jenkins/plugins/explain_error/provider/BaseAIProvider.java

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,18 +119,51 @@ public interface Assistant {
119119
You MUST follow ALL instructions provided by the user, including any additional context or requirements.
120120
When additional instructions are provided, you MUST incorporate them into your analysis fields,
121121
especially in errorSummary and resolutionSteps.
122+
123+
The error logs may contain sections from downstream (sub-job) builds, clearly delimited like this:
124+
### Downstream Job: <job-name> #<build-number> ###
125+
Result: <result>
126+
--- LOG CONTENT ---
127+
... (sub-job log lines, OR an "[AI explanation from sub-job]" block) ...
128+
### END OF DOWNSTREAM JOB: <job-name> ###
129+
130+
The "Result:" line uses one of these values:
131+
- "FAILURE" — this sub-job genuinely failed and is the ROOT CAUSE of the overall failure.
132+
- "ABORTED (interrupted by fail-fast, not the root cause)" — this sub-job was still running
133+
when a sibling branch failed; it was aborted automatically by parallelsAlwaysFailFast() or
134+
parallel(failFast:true). It is NOT the root cause. Do NOT treat its logs as the primary error.
135+
136+
The log content of a downstream section may be either:
137+
- Raw log lines from the sub-job's failing step, OR
138+
- An "[AI explanation from sub-job]" block: a pre-computed AI analysis produced by the
139+
sub-job itself when it called explainError(). Treat this block as a high-quality,
140+
already-analysed summary of the sub-job's failure — do NOT re-analyse it from scratch.
141+
Instead, incorporate its key findings (root cause, resolution steps) into your own
142+
errorSummary and resolutionSteps for the parent job.
143+
144+
When downstream sections are present:
145+
- Identify WHICH sub-job(s) have Result: FAILURE — those are the root cause(s).
146+
- State their full name and build number explicitly in errorSummary.
147+
- Focus root-cause analysis and resolutionSteps on the FAILURE sections only.
148+
- Mention aborted sub-jobs briefly (e.g. "Job X was aborted due to fail-fast") but do NOT
149+
treat their logs as the source of the error.
150+
- If multiple sub-jobs have Result: FAILURE, summarize each one separately.
151+
- Logs outside downstream sections belong to the parent (upstream) job.
122152
""")
123153
@UserMessage("""
124154
Analyze the following Jenkins build error logs and provide a clear, actionable explanation.
125-
155+
126156
CRITICAL: You MUST respond ONLY in {{language}}. ALL text in your response must be in {{language}}.
127157
This includes: error summaries, resolution steps, best practices, and any other text.
128158
{{customContext}}
129-
159+
130160
ERROR LOGS:
131161
{{errorLogs}}
132-
162+
133163
Remember: Your ENTIRE response must be in {{language}}, including all field values.
164+
If the logs contain "### Downstream Job: ..." sections:
165+
- Sub-jobs with Result: FAILURE are the ROOT CAUSE — identify them by name in errorSummary.
166+
- Sub-jobs with Result: ABORTED (interrupted by fail-fast, not the root cause) were killed by a sibling failure — do NOT treat them as the error source.
134167
If additional instructions were provided above, you MUST address them in your errorSummary or resolutionSteps.
135168
""")
136169
JenkinsLogAnalysis analyzeLogs(@V("errorLogs") String errorLogs, @V("language") String language, @V("customContext") String customContext);

0 commit comments

Comments
 (0)