Skip to content

Commit 8a88f52

Browse files
committed
Report job success immediately after main graph completes, before post steps
- Report success to users as soon as main graph completes successfully - Post steps (promotion/cleanup) now run as best-effort and don't affect job result - Add metrics tracking: main_graph_duration_seconds, post_steps_duration_seconds, time_saved_seconds - Metrics are extractable from ci-operator-metrics.json in test_platform_insights events
1 parent b751022 commit 8a88f52

1 file changed

Lines changed: 33 additions & 4 deletions

File tree

cmd/ci-operator/main.go

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,26 +1057,55 @@ func (o *options) Run() []error {
10571057
return wrapped
10581058
}
10591059

1060-
// Run each of the promotion steps concurrently
1060+
// Main graph completed successfully - report success immediately before post steps
1061+
mainGraphCompletedAt := time.Now()
1062+
mainGraphDuration := mainGraphCompletedAt.Sub(start)
1063+
eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace))
1064+
1065+
// Report success to users immediately (post steps are best-effort cleanup)
1066+
reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost)
1067+
if loadErr != nil {
1068+
logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.")
1069+
} else {
1070+
reporter.Report(nil)
1071+
}
1072+
1073+
// Run each of the promotion steps concurrently (best-effort cleanup)
1074+
postStepsStart := time.Now()
10611075
lenOfPromotionSteps := len(promotionSteps)
10621076
detailsChan := make(chan api.CIOperatorStepDetails, lenOfPromotionSteps)
10631077
errChan := make(chan error, lenOfPromotionSteps)
10641078
for _, step := range promotionSteps {
10651079
go runPromotionStep(ctx, step, detailsChan, errChan, o.metricsAgent)
10661080
}
1081+
postStepsFailed := false
10671082
for i := 0; i < lenOfPromotionSteps; i++ {
10681083
select {
10691084
case details := <-detailsChan:
10701085
graph.MergeFrom(details)
10711086
case err := <-errChan:
10721087
errorDesc := fmt.Sprintf("post step failed while %s. with error: %v", eventJobDescription(o.jobSpec, o.namespace), err)
10731088
eventRecorder.Event(runtimeObject, coreapi.EventTypeWarning, "PostStepFailed", errorDesc)
1074-
return []error{results.ForReason("executing_post").WithError(err).Unwrap()} // If any of the promotion steps fail, it is considered a failure
1089+
logrus.WithError(err).Warn("Post step failed, but job success was already reported. Continuing with cleanup.")
1090+
postStepsFailed = true
1091+
// Post step failures don't affect job success (already reported), but we still record them
10751092
}
10761093
}
10771094

1078-
eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace))
1079-
o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metrics.Context{"duration_seconds": time.Since(start).Seconds(), "success": true}))
1095+
// Record final metrics including post steps duration
1096+
postStepsDuration := time.Since(postStepsStart)
1097+
totalDuration := time.Since(start)
1098+
metricsContext := metrics.Context{
1099+
"duration_seconds": totalDuration.Seconds(),
1100+
"main_graph_duration_seconds": mainGraphDuration.Seconds(),
1101+
"post_steps_duration_seconds": postStepsDuration.Seconds(),
1102+
"time_saved_seconds": postStepsDuration.Seconds(),
1103+
"success": true,
1104+
}
1105+
if postStepsFailed {
1106+
metricsContext["post_steps_failed"] = true
1107+
}
1108+
o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metricsContext))
10801109

10811110
return nil
10821111
})

0 commit comments

Comments
 (0)