Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ public class ReservationRegistryActor extends AbstractActorWithTimers {
private static final String TIMER_KEY_AUTO_MARK_READY = "reservation-registry-auto-mark-ready";
private static final Duration DEFAULT_PROCESS_INTERVAL = Duration.ofMillis(1000);
private static final Duration DEFAULT_AUTO_MARK_READY_TIMEOUT = Duration.ofSeconds(5);
// Extra delay added on top of inFlightReservationTimeout (5s) when retrying after a
// NoResourceAvailableException. Adding 55s delay yields a 60s effective backoff between retries on scaling.
private static final Duration DEFAULT_NO_RESOURCE_EXTRA_DELAY = Duration.ofSeconds(55);

private final ClusterID clusterID;
private final Clock clock;
Expand Down Expand Up @@ -465,17 +468,20 @@ private void onStatusFailure(Status.Failure failure) {
NoResourceAvailableException exception = (NoResourceAvailableException) cause;
String exceptionConstraintKey = exception.getConstraintKey();

log.info("{}: Received NoResourceAvailableException: {} (constraintKey={})",
log.error("{}: Received NoResourceAvailableException: {} (constraintKey={})",
this.clusterID, exception.getMessage(), exceptionConstraintKey);

if (exceptionConstraintKey != null) {
// Match the exact reservation by constraint key
Reservation matchingReservation = inFlightReservations.get(exceptionConstraintKey);
if (matchingReservation != null) {
// Update the timestamp for the matching in-flight reservation
// This tracks when the reservation last received a NoResourceAvailableException
// The reservation will be retried if this timestamp is older than the timeout
inFlightReservationRequestTimestamps.put(exceptionConstraintKey, clock.instant());
// Push the in-flight timestamp into the future so shouldSkipConstraintGroup's
// elapsed-time check (now - timestamp < inFlightReservationTimeout) keeps this
// group skipped for an additional DEFAULT_NO_RESOURCE_EXTRA_DELAY before the
// next retry after 60s.
inFlightReservationRequestTimestamps.put(
exceptionConstraintKey,
clock.instant().plus(DEFAULT_NO_RESOURCE_EXTRA_DELAY));
Comment on lines +482 to +484
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is inFlightReservationRequestTimestamps only used so that shouldSkipConstraintGroup can make its determination? If we add metrics/logs based on the values in inFlightReservationRequestTimestamps, this could get confusing. I wonder if we should add this handling within shouldSkipConstraintGroup instead?

log.info("{}: Updated request timestamp for in-flight reservation {} (constraintKey={}) due to NoResourceAvailableException",
this.clusterID, matchingReservation.getKey(), exceptionConstraintKey);
} else {
Expand Down
Loading