From 6c3cb1e736eb5e32b101cab7b2e7e3f5eff88227 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 18 Dec 2025 18:12:34 -0800 Subject: [PATCH 01/29] fix: prevent job stuck at Starting when AgentWatchdog replaces failed agents --- .../tank/vm/vmManager/models/VMStatus.java | 3 +- .../rest/mvc/rest/cloud/JobEventSender.java | 13 ++++++- .../intuit/tank/vmManager/AgentWatchdog.java | 14 ++++---- .../intuit/tank/vmManager/VMTrackerImpl.java | 34 +++++++++++++++++-- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java index 6d2b873f1..7dc4312b1 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java @@ -26,7 +26,8 @@ public enum VMStatus implements Serializable { stopping, stopped, shutting_down, - terminated; + terminated, + replaced; // replaced by AgentWatchdog due to failure to report back public static final VMStatus fromString(String value) { VMStatus ret = null; diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 5aeaf0852..3e997aa09 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -247,7 +247,18 @@ private List getInstancesForJob(String jobId) { List instanceIds = new ArrayList(); CloudVmStatusContainer statuses = vmTracker.getVmStatusForJob(jobId); if (statuses != null) { - instanceIds = statuses.getStatuses().stream().map(CloudVmStatus::getInstanceId).collect(Collectors.toList()); + instanceIds = statuses.getStatuses().stream() + .filter(s -> { + VMStatus vmStatus = s.getVmStatus(); + // skip unreachable instances - they can't receive commands + return vmStatus != VMStatus.terminated && + vmStatus != VMStatus.replaced && + vmStatus != VMStatus.stopped && + vmStatus != VMStatus.shutting_down && + vmStatus != VMStatus.stopping; + }) + .map(CloudVmStatus::getInstanceId) + .collect(Collectors.toList()); } return instanceIds; } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 1bc20e99a..e728c0e35 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -222,12 +222,16 @@ private void relaunch(ArrayList instances) { VMImageDao dao = new VMImageDao(); for (VMInformation info : instances) { vmInfo.remove(info); - vmTracker.setStatus(createTerminatedVmStatus(info)); + vmTracker.removeStatusForInstance(info.getInstanceId()); // remove from tracker to prevent blocking job status VMInstance image = dao.getImageByInstanceId(info.getInstanceId()); if (image != null) { - image.setStatus(VMStatus.terminated.name()); + image.setStatus(VMStatus.replaced.name()); // mark as replaced (not terminated) for audit trail dao.saveOrUpdate(image); } + + LOG.info(new ObjectMessage(Map.of("Message", + "Replaced and removed instance " + info.getInstanceId() + + " from tracker for job " + jobId))); } LOG.info(new ObjectMessage(Map.of("Message","Setting number of instances to relaunch to: " + instances.size() + " for job " + jobId))); instanceRequest.setNumberOfInstances(instances.size()); @@ -265,12 +269,6 @@ private CloudVmStatus createCloudStatus(VMInstanceRequest req, VMInformation inf VMImageType.AGENT, req.getRegion(), VMStatus.pending, new ValidationStatus(), 0, 0, null, null); } - private CloudVmStatus createTerminatedVmStatus(VMInformation info) { - return new CloudVmStatus(info.getInstanceId(), instanceRequest.getJobId(), "unknown", - JobStatus.Stopped, VMImageType.AGENT, instanceRequest.getRegion(), - VMStatus.terminated, new ValidationStatus(), 0, 0, null, null); - } - private boolean shouldRelaunchInstances() { return startTime + maxWaitForResponse < System.currentTimeMillis(); } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 6ccff1254..e7d44b0ca 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -224,7 +224,7 @@ private JobQueueStatus getQueueStatus(JobQueueStatus oldStatus, JobStatus jobSta } /** - * If the vm is shutting down or terminated, don't update the status to something else. + * If the vm is shutting down, terminated, or replaced, don't update the status to something else. * @param currentStatus * @return */ @@ -234,7 +234,8 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { return (status != VMStatus.shutting_down && status != VMStatus.stopped && status != VMStatus.stopping - && status != VMStatus.terminated); + && status != VMStatus.terminated + && status != VMStatus.replaced); } return true; } @@ -245,7 +246,24 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { */ @Override public void removeStatusForInstance(String instanceId) { - statusMap.remove(instanceId); + CloudVmStatus status = statusMap.remove(instanceId); + + // also remove from the job's container to keep counts accurate + if (status != null) { + String jobId = status.getJobId(); + // Synchronize on the same lock used by setStatusThread to prevent + // ConcurrentModificationException when iterating over statuses + synchronized (getCacheSyncObject(jobId)) { + CloudVmStatusContainer container = jobMap.get(jobId); + if (container != null) { + boolean removed = container.getStatuses().remove(status); + if (removed) { + LOG.info(new ObjectMessage(Map.of("Message", + "Removed instance " + instanceId + " from container for job " + jobId))); + } + } + } + } } /** @@ -315,6 +333,16 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // look up the job JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { + VMStatus vmStatus = s.getVmStatus(); + // skip replaced instances - these were replaced by AgentWatchdog due to failure + // but do NOT skip terminated/stopping/stopped/shutting_down - these are active agents in transition + if (vmStatus == VMStatus.replaced) { + LOG.info(new ObjectMessage(Map.of("Message", + "Skipping replaced instance " + s.getInstanceId() + + " in job status calculation for job " + status.getJobId()))); + continue; + } + JobStatus jobStatus = s.getJobStatus(); if (jobStatus != JobStatus.Completed) { // If no VMs are Completed isFinished = false; From 8f893fbfb0be4b41e5faa075a64af8d26f3c5acb Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 18 Dec 2025 21:43:51 -0800 Subject: [PATCH 02/29] add logging to understand it all --- .../com/intuit/tank/harness/APIMonitor.java | 17 ++++++++-- .../intuit/tank/harness/CommandListener.java | 2 ++ .../services/agent/AgentServiceV2Impl.java | 5 +++ .../perfManager/workLoads/JobManager.java | 13 +++++++- .../intuit/tank/vmManager/VMTrackerImpl.java | 33 +++++++++++++++++++ 5 files changed, 67 insertions(+), 3 deletions(-) diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java index 804d76037..526ef00ea 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java @@ -83,7 +83,13 @@ private void updateInstanceStatus() { newStatus.setTotalTps(tpsInfo.getTotalTps()); sendTps(tpsInfo); } - if (!isLocal) setInstanceStatus(newStatus.getInstanceId(), newStatus); + + if (!isLocal) { + LOG.debug(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + + ", JobStatus: " + newStatus.getJobStatus() + ", Users: " + newStatus.getCurrentUsers() + + "/" + newStatus.getTotalUsers())); + setInstanceStatus(newStatus.getInstanceId(), newStatus); + } APITestHarness.getInstance().checkAgentThreads(); } catch (Exception t) { LOG.error(LogUtil.getLogMessage("Unable to send status metrics | " + t.getMessage()), t); @@ -120,13 +126,20 @@ private CloudVmStatus createStatus(WatsAgentStatusResponse agentStatus) { */ private JobStatus calculateJobStatus(WatsAgentStatusResponse agentStatus, JobStatus currentStatus) { AgentCommand cmd = APITestHarness.getInstance().getCmd(); - return cmd == AgentCommand.pause ? JobStatus.Paused + JobStatus newStatus = cmd == AgentCommand.pause ? JobStatus.Paused : cmd == AgentCommand.stop ? JobStatus.Stopped : cmd == AgentCommand.pause_ramp ? JobStatus.RampPaused : currentStatus == JobStatus.Unknown || currentStatus == JobStatus.Starting && agentStatus.getCurrentNumberUsers() > 0 ? JobStatus.Running : currentStatus; + + if (newStatus != currentStatus) { + LOG.info(LogUtil.getLogMessage("Agent JobStatus transition: " + currentStatus + " -> " + newStatus + + " (cmd=" + cmd + ", currentUsers=" + agentStatus.getCurrentNumberUsers() + ")")); + } + + return newStatus; } public static void setDoMonitor(boolean monitor) { diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java index e7386ae60..cff991a2b 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/CommandListener.java @@ -78,6 +78,8 @@ private static void handleRequest(HttpExchange exchange) { String path = exchange.getRequestURI().getPath(); if (path.equals(AgentCommand.start.getPath()) || path.equals(AgentCommand.run.getPath())) { response = "Received command " + path + ", Starting Test JobId=" + APITestHarness.getInstance().getAgentRunData().getJobId(); + LOG.info(LogUtil.getLogMessage("Received START command - launching test threads for job " + + APITestHarness.getInstance().getAgentRunData().getJobId())); startTest(); } else if (path.startsWith(AgentCommand.stop.getPath())) { response = "Received command " + path + ", Stopping Test JobId=" + APITestHarness.getInstance().getAgentRunData().getJobId(); diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java index beaebe37a..9abba85f0 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java @@ -220,6 +220,11 @@ public void setInstanceStatus(String instanceId, CloudVmStatus status) { segment.putAnnotation("currentUsers", status.getCurrentUsers()); segment.putAnnotation("TotalUsers", status.getTotalUsers()); segment.putAnnotation("totalTps", status.getTotalTps()); + + LOGGER.info("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() + + ", JobStatus: " + status.getJobStatus() + ", Users: " + status.getCurrentUsers() + + "/" + status.getTotalUsers() + ", Job: " + status.getJobId()); + try { JobEventSender controller = new ServletInjector().getManagedBean( servletContext, JobEventSender.class); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java b/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java index 887fa39b0..ef6bd5e9b 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/perfManager/workLoads/JobManager.java @@ -172,12 +172,17 @@ public AgentTestStartData registerAgentForJob(AgentData agentData) { ret.setUserIntervalIncrement(jobInfo.jobRequest.getUserIntervalIncrement()); ret.setTargetRampRate(jobInfo.jobRequest.getTargetRatePerAgent()); jobInfo.agentData.add(agentData); + LOG.info(new ObjectMessage(Map.of("Message", "Agent " + agentData.getInstanceId() + + " added to job " + agentData.getJobId() + ". Total agents now: " + jobInfo.agentData.size() + + "/" + jobInfo.numberOfMachines + ", isFilled: " + jobInfo.isFilled()))); CloudVmStatus status = vmTracker.getStatus(agentData.getInstanceId()); if(status != null) { status.setVmStatus(VMStatus.pending); vmTracker.setStatus(status); } if (jobInfo.isFilled()) { + LOG.info(new ObjectMessage(Map.of("Message", "All " + jobInfo.numberOfMachines + + " agents registered for job " + agentData.getJobId() + " - starting test thread"))); new Thread( () -> { startTest(jobInfo); }).start(); } } @@ -209,8 +214,14 @@ private void startTest(final JobInfo info) { LOG.info(new ObjectMessage(Map.of("Message", "Start agents command received - Sending start commands for job " + jobId + " asynchronously to following agents: " + info.agentData.stream().collect(Collectors.toMap(AgentData::getInstanceId, AgentData::getInstanceUrl))))); } + LOG.info(new ObjectMessage(Map.of("Message", "Sending START commands to " + info.agentData.size() + + " agents for job " + jobId))); info.agentData.parallelStream() - .map(agentData -> agentData.getInstanceUrl() + AgentCommand.start.getPath()) + .map(agentData -> { + String url = agentData.getInstanceUrl() + AgentCommand.start.getPath(); + LOG.info(new ObjectMessage(Map.of("Message", "Sending command to url " + url))); + return url; + }) .map(URI::create) .map(uri -> sendCommand(uri, MAX_RETRIES)) .forEach(future -> { diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index e7d44b0ca..3e205dd17 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -149,6 +149,12 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { synchronized (getCacheSyncObject(status.getJobId())) { status.setReportTime(new Date()); CloudVmStatus currentStatus = getStatus(status.getInstanceId()); + + LOG.debug(new ObjectMessage(Map.of("Message", + "Status update for instance " + status.getInstanceId() + + " - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + + ", Job: " + status.getJobId()))); + if (shouldUpdateStatus(currentStatus)) { statusMap.put(status.getInstanceId(), status); if (status.getVmStatus() == VMStatus.running @@ -157,6 +163,10 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { AmazonInstance amzInstance = new AmazonInstance(status.getVmRegion()); amzInstance.killInstances(Collections.singletonList(status.getInstanceId())); } + } else { + LOG.debug(new ObjectMessage(Map.of("Message", + "Skipping status update for instance " + status.getInstanceId() + + " - current status is " + (currentStatus != null ? currentStatus.getVmStatus() : "null")))); } String jobId = status.getJobId(); CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); @@ -332,6 +342,12 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // look up the job JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); + + // Log all statuses in container for debugging + LOG.debug(new ObjectMessage(Map.of("Message", + "Status calculation for job " + status.getJobId() + " - Container has " + + cloudVmStatusContainer.getStatuses().size() + " statuses"))); + for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { VMStatus vmStatus = s.getVmStatus(); // skip replaced instances - these were replaced by AgentWatchdog due to failure @@ -343,6 +359,10 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine continue; } + LOG.trace(new ObjectMessage(Map.of("Message", + "Checking instance " + s.getInstanceId() + + " - VMStatus: " + vmStatus + ", JobStatus: " + s.getJobStatus()))); + JobStatus jobStatus = s.getJobStatus(); if (jobStatus != JobStatus.Completed) { // If no VMs are Completed isFinished = false; @@ -360,6 +380,12 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine running = false; } } + + LOG.debug(new ObjectMessage(Map.of("Message", + "Status calc complete for job " + status.getJobId() + + " - isFinished=" + isFinished + ", paused=" + paused + + ", rampPaused=" + rampPaused + ", stopped=" + stopped + ", running=" + running))); + if (isFinished) { LOG.info(new ObjectMessage(Map.of("Message","Setting end time on container " + cloudVmStatusContainer.getJobId()))); if (cloudVmStatusContainer.getEndTime() == null) { @@ -371,6 +397,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } if (job != null) { job.setEndTime(cloudVmStatusContainer.getEndTime()); + JobQueueStatus oldStatus = job.getStatus(); JobQueueStatus newStatus = job.getStatus(); if (isFinished) { newStatus = JobQueueStatus.Completed; @@ -389,6 +416,12 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } } + if (oldStatus != newStatus) { + LOG.info(new ObjectMessage(Map.of("Message", + "Job " + status.getJobId() + " status transition: " + oldStatus + " -> " + newStatus + + " (isFinished=" + isFinished + ", paused=" + paused + ", rampPaused=" + rampPaused + + ", stopped=" + stopped + ", running=" + running + ")"))); + } LOG.trace("Setting Container for job=" + status.getJobId() + " newStatus to " + newStatus); job.setStatus(newStatus); jobInstanceDao.get().saveOrUpdate(job); From e2e4be231dced02868372c7d4e7421b6277c78e2 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 19 Dec 2025 13:50:45 -0800 Subject: [PATCH 03/29] replaced agents stay visible in the UI --- .../java/com/intuit/tank/harness/APIMonitor.java | 2 +- .../com/intuit/tank/vmManager/AgentWatchdog.java | 14 +++++++++++--- .../com/intuit/tank/vmManager/VMTrackerImpl.java | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java index 526ef00ea..d4f202b59 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java @@ -85,7 +85,7 @@ private void updateInstanceStatus() { } if (!isLocal) { - LOG.debug(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + + LOG.info(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + ", JobStatus: " + newStatus.getJobStatus() + ", Users: " + newStatus.getCurrentUsers() + "/" + newStatus.getTotalUsers())); setInstanceStatus(newStatus.getInstanceId(), newStatus); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index e728c0e35..cd08967ee 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -222,7 +222,15 @@ private void relaunch(ArrayList instances) { VMImageDao dao = new VMImageDao(); for (VMInformation info : instances) { vmInfo.remove(info); - vmTracker.removeStatusForInstance(info.getInstanceId()); // remove from tracker to prevent blocking job status + + // Update status to 'replaced' in the tracker (keeps visible in UI, but filtered out of calculations) + CloudVmStatus replacedStatus = vmTracker.getStatus(info.getInstanceId()); + if (replacedStatus != null) { + replacedStatus.setVmStatus(VMStatus.replaced); + vmTracker.setStatus(replacedStatus); + } + + // Also update in the database for persistence VMInstance image = dao.getImageByInstanceId(info.getInstanceId()); if (image != null) { image.setStatus(VMStatus.replaced.name()); // mark as replaced (not terminated) for audit trail @@ -230,8 +238,8 @@ private void relaunch(ArrayList instances) { } LOG.info(new ObjectMessage(Map.of("Message", - "Replaced and removed instance " + info.getInstanceId() + - " from tracker for job " + jobId))); + "Marked instance " + info.getInstanceId() + + " as REPLACED (visible in UI but filtered from job status) for job " + jobId))); } LOG.info(new ObjectMessage(Map.of("Message","Setting number of instances to relaunch to: " + instances.size() + " for job " + jobId))); instanceRequest.setNumberOfInstances(instances.size()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 3e205dd17..01b3ee569 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -150,7 +150,7 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { status.setReportTime(new Date()); CloudVmStatus currentStatus = getStatus(status.getInstanceId()); - LOG.debug(new ObjectMessage(Map.of("Message", + LOG.info(new ObjectMessage(Map.of("Message", "Status update for instance " + status.getInstanceId() + " - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + ", Job: " + status.getJobId()))); @@ -344,7 +344,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); // Log all statuses in container for debugging - LOG.debug(new ObjectMessage(Map.of("Message", + LOG.info(new ObjectMessage(Map.of("Message", "Status calculation for job " + status.getJobId() + " - Container has " + cloudVmStatusContainer.getStatuses().size() + " statuses"))); @@ -359,7 +359,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine continue; } - LOG.trace(new ObjectMessage(Map.of("Message", + LOG.info(new ObjectMessage(Map.of("Message", "Checking instance " + s.getInstanceId() + " - VMStatus: " + vmStatus + ", JobStatus: " + s.getJobStatus()))); From 1e4bd64ed1b762dc5e98831d1817301d9aab0e47 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 19 Dec 2025 13:54:28 -0800 Subject: [PATCH 04/29] TEMP CHANGE: test public/private ip logging w/ fix --- .../tank/vm/vmManager/VMInformation.java | 40 +++++++++++++++++++ .../intuit/tank/vmManager/AgentWatchdog.java | 9 ++++- .../vmManager/environment/CreateInstance.java | 6 +++ .../vmManager/environment/JobRequest.java | 9 ++++- .../amazon/AmazonDataConverter.java | 2 + 5 files changed, 64 insertions(+), 2 deletions(-) diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java index da175fbd5..996609ccb 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java @@ -96,6 +96,46 @@ public String getPrivateDNS() { return (String) this.items.get("privateDns"); } + // TEMP_IP_LOGGING - START + /** + * Set the virtual machine private IP address + * + * @param data + * The virtual machine's private IP address + */ + public void setPrivateIp(String data) { + this.items.put("privateIp", data); + } + + /** + * Get the virtual machine private IP address + * + * @return The virtual machine's private IP address + */ + public String getPrivateIp() { + return (String) this.items.get("privateIp"); + } + + /** + * Set the virtual machine public IP address + * + * @param data + * The virtual machine's public IP address + */ + public void setPublicIp(String data) { + this.items.put("publicIp", data); + } + + /** + * Get the virtual machine public IP address + * + * @return The virtual machine's public IP address + */ + public String getPublicIp() { + return (String) this.items.get("publicIp"); + } + // TEMP_IP_LOGGING - END + public void setLaunchTime(Calendar data) { this.items.put("launchTime", data); } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index cd08967ee..793d60e5e 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -252,7 +252,14 @@ private void relaunch(ArrayList instances) { // Add directly to started instances since these are restarted from scratch startedInstances.add(newInfo); vmTracker.setStatus(createCloudStatus(instanceRequest, newInfo)); - LOG.info(new ObjectMessage(Map.of("Message","Added image (" + newInfo.getInstanceId() + ") to VMImage table for job " + jobId))); + // TEMP_IP_LOGGING - START + LOG.info(new ObjectMessage(Map.of( + "Message", "Added relaunched image to VMImage table for job " + jobId, + "instanceId", newInfo.getInstanceId(), + "publicIp", newInfo.getPublicIp() != null ? newInfo.getPublicIp() : "N/A", + "privateIp", newInfo.getPrivateIp() != null ? newInfo.getPrivateIp() : "N/A" + ))); + // TEMP_IP_LOGGING - END try { dao.addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java index 9da50ee5c..7fcf6f0aa 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java @@ -53,6 +53,12 @@ public void run() { for (VMInformation info : vmInfo) { vmTracker.setStatus(createCloudStatus(request, info)); dao.addImageFromInfo(request.getJobId(), info, request.getRegion()); + // TEMP_IP_LOGGING - START + logger.info("Added image (" + info.getInstanceId() + + ") with publicIp=" + (info.getPublicIp() != null ? info.getPublicIp() : "N/A") + + ", privateIp=" + (info.getPrivateIp() != null ? info.getPrivateIp() : "N/A") + + " to VMImage table"); + // TEMP_IP_LOGGING - END } } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java index 105790a43..3c381637a 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java @@ -89,7 +89,14 @@ private void persistInstances(VMInstanceRequest instanceRequest, List Date: Fri, 19 Dec 2025 15:29:35 -0800 Subject: [PATCH 05/29] fixes agent count + test publicIp logging --- .../environment/amazon/AmazonInstance.java | 26 +++++++++++++++++++ .../com/intuit/tank/job/ActJobNodeBean.java | 7 ++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java index 0feb610ea..195327182 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java @@ -293,6 +293,32 @@ public List create(VMRequest request) { // reboot(result); } + // Wait for instances to get public IPs assigned and refresh the details + if (!result.isEmpty()) { + try { + // Brief wait to allow AWS to assign public IPs + Thread.sleep(3000); + List instanceIds = result.stream() + .map(VMInformation::getInstanceId) + .collect(Collectors.toList()); + DescribeInstancesResponse described = ec2AsyncClient.describeInstances( + DescribeInstancesRequest.builder().instanceIds(instanceIds).build() + ).get(); + // Update result with fresh instance data that includes public IPs + List updated = described.reservations().stream() + .flatMap(reservation -> reservation.instances().stream() + .map(instance -> AmazonDataConverter.instanceToVmInformation( + reservation.requesterId(), instance, vmRegion))) + .collect(Collectors.toList()); + result.clear(); + result.addAll(updated); + LOG.debug("Refreshed {} instance details with public IP information", updated.size()); + } catch (InterruptedException e) { + LOG.warn("Interrupted while waiting for public IP assignment", e); + Thread.currentThread().interrupt(); + } + } + } catch (SdkException ae) { LOG.error("Amazon issue starting instances: {} : {}", vmRegion, ae.getMessage(), ae); throw new RuntimeException(ae); diff --git a/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java b/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java index 99947207d..5f58b39fb 100644 --- a/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java +++ b/web/web_support/src/main/java/com/intuit/tank/job/ActJobNodeBean.java @@ -129,7 +129,12 @@ public List getSubNodes() { @Override public List getCurrentSubNodes() { - return vmBeans.stream().filter(vm -> !vm.getStatus().equals(VMStatus.terminated.toString())).collect(Collectors.toList()); + // Filter out both terminated and replaced agents from the count + // terminated = normal shutdown, replaced = watchdog replaced a failed agent + return vmBeans.stream() + .filter(vm -> !vm.getStatus().equals(VMStatus.terminated.toString()) + && !vm.getStatus().equals(VMStatus.replaced.toString())) + .collect(Collectors.toList()); } @Override From eabf716e13d4a271b873adadd09d42c82bae5cc4 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 19 Dec 2025 16:15:27 -0800 Subject: [PATCH 06/29] fixes tests --- .../environment/amazon/AmazonInstance.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java index 195327182..ad226044e 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/amazon/AmazonInstance.java @@ -301,21 +301,25 @@ public List create(VMRequest request) { List instanceIds = result.stream() .map(VMInformation::getInstanceId) .collect(Collectors.toList()); - DescribeInstancesResponse described = ec2AsyncClient.describeInstances( - DescribeInstancesRequest.builder().instanceIds(instanceIds).build() - ).get(); - // Update result with fresh instance data that includes public IPs - List updated = described.reservations().stream() - .flatMap(reservation -> reservation.instances().stream() - .map(instance -> AmazonDataConverter.instanceToVmInformation( - reservation.requesterId(), instance, vmRegion))) - .collect(Collectors.toList()); - result.clear(); - result.addAll(updated); - LOG.debug("Refreshed {} instance details with public IP information", updated.size()); + CompletableFuture future = ec2AsyncClient.describeInstances( + DescribeInstancesRequest.builder().instanceIds(instanceIds).build()); + if (future != null) { + DescribeInstancesResponse described = future.get(); + // Update result with fresh instance data that includes public IPs + List updated = described.reservations().stream() + .flatMap(reservation -> reservation.instances().stream() + .map(instance -> AmazonDataConverter.instanceToVmInformation( + reservation.requesterId(), instance, vmRegion))) + .collect(Collectors.toList()); + result.clear(); + result.addAll(updated); + LOG.debug("Refreshed {} instance details with public IP information", updated.size()); + } } catch (InterruptedException e) { LOG.warn("Interrupted while waiting for public IP assignment", e); Thread.currentThread().interrupt(); + } catch (ExecutionException e) { + LOG.warn("Failed to refresh instance details for public IPs: {}", e.getMessage()); } } From 07066e6bf51a37f9205b7b97201c68902642135c Mon Sep 17 00:00:00 2001 From: zkofiro Date: Mon, 5 Jan 2026 15:44:40 -0800 Subject: [PATCH 07/29] Add VMStatus.replaced for AgentWatchdog visibility + thread safety fixes --- .../intuit/tank/agent/models/VMStatus.java | 15 +- .../tank/vm/vmManager/models/VMStatus.java | 15 +- .../rest/mvc/rest/cloud/JobEventSender.java | 6 +- .../mvc/rest/cloud/JobEventSenderTest.java | 264 ++++++++++++++++++ .../intuit/tank/vmManager/VMTrackerImpl.java | 56 +++- .../tank/vmManager/VMTrackerImplTest.java | 105 +++++++ 6 files changed, 437 insertions(+), 24 deletions(-) create mode 100644 rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java create mode 100644 tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java diff --git a/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java b/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java index 7ccd68212..7727ccdd1 100644 --- a/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/agent/models/VMStatus.java @@ -22,13 +22,18 @@ public enum VMStatus implements Serializable { terminated; public static final VMStatus fromString(String value) { - VMStatus ret = null; + if (value == null || value.isEmpty()) { + return VMStatus.unknown; + } if ("shutting-down".equals(value)) { - ret = VMStatus.shutting_down; - } else { - ret = VMStatus.valueOf(value); + return VMStatus.shutting_down; + } + try { + return VMStatus.valueOf(value); + } catch (IllegalArgumentException e) { + // Gracefully handle unknown values (e.g., 'replaced' from controller which agent doesn't need) + return VMStatus.unknown; } - return ret != null ? ret : VMStatus.unknown; } } diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java index 7dc4312b1..23f0a1c3a 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java @@ -30,13 +30,18 @@ public enum VMStatus implements Serializable { replaced; // replaced by AgentWatchdog due to failure to report back public static final VMStatus fromString(String value) { - VMStatus ret = null; + if (value == null || value.isEmpty()) { + return VMStatus.unknown; + } if ("shutting-down".equals(value)) { - ret = VMStatus.shutting_down; - } else { - ret = VMStatus.valueOf(value); + return VMStatus.shutting_down; + } + try { + return VMStatus.valueOf(value); + } catch (IllegalArgumentException e) { + // Gracefully handle unknown values (e.g., from older/newer clients with different enum versions) + return VMStatus.unknown; } - return ret != null ? ret : VMStatus.unknown; } } diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 3e997aa09..9eecb9ec3 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -271,8 +271,10 @@ public CloudVmStatus getVmStatus(String instanceId) { public void setVmStatus(final String instanceId, final CloudVmStatus status) { vmTracker.setStatus(status); - if (status.getJobStatus() == JobStatus.Completed || status.getVmStatus() == VMStatus.terminated) { - // will terrminate instance after waiting for some cleanup time + if (status.getJobStatus() == JobStatus.Completed + || status.getVmStatus() == VMStatus.terminated + || status.getVmStatus() == VMStatus.replaced) { + // will terminate instance after waiting for some cleanup time terminator.terminate(status.getInstanceId()); // check job status and kill off instances appropriately checkJobStatus(status.getJobId()); diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java new file mode 100644 index 000000000..fb1693fbd --- /dev/null +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -0,0 +1,264 @@ +package com.intuit.tank.rest.mvc.rest.cloud; + +import com.intuit.tank.vm.vmManager.VMTracker; +import com.intuit.tank.vm.vmManager.models.CloudVmStatus; +import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; +import com.intuit.tank.vm.vmManager.models.VMStatus; +import com.intuit.tank.vm.vmManager.models.ValidationStatus; +import com.intuit.tank.vm.api.enumerated.JobStatus; +import com.intuit.tank.vm.api.enumerated.VMImageType; +import com.intuit.tank.vm.api.enumerated.VMRegion; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.lang.reflect.Method; +import java.util.Date; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +/** + * Unit tests for JobEventSender + */ +@ExtendWith(MockitoExtension.class) +public class JobEventSenderTest { + + @Mock + private VMTracker vmTracker; + + @InjectMocks + private JobEventSender jobEventSender; + + private CloudVmStatusContainer container; + + @BeforeEach + void setUp() { + container = new CloudVmStatusContainer(); + container.setJobId("123"); + } + + // Helper method to create CloudVmStatus + private CloudVmStatus createStatus(String instanceId, JobStatus jobStatus, VMStatus vmStatus) { + return new CloudVmStatus( + instanceId, + "123", + "sg-test", + jobStatus, + VMImageType.AGENT, + VMRegion.US_WEST_2, + vmStatus, + new ValidationStatus(), + 100, + 50, + new Date(), + null + ); + } + + /** + * Use reflection to test the private getInstancesForJob method. + */ + private List invokeGetInstancesForJob(String jobId) throws Exception { + Method method = JobEventSender.class.getDeclaredMethod("getInstancesForJob", String.class); + method.setAccessible(true); + @SuppressWarnings("unchecked") + List result = (List) method.invoke(jobEventSender, jobId); + return result; + } + + @Test + @DisplayName("getInstancesForJob excludes terminated instances") + void getInstancesForJob_excludesTerminatedInstances() throws Exception { + // Given: Mix of running and terminated instances + container.getStatuses().add(createStatus("i-running1", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-terminated", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-running2", JobStatus.Running, VMStatus.running)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then: Only running instances should be returned + assertEquals(2, instances.size()); + assertTrue(instances.contains("i-running1")); + assertTrue(instances.contains("i-running2")); + assertFalse(instances.contains("i-terminated")); + } + + @Test + @DisplayName("getInstancesForJob excludes stopped instances") + void getInstancesForJob_excludesStoppedInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-stopped", JobStatus.Stopped, VMStatus.stopped)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-stopped")); + } + + @Test + @DisplayName("getInstancesForJob excludes stopping instances") + void getInstancesForJob_excludesStoppingInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-stopping", JobStatus.Stopped, VMStatus.stopping)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-stopping")); + } + + @Test + @DisplayName("getInstancesForJob excludes shutting_down instances") + void getInstancesForJob_excludesShuttingDownInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-shuttingdown", JobStatus.Stopped, VMStatus.shutting_down)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-shuttingdown")); + } + + @Test + @DisplayName("getInstancesForJob includes pending instances (agents starting)") + void getInstancesForJob_includesPendingInstances() throws Exception { + // Given: Pending instances are still reachable + container.getStatuses().add(createStatus("i-pending", JobStatus.Starting, VMStatus.pending)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then: Pending instances should be included + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-pending")); + } + + @Test + @DisplayName("getInstancesForJob includes starting instances") + void getInstancesForJob_includesStartingInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-starting", JobStatus.Starting, VMStatus.starting)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-starting")); + } + + @Test + @DisplayName("getInstancesForJob returns empty list for non-existent job") + void getInstancesForJob_returnsEmptyForNonExistent() throws Exception { + // Given + when(vmTracker.getVmStatusForJob("999")).thenReturn(null); + + // When + List instances = invokeGetInstancesForJob("999"); + + // Then + assertNotNull(instances); + assertTrue(instances.isEmpty()); + } + + @Test + @DisplayName("getInstancesForJob excludes replaced instances (AgentWatchdog replacements)") + void getInstancesForJob_excludesReplacedInstances() throws Exception { + // Given + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-replaced", JobStatus.Stopped, VMStatus.terminated)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + assertFalse(instances.contains("i-replaced")); + } + + @Test + @DisplayName("getInstancesForJob filters correctly with mixed statuses from AgentWatchdog scenario") + void getInstancesForJob_agentWatchdogScenario() throws Exception { + // Given: Real-world scenario where AgentWatchdog replaced some agents + // - 3 running agents (healthy) + // - 2 replaced agents (replaced by watchdog, now terminated) + container.getStatuses().add(createStatus("i-healthy1", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-healthy2", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-healthy3", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-replaced1", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced2", JobStatus.Stopped, VMStatus.terminated)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When: User tries to kill the job + List instances = invokeGetInstancesForJob("123"); + + // Then: Only healthy instances should receive the kill command + assertEquals(3, instances.size()); + assertTrue(instances.contains("i-healthy1")); + assertTrue(instances.contains("i-healthy2")); + assertTrue(instances.contains("i-healthy3")); + assertFalse(instances.contains("i-replaced1")); + assertFalse(instances.contains("i-replaced2")); + } + + @Test + @DisplayName("getVmStatus delegates to vmTracker") + void getVmStatus_delegatesToVmTracker() { + String instanceId = "i-test123"; + CloudVmStatus expectedStatus = createStatus(instanceId, JobStatus.Running, VMStatus.running); + when(vmTracker.getStatus(instanceId)).thenReturn(expectedStatus); + + CloudVmStatus result = jobEventSender.getVmStatus(instanceId); + + assertEquals(expectedStatus, result); + verify(vmTracker).getStatus(instanceId); + } + + @Test + @DisplayName("getVmStatusForJob delegates to vmTracker") + void getVmStatusForJob_delegatesToVmTracker() { + String jobId = "123"; + when(vmTracker.getVmStatusForJob(jobId)).thenReturn(container); + + CloudVmStatusContainer result = jobEventSender.getVmStatusForJob(jobId); + + assertEquals(container, result); + verify(vmTracker).getVmStatusForJob(jobId); + } +} + diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 01b3ee569..75fd934b2 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -19,9 +19,11 @@ import static com.intuit.tank.vm.common.TankConstants.NOTIFICATIONS_EVENT_EVENT_TIME_KEY; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; @@ -235,17 +237,26 @@ private JobQueueStatus getQueueStatus(JobQueueStatus oldStatus, JobStatus jobSta /** * If the vm is shutting down, terminated, or replaced, don't update the status to something else. + * This prevents race conditions where a user kills an instance while watchdog is replacing it, + * or stale status updates arrive after an instance has already transitioned to a terminal state. * @param currentStatus * @return */ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { if (currentStatus != null) { VMStatus status = currentStatus.getVmStatus(); - return (status != VMStatus.shutting_down - && status != VMStatus.stopped - && status != VMStatus.stopping - && status != VMStatus.terminated - && status != VMStatus.replaced); + boolean isTerminalState = (status == VMStatus.shutting_down + || status == VMStatus.stopped + || status == VMStatus.stopping + || status == VMStatus.terminated + || status == VMStatus.replaced); + if (isTerminalState) { + LOG.info(new ObjectMessage(Map.of("Message", + "Ignoring status update for instance " + currentStatus.getInstanceId() + + " - already in terminal state: " + status + + " (possible race between user kill and watchdog replace)"))); + return false; + } } return true; } @@ -282,12 +293,19 @@ public void removeStatusForInstance(String instanceId) { */ @Override public void removeStatusForJob(String jobId) { - CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); - if (cloudVmStatusContainer != null) { - for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { - removeStatusForInstance(s.getInstanceId()); + // Synchronize on the same lock used by setStatusThread to prevent + // ConcurrentModificationException when iterating over statuses + synchronized (getCacheSyncObject(jobId)) { + CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); + if (cloudVmStatusContainer != null) { + // Copy to avoid ConcurrentModificationException - the underlying + // HashSet could be modified by setStatusThread while we iterate + List statusesCopy = new ArrayList<>(cloudVmStatusContainer.getStatuses()); + for (CloudVmStatus s : statusesCopy) { + removeStatusForInstance(s.getInstanceId()); + } + jobMap.remove(jobId); } - jobMap.remove(jobId); } } @@ -344,11 +362,15 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); // Log all statuses in container for debugging + // Take a snapshot to avoid ConcurrentModificationException if another thread modifies + // the set while we iterate (defense in depth, even though we're synchronized) + Set statusesSnapshot = new HashSet<>(cloudVmStatusContainer.getStatuses()); LOG.info(new ObjectMessage(Map.of("Message", "Status calculation for job " + status.getJobId() + " - Container has " + - cloudVmStatusContainer.getStatuses().size() + " statuses"))); + statusesSnapshot.size() + " statuses"))); - for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { + int activeInstanceCount = 0; + for (CloudVmStatus s : statusesSnapshot) { VMStatus vmStatus = s.getVmStatus(); // skip replaced instances - these were replaced by AgentWatchdog due to failure // but do NOT skip terminated/stopping/stopped/shutting_down - these are active agents in transition @@ -358,6 +380,7 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine " in job status calculation for job " + status.getJobId()))); continue; } + activeInstanceCount++; LOG.info(new ObjectMessage(Map.of("Message", "Checking instance " + s.getInstanceId() + @@ -381,6 +404,15 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } } + // if all instances are replaced and replacements haven't reported yet, + // don't change job status - wait for active agents to report + if (activeInstanceCount == 0) { + LOG.info(new ObjectMessage(Map.of("Message", + "No active instances for job " + status.getJobId() + + " (all replaced or empty) - skipping status calculation until replacements report"))); + return; + } + LOG.debug(new ObjectMessage(Map.of("Message", "Status calc complete for job " + status.getJobId() + " - isFinished=" + isFinished + ", paused=" + paused + diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java new file mode 100644 index 000000000..5c5690b79 --- /dev/null +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -0,0 +1,105 @@ +package com.intuit.tank.vmManager; + +import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.DisplayName; + +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for VMTrackerImpl, focusing on the bug fix for terminated agents + * blocking job status transitions. + */ +public class VMTrackerImplTest { + + private VMTrackerImpl vmTracker; + + @BeforeEach + void setUp() { + vmTracker = new VMTrackerImpl(); + } + + @Test + @DisplayName("getStatus returns null for non-existent instance") + void getStatus_returnsNullForNonExistent() { + assertNull(vmTracker.getStatus("non-existent-instance")); + } + + @Test + @DisplayName("removeStatusForInstance handles non-existent instance gracefully") + void removeStatusForInstance_handlesNonExistent() { + String instanceId = "i-test123"; + + // First verify getStatus returns null initially + assertNull(vmTracker.getStatus(instanceId)); + + // Call removeStatusForInstance - should handle non-existent gracefully (no exception) + vmTracker.removeStatusForInstance(instanceId); + + // Should still be null (no exception thrown) + assertNull(vmTracker.getStatus(instanceId)); + } + + @Test + @DisplayName("getVmStatusForJob returns null for non-existent job") + void getVmStatusForJob_returnsNullForNonExistent() { + assertNull(vmTracker.getVmStatusForJob("non-existent-job")); + } + + @Test + @DisplayName("getAllJobs returns empty set initially") + void getAllJobs_returnsEmptySetInitially() { + Set jobs = vmTracker.getAllJobs(); + assertNotNull(jobs); + assertTrue(jobs.isEmpty()); + } + + @Test + @DisplayName("stopJob marks job as stopped") + void stopJob_marksJobAsStopped() { + String jobId = "123"; + + // Initially running + assertTrue(vmTracker.isRunning(jobId)); + + // Stop the job + vmTracker.stopJob(jobId); + + // Now stopped + assertFalse(vmTracker.isRunning(jobId)); + } + + @Test + @DisplayName("isRunning returns true for jobs not explicitly stopped") + void isRunning_returnsTrueByDefault() { + assertTrue(vmTracker.isRunning("any-job-id")); + } + + @Test + @DisplayName("removeStatusForJob handles non-existent job gracefully") + void removeStatusForJob_handlesNonExistent() { + // Should not throw + vmTracker.removeStatusForJob("non-existent-job"); + + // Verify state is still consistent + assertNull(vmTracker.getVmStatusForJob("non-existent-job")); + } + + @Test + @DisplayName("isDevMode returns false by default") + void isDevMode_returnsFalseByDefault() { + // The devMode is set based on TankConfig which defaults to false + // unless explicitly configured as standalone + assertFalse(vmTracker.isDevMode()); + } + + @Test + @DisplayName("getProjectStatusContainer returns null for non-existent project") + void getProjectStatusContainer_returnsNullForNonExistent() { + assertNull(vmTracker.getProjectStatusContainer("non-existent-project")); + } +} + From c5cfdfba83a4a7486a71f526e25dae1313893f88 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 6 Jan 2026 14:16:04 -0800 Subject: [PATCH 08/29] up that test coverage for new code --- .../tank/agent/models/VMStatusTest.java | 75 +++++++++++ .../vm/vmManager/models/VMStatusTest.java | 101 ++++++++++----- .../mvc/rest/cloud/JobEventSenderTest.java | 33 ++++- .../tank/vmManager/VMTrackerImplTest.java | 97 ++++++++++++++- .../intuit/tank/job/ActJobNodeBeanTest.java | 116 ++++++++++++++++++ 5 files changed, 382 insertions(+), 40 deletions(-) create mode 100644 api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java diff --git a/api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java b/api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java new file mode 100644 index 000000000..5360a0ee5 --- /dev/null +++ b/api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java @@ -0,0 +1,75 @@ +package com.intuit.tank.agent.models; + +import org.junit.jupiter.api.*; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for the agent-side VMStatus enum. + * This enum is used by agents and doesn't include the 'replaced' status + * (which is only meaningful on the controller side). + */ +public class VMStatusTest { + + @Test + @DisplayName("fromString handles shutting-down special case") + public void testFromString_shuttingDown() { + VMStatus result = VMStatus.fromString("shutting-down"); + + assertNotNull(result); + assertEquals(VMStatus.shutting_down, result); + } + + @Test + @DisplayName("fromString returns correct enum for valid values") + public void testFromString_validValues() { + assertEquals(VMStatus.running, VMStatus.fromString("running")); + assertEquals(VMStatus.pending, VMStatus.fromString("pending")); + assertEquals(VMStatus.starting, VMStatus.fromString("starting")); + assertEquals(VMStatus.rebooting, VMStatus.fromString("rebooting")); + assertEquals(VMStatus.terminated, VMStatus.fromString("terminated")); + assertEquals(VMStatus.stopped, VMStatus.fromString("stopped")); + assertEquals(VMStatus.stopping, VMStatus.fromString("stopping")); + assertEquals(VMStatus.rampPaused, VMStatus.fromString("rampPaused")); + } + + @Test + @DisplayName("fromString returns unknown for null input") + public void testFromString_nullReturnsUnknown() { + VMStatus result = VMStatus.fromString(null); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString returns unknown for empty string") + public void testFromString_emptyReturnsUnknown() { + VMStatus result = VMStatus.fromString(""); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString returns unknown for unrecognized values") + public void testFromString_unknownValueReturnsUnknown() { + // Should not throw IllegalArgumentException - gracefully returns unknown + VMStatus result = VMStatus.fromString("garbage-value"); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString handles 'replaced' from controller gracefully (returns unknown)") + public void testFromString_replacedFromControllerReturnsUnknown() { + // The agent-side VMStatus doesn't have a 'replaced' enum value + // When the controller sends 'replaced', the agent should handle it gracefully + VMStatus result = VMStatus.fromString("replaced"); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } +} + diff --git a/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java b/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java index 117793759..66f250585 100644 --- a/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java +++ b/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java @@ -19,50 +19,85 @@ /** * The class VMStatusTest contains tests for the class {@link VMStatus}. - * - * @generatedBy CodePro at 12/15/14 2:57 PM */ public class VMStatusTest { - /** - * Run the VMStatus fromString(String) method test. - * - * @throws Exception - * - * @generatedBy CodePro at 12/15/14 2:57 PM - */ + @Test - @Disabled - public void testFromString_1() - throws Exception { - String value = "shutting-down"; + @DisplayName("fromString handles shutting-down special case") + public void testFromString_shuttingDown() { + VMStatus result = VMStatus.fromString("shutting-down"); + + assertNotNull(result); + assertEquals(VMStatus.shutting_down, result); + } - VMStatus result = VMStatus.fromString(value); + @Test + @DisplayName("fromString returns correct enum for valid values") + public void testFromString_validValues() { + assertEquals(VMStatus.running, VMStatus.fromString("running")); + assertEquals(VMStatus.pending, VMStatus.fromString("pending")); + assertEquals(VMStatus.starting, VMStatus.fromString("starting")); + assertEquals(VMStatus.ready, VMStatus.fromString("ready")); + assertEquals(VMStatus.rebooting, VMStatus.fromString("rebooting")); + assertEquals(VMStatus.terminated, VMStatus.fromString("terminated")); + assertEquals(VMStatus.stopped, VMStatus.fromString("stopped")); + assertEquals(VMStatus.stopping, VMStatus.fromString("stopping")); + assertEquals(VMStatus.rampPaused, VMStatus.fromString("rampPaused")); + } + @Test + @DisplayName("fromString returns replaced for 'replaced' value") + public void testFromString_replaced() { + VMStatus result = VMStatus.fromString("replaced"); + assertNotNull(result); - assertEquals("shutting_down", result.name()); - assertEquals("shutting_down", result.toString()); - assertEquals(6, result.ordinal()); + assertEquals(VMStatus.replaced, result); } - /** - * Run the VMStatus fromString(String) method test. - * - * @throws Exception - * - * @generatedBy CodePro at 12/15/14 2:57 PM - */ @Test - public void testFromString_2() - throws Exception { - String value = VMStatus.rebooting.name(); + @DisplayName("fromString returns unknown for null input") + public void testFromString_nullReturnsUnknown() { + VMStatus result = VMStatus.fromString(null); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } - VMStatus result = VMStatus.fromString(value); + @Test + @DisplayName("fromString returns unknown for empty string") + public void testFromString_emptyReturnsUnknown() { + VMStatus result = VMStatus.fromString(""); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } - // An unexpected exception was thrown in user code while executing this test: - // java.lang.IllegalArgumentException: No enum constant com.intuit.tank.vm.vmManager.models.VMStatus. - // at java.lang.Enum.valueOf(Enum.java:238) - // at com.intuit.tank.vm.vmManager.models.VMStatus.valueOf(VMStatus.java:5) - // at com.intuit.tank.vm.vmManager.models.VMStatus.fromString(VMStatus.java:20) + @Test + @DisplayName("fromString returns unknown for unrecognized values (graceful handling)") + public void testFromString_unknownValueReturnsUnknown() { + // Should not throw IllegalArgumentException - gracefully returns unknown + VMStatus result = VMStatus.fromString("garbage-value"); + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("fromString handles future/unknown enum values gracefully") + public void testFromString_futureCompatibility() { + // Simulates receiving a value from a newer version of the API + VMStatus result = VMStatus.fromString("some_new_status_from_future"); + + assertNotNull(result); + assertEquals(VMStatus.unknown, result); + } + + @Test + @DisplayName("replaced is a terminal state (for documentation)") + public void testReplaced_isTerminalState() { + // This test documents that 'replaced' is intended as a terminal state + // like 'terminated', used by AgentWatchdog when replacing failed agents + assertNotNull(VMStatus.replaced); + assertEquals("replaced", VMStatus.replaced.name()); } } \ No newline at end of file diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java index fb1693fbd..a6576e4aa 100644 --- a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -195,16 +195,16 @@ void getInstancesForJob_returnsEmptyForNonExistent() throws Exception { @Test @DisplayName("getInstancesForJob excludes replaced instances (AgentWatchdog replacements)") void getInstancesForJob_excludesReplacedInstances() throws Exception { - // Given + // Given: VMStatus.replaced is set by AgentWatchdog when an agent fails and is replaced container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); - container.getStatuses().add(createStatus("i-replaced", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); when(vmTracker.getVmStatusForJob("123")).thenReturn(container); // When List instances = invokeGetInstancesForJob("123"); - // Then + // Then: Replaced instances should be filtered out (can't receive commands) assertEquals(1, instances.size()); assertTrue(instances.contains("i-running")); assertFalse(instances.contains("i-replaced")); @@ -215,12 +215,12 @@ void getInstancesForJob_excludesReplacedInstances() throws Exception { void getInstancesForJob_agentWatchdogScenario() throws Exception { // Given: Real-world scenario where AgentWatchdog replaced some agents // - 3 running agents (healthy) - // - 2 replaced agents (replaced by watchdog, now terminated) + // - 2 replaced agents (marked by watchdog with VMStatus.replaced) container.getStatuses().add(createStatus("i-healthy1", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-healthy2", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-healthy3", JobStatus.Running, VMStatus.running)); - container.getStatuses().add(createStatus("i-replaced1", JobStatus.Stopped, VMStatus.terminated)); - container.getStatuses().add(createStatus("i-replaced2", JobStatus.Stopped, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced1", JobStatus.Starting, VMStatus.replaced)); + container.getStatuses().add(createStatus("i-replaced2", JobStatus.Starting, VMStatus.replaced)); when(vmTracker.getVmStatusForJob("123")).thenReturn(container); @@ -236,6 +236,27 @@ void getInstancesForJob_agentWatchdogScenario() throws Exception { assertFalse(instances.contains("i-replaced2")); } + @Test + @DisplayName("getInstancesForJob excludes all terminal states correctly") + void getInstancesForJob_excludesAllTerminalStates() throws Exception { + // Given: One instance of each terminal state + one running + container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); + container.getStatuses().add(createStatus("i-terminated", JobStatus.Completed, VMStatus.terminated)); + container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); + container.getStatuses().add(createStatus("i-stopped", JobStatus.Stopped, VMStatus.stopped)); + container.getStatuses().add(createStatus("i-stopping", JobStatus.Stopped, VMStatus.stopping)); + container.getStatuses().add(createStatus("i-shutting-down", JobStatus.Stopped, VMStatus.shutting_down)); + + when(vmTracker.getVmStatusForJob("123")).thenReturn(container); + + // When + List instances = invokeGetInstancesForJob("123"); + + // Then: Only the running instance should be included + assertEquals(1, instances.size()); + assertTrue(instances.contains("i-running")); + } + @Test @DisplayName("getVmStatus delegates to vmTracker") void getVmStatus_delegatesToVmTracker() { diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java index 5c5690b79..c329962b0 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -1,17 +1,27 @@ package com.intuit.tank.vmManager; +import com.intuit.tank.vm.vmManager.models.CloudVmStatus; import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; +import com.intuit.tank.vm.vmManager.models.VMStatus; +import com.intuit.tank.vm.vmManager.models.ValidationStatus; +import com.intuit.tank.vm.api.enumerated.JobStatus; +import com.intuit.tank.vm.api.enumerated.VMImageType; +import com.intuit.tank.vm.api.enumerated.VMRegion; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import java.lang.reflect.Method; +import java.util.Date; import java.util.Set; import static org.junit.jupiter.api.Assertions.*; /** * Unit tests for VMTrackerImpl, focusing on the bug fix for terminated agents - * blocking job status transitions. + * blocking job status transitions and the VMStatus.replaced handling. */ public class VMTrackerImplTest { @@ -22,6 +32,8 @@ void setUp() { vmTracker = new VMTrackerImpl(); } + // ============ Basic functionality tests ============ + @Test @DisplayName("getStatus returns null for non-existent instance") void getStatus_returnsNullForNonExistent() { @@ -101,5 +113,88 @@ void isDevMode_returnsFalseByDefault() { void getProjectStatusContainer_returnsNullForNonExistent() { assertNull(vmTracker.getProjectStatusContainer("non-existent-project")); } + + // ============ shouldUpdateStatus tests (terminal state handling) ============ + + /** + * Use reflection to test the private shouldUpdateStatus method. + */ + private boolean invokeShouldUpdateStatus(CloudVmStatus status) throws Exception { + Method method = VMTrackerImpl.class.getDeclaredMethod("shouldUpdateStatus", CloudVmStatus.class); + method.setAccessible(true); + return (boolean) method.invoke(vmTracker, status); + } + + private CloudVmStatus createStatus(String instanceId, VMStatus vmStatus) { + return new CloudVmStatus( + instanceId, + "123", + "sg-test", + JobStatus.Starting, + VMImageType.AGENT, + VMRegion.US_WEST_2, + vmStatus, + new ValidationStatus(), + 100, + 50, + new Date(), + null + ); + } + + @Test + @DisplayName("shouldUpdateStatus returns true for null current status") + void shouldUpdateStatus_nullCurrentStatus_returnsTrue() throws Exception { + assertTrue(invokeShouldUpdateStatus(null)); + } + + @ParameterizedTest + @EnumSource(value = VMStatus.class, names = {"terminated", "replaced", "stopped", "stopping", "shutting_down"}) + @DisplayName("shouldUpdateStatus returns false for terminal states") + void shouldUpdateStatus_terminalStates_returnsFalse(VMStatus terminalStatus) throws Exception { + CloudVmStatus status = createStatus("i-123", terminalStatus); + assertFalse(invokeShouldUpdateStatus(status), + "Should reject updates for terminal state: " + terminalStatus); + } + + @ParameterizedTest + @EnumSource(value = VMStatus.class, names = {"unknown", "starting", "pending", "ready", "running", "rampPaused", "rebooting"}) + @DisplayName("shouldUpdateStatus returns true for active states") + void shouldUpdateStatus_activeStates_returnsTrue(VMStatus activeStatus) throws Exception { + CloudVmStatus status = createStatus("i-123", activeStatus); + assertTrue(invokeShouldUpdateStatus(status), + "Should allow updates for active state: " + activeStatus); + } + + @Test + @DisplayName("shouldUpdateStatus blocks update for replaced instance (AgentWatchdog scenario)") + void shouldUpdateStatus_replacedInstance_blocksUpdate() throws Exception { + // Given: An instance that was replaced by AgentWatchdog + CloudVmStatus replacedStatus = createStatus("i-replaced", VMStatus.replaced); + + // When/Then: Updates should be blocked + assertFalse(invokeShouldUpdateStatus(replacedStatus), + "Replaced instances should not accept status updates (race condition guard)"); + } + + // ============ VMStatus.replaced integration tests ============ + + @Test + @DisplayName("replaced status is recognized as a valid VMStatus") + void replacedStatus_isValidEnum() { + assertNotNull(VMStatus.replaced); + assertEquals("replaced", VMStatus.replaced.name()); + } + + @Test + @DisplayName("setStatus respects terminal state guard for replaced instances") + void setStatus_respectsTerminalGuard_forReplaced() throws Exception { + // Note: This is a partial test - full integration requires mocked dependencies + // The key behavior is tested via shouldUpdateStatus tests above + + // Verify the enum value exists and can be used + CloudVmStatus status = createStatus("i-test", VMStatus.replaced); + assertEquals(VMStatus.replaced, status.getVmStatus()); + } } diff --git a/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java b/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java index 54431df82..378ab5f94 100644 --- a/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java +++ b/web/web_support/src/test/java/com/intuit/tank/job/ActJobNodeBeanTest.java @@ -745,4 +745,120 @@ public void testSetVmBeans_1() // at com.intuit.tank.project.JobConfiguration.(JobConfiguration.java:63) // at com.intuit.tank.project.Workload.(Workload.java:57) } + + // ============ getCurrentSubNodes tests (VMStatus.replaced filtering) ============ + + /** + * Helper to create a VMNodeBean with a specific VMStatus. + */ + private VMNodeBean createVMNodeBean(String instanceId, VMStatus vmStatus) { + CloudVmStatus cvs = new CloudVmStatus( + instanceId, + "123", + "sg-1", + JobStatus.Running, + VMImageType.AGENT, + VMRegion.US_EAST_2, + vmStatus, + new ValidationStatus(), + 10, + 100, + new Date(), + null + ); + return new VMNodeBean(cvs, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + } + + @Test + public void testGetCurrentSubNodes_filtersReplacedInstances() throws Exception { + // Setup + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Running); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-003", VMStatus.replaced)); // Should be filtered + vmBeans.add(createVMNodeBean("i-004", VMStatus.running)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify: replaced instance should be filtered out + assertEquals(3, result.size(), "Should exclude replaced instance"); + assertEquals(4, fixture.getSubNodes().size(), "getSubNodes should include ALL"); + } + + @Test + public void testGetCurrentSubNodes_filtersTerminatedInstances() throws Exception { + // Setup + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Running); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.terminated)); // Should be filtered + vmBeans.add(createVMNodeBean("i-003", VMStatus.running)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify + assertEquals(2, result.size(), "Should exclude terminated instance"); + } + + @Test + public void testGetCurrentSubNodes_filtersBothReplacedAndTerminated() throws Exception { + // Setup: Mix of running, replaced, and terminated + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Running); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.running)); + vmBeans.add(createVMNodeBean("i-003", VMStatus.replaced)); // Should be filtered + vmBeans.add(createVMNodeBean("i-004", VMStatus.terminated)); // Should be filtered + vmBeans.add(createVMNodeBean("i-005", VMStatus.running)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify: Both replaced and terminated should be filtered + assertEquals(3, result.size(), "Should only include running instances"); + assertEquals(5, fixture.getSubNodes().size(), "getSubNodes should include ALL for visibility"); + } + + @Test + public void testGetCurrentSubNodes_allReplaced_returnsEmpty() throws Exception { + // Setup: Edge case where all instances are replaced + Workload workload = new Workload(); + workload.setJobConfiguration(new JobConfiguration()); + JobInstance jobInstance = new JobInstance(workload, "test"); + jobInstance.setStatus(JobQueueStatus.Starting); + ActJobNodeBean fixture = new ActJobNodeBean(jobInstance, true, FastDateFormat.getDateTimeInstance(FastDateFormat.MEDIUM, FastDateFormat.MEDIUM)); + + List vmBeans = new LinkedList<>(); + vmBeans.add(createVMNodeBean("i-001", VMStatus.replaced)); + vmBeans.add(createVMNodeBean("i-002", VMStatus.replaced)); + fixture.setVmBeans(vmBeans); + + // Execute + List result = fixture.getCurrentSubNodes(); + + // Verify + assertEquals(0, result.size(), "Should return empty when all replaced"); + assertEquals(2, fixture.getSubNodes().size(), "All should still be visible in getSubNodes"); + } } \ No newline at end of file From 574b01e992fae7e878a7ac3f17f2010de8bb86c7 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 6 Jan 2026 15:46:05 -0800 Subject: [PATCH 09/29] we can undo the bandaid 'pending' agent status implemented for replacing agents now --- .../intuit/tank/vmManager/AgentWatchdog.java | 25 ++-- .../tank/vmManager/AgentWatchdogTest.java | 114 ++++++++++++++++++ 2 files changed, 132 insertions(+), 7 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 793d60e5e..6546e9f45 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -246,12 +246,18 @@ private void relaunch(ArrayList instances) { instances.clear(); // Create and send instance start request List newVms = amazonInstance.create(instanceRequest); - // Add new instances + // Add new instances - set to 'starting' status so watchdog waits for actual /v2/agent/ready call for (VMInformation newInfo : newVms) { vmInfo.add(newInfo); - // Add directly to started instances since these are restarted from scratch + // Add to startedInstances - watchdog will wait for this agent to actually report startedInstances.add(newInfo); - vmTracker.setStatus(createCloudStatus(instanceRequest, newInfo)); + CloudVmStatus newStatus = createCloudStatus(instanceRequest, newInfo); + vmTracker.setStatus(newStatus); + LOG.info(new ObjectMessage(Map.of( + "Message", "Created replacement agent with status " + newStatus.getVmStatus() + + " - watchdog will wait for /v2/agent/ready call", + "instanceId", newInfo.getInstanceId(), + "jobId", jobId))); // TEMP_IP_LOGGING - START LOG.info(new ObjectMessage(Map.of( "Message", "Added relaunched image to VMImage table for job " + jobId, @@ -273,15 +279,20 @@ private void relaunch(ArrayList instances) { } /** - * @param req - * @param info - * @return + * Creates initial cloud status for a newly launched replacement agent. + * CRITICAL: Must use VMStatus.starting (not pending) so watchdog waits for actual agent registration. + * + * Status flow: starting → (agent calls /v2/agent/ready) → pending → (receives START) → ready → running + * + * @param req the instance request + * @param info the VM information for the new instance + * @return CloudVmStatus with starting state */ private CloudVmStatus createCloudStatus(VMInstanceRequest req, VMInformation info) { return new CloudVmStatus(info.getInstanceId(), req.getJobId(), req.getInstanceDescription() != null ? req.getInstanceDescription().getSecurityGroup() : "unknown", JobStatus.Starting, - VMImageType.AGENT, req.getRegion(), VMStatus.pending, new ValidationStatus(), 0, 0, null, null); + VMImageType.AGENT, req.getRegion(), VMStatus.starting, new ValidationStatus(), 0, 0, null, null); } private boolean shouldRelaunchInstances() { diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java index e43775931..82f714472 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/AgentWatchdogTest.java @@ -17,6 +17,7 @@ import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -98,4 +99,117 @@ public void progressToPendingRunTest(@Mock VMTracker vmTrackerMock, @Mock CloudV verify(cloudVmStatusContainerMock, times(2)).getEndTime(); verify(cloudVmStatusContainerMock, times(2)).getStatuses(); } + + /** + * CRITICAL TEST: Verifies that replacement agents are created with VMStatus.starting, + * NOT VMStatus.pending. This is essential because: + * + * - checkForReportingInstances() considers 'pending' as "agent has reported" + * - If replacements start with 'pending', they're immediately moved to reportedInstances + * - This causes watchdog to exit before agents actually call /v2/agent/ready + * - JobManager then waits forever for registrations that never come + * + * See: https://github.intuit.com/user/Tank/issues/XXX (if applicable) + */ + @Test + public void createCloudStatus_usesStartingNotPending() throws Exception { + VMTracker vmTracker = new VMTrackerImpl(); + VMInstanceRequest instanceRequest = new VMInstanceRequest(); + instanceRequest.setJobId("test-job-123"); + instanceRequest.setRegion(VMRegion.STANDALONE); + List vmInfo = new ArrayList<>(); + + AgentWatchdog agentWatchdog = new AgentWatchdog(instanceRequest, vmInfo, vmTracker); + + // Use reflection to access private createCloudStatus method + Method createCloudStatus = AgentWatchdog.class.getDeclaredMethod( + "createCloudStatus", VMInstanceRequest.class, VMInformation.class); + createCloudStatus.setAccessible(true); + + VMInformation vmInformation = new VMInformation(); + vmInformation.setInstanceId("i-replacement-test"); + + CloudVmStatus result = (CloudVmStatus) createCloudStatus.invoke( + agentWatchdog, instanceRequest, vmInformation); + + // THE CRITICAL ASSERTION: Must be 'starting', not 'pending' + assertEquals(VMStatus.starting, result.getVmStatus(), + "Replacement agents MUST start with VMStatus.starting so watchdog waits for actual /v2/agent/ready call. " + + "Using 'pending' causes watchdog to immediately consider them as 'reported' and exit prematurely."); + + // Verify other fields are correct + assertEquals("test-job-123", result.getJobId()); + assertEquals("i-replacement-test", result.getInstanceId()); + assertEquals(JobStatus.Starting, result.getJobStatus()); + } + + /** + * Verifies the checkForReportingInstances() logic only considers 'pending' agents as reported. + * This test ensures that 'starting' agents are NOT moved to reportedInstances. + */ + @Test + public void checkForReportingInstances_onlyMovesPendingAgents( + @Mock VMTracker vmTrackerMock, + @Mock CloudVmStatusContainer cloudVmStatusContainerMock) { + + // Setup: Two agents - one starting (should wait), one pending (should be reported) + when(cloudVmStatusContainerMock.getEndTime()).thenReturn(null); + + CloudVmStatus vmstatusStarting = new CloudVmStatus( + "i-still-starting", "123", "sg-123456", + JobStatus.Starting, VMImageType.AGENT, VMRegion.STANDALONE, + VMStatus.starting, new ValidationStatus(), 1, 1, new Date(), new Date()); + + CloudVmStatus vmstatusPending = new CloudVmStatus( + "i-reported", "123", "sg-123456", + JobStatus.Starting, VMImageType.AGENT, VMRegion.STANDALONE, + VMStatus.pending, new ValidationStatus(), 1, 1, new Date(), new Date()); + + Set statuses = new HashSet<>(); + statuses.add(vmstatusStarting); + statuses.add(vmstatusPending); + when(cloudVmStatusContainerMock.getStatuses()).thenReturn(statuses); + when(vmTrackerMock.getVmStatusForJob(null)).thenReturn(cloudVmStatusContainerMock); + + // Create VMInformation for both agents + VMInformation vmInfoStarting = new VMInformation(); + vmInfoStarting.setState("pending"); + vmInfoStarting.setInstanceId("i-still-starting"); + + VMInformation vmInfoPending = new VMInformation(); + vmInfoPending.setState("pending"); + vmInfoPending.setInstanceId("i-reported"); + + List vmInfo = new ArrayList<>(); + vmInfo.add(vmInfoStarting); + vmInfo.add(vmInfoPending); + + VMInstanceRequest instanceRequest = new VMInstanceRequest(); + instanceRequest.setRegion(VMRegion.STANDALONE); + + // Short timeout to let it check once then timeout waiting for i-still-starting + // maxWaitForResponse=1 means it will immediately try to relaunch + // But since we're not mocking amazonInstance.create, it should fail + // and we just verify the initial check behavior + AgentWatchdog agentWatchdog = new AgentWatchdog( + instanceRequest, vmInfo, vmTrackerMock, amazonInstanceMock, 10, 50); + + // Run in a way that only checks once - the job will "appear stopped" after first check + when(cloudVmStatusContainerMock.getEndTime()) + .thenReturn(null) // First check + .thenReturn(new Date()); // Second check - job appears stopped, exit + + try { + agentWatchdog.run(); + } catch (RuntimeException e) { + // Expected - job appears stopped + } + + // After ONE check: + // - i-reported (pending) should be in reportedInstances + // - i-still-starting (starting) should still be in startedInstances + // The key assertion is that the watchdog did NOT immediately exit + // because i-still-starting is still 'starting' not 'pending' + verify(cloudVmStatusContainerMock, atLeast(1)).getStatuses(); + } } From b92c4c6e001f1ada97fee4e36fdffe17ba93a970 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 7 Jan 2026 11:24:10 -0800 Subject: [PATCH 10/29] fix: race condition in removeStatusForInstance + add unit tests + remove verbose logging --- .../intuit/tank/vmManager/AgentWatchdog.java | 4 +- .../intuit/tank/vmManager/VMTrackerImpl.java | 45 +-- .../tank/vmManager/VMTrackerImplTest.java | 328 ++++++++++++++++++ 3 files changed, 349 insertions(+), 28 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 6546e9f45..2778d775d 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -147,8 +147,8 @@ public void run() { relaunch(startedInstances); startTime = System.currentTimeMillis(); } - LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": " + "Waiting for " + startedInstances.size() + " agents to report: " - + getInstanceIdList(startedInstances)))); + LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": Waiting for " + startedInstances.size() + " agents to report: " + + startedInstances.stream().map(VMInformation::getInstanceId).collect(Collectors.toList())))); Thread.sleep(sleepTime); } else { LOG.info(new ObjectMessage(Map.of("Message","All Agents Reported back for job " + jobId + "."))); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 75fd934b2..25efb07bb 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -152,11 +152,6 @@ private void setStatusThread(@Nonnull final CloudVmStatus status) { status.setReportTime(new Date()); CloudVmStatus currentStatus = getStatus(status.getInstanceId()); - LOG.info(new ObjectMessage(Map.of("Message", - "Status update for instance " + status.getInstanceId() + - " - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + - ", Job: " + status.getJobId()))); - if (shouldUpdateStatus(currentStatus)) { statusMap.put(status.getInstanceId(), status); if (status.getVmStatus() == VMStatus.running @@ -267,23 +262,32 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { */ @Override public void removeStatusForInstance(String instanceId) { - CloudVmStatus status = statusMap.remove(instanceId); + // First, get the status to find the jobId (without removing yet) + CloudVmStatus status = statusMap.get(instanceId); - // also remove from the job's container to keep counts accurate if (status != null) { String jobId = status.getJobId(); - // Synchronize on the same lock used by setStatusThread to prevent - // ConcurrentModificationException when iterating over statuses + // Synchronize on the job lock to ensure atomic removal from both + // statusMap and container - prevents race with setStatusThread synchronized (getCacheSyncObject(jobId)) { - CloudVmStatusContainer container = jobMap.get(jobId); - if (container != null) { - boolean removed = container.getStatuses().remove(status); - if (removed) { - LOG.info(new ObjectMessage(Map.of("Message", - "Removed instance " + instanceId + " from container for job " + jobId))); + // Now remove from statusMap inside the sync block + CloudVmStatus removedStatus = statusMap.remove(instanceId); + + // Also remove from the job's container to keep counts accurate + if (removedStatus != null) { + CloudVmStatusContainer container = jobMap.get(jobId); + if (container != null) { + boolean removed = container.getStatuses().remove(removedStatus); + if (removed) { + LOG.info(new ObjectMessage(Map.of("Message", + "Removed instance " + instanceId + " from container for job " + jobId))); + } } } } + } else { + // Instance not in statusMap - just try to remove (no-op if not present) + statusMap.remove(instanceId); } } @@ -361,13 +365,9 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // look up the job JobInstance job = jobInstanceDao.get().findById(Integer.parseInt(status.getJobId())); - // Log all statuses in container for debugging // Take a snapshot to avoid ConcurrentModificationException if another thread modifies // the set while we iterate (defense in depth, even though we're synchronized) Set statusesSnapshot = new HashSet<>(cloudVmStatusContainer.getStatuses()); - LOG.info(new ObjectMessage(Map.of("Message", - "Status calculation for job " + status.getJobId() + " - Container has " + - statusesSnapshot.size() + " statuses"))); int activeInstanceCount = 0; for (CloudVmStatus s : statusesSnapshot) { @@ -375,17 +375,10 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine // skip replaced instances - these were replaced by AgentWatchdog due to failure // but do NOT skip terminated/stopping/stopped/shutting_down - these are active agents in transition if (vmStatus == VMStatus.replaced) { - LOG.info(new ObjectMessage(Map.of("Message", - "Skipping replaced instance " + s.getInstanceId() + - " in job status calculation for job " + status.getJobId()))); continue; } activeInstanceCount++; - LOG.info(new ObjectMessage(Map.of("Message", - "Checking instance " + s.getInstanceId() + - " - VMStatus: " + vmStatus + ", JobStatus: " + s.getJobStatus()))); - JobStatus jobStatus = s.getJobStatus(); if (jobStatus != JobStatus.Completed) { // If no VMs are Completed isFinished = false; diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java index c329962b0..6849cfeb0 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -10,12 +10,20 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.RepeatedTest; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; import java.lang.reflect.Method; +import java.util.ArrayList; import java.util.Date; +import java.util.List; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import static org.junit.jupiter.api.Assertions.*; @@ -196,5 +204,325 @@ void setStatus_respectsTerminalGuard_forReplaced() throws Exception { CloudVmStatus status = createStatus("i-test", VMStatus.replaced); assertEquals(VMStatus.replaced, status.getVmStatus()); } + + // ============ Status calculation tests (using reflection to bypass CDI dependencies) ============ + + private CloudVmStatus createStatusWithJobStatus(String instanceId, String jobId, VMStatus vmStatus, JobStatus jobStatus) { + return new CloudVmStatus( + instanceId, + jobId, + "sg-test", + jobStatus, + VMImageType.AGENT, + VMRegion.US_WEST_2, + vmStatus, + new ValidationStatus(), + 100, + 50, + new Date(), + null + ); + } + + /** + * Directly add status to internal maps using reflection (bypasses async setStatus and CDI dependencies). + * This allows testing the status calculation logic in isolation. + */ + @SuppressWarnings("unchecked") + private void addStatusDirectly(CloudVmStatus status) throws Exception { + // Access the private statusMap field + java.lang.reflect.Field statusMapField = VMTrackerImpl.class.getDeclaredField("statusMap"); + statusMapField.setAccessible(true); + java.util.concurrent.ConcurrentHashMap statusMap = + (java.util.concurrent.ConcurrentHashMap) statusMapField.get(vmTracker); + + // Access the private jobMap field + java.lang.reflect.Field jobMapField = VMTrackerImpl.class.getDeclaredField("jobMap"); + jobMapField.setAccessible(true); + java.util.concurrent.ConcurrentHashMap jobMap = + (java.util.concurrent.ConcurrentHashMap) jobMapField.get(vmTracker); + + // Add to statusMap + statusMap.put(status.getInstanceId(), status); + + // Add to job container + String jobId = status.getJobId(); + CloudVmStatusContainer container = jobMap.computeIfAbsent(jobId, k -> { + CloudVmStatusContainer c = new CloudVmStatusContainer(); + c.setJobId(jobId); + return c; + }); + container.getStatuses().add(status); + } + + @Test + @DisplayName("Status calculation with all agents replaced returns early (activeInstanceCount = 0)") + void statusCalculation_allAgentsReplaced_returnsEarly() throws Exception { + // Given: A job where ALL agents have been replaced by watchdog + String jobId = "all-replaced-job"; + + // Set up 3 agents, all marked as replaced (using direct manipulation) + addStatusDirectly(createStatusWithJobStatus("i-001", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-002", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-003", jobId, VMStatus.replaced, JobStatus.Starting)); + + // When: Get job status + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + + // Then: Container should exist with 3 statuses + assertNotNull(container, "Container should exist"); + assertEquals(3, container.getStatuses().size()); + + // Verify all have replaced status + assertTrue(container.getStatuses().stream().allMatch(s -> s.getVmStatus() == VMStatus.replaced), + "All agents should be in replaced state"); + } + + @Test + @DisplayName("Status calculation with mixed statuses correctly tracks replaced agents") + void statusCalculation_mixedStatuses_correctlyDeterminesJobStatus() throws Exception { + String jobId = "mixed-status-job"; + + // Given: Mix of running, pending, and replaced agents (using direct manipulation) + addStatusDirectly(createStatusWithJobStatus("i-running1", jobId, VMStatus.running, JobStatus.Running)); + addStatusDirectly(createStatusWithJobStatus("i-running2", jobId, VMStatus.running, JobStatus.Running)); + addStatusDirectly(createStatusWithJobStatus("i-replaced", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-pending", jobId, VMStatus.pending, JobStatus.Starting)); + + // When + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + + // Then: Should have 4 total statuses + assertNotNull(container, "Container should exist"); + assertEquals(4, container.getStatuses().size()); + + // Replaced should be in container + assertTrue(container.getStatuses().stream() + .anyMatch(s -> s.getInstanceId().equals("i-replaced") && s.getVmStatus() == VMStatus.replaced), + "Replaced instance should be in container"); + + // Running instances should also be present + assertEquals(2, container.getStatuses().stream() + .filter(s -> s.getVmStatus() == VMStatus.running).count(), + "Should have 2 running instances"); + } + + @Test + @DisplayName("Status calculation container includes all terminal states") + void statusCalculation_onlySkipsReplaced_notOtherTerminalStates() throws Exception { + String jobId = "terminal-states-job"; + + // Given: Various terminal and non-terminal states (using direct manipulation) + addStatusDirectly(createStatusWithJobStatus("i-running", jobId, VMStatus.running, JobStatus.Running)); + addStatusDirectly(createStatusWithJobStatus("i-replaced", jobId, VMStatus.replaced, JobStatus.Starting)); + addStatusDirectly(createStatusWithJobStatus("i-terminated", jobId, VMStatus.terminated, JobStatus.Completed)); + addStatusDirectly(createStatusWithJobStatus("i-stopped", jobId, VMStatus.stopped, JobStatus.Stopped)); + + // When + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + + // Then: Container should have all 4 statuses + assertNotNull(container, "Container should exist"); + assertEquals(4, container.getStatuses().size()); + + // Verify each status type is present + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.running)); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.replaced)); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.terminated)); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getVmStatus() == VMStatus.stopped)); + } + + // ============ Thread safety tests (using direct manipulation to avoid CDI issues) ============ + + @RepeatedTest(3) + @DisplayName("Concurrent direct status additions do not cause ConcurrentModificationException") + void concurrentDirectStatusAdd_noException() throws Exception { + String jobId = "concurrent-test-job-" + System.nanoTime(); + int numThreads = 10; + int operationsPerThread = 50; + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + CountDownLatch startLatch = new CountDownLatch(1); + CountDownLatch doneLatch = new CountDownLatch(numThreads); + AtomicInteger errorCount = new AtomicInteger(0); + + for (int t = 0; t < numThreads; t++) { + final int threadNum = t; + executor.submit(() -> { + try { + startLatch.await(); + for (int i = 0; i < operationsPerThread; i++) { + String instanceId = "i-thread" + threadNum + "-op" + i; + addStatusDirectly(createStatusWithJobStatus( + instanceId, jobId, VMStatus.running, JobStatus.Running)); + } + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + assertTrue(doneLatch.await(30, TimeUnit.SECONDS), "Test timed out"); + executor.shutdown(); + + assertEquals(0, errorCount.get(), "Concurrent operations should not throw exceptions"); + + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + assertNotNull(container, "Container should exist"); + assertTrue(container.getStatuses().size() > 0, "Should have some statuses"); + } + + @RepeatedTest(3) + @DisplayName("Concurrent add and removeStatusForInstance do not cause race conditions") + void concurrentAddAndRemove_noRaceCondition() throws Exception { + String jobId = "concurrent-remove-job-" + System.nanoTime(); + int numAddThreads = 5; + int numRemoveThreads = 5; + int operationsPerThread = 30; + ExecutorService executor = Executors.newFixedThreadPool(numAddThreads + numRemoveThreads); + CountDownLatch startLatch = new CountDownLatch(1); + CountDownLatch doneLatch = new CountDownLatch(numAddThreads + numRemoveThreads); + AtomicInteger errorCount = new AtomicInteger(0); + List instanceIds = new ArrayList<>(); + + // Pre-populate some instances + for (int i = 0; i < operationsPerThread; i++) { + String instanceId = "i-prepop-" + i; + instanceIds.add(instanceId); + addStatusDirectly(createStatusWithJobStatus(instanceId, jobId, VMStatus.running, JobStatus.Running)); + } + + // Threads that add new statuses + for (int t = 0; t < numAddThreads; t++) { + final int threadNum = t; + executor.submit(() -> { + try { + startLatch.await(); + for (int i = 0; i < operationsPerThread; i++) { + String instanceId = "i-add-thread" + threadNum + "-op" + i; + addStatusDirectly(createStatusWithJobStatus( + instanceId, jobId, VMStatus.running, JobStatus.Running)); + } + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + // Threads that remove statuses + for (int t = 0; t < numRemoveThreads; t++) { + final int threadNum = t; + executor.submit(() -> { + try { + startLatch.await(); + for (int i = 0; i < operationsPerThread; i++) { + if (i < instanceIds.size()) { + vmTracker.removeStatusForInstance(instanceIds.get(i)); + } + vmTracker.removeStatusForInstance("i-nonexistent-" + threadNum + "-" + i); + } + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + assertTrue(doneLatch.await(30, TimeUnit.SECONDS), "Test timed out"); + executor.shutdown(); + + assertEquals(0, errorCount.get(), "Concurrent add/remove should not throw exceptions"); + } + + @RepeatedTest(3) + @DisplayName("Concurrent removeStatusForJob does not cause ConcurrentModificationException") + void concurrentRemoveStatusForJob_noException() throws Exception { + int numJobs = 5; + int instancesPerJob = 10; + String prefix = "job-" + System.nanoTime() + "-"; + ExecutorService executor = Executors.newFixedThreadPool(numJobs); + CountDownLatch doneLatch = new CountDownLatch(numJobs); + AtomicInteger errorCount = new AtomicInteger(0); + + // Set up multiple jobs with instances + for (int j = 0; j < numJobs; j++) { + String jobId = prefix + j; + for (int i = 0; i < instancesPerJob; i++) { + addStatusDirectly(createStatusWithJobStatus( + "i-" + jobId + "-instance" + i, jobId, VMStatus.running, JobStatus.Running)); + } + } + + // Concurrently remove all jobs + for (int j = 0; j < numJobs; j++) { + final String jobId = prefix + j; + executor.submit(() -> { + try { + vmTracker.removeStatusForJob(jobId); + } catch (Exception e) { + errorCount.incrementAndGet(); + e.printStackTrace(); + } finally { + doneLatch.countDown(); + } + }); + } + + assertTrue(doneLatch.await(30, TimeUnit.SECONDS), "Test timed out"); + executor.shutdown(); + + assertEquals(0, errorCount.get(), "Concurrent removeStatusForJob should not throw exceptions"); + + // Verify all jobs are removed + for (int j = 0; j < numJobs; j++) { + assertNull(vmTracker.getVmStatusForJob(prefix + j), "Job " + prefix + j + " should be removed"); + } + } + + // ============ removeStatusForInstance race condition fix tests ============ + + @Test + @DisplayName("removeStatusForInstance removes from both statusMap and container atomically") + void removeStatusForInstance_atomicRemoval() throws Exception { + String jobId = "atomic-remove-job"; + String instanceId = "i-atomic-test"; + + // Given: An instance with status (using direct manipulation) + CloudVmStatus status = createStatusWithJobStatus(instanceId, jobId, VMStatus.running, JobStatus.Running); + addStatusDirectly(status); + + // Verify it exists in both places + assertNotNull(vmTracker.getStatus(instanceId), "Status should exist in statusMap"); + CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); + assertNotNull(container, "Container should exist"); + assertTrue(container.getStatuses().stream().anyMatch(s -> s.getInstanceId().equals(instanceId)), + "Instance should be in container"); + + // When: Remove the status + vmTracker.removeStatusForInstance(instanceId); + + // Then: Should be removed from both statusMap and container + assertNull(vmTracker.getStatus(instanceId), "Status should be removed from statusMap"); + container = vmTracker.getVmStatusForJob(jobId); + assertNotNull(container); // Container still exists (job not removed) + assertFalse(container.getStatuses().stream().anyMatch(s -> s.getInstanceId().equals(instanceId)), + "Instance should be removed from container"); + } + + @Test + @DisplayName("removeStatusForInstance handles non-existent instance gracefully with sync") + void removeStatusForInstance_nonExistent_noException() { + // Should not throw even with the new synchronized logic + assertDoesNotThrow(() -> vmTracker.removeStatusForInstance("i-does-not-exist")); + } } From 213a7f2ce93ce1694df21c3ddb30d4b015ee27d0 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 7 Jan 2026 13:22:38 -0800 Subject: [PATCH 11/29] remove unnecessary logging --- .../src/main/java/com/intuit/tank/harness/APIMonitor.java | 3 --- .../java/com/intuit/tank/vm/vmManager/VMInformation.java | 2 -- .../java/com/intuit/tank/vmManager/AgentWatchdog.java | 8 -------- .../intuit/tank/vmManager/environment/CreateInstance.java | 6 ------ .../com/intuit/tank/vmManager/environment/JobRequest.java | 8 -------- .../vmManager/environment/amazon/AmazonDataConverter.java | 4 ++-- 6 files changed, 2 insertions(+), 29 deletions(-) diff --git a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java index d4f202b59..7211a8467 100644 --- a/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java +++ b/agent/apiharness/src/main/java/com/intuit/tank/harness/APIMonitor.java @@ -85,9 +85,6 @@ private void updateInstanceStatus() { } if (!isLocal) { - LOG.info(LogUtil.getLogMessage("Sending status update - VMStatus: " + newStatus.getVmStatus() + - ", JobStatus: " + newStatus.getJobStatus() + ", Users: " + newStatus.getCurrentUsers() + - "/" + newStatus.getTotalUsers())); setInstanceStatus(newStatus.getInstanceId(), newStatus); } APITestHarness.getInstance().checkAgentThreads(); diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java index 996609ccb..ea5fd7e04 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java @@ -96,7 +96,6 @@ public String getPrivateDNS() { return (String) this.items.get("privateDns"); } - // TEMP_IP_LOGGING - START /** * Set the virtual machine private IP address * @@ -134,7 +133,6 @@ public void setPublicIp(String data) { public String getPublicIp() { return (String) this.items.get("publicIp"); } - // TEMP_IP_LOGGING - END public void setLaunchTime(Calendar data) { this.items.put("launchTime", data); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 2778d775d..eb145a148 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -258,14 +258,6 @@ private void relaunch(ArrayList instances) { " - watchdog will wait for /v2/agent/ready call", "instanceId", newInfo.getInstanceId(), "jobId", jobId))); - // TEMP_IP_LOGGING - START - LOG.info(new ObjectMessage(Map.of( - "Message", "Added relaunched image to VMImage table for job " + jobId, - "instanceId", newInfo.getInstanceId(), - "publicIp", newInfo.getPublicIp() != null ? newInfo.getPublicIp() : "N/A", - "privateIp", newInfo.getPrivateIp() != null ? newInfo.getPrivateIp() : "N/A" - ))); - // TEMP_IP_LOGGING - END try { dao.addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java index 7fcf6f0aa..9da50ee5c 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/CreateInstance.java @@ -53,12 +53,6 @@ public void run() { for (VMInformation info : vmInfo) { vmTracker.setStatus(createCloudStatus(request, info)); dao.addImageFromInfo(request.getJobId(), info, request.getRegion()); - // TEMP_IP_LOGGING - START - logger.info("Added image (" + info.getInstanceId() + - ") with publicIp=" + (info.getPublicIp() != null ? info.getPublicIp() : "N/A") + - ", privateIp=" + (info.getPrivateIp() != null ? info.getPrivateIp() : "N/A") + - " to VMImage table"); - // TEMP_IP_LOGGING - END } } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java index 3c381637a..774df5535 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java @@ -89,14 +89,6 @@ private void persistInstances(VMInstanceRequest instanceRequest, List Date: Thu, 8 Jan 2026 14:27:59 -0800 Subject: [PATCH 12/29] no more log flooding --- .../tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java index 9abba85f0..32dbde6de 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/services/agent/AgentServiceV2Impl.java @@ -221,7 +221,7 @@ public void setInstanceStatus(String instanceId, CloudVmStatus status) { segment.putAnnotation("TotalUsers", status.getTotalUsers()); segment.putAnnotation("totalTps", status.getTotalTps()); - LOGGER.info("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() + + LOGGER.debug("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() + ", JobStatus: " + status.getJobStatus() + ", Users: " + status.getCurrentUsers() + "/" + status.getTotalUsers() + ", Job: " + status.getJobId()); From 8dd908492ccd7bfa81c2b1f233c286d7fbeabc92 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 8 Jan 2026 16:08:30 -0800 Subject: [PATCH 13/29] this was actually helpful --- .../main/java/com/intuit/tank/vmManager/AgentWatchdog.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index eb145a148..753cec4f6 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -147,8 +147,8 @@ public void run() { relaunch(startedInstances); startTime = System.currentTimeMillis(); } - LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": Waiting for " + startedInstances.size() + " agents to report: " - + startedInstances.stream().map(VMInformation::getInstanceId).collect(Collectors.toList())))); + LOG.info(new ObjectMessage(Map.of("Message","Job " + jobId + ": " + "Waiting for " + startedInstances.size() + " agents to report: " + + getInstanceIdList(startedInstances)))); Thread.sleep(sleepTime); } else { LOG.info(new ObjectMessage(Map.of("Message","All Agents Reported back for job " + jobId + "."))); From c09111f4443e62ccec5260f592e7b41eb6304cba Mon Sep 17 00:00:00 2001 From: zkofiro Date: Fri, 9 Jan 2026 13:16:24 -0800 Subject: [PATCH 14/29] very helpful ip logging --- .../main/java/com/intuit/tank/vmManager/AgentWatchdog.java | 4 +++- .../com/intuit/tank/vmManager/environment/JobRequest.java | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 753cec4f6..d0290e658 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -257,7 +257,9 @@ private void relaunch(ArrayList instances) { "Message", "Created replacement agent with status " + newStatus.getVmStatus() + " - watchdog will wait for /v2/agent/ready call", "instanceId", newInfo.getInstanceId(), - "jobId", jobId))); + "jobId", jobId, + "publicIp", newInfo.getPublicIp() != null ? newInfo.getPublicIp() : "N/A", + "privateIp", newInfo.getPrivateIp() != null ? newInfo.getPrivateIp() : "N/A"))); try { dao.addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java index 774df5535..e8a189d6f 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/environment/JobRequest.java @@ -89,6 +89,12 @@ private void persistInstances(VMInstanceRequest instanceRequest, List Date: Mon, 26 Jan 2026 19:43:26 -0800 Subject: [PATCH 15/29] goodbye // TODO Do we have to kill jobs here? --- .../intuit/tank/vmManager/AgentWatchdog.java | 54 ++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index d0290e658..962796f5d 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -17,6 +17,7 @@ */ import java.util.ArrayList; +import java.util.Date; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -33,8 +34,11 @@ import com.intuit.tank.vm.vmManager.models.CloudVmStatusContainer; import com.intuit.tank.vm.vmManager.models.VMStatus; import com.intuit.tank.vm.vmManager.models.ValidationStatus; +import com.intuit.tank.dao.JobInstanceDao; import com.intuit.tank.dao.VMImageDao; +import com.intuit.tank.project.JobInstance; import com.intuit.tank.project.VMInstance; +import com.intuit.tank.vm.api.enumerated.JobQueueStatus; import com.intuit.tank.vm.api.enumerated.JobLifecycleEvent; import com.intuit.tank.vm.api.enumerated.JobStatus; import com.intuit.tank.vm.api.enumerated.VMImageType; @@ -205,9 +209,18 @@ private void relaunch(ArrayList instances) { String msg = "Have " + this.startedInstances.size() + " agents that failed to start correctly and have exceeded the maximum number of restarts. Killing job."; - vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), msg, JobLifecycleEvent.JOB_ABORTED)); LOG.info(new ObjectMessage(Map.of("Message", msg))); - // TODO Do we have to kill jobs here? + + // kill the job directly + killJobDirectly(jobId); + + // fire event for logging/notification only (observers would need to be changed to RequestScoped) + try { + vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), msg, JobLifecycleEvent.JOB_ABORTED)); + } catch (Exception e) { + LOG.warn("Failed to publish JOB_ABORTED event (non-critical): " + e.getMessage()); + } + throw new RuntimeException("Killing jobs and exiting"); } String msg = "Have " + instances.size() + " agents that failed to start or report correctly for job " + jobId + ". Relaunching. " @@ -307,4 +320,41 @@ private static VMInformation removeInstance(List instances, Strin } return instanceRemoved; } + + private void killJobDirectly(String jobId) { + LOG.info(new ObjectMessage(Map.of("Message", "Killing job " + jobId + " directly from watchdog"))); + + vmTracker.stopJob(jobId); + + List allInstanceIds = vmInfo.stream() + .map(VMInformation::getInstanceId) + .collect(Collectors.toList()); + if (!allInstanceIds.isEmpty()) { + LOG.info(new ObjectMessage(Map.of("Message", "Killing " + allInstanceIds.size() + " instances for job " + jobId))); + amazonInstance.killInstances(allInstanceIds); + } + + for (VMInformation info : vmInfo) { + CloudVmStatus status = vmTracker.getStatus(info.getInstanceId()); + if (status != null) { + status.setVmStatus(VMStatus.terminated); + status.setJobStatus(JobStatus.Completed); + status.setEndTime(new Date()); + vmTracker.setStatus(status); + } + } + + try { + JobInstanceDao dao = new JobInstanceDao(); + JobInstance job = dao.findById(Integer.parseInt(jobId)); + if (job != null) { + job.setStatus(JobQueueStatus.Completed); + job.setEndTime(new Date()); + dao.saveOrUpdate(job); + LOG.info(new ObjectMessage(Map.of("Message", "Updated job " + jobId + " status to Completed"))); + } + } catch (Exception e) { + LOG.error("Error updating job status in database: " + e.getMessage(), e); + } + } } From cd907128a6987d6146c9ac44fceee4f9b1e061cd Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 27 Jan 2026 15:49:19 -0800 Subject: [PATCH 16/29] add agent ready timer to better time watchdog timeouts + fix bug with job status --- .../java/com/intuit/tank/vmManager/AgentWatchdog.java | 8 ++++++++ .../java/com/intuit/tank/vmManager/VMTrackerImpl.java | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 962796f5d..80c67185f 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -186,6 +186,14 @@ private void checkForReportingInstances() { VMInformation removedInstance = removeInstance(startedInstances, status.getInstanceId()); if (removedInstance != null) { addInstance(reportedInstances, removedInstance); + long startupTimeMs = System.currentTimeMillis() - startTime; + LOG.info(new ObjectMessage(Map.of( + "Message", "Agent reported ready", + "instanceId", status.getInstanceId(), + "jobId", jobId, + "startupTimeMs", startupTimeMs, + "startupTimeSec", startupTimeMs / 1000.0, + "remainingToReport", startedInstances.size()))); } } } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 25efb07bb..782aff760 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -423,6 +423,14 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine if (job != null) { job.setEndTime(cloudVmStatusContainer.getEndTime()); JobQueueStatus oldStatus = job.getStatus(); + + // don't downgrade from terminal state (Completed) - once a job is done, it stays done + if (oldStatus == JobQueueStatus.Completed) { + LOG.debug(new ObjectMessage(Map.of("Message", + "Job " + status.getJobId() + " already Completed - ignoring status recalculation from instance updates"))); + return; + } + JobQueueStatus newStatus = job.getStatus(); if (isFinished) { newStatus = JobQueueStatus.Completed; From 17b9c065f19a0571b667bfa37491284f7ef82282 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 27 Jan 2026 16:00:19 -0800 Subject: [PATCH 17/29] update comment + jobId null check --- .../src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java | 3 +++ .../src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 80c67185f..702814def 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -329,6 +329,9 @@ private static VMInformation removeInstance(List instances, Strin return instanceRemoved; } + // TODO: This method duplicates logic from JobEventSender.killJob(). Consider extracting to a shared + // JobTerminator service when Valkey overhaul happens. Current duplication is intentional to avoid + // CDI scope issues (JobEventSender is @RequestScoped, unavailable from background threads). private void killJobDirectly(String jobId) { LOG.info(new ObjectMessage(Map.of("Message", "Killing job " + jobId + " directly from watchdog"))); diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 782aff760..ca6090092 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -469,6 +469,9 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } private Object getCacheSyncObject(final String id) { + if (id == null) { + throw new IllegalArgumentException("Cannot synchronize on null jobId"); + } locks.putIfAbsent(id, id); return locks.get(id); } From 9a1e5fb2928d535e05c9a7e013ce39a75c2e6b77 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 27 Jan 2026 21:16:34 -0800 Subject: [PATCH 18/29] fixes job / agent status issues --- .../tank/vm/settings/VmManagerConfig.java | 17 ----------------- .../intuit/tank/vmManager/AgentWatchdog.java | 16 ++++++++++++---- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/api/src/main/java/com/intuit/tank/vm/settings/VmManagerConfig.java b/api/src/main/java/com/intuit/tank/vm/settings/VmManagerConfig.java index 020d39340..52bb6c21c 100644 --- a/api/src/main/java/com/intuit/tank/vm/settings/VmManagerConfig.java +++ b/api/src/main/java/com/intuit/tank/vm/settings/VmManagerConfig.java @@ -204,23 +204,6 @@ public VmInstanceType getInstanceType(String name) { return ret; } - /** - * - * @param defaultMills - * @return - */ - public long getMaxAgentStartMills(long defaultMills) { - String string = config.getString("watchdog/max-time-for-agent-start"); - if (string != null) { - try { - return TimeUtil.parseTimeString(string); - } catch (Exception e) { - LOG.error(e.toString()); - } - } - return defaultMills; - } - /** * * @return the provider classname. should be an instance of IDatabase diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 702814def..1be525d92 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -176,7 +176,17 @@ public void run() { private void checkForReportingInstances() { String jobId = instanceRequest.getJobId(); CloudVmStatusContainer vmStatusForJob = vmTracker.getVmStatusForJob(jobId); - if (vmStatusForJob == null || vmStatusForJob.getEndTime() != null) { + + // Container might not exist yet if setStatus() async tasks haven't completed + // This is expected on the first few iterations - just wait for the executor to process + if (vmStatusForJob == null) { + LOG.debug(new ObjectMessage(Map.of("Message", + "Job container not yet created for job " + jobId + " - waiting for async status updates"))); + return; // Return and check again on next iteration + } + + // Only treat as stopped if container exists AND has an end time (user/system stopped the job) + if (vmStatusForJob.getEndTime() != null) { stopped = true; throw new RuntimeException("Job appears to have been stopped. Exiting..."); } @@ -329,9 +339,7 @@ private static VMInformation removeInstance(List instances, Strin return instanceRemoved; } - // TODO: This method duplicates logic from JobEventSender.killJob(). Consider extracting to a shared - // JobTerminator service when Valkey overhaul happens. Current duplication is intentional to avoid - // CDI scope issues (JobEventSender is @RequestScoped, unavailable from background threads). + // TODO: This method duplicates logic from JobEventSender.killJob(). Consider extracting to a shared service private void killJobDirectly(String jobId) { LOG.info(new ObjectMessage(Map.of("Message", "Killing job " + jobId + " directly from watchdog"))); From fd49ecb9b4c479cfb49c6ba8322cb3b86dceb09a Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 27 Jan 2026 22:12:59 -0800 Subject: [PATCH 19/29] more fixes --- .../src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index ca6090092..4d0450651 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -103,7 +103,7 @@ public class VMTrackerImpl implements VMTracker { t.setDaemon(true); return t; }, - new ThreadPoolExecutor.DiscardOldestPolicy()); + new ThreadPoolExecutor.CallerRunsPolicy()); // Don't drop status updates - caller processes if queue full /** * From 790f870a5360d576b2ab03e8867077f804784326 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 27 Jan 2026 22:44:39 -0800 Subject: [PATCH 20/29] potential fix for agent status race conditions: handle async setStatus, stop dropping updates, remove instances from tracking before replace --- docs/installation-guide/installation.md | 14 ++++------ .../intuit/tank/vmManager/AgentWatchdog.java | 28 +++++++++++++++++-- .../intuit/tank/vmManager/VMTrackerImpl.java | 23 +++------------ 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/docs/installation-guide/installation.md b/docs/installation-guide/installation.md index 2518c9729..d37752639 100644 --- a/docs/installation-guide/installation.md +++ b/docs/installation-guide/installation.md @@ -434,15 +434,13 @@ Configuration is achieved via an XML file called `settings.xml`. The default is - + - - 3m - - 5m - - 2 - + + 3m + + 3 + 30s diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 1be525d92..eb1d5f450 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -178,8 +178,17 @@ private void checkForReportingInstances() { CloudVmStatusContainer vmStatusForJob = vmTracker.getVmStatusForJob(jobId); // Container might not exist yet if setStatus() async tasks haven't completed - // This is expected on the first few iterations - just wait for the executor to process + // OR it could be null because the job was killed/removed externally if (vmStatusForJob == null) { + // Check if job still exists and is active - if not, it was killed externally + JobInstanceDao dao = new JobInstanceDao(); + JobInstance job = dao.findById(Integer.parseInt(jobId)); + if (job == null || job.getStatus() == JobQueueStatus.Completed + || job.getStatus() == JobQueueStatus.Aborted) { + stopped = true; + throw new RuntimeException("Job " + jobId + " was stopped or does not exist. Exiting watchdog."); + } + // Job exists and is active - container just not yet created (async race) LOG.debug(new ObjectMessage(Map.of("Message", "Job container not yet created for job " + jobId + " - waiting for async status updates"))); return; // Return and check again on next iteration @@ -254,11 +263,26 @@ private void relaunch(ArrayList instances) { for (VMInformation info : instances) { vmInfo.remove(info); - // Update status to 'replaced' in the tracker (keeps visible in UI, but filtered out of calculations) + // Remove from all tracking lists FIRST - this prevents stale heartbeats from being processed + // for this instance after we mark it as replaced (no need to modify shouldUpdateStatus) + removeInstance(startedInstances, info.getInstanceId()); + removeInstance(reportedInstances, info.getInstanceId()); + + // NOW mark as replaced - safe because instance is removed from tracking CloudVmStatus replacedStatus = vmTracker.getStatus(info.getInstanceId()); if (replacedStatus != null) { replacedStatus.setVmStatus(VMStatus.replaced); vmTracker.setStatus(replacedStatus); + } else { + // Create new replaced status if none exists (race condition protection) + CloudVmStatus newReplacedStatus = new CloudVmStatus( + info.getInstanceId(), instanceRequest.getJobId(), "unknown", + JobStatus.Stopped, VMImageType.AGENT, instanceRequest.getRegion(), + VMStatus.replaced, new ValidationStatus(), 0, 0, null, null); + vmTracker.setStatus(newReplacedStatus); + LOG.warn(new ObjectMessage(Map.of("Message", + "Created new 'replaced' status for instance " + info.getInstanceId() + + " - original status was missing (possible race condition)"))); } // Also update in the database for persistence diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 4d0450651..57a1e1b97 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -230,28 +230,13 @@ private JobQueueStatus getQueueStatus(JobQueueStatus oldStatus, JobStatus jobSta return oldStatus; } - /** - * If the vm is shutting down, terminated, or replaced, don't update the status to something else. - * This prevents race conditions where a user kills an instance while watchdog is replacing it, - * or stale status updates arrive after an instance has already transitioned to a terminal state. - * @param currentStatus - * @return - */ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { if (currentStatus != null) { VMStatus status = currentStatus.getVmStatus(); - boolean isTerminalState = (status == VMStatus.shutting_down - || status == VMStatus.stopped - || status == VMStatus.stopping - || status == VMStatus.terminated - || status == VMStatus.replaced); - if (isTerminalState) { - LOG.info(new ObjectMessage(Map.of("Message", - "Ignoring status update for instance " + currentStatus.getInstanceId() + - " - already in terminal state: " + status + - " (possible race between user kill and watchdog replace)"))); - return false; - } + return (status != VMStatus.shutting_down + && status != VMStatus.stopped + && status != VMStatus.stopping + && status != VMStatus.terminated); } return true; } From cb2fbbd12eafb6058dc092ad876efe5e32f82df7 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Tue, 27 Jan 2026 23:35:16 -0800 Subject: [PATCH 21/29] update tests to account for changes --- .../intuit/tank/vmManager/VMTrackerImplTest.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java index 6849cfeb0..cd18b8ac8 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -157,7 +157,7 @@ void shouldUpdateStatus_nullCurrentStatus_returnsTrue() throws Exception { } @ParameterizedTest - @EnumSource(value = VMStatus.class, names = {"terminated", "replaced", "stopped", "stopping", "shutting_down"}) + @EnumSource(value = VMStatus.class, names = {"terminated", "stopped", "stopping", "shutting_down"}) @DisplayName("shouldUpdateStatus returns false for terminal states") void shouldUpdateStatus_terminalStates_returnsFalse(VMStatus terminalStatus) throws Exception { CloudVmStatus status = createStatus("i-123", terminalStatus); @@ -175,14 +175,16 @@ void shouldUpdateStatus_activeStates_returnsTrue(VMStatus activeStatus) throws E } @Test - @DisplayName("shouldUpdateStatus blocks update for replaced instance (AgentWatchdog scenario)") - void shouldUpdateStatus_replacedInstance_blocksUpdate() throws Exception { + @DisplayName("shouldUpdateStatus allows updates for replaced instances (protection handled in AgentWatchdog)") + void shouldUpdateStatus_replacedInstance_allowsUpdate() throws Exception { // Given: An instance that was replaced by AgentWatchdog CloudVmStatus replacedStatus = createStatus("i-replaced", VMStatus.replaced); - // When/Then: Updates should be blocked - assertFalse(invokeShouldUpdateStatus(replacedStatus), - "Replaced instances should not accept status updates (race condition guard)"); + // When/Then: Updates should be allowed - protection is handled by removing + // instances from tracking lists in AgentWatchdog before marking as replaced, + // NOT by blocking in shouldUpdateStatus (which would break killJobDirectly) + assertTrue(invokeShouldUpdateStatus(replacedStatus), + "Replaced instances should allow updates (e.g., replaced -> terminated for killJobDirectly)"); } // ============ VMStatus.replaced integration tests ============ From 859fe61efc76ccda5cfdf1ec9d9b1eeb8a643fcc Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 28 Jan 2026 15:19:47 -0800 Subject: [PATCH 22/29] add defensive programming by copying array instead of referencing --- .../main/java/com/intuit/tank/vmManager/AgentWatchdog.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index eb1d5f450..885c20a25 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -260,7 +260,10 @@ private void relaunch(ArrayList instances) { amazonInstance.killInstances(instanceIds); // Set terminated status on the DAO VMImageDao dao = new VMImageDao(); - for (VMInformation info : instances) { + // Create defensive copy - instances IS startedInstances (same reference), + // and we modify startedInstances inside the loop via removeInstance() + List instancesToProcess = new ArrayList<>(instances); + for (VMInformation info : instancesToProcess) { vmInfo.remove(info); // Remove from all tracking lists FIRST - this prevents stale heartbeats from being processed From ce8468cf692f924da97eeb3c3698b5f58a7c4a38 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 28 Jan 2026 19:13:57 -0800 Subject: [PATCH 23/29] do NOT fix (or rather ruin) what's not broken --- .../com/intuit/tank/vmManager/AgentWatchdog.java | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 885c20a25..83f8df501 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -260,18 +260,10 @@ private void relaunch(ArrayList instances) { amazonInstance.killInstances(instanceIds); // Set terminated status on the DAO VMImageDao dao = new VMImageDao(); - // Create defensive copy - instances IS startedInstances (same reference), - // and we modify startedInstances inside the loop via removeInstance() - List instancesToProcess = new ArrayList<>(instances); - for (VMInformation info : instancesToProcess) { + for (VMInformation info : instances) { vmInfo.remove(info); - - // Remove from all tracking lists FIRST - this prevents stale heartbeats from being processed - // for this instance after we mark it as replaced (no need to modify shouldUpdateStatus) - removeInstance(startedInstances, info.getInstanceId()); - removeInstance(reportedInstances, info.getInstanceId()); - // NOW mark as replaced - safe because instance is removed from tracking + // Mark as replaced - keeps visible in UI but filtered from job status CloudVmStatus replacedStatus = vmTracker.getStatus(info.getInstanceId()); if (replacedStatus != null) { replacedStatus.setVmStatus(VMStatus.replaced); From 46f2bc11e38fe7504711473803713304d37d82f2 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Wed, 28 Jan 2026 22:41:53 -0800 Subject: [PATCH 24/29] more unvibe coding this mess --- .../tank/vm/vmManager/models/VMStatus.java | 15 ++---- .../rest/mvc/rest/cloud/JobEventSender.java | 10 +--- .../intuit/tank/vmManager/AgentWatchdog.java | 50 ++++--------------- 3 files changed, 16 insertions(+), 59 deletions(-) diff --git a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java index 23f0a1c3a..7dc4312b1 100644 --- a/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java +++ b/api/src/main/java/com/intuit/tank/vm/vmManager/models/VMStatus.java @@ -30,18 +30,13 @@ public enum VMStatus implements Serializable { replaced; // replaced by AgentWatchdog due to failure to report back public static final VMStatus fromString(String value) { - if (value == null || value.isEmpty()) { - return VMStatus.unknown; - } + VMStatus ret = null; if ("shutting-down".equals(value)) { - return VMStatus.shutting_down; - } - try { - return VMStatus.valueOf(value); - } catch (IllegalArgumentException e) { - // Gracefully handle unknown values (e.g., from older/newer clients with different enum versions) - return VMStatus.unknown; + ret = VMStatus.shutting_down; + } else { + ret = VMStatus.valueOf(value); } + return ret != null ? ret : VMStatus.unknown; } } diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 9eecb9ec3..14bfa268f 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -248,15 +248,7 @@ private List getInstancesForJob(String jobId) { CloudVmStatusContainer statuses = vmTracker.getVmStatusForJob(jobId); if (statuses != null) { instanceIds = statuses.getStatuses().stream() - .filter(s -> { - VMStatus vmStatus = s.getVmStatus(); - // skip unreachable instances - they can't receive commands - return vmStatus != VMStatus.terminated && - vmStatus != VMStatus.replaced && - vmStatus != VMStatus.stopped && - vmStatus != VMStatus.shutting_down && - vmStatus != VMStatus.stopping; - }) + .filter(s -> s.getVmStatus() != VMStatus.replaced) .map(CloudVmStatus::getInstanceId) .collect(Collectors.toList()); } diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java index 83f8df501..cc74f77e7 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/AgentWatchdog.java @@ -176,26 +176,7 @@ public void run() { private void checkForReportingInstances() { String jobId = instanceRequest.getJobId(); CloudVmStatusContainer vmStatusForJob = vmTracker.getVmStatusForJob(jobId); - - // Container might not exist yet if setStatus() async tasks haven't completed - // OR it could be null because the job was killed/removed externally - if (vmStatusForJob == null) { - // Check if job still exists and is active - if not, it was killed externally - JobInstanceDao dao = new JobInstanceDao(); - JobInstance job = dao.findById(Integer.parseInt(jobId)); - if (job == null || job.getStatus() == JobQueueStatus.Completed - || job.getStatus() == JobQueueStatus.Aborted) { - stopped = true; - throw new RuntimeException("Job " + jobId + " was stopped or does not exist. Exiting watchdog."); - } - // Job exists and is active - container just not yet created (async race) - LOG.debug(new ObjectMessage(Map.of("Message", - "Job container not yet created for job " + jobId + " - waiting for async status updates"))); - return; // Return and check again on next iteration - } - - // Only treat as stopped if container exists AND has an end time (user/system stopped the job) - if (vmStatusForJob.getEndTime() != null) { + if (vmStatusForJob == null || vmStatusForJob.getEndTime() != null) { stopped = true; throw new RuntimeException("Job appears to have been stopped. Exiting..."); } @@ -262,34 +243,17 @@ private void relaunch(ArrayList instances) { VMImageDao dao = new VMImageDao(); for (VMInformation info : instances) { vmInfo.remove(info); + vmTracker.setStatus(createReplacedVmStatus(info)); - // Mark as replaced - keeps visible in UI but filtered from job status - CloudVmStatus replacedStatus = vmTracker.getStatus(info.getInstanceId()); - if (replacedStatus != null) { - replacedStatus.setVmStatus(VMStatus.replaced); - vmTracker.setStatus(replacedStatus); - } else { - // Create new replaced status if none exists (race condition protection) - CloudVmStatus newReplacedStatus = new CloudVmStatus( - info.getInstanceId(), instanceRequest.getJobId(), "unknown", - JobStatus.Stopped, VMImageType.AGENT, instanceRequest.getRegion(), - VMStatus.replaced, new ValidationStatus(), 0, 0, null, null); - vmTracker.setStatus(newReplacedStatus); - LOG.warn(new ObjectMessage(Map.of("Message", - "Created new 'replaced' status for instance " + info.getInstanceId() + - " - original status was missing (possible race condition)"))); - } - - // Also update in the database for persistence VMInstance image = dao.getImageByInstanceId(info.getInstanceId()); if (image != null) { - image.setStatus(VMStatus.replaced.name()); // mark as replaced (not terminated) for audit trail + image.setStatus(VMStatus.replaced.name()); dao.saveOrUpdate(image); } LOG.info(new ObjectMessage(Map.of("Message", "Marked instance " + info.getInstanceId() + - " as REPLACED (visible in UI but filtered from job status) for job " + jobId))); + " as REPLACED for job " + jobId))); } LOG.info(new ObjectMessage(Map.of("Message","Setting number of instances to relaunch to: " + instances.size() + " for job " + jobId))); instanceRequest.setNumberOfInstances(instances.size()); @@ -339,6 +303,12 @@ private CloudVmStatus createCloudStatus(VMInstanceRequest req, VMInformation inf VMImageType.AGENT, req.getRegion(), VMStatus.starting, new ValidationStatus(), 0, 0, null, null); } + private CloudVmStatus createReplacedVmStatus(VMInformation info) { + return new CloudVmStatus(info.getInstanceId(), instanceRequest.getJobId(), "unknown", + JobStatus.Stopped, VMImageType.AGENT, instanceRequest.getRegion(), + VMStatus.replaced, new ValidationStatus(), 0, 0, null, null); + } + private boolean shouldRelaunchInstances() { return startTime + maxWaitForResponse < System.currentTimeMillis(); } From f9855eb1b4347a84ce57290301ed63f7e758d305 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 29 Jan 2026 00:58:03 -0800 Subject: [PATCH 25/29] removing more useless code, sticking to feature --- .../intuit/tank/vmManager/VMTrackerImpl.java | 60 +++---------------- 1 file changed, 7 insertions(+), 53 deletions(-) diff --git a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java index 57a1e1b97..7c0083d05 100644 --- a/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java +++ b/tank_vmManager/src/main/java/com/intuit/tank/vmManager/VMTrackerImpl.java @@ -19,11 +19,9 @@ import static com.intuit.tank.vm.common.TankConstants.NOTIFICATIONS_EVENT_EVENT_TIME_KEY; import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; @@ -103,7 +101,7 @@ public class VMTrackerImpl implements VMTracker { t.setDaemon(true); return t; }, - new ThreadPoolExecutor.CallerRunsPolicy()); // Don't drop status updates - caller processes if queue full + new ThreadPoolExecutor.DiscardOldestPolicy()); /** * @@ -247,33 +245,7 @@ private boolean shouldUpdateStatus(CloudVmStatus currentStatus) { */ @Override public void removeStatusForInstance(String instanceId) { - // First, get the status to find the jobId (without removing yet) - CloudVmStatus status = statusMap.get(instanceId); - - if (status != null) { - String jobId = status.getJobId(); - // Synchronize on the job lock to ensure atomic removal from both - // statusMap and container - prevents race with setStatusThread - synchronized (getCacheSyncObject(jobId)) { - // Now remove from statusMap inside the sync block - CloudVmStatus removedStatus = statusMap.remove(instanceId); - - // Also remove from the job's container to keep counts accurate - if (removedStatus != null) { - CloudVmStatusContainer container = jobMap.get(jobId); - if (container != null) { - boolean removed = container.getStatuses().remove(removedStatus); - if (removed) { - LOG.info(new ObjectMessage(Map.of("Message", - "Removed instance " + instanceId + " from container for job " + jobId))); - } - } - } - } - } else { - // Instance not in statusMap - just try to remove (no-op if not present) - statusMap.remove(instanceId); - } + statusMap.remove(instanceId); } /** @@ -282,19 +254,12 @@ public void removeStatusForInstance(String instanceId) { */ @Override public void removeStatusForJob(String jobId) { - // Synchronize on the same lock used by setStatusThread to prevent - // ConcurrentModificationException when iterating over statuses - synchronized (getCacheSyncObject(jobId)) { - CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); - if (cloudVmStatusContainer != null) { - // Copy to avoid ConcurrentModificationException - the underlying - // HashSet could be modified by setStatusThread while we iterate - List statusesCopy = new ArrayList<>(cloudVmStatusContainer.getStatuses()); - for (CloudVmStatus s : statusesCopy) { - removeStatusForInstance(s.getInstanceId()); - } - jobMap.remove(jobId); + CloudVmStatusContainer cloudVmStatusContainer = jobMap.get(jobId); + if (cloudVmStatusContainer != null) { + for (CloudVmStatus s : cloudVmStatusContainer.getStatuses()) { + removeStatusForInstance(s.getInstanceId()); } + jobMap.remove(jobId); } } @@ -408,14 +373,6 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine if (job != null) { job.setEndTime(cloudVmStatusContainer.getEndTime()); JobQueueStatus oldStatus = job.getStatus(); - - // don't downgrade from terminal state (Completed) - once a job is done, it stays done - if (oldStatus == JobQueueStatus.Completed) { - LOG.debug(new ObjectMessage(Map.of("Message", - "Job " + status.getJobId() + " already Completed - ignoring status recalculation from instance updates"))); - return; - } - JobQueueStatus newStatus = job.getStatus(); if (isFinished) { newStatus = JobQueueStatus.Completed; @@ -454,9 +411,6 @@ private void addStatusToJobContainer(CloudVmStatus status, CloudVmStatusContaine } private Object getCacheSyncObject(final String id) { - if (id == null) { - throw new IllegalArgumentException("Cannot synchronize on null jobId"); - } locks.putIfAbsent(id, id); return locks.get(id); } From d2ca893189809efced9c24cc5aec6c100e787365 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 29 Jan 2026 09:29:47 -0800 Subject: [PATCH 26/29] update tests to account for vibe code clean up --- .../vm/vmManager/models/VMStatusTest.java | 40 +++---------- .../mvc/rest/cloud/JobEventSenderTest.java | 56 ++++++++++--------- .../tank/vmManager/VMTrackerImplTest.java | 26 +++------ 3 files changed, 48 insertions(+), 74 deletions(-) diff --git a/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java b/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java index 66f250585..8df966fbc 100644 --- a/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java +++ b/api/src/test/java/com/intuit/tank/vm/vmManager/models/VMStatusTest.java @@ -55,41 +55,17 @@ public void testFromString_replaced() { } @Test - @DisplayName("fromString returns unknown for null input") - public void testFromString_nullReturnsUnknown() { - VMStatus result = VMStatus.fromString(null); - - assertNotNull(result); - assertEquals(VMStatus.unknown, result); - } - - @Test - @DisplayName("fromString returns unknown for empty string") - public void testFromString_emptyReturnsUnknown() { - VMStatus result = VMStatus.fromString(""); - - assertNotNull(result); - assertEquals(VMStatus.unknown, result); + @DisplayName("fromString throws for invalid values (original behavior)") + public void testFromString_invalidValueThrows() { + // Original behavior: valueOf throws IllegalArgumentException for invalid values + assertThrows(IllegalArgumentException.class, () -> VMStatus.fromString("garbage-value")); } @Test - @DisplayName("fromString returns unknown for unrecognized values (graceful handling)") - public void testFromString_unknownValueReturnsUnknown() { - // Should not throw IllegalArgumentException - gracefully returns unknown - VMStatus result = VMStatus.fromString("garbage-value"); - - assertNotNull(result); - assertEquals(VMStatus.unknown, result); - } - - @Test - @DisplayName("fromString handles future/unknown enum values gracefully") - public void testFromString_futureCompatibility() { - // Simulates receiving a value from a newer version of the API - VMStatus result = VMStatus.fromString("some_new_status_from_future"); - - assertNotNull(result); - assertEquals(VMStatus.unknown, result); + @DisplayName("fromString throws for null input (original behavior)") + public void testFromString_nullThrows() { + // Original behavior: valueOf throws NullPointerException for null + assertThrows(NullPointerException.class, () -> VMStatus.fromString(null)); } @Test diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java index a6576e4aa..740bde1e6 100644 --- a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -73,8 +73,8 @@ private List invokeGetInstancesForJob(String jobId) throws Exception { } @Test - @DisplayName("getInstancesForJob excludes terminated instances") - void getInstancesForJob_excludesTerminatedInstances() throws Exception { + @DisplayName("getInstancesForJob includes terminated instances (AWS handles idempotently)") + void getInstancesForJob_includesTerminatedInstances() throws Exception { // Given: Mix of running and terminated instances container.getStatuses().add(createStatus("i-running1", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-terminated", JobStatus.Stopped, VMStatus.terminated)); @@ -85,16 +85,16 @@ void getInstancesForJob_excludesTerminatedInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: Only running instances should be returned - assertEquals(2, instances.size()); + // Then: All instances returned (AWS terminateInstances is idempotent) + assertEquals(3, instances.size()); assertTrue(instances.contains("i-running1")); assertTrue(instances.contains("i-running2")); - assertFalse(instances.contains("i-terminated")); + assertTrue(instances.contains("i-terminated")); } @Test - @DisplayName("getInstancesForJob excludes stopped instances") - void getInstancesForJob_excludesStoppedInstances() throws Exception { + @DisplayName("getInstancesForJob includes stopped instances (EC2 still exists)") + void getInstancesForJob_includesStoppedInstances() throws Exception { // Given container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-stopped", JobStatus.Stopped, VMStatus.stopped)); @@ -104,15 +104,15 @@ void getInstancesForJob_excludesStoppedInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then - assertEquals(1, instances.size()); + // Then: stopped instances are included (EC2 still exists, needs termination) + assertEquals(2, instances.size()); assertTrue(instances.contains("i-running")); - assertFalse(instances.contains("i-stopped")); + assertTrue(instances.contains("i-stopped")); } @Test - @DisplayName("getInstancesForJob excludes stopping instances") - void getInstancesForJob_excludesStoppingInstances() throws Exception { + @DisplayName("getInstancesForJob includes stopping instances (EC2 still exists)") + void getInstancesForJob_includesStoppingInstances() throws Exception { // Given container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-stopping", JobStatus.Stopped, VMStatus.stopping)); @@ -122,15 +122,15 @@ void getInstancesForJob_excludesStoppingInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then - assertEquals(1, instances.size()); + // Then: stopping instances are included (EC2 still exists) + assertEquals(2, instances.size()); assertTrue(instances.contains("i-running")); - assertFalse(instances.contains("i-stopping")); + assertTrue(instances.contains("i-stopping")); } @Test - @DisplayName("getInstancesForJob excludes shutting_down instances") - void getInstancesForJob_excludesShuttingDownInstances() throws Exception { + @DisplayName("getInstancesForJob includes shutting_down instances (EC2 still exists)") + void getInstancesForJob_includesShuttingDownInstances() throws Exception { // Given container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-shuttingdown", JobStatus.Stopped, VMStatus.shutting_down)); @@ -140,10 +140,10 @@ void getInstancesForJob_excludesShuttingDownInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then - assertEquals(1, instances.size()); + // Then: shutting_down instances are included (EC2 still exists) + assertEquals(2, instances.size()); assertTrue(instances.contains("i-running")); - assertFalse(instances.contains("i-shuttingdown")); + assertTrue(instances.contains("i-shuttingdown")); } @Test @@ -237,9 +237,9 @@ void getInstancesForJob_agentWatchdogScenario() throws Exception { } @Test - @DisplayName("getInstancesForJob excludes all terminal states correctly") - void getInstancesForJob_excludesAllTerminalStates() throws Exception { - // Given: One instance of each terminal state + one running + @DisplayName("getInstancesForJob excludes only replaced instances") + void getInstancesForJob_excludesOnlyReplaced() throws Exception { + // Given: One instance of each state container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-terminated", JobStatus.Completed, VMStatus.terminated)); container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); @@ -252,9 +252,15 @@ void getInstancesForJob_excludesAllTerminalStates() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: Only the running instance should be included - assertEquals(1, instances.size()); + // Then: Only replaced is excluded (watchdog already killed it on AWS) + // All others included (EC2 instances exist or AWS handles idempotently) + assertEquals(5, instances.size()); assertTrue(instances.contains("i-running")); + assertTrue(instances.contains("i-terminated")); + assertTrue(instances.contains("i-stopped")); + assertTrue(instances.contains("i-stopping")); + assertTrue(instances.contains("i-shutting-down")); + assertFalse(instances.contains("i-replaced")); } @Test diff --git a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java index cd18b8ac8..39dd09957 100644 --- a/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java +++ b/tank_vmManager/src/test/java/com/intuit/tank/vmManager/VMTrackerImplTest.java @@ -490,40 +490,32 @@ void concurrentRemoveStatusForJob_noException() throws Exception { } } - // ============ removeStatusForInstance race condition fix tests ============ + // ============ removeStatusForInstance tests ============ @Test - @DisplayName("removeStatusForInstance removes from both statusMap and container atomically") - void removeStatusForInstance_atomicRemoval() throws Exception { - String jobId = "atomic-remove-job"; - String instanceId = "i-atomic-test"; + @DisplayName("removeStatusForInstance removes from statusMap") + void removeStatusForInstance_removesFromStatusMap() throws Exception { + String jobId = "remove-test-job"; + String instanceId = "i-remove-test"; // Given: An instance with status (using direct manipulation) CloudVmStatus status = createStatusWithJobStatus(instanceId, jobId, VMStatus.running, JobStatus.Running); addStatusDirectly(status); - // Verify it exists in both places + // Verify it exists assertNotNull(vmTracker.getStatus(instanceId), "Status should exist in statusMap"); - CloudVmStatusContainer container = vmTracker.getVmStatusForJob(jobId); - assertNotNull(container, "Container should exist"); - assertTrue(container.getStatuses().stream().anyMatch(s -> s.getInstanceId().equals(instanceId)), - "Instance should be in container"); // When: Remove the status vmTracker.removeStatusForInstance(instanceId); - // Then: Should be removed from both statusMap and container + // Then: Should be removed from statusMap assertNull(vmTracker.getStatus(instanceId), "Status should be removed from statusMap"); - container = vmTracker.getVmStatusForJob(jobId); - assertNotNull(container); // Container still exists (job not removed) - assertFalse(container.getStatuses().stream().anyMatch(s -> s.getInstanceId().equals(instanceId)), - "Instance should be removed from container"); } @Test - @DisplayName("removeStatusForInstance handles non-existent instance gracefully with sync") + @DisplayName("removeStatusForInstance handles non-existent instance gracefully") void removeStatusForInstance_nonExistent_noException() { - // Should not throw even with the new synchronized logic + // Should not throw assertDoesNotThrow(() -> vmTracker.removeStatusForInstance("i-does-not-exist")); } } From e9a0e00302e39163dc44134cf9b838cf4f479bc9 Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 29 Jan 2026 16:04:46 -0800 Subject: [PATCH 27/29] revert jobeventsender status filter change --- .../com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 14bfa268f..3eabf414d 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -247,10 +247,7 @@ private List getInstancesForJob(String jobId) { List instanceIds = new ArrayList(); CloudVmStatusContainer statuses = vmTracker.getVmStatusForJob(jobId); if (statuses != null) { - instanceIds = statuses.getStatuses().stream() - .filter(s -> s.getVmStatus() != VMStatus.replaced) - .map(CloudVmStatus::getInstanceId) - .collect(Collectors.toList()); + instanceIds = statuses.getStatuses().stream().map(CloudVmStatus::getInstanceId).collect(Collectors.toList()); } return instanceIds; } From 489b3ef45cecc25225b49328a4630968fe538b0c Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 29 Jan 2026 16:23:08 -0800 Subject: [PATCH 28/29] update test --- .../mvc/rest/cloud/JobEventSenderTest.java | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java index 740bde1e6..958cb7b7c 100644 --- a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -193,8 +193,8 @@ void getInstancesForJob_returnsEmptyForNonExistent() throws Exception { } @Test - @DisplayName("getInstancesForJob excludes replaced instances (AgentWatchdog replacements)") - void getInstancesForJob_excludesReplacedInstances() throws Exception { + @DisplayName("getInstancesForJob includes replaced instances (needed for EC2 termination)") + void getInstancesForJob_includesReplacedInstances() throws Exception { // Given: VMStatus.replaced is set by AgentWatchdog when an agent fails and is replaced container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); @@ -204,14 +204,14 @@ void getInstancesForJob_excludesReplacedInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: Replaced instances should be filtered out (can't receive commands) - assertEquals(1, instances.size()); + // Then: ALL instances returned including replaced (EC2 still needs termination) + assertEquals(2, instances.size()); assertTrue(instances.contains("i-running")); - assertFalse(instances.contains("i-replaced")); + assertTrue(instances.contains("i-replaced")); } @Test - @DisplayName("getInstancesForJob filters correctly with mixed statuses from AgentWatchdog scenario") + @DisplayName("getInstancesForJob returns all instances including replaced for AgentWatchdog scenario") void getInstancesForJob_agentWatchdogScenario() throws Exception { // Given: Real-world scenario where AgentWatchdog replaced some agents // - 3 running agents (healthy) @@ -227,18 +227,18 @@ void getInstancesForJob_agentWatchdogScenario() throws Exception { // When: User tries to kill the job List instances = invokeGetInstancesForJob("123"); - // Then: Only healthy instances should receive the kill command - assertEquals(3, instances.size()); + // Then: ALL instances returned (replaced EC2 instances still need termination) + assertEquals(5, instances.size()); assertTrue(instances.contains("i-healthy1")); assertTrue(instances.contains("i-healthy2")); assertTrue(instances.contains("i-healthy3")); - assertFalse(instances.contains("i-replaced1")); - assertFalse(instances.contains("i-replaced2")); + assertTrue(instances.contains("i-replaced1")); + assertTrue(instances.contains("i-replaced2")); } @Test - @DisplayName("getInstancesForJob excludes only replaced instances") - void getInstancesForJob_excludesOnlyReplaced() throws Exception { + @DisplayName("getInstancesForJob returns all instances regardless of status") + void getInstancesForJob_returnsAllInstances() throws Exception { // Given: One instance of each state container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-terminated", JobStatus.Completed, VMStatus.terminated)); @@ -252,15 +252,14 @@ void getInstancesForJob_excludesOnlyReplaced() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: Only replaced is excluded (watchdog already killed it on AWS) - // All others included (EC2 instances exist or AWS handles idempotently) - assertEquals(5, instances.size()); + // Then: ALL instances returned (no filtering - AWS handles idempotently) + assertEquals(6, instances.size()); assertTrue(instances.contains("i-running")); assertTrue(instances.contains("i-terminated")); assertTrue(instances.contains("i-stopped")); assertTrue(instances.contains("i-stopping")); assertTrue(instances.contains("i-shutting-down")); - assertFalse(instances.contains("i-replaced")); + assertTrue(instances.contains("i-replaced")); } @Test From f906fcbf146663985178bd1944fb67f21fcae88e Mon Sep 17 00:00:00 2001 From: zkofiro Date: Thu, 29 Jan 2026 22:19:16 -0800 Subject: [PATCH 29/29] fix for job command filtering --- .../rest/mvc/rest/cloud/JobEventSender.java | 16 ++++++- .../mvc/rest/cloud/JobEventSenderTest.java | 43 ++++++++++--------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java index 3eabf414d..1cc998ada 100644 --- a/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java +++ b/rest-mvc/impl/src/main/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSender.java @@ -146,7 +146,12 @@ public void killInstances(List instanceIds) { } String jobId = null; for (String instanceId : instanceIds) { - CloudVmStatus status = new CloudVmStatus(vmTracker.getStatus(instanceId)); + CloudVmStatus existingStatus = vmTracker.getStatus(instanceId); + if (existingStatus == null) { + LOG.warn("No status found for instance " + instanceId + " - skipping status update"); + continue; + } + CloudVmStatus status = new CloudVmStatus(existingStatus); status.setCurrentUsers(0); status.setEndTime(new Date()); status.setJobStatus(JobStatus.Completed); @@ -247,7 +252,14 @@ private List getInstancesForJob(String jobId) { List instanceIds = new ArrayList(); CloudVmStatusContainer statuses = vmTracker.getVmStatusForJob(jobId); if (statuses != null) { - instanceIds = statuses.getStatuses().stream().map(CloudVmStatus::getInstanceId).collect(Collectors.toList()); + instanceIds = statuses.getStatuses().stream() + .filter(s -> { + VMStatus vmStatus = s.getVmStatus(); + // Only exclude instances that no longer exist on AWS + return vmStatus != VMStatus.replaced && vmStatus != VMStatus.terminated; + }) + .map(CloudVmStatus::getInstanceId) + .collect(Collectors.toList()); } return instanceIds; } diff --git a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java index 958cb7b7c..17562a19f 100644 --- a/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java +++ b/rest-mvc/impl/src/test/java/com/intuit/tank/rest/mvc/rest/cloud/JobEventSenderTest.java @@ -73,8 +73,8 @@ private List invokeGetInstancesForJob(String jobId) throws Exception { } @Test - @DisplayName("getInstancesForJob includes terminated instances (AWS handles idempotently)") - void getInstancesForJob_includesTerminatedInstances() throws Exception { + @DisplayName("getInstancesForJob excludes terminated instances (no longer exist on AWS)") + void getInstancesForJob_excludesTerminatedInstances() throws Exception { // Given: Mix of running and terminated instances container.getStatuses().add(createStatus("i-running1", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-terminated", JobStatus.Stopped, VMStatus.terminated)); @@ -85,11 +85,11 @@ void getInstancesForJob_includesTerminatedInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: All instances returned (AWS terminateInstances is idempotent) - assertEquals(3, instances.size()); + // Then: Terminated instances excluded (no longer exist on AWS) + assertEquals(2, instances.size()); assertTrue(instances.contains("i-running1")); assertTrue(instances.contains("i-running2")); - assertTrue(instances.contains("i-terminated")); + assertFalse(instances.contains("i-terminated")); } @Test @@ -193,8 +193,8 @@ void getInstancesForJob_returnsEmptyForNonExistent() throws Exception { } @Test - @DisplayName("getInstancesForJob includes replaced instances (needed for EC2 termination)") - void getInstancesForJob_includesReplacedInstances() throws Exception { + @DisplayName("getInstancesForJob excludes replaced instances (can't receive HTTP commands)") + void getInstancesForJob_excludesReplacedInstances() throws Exception { // Given: VMStatus.replaced is set by AgentWatchdog when an agent fails and is replaced container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-replaced", JobStatus.Starting, VMStatus.replaced)); @@ -204,14 +204,14 @@ void getInstancesForJob_includesReplacedInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: ALL instances returned including replaced (EC2 still needs termination) - assertEquals(2, instances.size()); + // Then: Replaced instances filtered out (they can't receive HTTP commands) + assertEquals(1, instances.size()); assertTrue(instances.contains("i-running")); - assertTrue(instances.contains("i-replaced")); + assertFalse(instances.contains("i-replaced")); } @Test - @DisplayName("getInstancesForJob returns all instances including replaced for AgentWatchdog scenario") + @DisplayName("getInstancesForJob excludes replaced instances in AgentWatchdog scenario") void getInstancesForJob_agentWatchdogScenario() throws Exception { // Given: Real-world scenario where AgentWatchdog replaced some agents // - 3 running agents (healthy) @@ -227,18 +227,18 @@ void getInstancesForJob_agentWatchdogScenario() throws Exception { // When: User tries to kill the job List instances = invokeGetInstancesForJob("123"); - // Then: ALL instances returned (replaced EC2 instances still need termination) - assertEquals(5, instances.size()); + // Then: Only healthy instances returned (replaced no longer exist on AWS) + assertEquals(3, instances.size()); assertTrue(instances.contains("i-healthy1")); assertTrue(instances.contains("i-healthy2")); assertTrue(instances.contains("i-healthy3")); - assertTrue(instances.contains("i-replaced1")); - assertTrue(instances.contains("i-replaced2")); + assertFalse(instances.contains("i-replaced1")); + assertFalse(instances.contains("i-replaced2")); } @Test - @DisplayName("getInstancesForJob returns all instances regardless of status") - void getInstancesForJob_returnsAllInstances() throws Exception { + @DisplayName("getInstancesForJob excludes replaced and terminated, keeps other statuses") + void getInstancesForJob_excludesReplacedAndTerminated() throws Exception { // Given: One instance of each state container.getStatuses().add(createStatus("i-running", JobStatus.Running, VMStatus.running)); container.getStatuses().add(createStatus("i-terminated", JobStatus.Completed, VMStatus.terminated)); @@ -252,14 +252,15 @@ void getInstancesForJob_returnsAllInstances() throws Exception { // When List instances = invokeGetInstancesForJob("123"); - // Then: ALL instances returned (no filtering - AWS handles idempotently) - assertEquals(6, instances.size()); + // Then: Excludes replaced and terminated (no longer exist on AWS) + // Keeps stopped/stopping/shutting_down (EC2 still exists, needs termination) + assertEquals(4, instances.size()); assertTrue(instances.contains("i-running")); - assertTrue(instances.contains("i-terminated")); assertTrue(instances.contains("i-stopped")); assertTrue(instances.contains("i-stopping")); assertTrue(instances.contains("i-shutting-down")); - assertTrue(instances.contains("i-replaced")); + assertFalse(instances.contains("i-terminated")); + assertFalse(instances.contains("i-replaced")); } @Test