Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6c3cb1e
fix: prevent job stuck at Starting when AgentWatchdog replaces failed…
Dec 19, 2025
8f893fb
add logging to understand it all
Dec 19, 2025
e2e4be2
replaced agents stay visible in the UI
Dec 19, 2025
1e4bd64
TEMP CHANGE: test public/private ip logging w/ fix
Dec 19, 2025
cb2c816
fixes agent count + test publicIp logging
Dec 19, 2025
eabf716
fixes tests
Dec 20, 2025
77316ab
Merge branch 'master' into fix-terminated-job-status
Dec 20, 2025
07066e6
Add VMStatus.replaced for AgentWatchdog visibility + thread safety fixes
Jan 5, 2026
c5cfdfb
up that test coverage for new code
Jan 6, 2026
574b01e
we can undo the bandaid 'pending' agent status implemented for replac…
Jan 6, 2026
b92c4c6
fix: race condition in removeStatusForInstance + add unit tests + rem…
Jan 7, 2026
213a7f2
remove unnecessary logging
Jan 7, 2026
df63e71
no more log flooding
Jan 8, 2026
8dd9084
this was actually helpful
Jan 9, 2026
e04d193
Merge branch 'master' into fix-terminated-job-status
Jan 9, 2026
c09111f
very helpful ip logging
Jan 9, 2026
311492a
goodbye // TODO Do we have to kill jobs here?
Jan 27, 2026
cd90712
add agent ready timer to better time watchdog timeouts + fix bug with…
Jan 27, 2026
17b9c06
update comment + jobId null check
Jan 28, 2026
9a1e5fb
fixes job / agent status issues
Jan 28, 2026
fd49ecb
more fixes
Jan 28, 2026
790f870
potential fix for agent status race conditions: handle async setStatu…
Jan 28, 2026
8b44c30
Merge branch 'master' into fix-terminated-job-status
Jan 28, 2026
cb2fbbd
update tests to account for changes
Jan 28, 2026
859fe61
add defensive programming by copying array instead of referencing
Jan 28, 2026
ce8468c
do NOT fix (or rather ruin) what's not broken
Jan 29, 2026
46f2bc1
more unvibe coding this mess
Jan 29, 2026
f9855eb
removing more useless code, sticking to feature
Jan 29, 2026
d2ca893
update tests to account for vibe code clean up
Jan 29, 2026
e9a0e00
revert jobeventsender status filter change
Jan 30, 2026
489b3ef
update test
Jan 30, 2026
f906fcb
fix for job command filtering
Jan 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ private void updateInstanceStatus() {
newStatus.setTotalTps(tpsInfo.getTotalTps());
sendTps(tpsInfo);
}
if (!isLocal) setInstanceStatus(newStatus.getInstanceId(), newStatus);

if (!isLocal) {
setInstanceStatus(newStatus.getInstanceId(), newStatus);
}
APITestHarness.getInstance().checkAgentThreads();
} catch (Exception t) {
LOG.error(LogUtil.getLogMessage("Unable to send status metrics | " + t.getMessage()), t);
Expand Down Expand Up @@ -120,13 +123,20 @@ private CloudVmStatus createStatus(WatsAgentStatusResponse agentStatus) {
*/
private JobStatus calculateJobStatus(WatsAgentStatusResponse agentStatus, JobStatus currentStatus) {
AgentCommand cmd = APITestHarness.getInstance().getCmd();
return cmd == AgentCommand.pause ? JobStatus.Paused
JobStatus newStatus = cmd == AgentCommand.pause ? JobStatus.Paused
: cmd == AgentCommand.stop ? JobStatus.Stopped
: cmd == AgentCommand.pause_ramp ? JobStatus.RampPaused
: currentStatus == JobStatus.Unknown
|| currentStatus == JobStatus.Starting
&& agentStatus.getCurrentNumberUsers() > 0 ? JobStatus.Running
: currentStatus;

if (newStatus != currentStatus) {
LOG.info(LogUtil.getLogMessage("Agent JobStatus transition: " + currentStatus + " -> " + newStatus +
" (cmd=" + cmd + ", currentUsers=" + agentStatus.getCurrentNumberUsers() + ")"));
}

return newStatus;
}

public static void setDoMonitor(boolean monitor) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ private static void handleRequest(HttpExchange exchange) {
String path = exchange.getRequestURI().getPath();
if (path.equals(AgentCommand.start.getPath()) || path.equals(AgentCommand.run.getPath())) {
response = "Received command " + path + ", Starting Test JobId=" + APITestHarness.getInstance().getAgentRunData().getJobId();
LOG.info(LogUtil.getLogMessage("Received START command - launching test threads for job " +
APITestHarness.getInstance().getAgentRunData().getJobId()));
startTest();
} else if (path.startsWith(AgentCommand.stop.getPath())) {
response = "Received command " + path + ", Stopping Test JobId=" + APITestHarness.getInstance().getAgentRunData().getJobId();
Expand Down
15 changes: 10 additions & 5 deletions api/src/main/java/com/intuit/tank/agent/models/VMStatus.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,18 @@ public enum VMStatus implements Serializable {
terminated;

public static final VMStatus fromString(String value) {
VMStatus ret = null;
if (value == null || value.isEmpty()) {
return VMStatus.unknown;
}
if ("shutting-down".equals(value)) {
ret = VMStatus.shutting_down;
} else {
ret = VMStatus.valueOf(value);
return VMStatus.shutting_down;
}
try {
return VMStatus.valueOf(value);
} catch (IllegalArgumentException e) {
// Gracefully handle unknown values (e.g., 'replaced' from controller which agent doesn't need)
return VMStatus.unknown;
}
return ret != null ? ret : VMStatus.unknown;
}

}
17 changes: 0 additions & 17 deletions api/src/main/java/com/intuit/tank/vm/settings/VmManagerConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -204,23 +204,6 @@ public VmInstanceType getInstanceType(String name) {
return ret;
}

/**
*
* @param defaultMills
* @return
*/
public long getMaxAgentStartMills(long defaultMills) {
String string = config.getString("watchdog/max-time-for-agent-start");
if (string != null) {
try {
return TimeUtil.parseTimeString(string);
} catch (Exception e) {
LOG.error(e.toString());
}
}
return defaultMills;
}

/**
*
* @return the provider classname. should be an instance of IDatabase
Expand Down
38 changes: 38 additions & 0 deletions api/src/main/java/com/intuit/tank/vm/vmManager/VMInformation.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,44 @@ public String getPrivateDNS() {
return (String) this.items.get("privateDns");
}

/**
* Set the virtual machine private IP address
*
* @param data
* The virtual machine's private IP address
*/
public void setPrivateIp(String data) {
this.items.put("privateIp", data);
}

/**
* Get the virtual machine private IP address
*
* @return The virtual machine's private IP address
*/
public String getPrivateIp() {
return (String) this.items.get("privateIp");
}

/**
* Set the virtual machine public IP address
*
* @param data
* The virtual machine's public IP address
*/
public void setPublicIp(String data) {
this.items.put("publicIp", data);
}

/**
* Get the virtual machine public IP address
*
* @return The virtual machine's public IP address
*/
public String getPublicIp() {
return (String) this.items.get("publicIp");
}

public void setLaunchTime(Calendar data) {
this.items.put("launchTime", data);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ public enum VMStatus implements Serializable {
stopping,
stopped,
shutting_down,
terminated;
terminated,
replaced; // replaced by AgentWatchdog due to failure to report back

public static final VMStatus fromString(String value) {
VMStatus ret = null;
Expand Down
75 changes: 75 additions & 0 deletions api/src/test/java/com/intuit/tank/agent/models/VMStatusTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package com.intuit.tank.agent.models;

import org.junit.jupiter.api.*;

import static org.junit.jupiter.api.Assertions.*;

/**
* Tests for the agent-side VMStatus enum.
* This enum is used by agents and doesn't include the 'replaced' status
* (which is only meaningful on the controller side).
*/
public class VMStatusTest {

@Test
@DisplayName("fromString handles shutting-down special case")
public void testFromString_shuttingDown() {
VMStatus result = VMStatus.fromString("shutting-down");

assertNotNull(result);
assertEquals(VMStatus.shutting_down, result);
}

@Test
@DisplayName("fromString returns correct enum for valid values")
public void testFromString_validValues() {
assertEquals(VMStatus.running, VMStatus.fromString("running"));
assertEquals(VMStatus.pending, VMStatus.fromString("pending"));
assertEquals(VMStatus.starting, VMStatus.fromString("starting"));
assertEquals(VMStatus.rebooting, VMStatus.fromString("rebooting"));
assertEquals(VMStatus.terminated, VMStatus.fromString("terminated"));
assertEquals(VMStatus.stopped, VMStatus.fromString("stopped"));
assertEquals(VMStatus.stopping, VMStatus.fromString("stopping"));
assertEquals(VMStatus.rampPaused, VMStatus.fromString("rampPaused"));
}

@Test
@DisplayName("fromString returns unknown for null input")
public void testFromString_nullReturnsUnknown() {
VMStatus result = VMStatus.fromString(null);

assertNotNull(result);
assertEquals(VMStatus.unknown, result);
}

@Test
@DisplayName("fromString returns unknown for empty string")
public void testFromString_emptyReturnsUnknown() {
VMStatus result = VMStatus.fromString("");

assertNotNull(result);
assertEquals(VMStatus.unknown, result);
}

@Test
@DisplayName("fromString returns unknown for unrecognized values")
public void testFromString_unknownValueReturnsUnknown() {
// Should not throw IllegalArgumentException - gracefully returns unknown
VMStatus result = VMStatus.fromString("garbage-value");

assertNotNull(result);
assertEquals(VMStatus.unknown, result);
}

@Test
@DisplayName("fromString handles 'replaced' from controller gracefully (returns unknown)")
public void testFromString_replacedFromControllerReturnsUnknown() {
// The agent-side VMStatus doesn't have a 'replaced' enum value
// When the controller sends 'replaced', the agent should handle it gracefully
VMStatus result = VMStatus.fromString("replaced");

assertNotNull(result);
assertEquals(VMStatus.unknown, result);
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -19,50 +19,61 @@

/**
* The class <code>VMStatusTest</code> contains tests for the class <code>{@link VMStatus}</code>.
*
* @generatedBy CodePro at 12/15/14 2:57 PM
*/
public class VMStatusTest {
/**
* Run the VMStatus fromString(String) method test.
*
* @throws Exception
*
* @generatedBy CodePro at 12/15/14 2:57 PM
*/

@Test
@Disabled
public void testFromString_1()
throws Exception {
String value = "shutting-down";
@DisplayName("fromString handles shutting-down special case")
public void testFromString_shuttingDown() {
VMStatus result = VMStatus.fromString("shutting-down");

assertNotNull(result);
assertEquals(VMStatus.shutting_down, result);
}

VMStatus result = VMStatus.fromString(value);
@Test
@DisplayName("fromString returns correct enum for valid values")
public void testFromString_validValues() {
assertEquals(VMStatus.running, VMStatus.fromString("running"));
assertEquals(VMStatus.pending, VMStatus.fromString("pending"));
assertEquals(VMStatus.starting, VMStatus.fromString("starting"));
assertEquals(VMStatus.ready, VMStatus.fromString("ready"));
assertEquals(VMStatus.rebooting, VMStatus.fromString("rebooting"));
assertEquals(VMStatus.terminated, VMStatus.fromString("terminated"));
assertEquals(VMStatus.stopped, VMStatus.fromString("stopped"));
assertEquals(VMStatus.stopping, VMStatus.fromString("stopping"));
assertEquals(VMStatus.rampPaused, VMStatus.fromString("rampPaused"));
}

@Test
@DisplayName("fromString returns replaced for 'replaced' value")
public void testFromString_replaced() {
VMStatus result = VMStatus.fromString("replaced");

assertNotNull(result);
assertEquals("shutting_down", result.name());
assertEquals("shutting_down", result.toString());
assertEquals(6, result.ordinal());
assertEquals(VMStatus.replaced, result);
}

/**
* Run the VMStatus fromString(String) method test.
*
* @throws Exception
*
* @generatedBy CodePro at 12/15/14 2:57 PM
*/
@Test
public void testFromString_2()
throws Exception {
String value = VMStatus.rebooting.name();
@DisplayName("fromString throws for invalid values (original behavior)")
public void testFromString_invalidValueThrows() {
// Original behavior: valueOf throws IllegalArgumentException for invalid values
assertThrows(IllegalArgumentException.class, () -> VMStatus.fromString("garbage-value"));
}

VMStatus result = VMStatus.fromString(value);
@Test
@DisplayName("fromString throws for null input (original behavior)")
public void testFromString_nullThrows() {
// Original behavior: valueOf throws NullPointerException for null
assertThrows(NullPointerException.class, () -> VMStatus.fromString(null));
}

// An unexpected exception was thrown in user code while executing this test:
// java.lang.IllegalArgumentException: No enum constant com.intuit.tank.vm.vmManager.models.VMStatus.
// at java.lang.Enum.valueOf(Enum.java:238)
// at com.intuit.tank.vm.vmManager.models.VMStatus.valueOf(VMStatus.java:5)
// at com.intuit.tank.vm.vmManager.models.VMStatus.fromString(VMStatus.java:20)
assertNotNull(result);
@Test
@DisplayName("replaced is a terminal state (for documentation)")
public void testReplaced_isTerminalState() {
// This test documents that 'replaced' is intended as a terminal state
// like 'terminated', used by AgentWatchdog when replacing failed agents
assertNotNull(VMStatus.replaced);
assertEquals("replaced", VMStatus.replaced.name());
}
}
14 changes: 6 additions & 8 deletions docs/installation-guide/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -434,15 +434,13 @@ Configuration is achieved via an XML file called `settings.xml`. The default is
<type name="c3.8xlarge" cost="1.68" users="16000" cpus="32" ecus="108" mem="60" jvmArgs="-Xms50g -Xmx50g" />
</instance-types>

<!-- watchdog is used to check if all agents start correctly. -->
<!-- watchdog monitors agents after EC2 launch, waiting for them to call /v2/agent/ready -->
<watchdog>
<!-- The maximum amount of time to wait for the agents to start before restarting -->
<max-time-for-agent-start>3m</max-time-for-agent-start>
<!-- The maximum amount of time to wait for the agents to report to the controller before restarting -->
<max-time-for-agent-report>5m</max-time-for-agent-report>
<!-- the maximum number of restarts before stopping test -->
<max-restarts>2</max-restarts>
<!-- The amount of time to wait between checking for agent start -->
<!-- Time to wait for agents to report ready before relaunching -->
<max-time-for-agent-report>3m</max-time-for-agent-report>
<!-- Max relaunch attempts before aborting the job -->
<max-restarts>3</max-restarts>
<!-- Polling interval for checking agent status -->
<sleep-time-between-check>30s</sleep-time-between-check>
</watchdog>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,12 @@ public void killInstances(List<String> instanceIds) {
}
String jobId = null;
for (String instanceId : instanceIds) {
CloudVmStatus status = new CloudVmStatus(vmTracker.getStatus(instanceId));
CloudVmStatus existingStatus = vmTracker.getStatus(instanceId);
if (existingStatus == null) {
LOG.warn("No status found for instance " + instanceId + " - skipping status update");
continue;
}
CloudVmStatus status = new CloudVmStatus(existingStatus);
status.setCurrentUsers(0);
status.setEndTime(new Date());
status.setJobStatus(JobStatus.Completed);
Expand Down Expand Up @@ -247,7 +252,14 @@ private List<String> getInstancesForJob(String jobId) {
List<String> instanceIds = new ArrayList<String>();
CloudVmStatusContainer statuses = vmTracker.getVmStatusForJob(jobId);
if (statuses != null) {
instanceIds = statuses.getStatuses().stream().map(CloudVmStatus::getInstanceId).collect(Collectors.toList());
instanceIds = statuses.getStatuses().stream()
.filter(s -> {
VMStatus vmStatus = s.getVmStatus();
// Only exclude instances that no longer exist on AWS
return vmStatus != VMStatus.replaced && vmStatus != VMStatus.terminated;
})
.map(CloudVmStatus::getInstanceId)
.collect(Collectors.toList());
}
return instanceIds;
}
Expand All @@ -260,8 +272,10 @@ public CloudVmStatus getVmStatus(String instanceId) {

public void setVmStatus(final String instanceId, final CloudVmStatus status) {
vmTracker.setStatus(status);
if (status.getJobStatus() == JobStatus.Completed || status.getVmStatus() == VMStatus.terminated) {
// will terrminate instance after waiting for some cleanup time
if (status.getJobStatus() == JobStatus.Completed
|| status.getVmStatus() == VMStatus.terminated
|| status.getVmStatus() == VMStatus.replaced) {
// will terminate instance after waiting for some cleanup time
terminator.terminate(status.getInstanceId());
// check job status and kill off instances appropriately
checkJobStatus(status.getJobId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,11 @@ public void setInstanceStatus(String instanceId, CloudVmStatus status) {
segment.putAnnotation("currentUsers", status.getCurrentUsers());
segment.putAnnotation("TotalUsers", status.getTotalUsers());
segment.putAnnotation("totalTps", status.getTotalTps());

LOGGER.debug("Agent " + instanceId + " reporting status - VMStatus: " + status.getVmStatus() +
", JobStatus: " + status.getJobStatus() + ", Users: " + status.getCurrentUsers() +
"/" + status.getTotalUsers() + ", Job: " + status.getJobId());

try {
JobEventSender controller = new ServletInjector<JobEventSender>().getManagedBean(
servletContext, JobEventSender.class);
Expand Down
Loading