From e3bfcb7de35d389cd52a289d2651096f8536a86c Mon Sep 17 00:00:00 2001 From: Jack Venberg Date: Sun, 29 Mar 2026 23:37:34 -0700 Subject: [PATCH] Fix TOCTOU race condition in scale-down that terminates busy runners The scale-down lambda had a race condition where a job could be assigned to a runner between checking its busy state and terminating the EC2 instance. This caused in-flight jobs (e.g., Helm deploys) to be killed mid-execution when the instance was terminated. The fix adds a post-deregistration busy re-check: 1. Check busy state (fast-path to skip busy runners) 2. Deregister the runner from GitHub (prevents new job assignment) 3. Re-check busy state after deregistration After deregistration, GitHub cannot assign new jobs to the runner, so the re-check is stable (no more race window). If the runner became busy between step 1 and step 2, the in-flight job will complete using its job-scoped OAuth token and the instance is left for orphan cleanup. --- .../src/scale-runners/scale-down.test.ts | 76 +++++++++++++++ .../src/scale-runners/scale-down.ts | 94 ++++++++++++++----- 2 files changed, 145 insertions(+), 25 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index bfa2b5600f..cd9f4a554b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -293,6 +293,82 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); + it(`Should not terminate a runner that became busy between deregister and post-deregister check.`, async () => { + // setup: runner appears idle on first check, deregister succeeds, + // but becomes busy on the post-deregister re-check (race condition) + const runners = [ + createRunnerTestData( + 'race-condition-1', + type, + MINIMUM_TIME_RUNNING_IN_MINUTES + 1, + true, + false, + false, + ), + ]; + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // First call to getSelfHostedRunnerFor* returns not-busy (pre-deregister check), + // second call returns busy (post-deregister re-check) + let callCount = 0; + const busyCheckMock = () => { + callCount++; + if (callCount <= 1) { + return { data: { busy: false } }; + } + return { data: { busy: true } }; + }; + mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock); + mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert: runner should NOT be terminated because it was busy on re-check + checkTerminated(runners); + checkNonTerminated(runners); + }); + + it(`Should terminate a runner when post-deregister busy check returns 404.`, async () => { + // setup: after deregistration, GitHub API returns 404 for the runner + // (runner fully removed). This means it can't be busy, safe to terminate. + const runners = [ + createRunnerTestData( + 'deregistered-404', + type, + MINIMUM_TIME_RUNNING_IN_MINUTES + 1, + true, + false, + true, + ), + ]; + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // First call returns not-busy (pre-deregister check), + // second call throws 404 (post-deregister, runner is gone) + let callCount = 0; + const busyCheckMock = () => { + callCount++; + if (callCount <= 1) { + return { data: { busy: false } }; + } + throw new Error('Not Found'); + }; + mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock); + mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert: runner should be terminated because 404 means not busy + checkTerminated(runners); + checkNonTerminated(runners); + }); + it(`Should terminate orphan.`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index be7183f929..9eade8ba70 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -106,39 +106,83 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { const githubAppClient = await getOrCreateOctokit(ec2runner); try { + // Step 1: Check busy state first as a fast-path to skip busy runners. const states = await Promise.all( ghRunnerIds.map(async (ghRunnerId) => { - // Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition. + // Get busy state instead of using the output of listGitHubRunners(...) to minimize the race condition. return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); }), ); - if (states.every((busy) => busy === false)) { - const statuses = await Promise.all( - ghRunnerIds.map(async (ghRunnerId) => { - return ( - ec2runner.type === 'Org' - ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ - runner_id: ghRunnerId, - org: ec2runner.owner, - }) - : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ - runner_id: ghRunnerId, - owner: ec2runner.owner.split('/')[0], - repo: ec2runner.owner.split('/')[1], - }) - ).status; - }), - ); + if (!states.every((busy) => busy === false)) { + logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); + return; + } - if (statuses.every((status) => status == 204)) { - await terminateRunner(ec2runner.instanceId); - logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); - } else { - logger.error(`Failed to de-register GitHub runner: ${statuses}`); - } + // Step 2: De-register the runner from GitHub. This prevents GitHub from assigning + // new jobs to this runner, closing the TOCTOU race window where a job could be + // assigned between checking busy state and terminating the instance. + const statuses = await Promise.all( + ghRunnerIds.map(async (ghRunnerId) => { + return ( + ec2runner.type === 'Org' + ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }) + : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner: ec2runner.owner.split('/')[0], + repo: ec2runner.owner.split('/')[1], + }) + ).status; + }), + ); + + if (!statuses.every((status) => status == 204)) { + logger.error(`Failed to de-register GitHub runner: ${statuses}`); + return; + } + + // Step 3: Re-check busy state after de-registration to catch the race condition. + // A job may have been assigned between Step 1 and Step 2. After de-registration, + // no NEW jobs can be assigned, so this check is now stable. + // + // If the runner is busy, the in-flight job will continue to completion using its + // job-scoped OAuth token. The runner worker communicates with GitHub using credentials + // from the job message, not the runner registration. + // See: https://github.com/actions/runner/blob/main/src/Runner.Worker/JobRunner.cs + // - Line 80-95: Worker creates its own VssConnection using job-scoped credentials + // - The worker never checks runner registration status during execution + // + // We leave the instance running so the job can finish. It will be cleaned up as + // an orphan on a subsequent scale-down cycle. + const postDeregisterStates = await Promise.all( + ghRunnerIds.map(async (ghRunnerId) => { + try { + return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); + } catch (e) { + // The runner was just de-registered, so the GitHub API may 404. + // This means GitHub has already fully removed the runner, so it + // cannot be busy. Safe to proceed with termination. + logger.debug( + `Could not re-check busy state for de-registered runner '${ec2runner.instanceId}' ` + + `(GitHub Runner ID '${ghRunnerId}'): ${e}. Treating as not busy.`, + ); + return false; + } + }), + ); + + if (postDeregisterStates.every((busy) => busy === false)) { + await terminateRunner(ec2runner.instanceId); + logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { - logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); + logger.warn( + `Runner '${ec2runner.instanceId}' became busy between idle check and de-registration. ` + + `The in-flight job will complete using its job-scoped credentials. ` + + `The instance will be cleaned up as an orphan on a subsequent scale-down cycle.`, + ); } } catch (e) { logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {