diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 3a1b31b1cf..72ff9e3e1a 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -10,6 +10,7 @@ export interface RunnerList { repo?: string; org?: string; orphan?: boolean; + runnerId?: string; } export interface RunnerInfo { diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index b927f98696..9d218605bb 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -4,6 +4,7 @@ import { CreateFleetInstance, CreateFleetResult, CreateTagsCommand, + DeleteTagsCommand, DefaultTargetCapacityType, DescribeInstancesCommand, DescribeInstancesResult, @@ -17,7 +18,7 @@ import { mockClient } from 'aws-sdk-client-mock'; import 'aws-sdk-client-mock-jest/vitest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, tag, terminateRunner } from './runners'; +import { createRunner, listEC2Runners, tag, untag, terminateRunner } from './runners'; import { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -53,6 +54,26 @@ const mockRunningInstances: DescribeInstancesResult = { }, ], }; +const mockRunningInstancesJit: DescribeInstancesResult = { + Reservations: [ + { + Instances: [ + { + LaunchTime: new Date('2020-10-10T14:48:00.000+09:00'), + InstanceId: 'i-1234', + Tags: [ + { Key: 'ghr:Application', Value: 'github-action-runner' }, + { Key: 'ghr:runner_name_prefix', Value: RUNNER_NAME_PREFIX }, + { Key: 'ghr:created_by', Value: 'scale-up-lambda' }, + { Key: 'ghr:Type', Value: 'Org' }, + { Key: 'ghr:Owner', Value: 'CoderToCat' }, + { Key: 'ghr:github_runner_id', Value: '9876543210' }, + ], + }, + ], + }, + ], +}; describe('list instances', () => { beforeEach(() => { @@ -60,7 +81,7 @@ describe('list instances', () => { vi.clearAllMocks(); }); - it('returns a list of instances', async () => { + it('returns a list of instances (Non JIT)', async () => { mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances); const resp = await listEC2Runners(); expect(resp.length).toBe(1); @@ -73,6 +94,20 @@ describe('list instances', () => { }); }); + it('returns a list of instances (JIT)', async () => { + mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstancesJit); + const resp = await listEC2Runners(); + expect(resp.length).toBe(1); + expect(resp).toContainEqual({ + instanceId: 'i-1234', + launchTime: new Date('2020-10-10T14:48:00.000+09:00'), + type: 'Org', + owner: 'CoderToCat', + orphan: false, + runnerId: '9876543210', + }); + }); + it('check orphan tag.', async () => { const instances: DescribeInstancesResult = mockRunningInstances; instances.Reservations![0].Instances![0].Tags!.push({ Key: 'ghr:orphan', Value: 'true' }); @@ -229,11 +264,35 @@ describe('tag runner', () => { owner: 'owner-2', type: 'Repo', }; - await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'truer' }]); + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + + expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], + }); + }); +}); +describe('untag runner', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + it('removing extra tag', async () => { + mockEC2Client.on(DeleteTagsCommand).resolves({}); + const runner: RunnerInfo = { + instanceId: 'instance-2', + owner: 'owner-2', + type: 'Repo', + }; + //await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: '' }]); expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { Resources: [runner.instanceId], - Tags: [{ Key: 'ghr:orphan', Value: 'truer' }], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], + }); + await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + expect(mockEC2Client).toHaveReceivedCommandWith(DeleteTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'true' }], }); }); }); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index dfe4d99fcf..6779dd39d2 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -2,6 +2,7 @@ import { CreateFleetCommand, CreateFleetResult, CreateTagsCommand, + DeleteTagsCommand, DescribeInstancesCommand, DescribeInstancesResult, EC2Client, @@ -91,6 +92,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', + runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string, }); } } @@ -112,6 +114,12 @@ export async function tag(instanceId: string, tags: Tag[]): Promise { await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); } +export async function untag(instanceId: string, tags: Tag[]): Promise { + logger.debug(`Untagging '${instanceId}'`, { tags }); + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + await ec2.send(new DeleteTagsCommand({ Resources: [instanceId], Tags: tags })); +} + function generateFleetOverrides( subnetIds: string[], instancesTypes: string[], diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 87bab093cb..8dd25323a6 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -4,7 +4,7 @@ import nock from 'nock'; import { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../github/auth'; -import { listEC2Runners, terminateRunner, tag } from './../aws/runners'; +import { listEC2Runners, terminateRunner, tag, untag } from './../aws/runners'; import { githubCache } from './cache'; import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -33,6 +33,7 @@ vi.mock('./../aws/runners', async (importOriginal) => { return { ...actual, tag: vi.fn(), + untag: vi.fn(), terminateRunner: vi.fn(), listEC2Runners: vi.fn(), }; @@ -62,6 +63,7 @@ const mockedInstallationAuth = vi.mocked(ghAuth.createGithubInstallationAuth); const mockCreateClient = vi.mocked(ghAuth.createOctokitClient); const mockListRunners = vi.mocked(listEC2Runners); const mockTagRunners = vi.mocked(tag); +const mockUntagRunners = vi.mocked(untag); const mockTerminateRunners = vi.mocked(terminateRunner); export interface TestData { @@ -312,7 +314,7 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it(`Should terminate orphan.`, async () => { + it(`Should terminate orphan (Non JIT)`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); const idleRunner = createRunnerTestData('idle-1', type, MINIMUM_BOOT_TIME + 1, true, false, false); @@ -334,6 +336,7 @@ describe('Scale down runners', () => { Value: 'true', }, ]); + expect(mockTagRunners).not.toHaveBeenCalledWith(idleRunner.instanceId, expect.anything()); // next cycle, update test data set orphan to true and terminate should be true @@ -348,6 +351,58 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); + it('Should test if orphaned runner, untag if online and busy, else terminate (JIT)', async () => { + // arrange + const orphanRunner = createRunnerTestData( + 'orphan-jit', + type, + MINIMUM_BOOT_TIME + 1, + false, + true, + false, + undefined, + 1234567890, + ); + const runners = [orphanRunner]; + + mockGitHubRunners([]); + mockAwsRunners(runners); + + if (type === 'Repo') { + mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ + data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + }); + } else { + mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ + data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + }); + } + + // act + await scaleDown(); + + // assert + expect(mockUntagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + expect(mockTerminateRunners).not.toHaveBeenCalledWith(orphanRunner.instanceId); + + // arrange + if (type === 'Repo') { + mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + }); + } else { + mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ + data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + }); + } + + // act + await scaleDown(); + + // assert + expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId); + }); + it(`Should ignore errors when termination orphan fails.`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true); @@ -625,6 +680,7 @@ function createRunnerTestData( orphan: boolean, shouldBeTerminated: boolean, owner?: string, + runnerId?: number, ): RunnerTestItem { return { instanceId: `i-${name}-${type.toLowerCase()}`, @@ -638,5 +694,6 @@ function createRunnerTestData( registered, orphan, shouldBeTerminated, + runnerId: runnerId !== undefined ? String(runnerId) : undefined, }; } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 10d5615b43..1c8955d268 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -1,9 +1,10 @@ import { Octokit } from '@octokit/rest'; +import { Endpoints } from '@octokit/types'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; +import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; @@ -12,6 +13,10 @@ import { getGitHubEnterpriseApiUrl } from './scale-up'; const logger = createChildLogger('scale-down'); +type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners']; +type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners']; +type RunnerState = OrgRunnerList[number] | RepoRunnerList[number]; + async function getOrCreateOctokit(runner: RunnerInfo): Promise { const key = runner.owner; const cachedOctokit = githubCache.clients.get(key); @@ -46,7 +51,11 @@ async function getOrCreateOctokit(runner: RunnerInfo): Promise { return octokit; } -async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { +async function getGitHubSelfHostedRunnerState( + client: Octokit, + ec2runner: RunnerInfo, + runnerId: number, +): Promise { const state = ec2runner.type === 'Org' ? await client.actions.getSelfHostedRunnerForOrg({ @@ -58,12 +67,24 @@ async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, owner: ec2runner.owner.split('/')[0], repo: ec2runner.owner.split('/')[1], }); - - logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`); - metricGitHubAppRateLimit(state.headers); - return state.data.busy; + return { + id: state.data.id, + name: state.data.name, + busy: state.data.busy, + status: state.data.status, + os: state.data.os, + labels: state.data.labels, + runner_group_id: state.data.runner_group_id, + ephemeral: state.data.ephemeral, + }; +} + +async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise { + const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); + logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`); + return state.busy; } async function listGitHubRunners(runner: RunnerInfo): Promise { @@ -200,18 +221,42 @@ async function markOrphan(instanceId: string): Promise { } } +async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise { + const client = await getOrCreateOctokit(runner as RunnerInfo); + const runnerId = parseInt(runner.runnerId || '0'); + const ec2Instance = runner as RunnerInfo; + const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); + logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); + if (state.status === 'online' && state.busy) { + logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); + await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + } else if (state.status === 'offline') { + logger.warn(`Runner '${runner.instanceId}' is orphan.`); + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } +} + async function terminateOrphan(environment: string): Promise { try { const orphanRunners = await listEC2Runners({ environment, orphan: true }); for (const runner of orphanRunners) { - logger.info(`Terminating orphan runner '${runner.instanceId}'`); - await terminateRunner(runner.instanceId).catch((e) => { - logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); - }); + // do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method + if (runner.runnerId) { + logger.debug(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); + await lastChanceCheckOrphanRunner(runner); + } else { + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } } } catch (e) { - logger.warn(`Failure during orphan runner termination.`, { error: e }); + logger.warn(`Failure during orphan termination processing.`, { error: e }); } } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 0611a6e697..14c0a0422e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -41,6 +41,7 @@ vi.mock('@octokit/rest', () => ({ vi.mock('./../aws/runners', async () => ({ createRunner: vi.fn(), listEC2Runners: vi.fn(), + tag: vi.fn(), })); vi.mock('./../github/auth', async () => ({ @@ -645,7 +646,7 @@ describe('scaleUp with public GH', () => { }); }); - it('JIT config is ingored for non-ephemeral runners.', async () => { + it('JIT config is ignored for non-ephemeral runners.', async () => { process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; process.env.ENABLE_JIT_CONFIG = 'true'; process.env.ENABLE_JOB_QUEUED_CHECK = 'false'; @@ -1008,11 +1009,13 @@ function defaultOctokitMockImpl() { ]); mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(() => ({ data: { + runner: { id: 9876543210 }, encoded_jit_config: 'TEST_JIT_CONFIG_ORG', }, })); mockOctokit.actions.generateRunnerJitconfigForRepo.mockImplementation(() => ({ data: { + runner: { id: 9876543210 }, encoded_jit_config: 'TEST_JIT_CONFIG_REPO', }, })); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 08d16d682a..62da8b79b8 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -4,7 +4,7 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { createRunner, listEC2Runners } from './../aws/runners'; +import { createRunner, listEC2Runners, tag } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import ScaleError from './ScaleError'; import { publishRetryMessage } from './job-retry'; @@ -416,6 +416,14 @@ async function createRegistrationTokenConfig( } } +async function addGhRunnerIdToEC2InstanceTag(instanceId: string, runnerId: string): Promise { + try { + await tag(instanceId, [{ Key: 'ghr:github_runner_id', Value: runnerId }]); + } catch (e) { + logger.error(`Failed to mark runner '${instanceId}' with ${runnerId}.`, { error: e }); + } +} + async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) { const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); const { isDelay, delay } = addDelay(instances); @@ -449,6 +457,9 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins metricGitHubAppRateLimit(runnerConfig.headers); + // tag the EC2 instance with the Github runner id + await addGhRunnerIdToEC2InstanceTag(instance, runnerConfig.data.runner.id.toString()); + // store jit config in ssm parameter store logger.debug('Runner JIT config for ephemeral runner generated.', { instance: instance,