diff --git a/docs/index.md b/docs/index.md index 38d7d8cb95..77a248f737 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,7 +46,7 @@ The "Scale Up Runner" Lambda actively monitors the SQS queue, processing incomin The Lambda first requests a JIT configuration or registration token from GitHub, which is needed later by the runner to register itself. This avoids the case that the EC2 instance, which later in the process will install the agent, needs administration permissions to register the runner. Next, the EC2 spot instance is created via the launch template. The launch template defines the specifications of the required instance and contains a [`user_data`](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html) script. This script will install the required software and configure it. The configuration for the runner is shared via EC2 tags and the parameter store (SSM), from which the user data script will fetch it and delete it once it has been retrieved. Once the user data script is finished, the action runner should be online, and the workflow will start in seconds. -The current method for scaling down runners employs a straightforward approach: at predefined intervals, the Lambda conducts a thorough examination of each runner (instance) to assess its activity. If a runner is found to be idle, it is deregistered from GitHub, and the associated AWS instance is terminated. For ephemeral runners the the instance is terminated immediately after the workflow is finished. To avoid orphaned runners the scale down lambda is active in this cae as well. +The current method for scaling down runners employs a straightforward approach: at predefined intervals, the Lambda conducts a thorough examination of each runner (instance) to assess its activity. If a runner is found to be idle, it is deregistered from GitHub, and the associated AWS instance is terminated. For ephemeral runners the the instance is terminated immediately after the workflow is finished. Instances not registered in GitHub as a runner after a minimal boot time will be marked orphan and removed in a next cycle. To avoid orphaned runners the scale down lambda is active in this cae as well. ### Pool @@ -68,7 +68,7 @@ The AMI cleaner is a lambda that will clean up AMIs that are older than a config > This feature is Beta, changes will not trigger a major release as long in beta. -The Instance Termination Watcher is creating log and optional metrics for termination of instances. Currently only spot termination warnings are watched. See [configuration](configuration/) for more details. +The Instance Termination Watcher is creating log and optional metrics for termination of instances. Currently only spot termination warnings are watched. See [configuration](configuration/) for more details. ### Security diff --git a/lambdas/functions/control-plane/jest.config.ts b/lambdas/functions/control-plane/jest.config.ts index e96f2cbc3d..dd8ac50414 100644 --- a/lambdas/functions/control-plane/jest.config.ts +++ b/lambdas/functions/control-plane/jest.config.ts @@ -6,10 +6,10 @@ const config: Config = { ...defaultConfig, coverageThreshold: { global: { - statements: 97.79, - branches: 96.13, - functions: 95.4, - lines: 98.06, + statements: 98.01, + branches: 97.28, + functions: 95.6, + lines: 97.94, }, }, }; diff --git a/lambdas/functions/control-plane/package.json b/lambdas/functions/control-plane/package.json index 3d5017a6e8..b32075c98d 100644 --- a/lambdas/functions/control-plane/package.json +++ b/lambdas/functions/control-plane/package.json @@ -8,7 +8,7 @@ "test": "NODE_ENV=test nx test", "test:watch": "NODE_ENV=test nx test --watch", "lint": "yarn eslint src", - "watch": "ts-node-dev --respawn --exit-child --files src/local.ts", + "watch": "ts-node-dev --respawn --exit-child --files src/local-down.ts", "build": "ncc build src/lambda.ts -o dist", "dist": "yarn build && cd dist && zip ../runners.zip index.js", "format": "prettier --write \"**/*.ts\"", diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index ea1f86b735..3a1b31b1cf 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -9,6 +9,7 @@ export interface RunnerList { type?: string; repo?: string; org?: string; + orphan?: boolean; } export interface RunnerInfo { @@ -22,6 +23,7 @@ export interface ListRunnerFilters { runnerType?: RunnerType; runnerOwner?: string; environment?: string; + orphan?: boolean; statuses?: string[]; } diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index d233b16a69..09a0e8b710 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -3,6 +3,7 @@ import { CreateFleetCommandInput, CreateFleetInstance, CreateFleetResult, + CreateTagsCommand, DefaultTargetCapacityType, DescribeInstancesCommand, DescribeInstancesResult, @@ -16,7 +17,7 @@ import { mockClient } from 'aws-sdk-client-mock'; import 'aws-sdk-client-mock-jest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, terminateRunner } from './runners'; +import { createRunner, listEC2Runners, tag, terminateRunner } from './runners'; import { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; process.env.AWS_REGION = 'eu-east-1'; @@ -67,6 +68,23 @@ describe('list instances', () => { launchTime: new Date('2020-10-10T14:48:00.000+09:00'), type: 'Org', owner: 'CoderToCat', + orphan: false, + }); + }); + + it('check orphan tag.', async () => { + const instances: DescribeInstancesResult = mockRunningInstances; + instances.Reservations![0].Instances![0].Tags!.push({ Key: 'ghr:orphan', Value: 'true' }); + mockEC2Client.on(DescribeInstancesCommand).resolves(instances); + + const resp = await listEC2Runners(); + expect(resp.length).toBe(1); + expect(resp).toContainEqual({ + instanceId: instances.Reservations![0].Instances![0].InstanceId!, + launchTime: instances.Reservations![0].Instances![0].LaunchTime!, + type: 'Org', + owner: 'CoderToCat', + orphan: true, }); }); @@ -114,6 +132,23 @@ describe('list instances', () => { }); }); + it('filters instances on environment and orphan', async () => { + mockRunningInstances.Reservations![0].Instances![0].Tags!.push({ + Key: 'ghr:orphan', + Value: 'true', + }); + mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances); + await listEC2Runners({ environment: ENVIRONMENT, orphan: true }); + expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, { + Filters: [ + { Name: 'instance-state-name', Values: ['running', 'pending'] }, + { Name: 'tag:ghr:environment', Values: [ENVIRONMENT] }, + { Name: 'tag:ghr:orphan', Values: ['true'] }, + { Name: 'tag:ghr:Application', Values: ['github-action-runner'] }, + ], + }); + }); + it('No instances, undefined reservations list.', async () => { const noInstances: DescribeInstancesResult = { Reservations: undefined, @@ -182,6 +217,26 @@ describe('terminate runner', () => { }); }); +describe('tag runner', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + it('adding extra tag', async () => { + mockEC2Client.on(CreateTagsCommand).resolves({}); + const runner: RunnerInfo = { + instanceId: 'instance-2', + owner: 'owner-2', + type: 'Repo', + }; + await tag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'truer' }]); + + expect(mockEC2Client).toHaveReceivedCommandWith(CreateTagsCommand, { + Resources: [runner.instanceId], + Tags: [{ Key: 'ghr:orphan', Value: 'truer' }], + }); + }); +}); + describe('create runner', () => { const defaultRunnerConfig: RunnerConfig = { allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED, diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 889d3c5f8a..346013b0d1 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -1,10 +1,12 @@ import { CreateFleetCommand, CreateFleetResult, + CreateTagsCommand, DescribeInstancesCommand, DescribeInstancesResult, EC2Client, FleetLaunchTemplateOverridesRequest, + Tag, TerminateInstancesCommand, _InstanceType, } from '@aws-sdk/client-ec2'; @@ -46,6 +48,9 @@ function constructFilters(filters?: Runners.ListRunnerFilters): Ec2Filter[][] { ec2FiltersBase.push({ Name: `tag:ghr:Type`, Values: [filters.runnerType] }); ec2FiltersBase.push({ Name: `tag:ghr:Owner`, Values: [filters.runnerOwner] }); } + if (filters.orphan) { + ec2FiltersBase.push({ Name: 'tag:ghr:orphan', Values: ['true'] }); + } } for (const key of ['tag:ghr:Application']) { @@ -85,6 +90,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { type: i.Tags?.find((e) => e.Key === 'ghr:Type')?.Value as string, repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, + orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', }); } } @@ -94,10 +100,16 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { } export async function terminateRunner(instanceId: string): Promise { - logger.info(`Runner '${instanceId}' will be terminated.`); + logger.debug(`Runner '${instanceId}' will be terminated.`); const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); await ec2.send(new TerminateInstancesCommand({ InstanceIds: [instanceId] })); - logger.info(`Runner ${instanceId} has been terminated.`); + logger.debug(`Runner ${instanceId} has been terminated.`); +} + +export async function tag(instanceId: string, tags: Tag[]): Promise { + logger.debug(`Tagging '${instanceId}'`, { tags }); + const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); + await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); } function generateFleetOverrides( diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index b3c7de9072..925e4e4b10 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -5,7 +5,7 @@ import nock from 'nock'; import { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../gh-auth/gh-auth'; -import { listEC2Runners, terminateRunner } from './../aws/runners'; +import { listEC2Runners, terminateRunner, tag } from './../aws/runners'; import { githubCache } from './cache'; import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; @@ -42,6 +42,7 @@ const mockedAppAuth = mocked(ghAuth.createGithubAppAuth, { shallow: false }); const mockedInstallationAuth = mocked(ghAuth.createGithubInstallationAuth, { shallow: false }); const mockCreateClient = mocked(ghAuth.createOctoClient, { shallow: false }); const mockListRunners = mocked(listEC2Runners); +const mockTagRunners = mocked(tag); const mockTerminateRunners = mocked(terminateRunner); export interface TestData { @@ -172,7 +173,7 @@ describe('Scale down runners', () => { describe.each(runnerTypes)('For %s runners.', (type) => { it('Should not call terminate when no runners online.', async () => { // setup - mockListRunners.mockResolvedValue([]); + mockAwsRunners([]); // act await scaleDown(); @@ -197,7 +198,8 @@ describe('Scale down runners', () => { mockGitHubRunners(runners); mockListRunners.mockResolvedValue(runners); - // act + mockAwsRunners(runners); + await scaleDown(); // assert @@ -220,7 +222,7 @@ describe('Scale down runners', () => { const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES - 1, true, false, false)]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -235,7 +237,7 @@ describe('Scale down runners', () => { const runners = [createRunnerTestData('booting-1', type, MINIMUM_BOOT_TIME - 1, false, false, false)]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -250,7 +252,7 @@ describe('Scale down runners', () => { const runners = [createRunnerTestData('busy-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -274,7 +276,7 @@ describe('Scale down runners', () => { ]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => { return { status: 500 }; }); @@ -293,10 +295,31 @@ describe('Scale down runners', () => { it(`Should terminate orphan.`, async () => { // setup - const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); + const idleRunner = createRunnerTestData('idle-1', type, MINIMUM_BOOT_TIME + 1, true, false, false); + const runners = [orphanRunner, idleRunner]; - mockGitHubRunners([]); - mockListRunners.mockResolvedValue(runners); + mockGitHubRunners([idleRunner]); + mockAwsRunners(runners); + + // act + await scaleDown(); + + // assert + checkTerminated(runners); + checkNonTerminated(runners); + + expect(mockTagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [ + { + Key: 'ghr:orphan', + Value: 'true', + }, + ]); + expect(mockTagRunners).not.toHaveBeenCalledWith(idleRunner.instanceId, expect.anything()); + + // next cycle, update test data set orphan to true and terminate should be true + orphanRunner.orphan = true; + orphanRunner.shouldBeTerminated = true; // act await scaleDown(); @@ -306,21 +329,60 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); - it(`Should orphan termination failure is not resulting in an exception..`, async () => { + it(`Should ignore errors when termination orphan fails.`, async () => { // setup - const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true); + const runners = [orphanRunner]; mockGitHubRunners([]); - mockListRunners.mockResolvedValue(runners); - mockTerminateRunners.mockRejectedValue(new Error('Termination failed')); + mockAwsRunners(runners); + mockTerminateRunners.mockImplementation(() => { + throw new Error('Failed to terminate'); + }); - // act and ensure no exception is thrown - await expect(scaleDown()).resolves.not.toThrow(); + // act + await scaleDown(); + + // assert + checkTerminated(runners); + checkNonTerminated(runners); + }); + + describe('When orphan termination fails', () => { + it(`Should not throw in case of list runner exception.`, async () => { + // setup + const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + + mockGitHubRunners([]); + mockListRunners.mockRejectedValueOnce(new Error('Failed to list runners')); + mockAwsRunners(runners); + + // ac + await scaleDown(); + + // assert + checkNonTerminated(runners); + }); + + it(`Should not throw in case of terminate runner exception.`, async () => { + // setup + const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)]; + + mockGitHubRunners([]); + mockAwsRunners(runners); + mockTerminateRunners.mockRejectedValue(new Error('Failed to terminate')); + + // act and ensure no exception is thrown + await scaleDown(); + + // assert + checkNonTerminated(runners); + }); }); it(`Should not terminate instance in case de-register fails.`, async () => { // setup - const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, true, false)]; + const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)]; mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => { return { status: 500 }; @@ -330,7 +392,7 @@ describe('Scale down runners', () => { }); mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act and should resolve await expect(scaleDown()).resolves.not.toThrow(); @@ -352,7 +414,7 @@ describe('Scale down runners', () => { }); mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await expect(scaleDown()).resolves.not.toThrow(); @@ -383,7 +445,7 @@ describe('Scale down runners', () => { ]; mockGitHubRunners(runners); - mockListRunners.mockResolvedValue(runners); + mockAwsRunners(runners); // act await scaleDown(); @@ -502,6 +564,12 @@ describe('Scale down runners', () => { }); }); +function mockAwsRunners(runners: RunnerTestItem[]) { + mockListRunners.mockImplementation(async (filter) => { + return runners.filter((r) => !filter?.orphan || filter?.orphan === r.orphan); + }); +} + function checkNonTerminated(runners: RunnerTestItem[]) { const notTerminated = runners.filter((r) => !r.shouldBeTerminated); for (const toTerminate of notTerminated) { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 96b4dd4c76..eeef81d5d5 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -3,7 +3,7 @@ import { createChildLogger } from '@terraform-aws-github-runner/aws-powertools-u import moment from 'moment'; import { createGithubAppAuth, createGithubInstallationAuth, createOctoClient } from '../gh-auth/gh-auth'; -import { bootTimeExceeded, listEC2Runners, terminateRunner } from './../aws/runners'; +import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; @@ -130,7 +130,7 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi if (statuses.every((status) => status == 204)) { await terminateRunner(ec2runner.instanceId); - logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); + logger.debug(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { logger.error(`Failed to de-register GitHub runner: ${statuses}`); } @@ -175,30 +175,43 @@ async function evaluateAndRemoveRunners( idleCounter--; logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); } else { - logger.info(`Will try to terminate runners that are not busy`); + logger.info(`Terminating all non busy runners.`); await removeRunner( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), ); } } + } else if (bootTimeExceeded(ec2Runner)) { + await markOrphan(ec2Runner.instanceId); } else { - if (bootTimeExceeded(ec2Runner)) { - logger.info(`Runner '${ec2Runner.instanceId}' is orphaned and will be removed.`); - terminateOrphan(ec2Runner.instanceId); - } else { - logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); - } + logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); } } } } -async function terminateOrphan(instanceId: string): Promise { +async function markOrphan(instanceId: string): Promise { try { - await terminateRunner(instanceId); + await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); + logger.info(`Runner '${instanceId}' marked as orphan.`); } catch (e) { - logger.debug(`Orphan runner '${instanceId}' cannot be removed.`); + logger.error(`Failed to mark runner '${instanceId}' as orphan.`, { error: e }); + } +} + +async function terminateOrphan(environment: string): Promise { + try { + const orphanRunners = await listEC2Runners({ environment, orphan: true }); + + for (const runner of orphanRunners) { + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + await terminateRunner(runner.instanceId).catch((e) => { + logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); + }); + } + } catch (e) { + logger.warn(`Failure during orphan runner termination.`, { error: e }); } } @@ -221,14 +234,18 @@ async function listRunners(environment: string) { } function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] { - return ec2runners.filter((ec2Runner) => ec2Runner.type) as RunnerInfo[]; + return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[]; } export async function scaleDown(): Promise { githubCache.reset(); - const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig]; const environment = process.env.ENVIRONMENT; + const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig]; + + // first runners marked to be orphan. + await terminateOrphan(environment); + // next scale down idle runners with respect to config and mark potential orphans const ec2Runners = await listRunners(environment); const activeEc2RunnersCount = ec2Runners.length; logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`); diff --git a/modules/runners/policies/lambda-scale-down.json b/modules/runners/policies/lambda-scale-down.json index f745ba52bf..0f73aeacf1 100644 --- a/modules/runners/policies/lambda-scale-down.json +++ b/modules/runners/policies/lambda-scale-down.json @@ -14,7 +14,8 @@ { "Effect": "Allow", "Action": [ - "ec2:TerminateInstances" + "ec2:TerminateInstances", + "ec2:CreateTags" ], "Resource": [ "*" @@ -28,14 +29,15 @@ { "Effect": "Allow", "Action": [ - "ec2:TerminateInstances" + "ec2:TerminateInstances", + "ec2:CreateTags" ], "Resource": [ "*" ], "Condition": { "StringEquals": { - "ec2:ResourceTag/Application": "github-action-runner" + "ec2:ResourceTag/gh:environment": "${environment}" } } }, diff --git a/modules/runners/scale-down.tf b/modules/runners/scale-down.tf index c74f88e387..5fc9c02ee0 100644 --- a/modules/runners/scale-down.tf +++ b/modules/runners/scale-down.tf @@ -93,6 +93,7 @@ resource "aws_iam_role_policy" "scale_down" { name = "${var.prefix}-lambda-scale-down-policy" role = aws_iam_role.scale_down.name policy = templatefile("${path.module}/policies/lambda-scale-down.json", { + environment = var.prefix github_app_id_arn = var.github_app_parameters.id.arn github_app_key_base64_arn = var.github_app_parameters.key_base64.arn kms_key_arn = local.kms_key_arn