Skip to content

Commit

Permalink
runner robustness in edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
ludamad committed Apr 19, 2024
1 parent 3b2f75b commit a576467
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 61 deletions.
76 changes: 49 additions & 27 deletions dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -997,27 +997,30 @@ const ec2_1 = __nccwpck_require__(2886);
const core = __importStar(__nccwpck_require__(42186));
const github_1 = __nccwpck_require__(51294);
const utils_1 = __nccwpck_require__(91316);
function shouldEarlyExit(config, ec2Client, ghClient) {
function pollSpotStatus(config, ec2Client, ghClient) {
return __awaiter(this, void 0, void 0, function* () {
for (let iter = 0; iter < 60 * 2; iter++) {
// 12 iters x 10000 ms = 2 minutes
for (let iter = 0; iter < 12; iter++) {
const instances = yield ec2Client.getInstancesForTags();
const hasInstance = instances.filter((i) => { var _a; return ((_a = i.State) === null || _a === void 0 ? void 0 : _a.Name) === "running"; }).length > 0;
if (!hasInstance) {
// we need to start an instance
return false;
return 'none';
}
try {
core.info("Found ec2 instance, looking for runners.");
if (yield ghClient.hasRunner([config.githubJobId])) {
// we have runners
return true;
return 'usable';
}
}
catch (err) { }
yield new Promise((r) => setTimeout(r, 1000));
// wait 10 seconds
yield new Promise((r) => setTimeout(r, 10000));
}
// we have a bad state for a while, error
throw new Error("Looped for 2 minutes and could only find spot with no runners!");
core.warning("Looped for 2 minutes and could only find spot with no runners!");
return 'unusable';
});
}
function start() {
Expand All @@ -1034,9 +1037,21 @@ function start() {
else if (config.subaction !== "start") {
throw new Error("Unexpected subaction: " + config.subaction);
}
// assume subaction is 'start'
// subaction is 'start' or 'restart'estart'
const ec2Client = new ec2_1.Ec2Instance(config);
const ghClient = new github_1.GithubClient(config);
const spotStatus = yield pollSpotStatus(config, ec2Client, ghClient);
if (spotStatus === "usable") {
core.info(`Runner already running. Continuing as we can target it with jobs.`);
return;
}
if (spotStatus === "unusable") {
core.warning("Taking down spot as it has no runners! If we were mistaken, this could impact existing jobs.");
if (config.subaction === "restart") {
throw new Error("Taking down spot we just started. This seems wrong, erroring out.");
}
yield stop();
}
var ec2SpotStrategies;
switch (config.ec2SpotInstanceStrategy) {
case "maxperformance": {
Expand All @@ -1054,32 +1069,39 @@ function start() {
core.info(`Ec2 spot instance strategy is set to ${config.ec2SpotInstanceStrategy}`);
}
}
const canEarlyExit = yield shouldEarlyExit(config, ec2Client, ghClient);
if (canEarlyExit) {
core.info(`Runner already running. Continuing as we can target it with jobs.`);
return;
}
var instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
core.info(`Starting instance with ${ec2Strategy} strategy`);
// Get instance config
const instanceConfig = yield ec2Client.getInstanceConfiguration(ec2Strategy);
try {
// Start instance
const response = yield ec2Client.runInstances(instanceConfig);
if ((response === null || response === void 0 ? void 0 : response.length) && response.length > 0 && response[0].InstanceId) {
instanceId = response[0].InstanceId;
// 6 * 10000ms = 1 minute per strategy
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
// Get instance config
const instanceConfig = yield ec2Client.getInstanceConfiguration(ec2Strategy);
try {
// Start instance
const response = yield ec2Client.runInstances(instanceConfig);
if ((response === null || response === void 0 ? void 0 : response.length) && response.length > 0 && response[0].InstanceId) {
instanceId = response[0].InstanceId;
}
break;
}
catch (error) {
if ((error === null || error === void 0 ? void 0 : error.code) &&
error.code === "InsufficientInstanceCapacity" &&
ec2SpotStrategies.length > 0 &&
ec2Strategy.toLocaleUpperCase() != "none") {
core.info("Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again.");
}
else {
throw error;
}
}
// wait 10 seconds
yield new Promise((r) => setTimeout(r, 10000));
}
catch (error) {
if ((error === null || error === void 0 ? void 0 : error.code) &&
error.code === "InsufficientInstanceCapacity" &&
ec2SpotStrategies.length > 0 &&
ec2Strategy.toLocaleUpperCase() != "none")
core.info("Failed to create instance due to 'InsufficientInstanceCapacity', trying fallback strategy next");
else
throw error;
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
break;
}
}
if (instanceId)
Expand Down
94 changes: 60 additions & 34 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,29 @@ import * as core from "@actions/core";
import { GithubClient } from "./github/github";
import { assertIsError } from "./utils/utils";

async function shouldEarlyExit(config: ActionConfig, ec2Client: Ec2Instance, ghClient: GithubClient): Promise<boolean> {
for (let iter = 0; iter < 60 * 2; iter++) {
async function pollSpotStatus(config: ActionConfig, ec2Client: Ec2Instance, ghClient: GithubClient): Promise<'usable' | 'unusable' | 'none'> {
// 12 iters x 10000 ms = 2 minutes
for (let iter = 0; iter < 12; iter++) {
const instances = await ec2Client.getInstancesForTags();
const hasInstance =
instances.filter((i) => i.State?.Name === "running").length > 0;
if (!hasInstance) {
// we need to start an instance
return false;
return 'none';
}
try {
core.info("Found ec2 instance, looking for runners.")
if (await ghClient.hasRunner([config.githubJobId])) {
// we have runners
return true;
return 'usable';
}
} catch (err) { }
await new Promise((r) => setTimeout(r, 1000));
// wait 10 seconds
await new Promise((r) => setTimeout(r, 10000));
}
// we have a bad state for a while, error
throw new Error("Looped for 2 minutes and could only find spot with no runners!");
core.warning("Looped for 2 minutes and could only find spot with no runners!");
return 'unusable';
}

async function start() {
Expand All @@ -37,9 +40,27 @@ async function start() {
} else if (config.subaction !== "start") {
throw new Error("Unexpected subaction: " + config.subaction);
}
// assume subaction is 'start'
// subaction is 'start' or 'restart'estart'
const ec2Client = new Ec2Instance(config);
const ghClient = new GithubClient(config);
const spotStatus = await pollSpotStatus(config, ec2Client, ghClient);
if (spotStatus === "usable") {
core.info(
`Runner already running. Continuing as we can target it with jobs.`
);
return;
}
if (spotStatus === "unusable") {
core.warning(
"Taking down spot as it has no runners! If we were mistaken, this could impact existing jobs."
);
if (config.subaction === "restart") {
throw new Error(
"Taking down spot we just started. This seems wrong, erroring out."
);
}
await stop();
}

var ec2SpotStrategies: string[];
switch (config.ec2SpotInstanceStrategy) {
Expand All @@ -65,38 +86,43 @@ async function start() {
}
}

const canEarlyExit = await shouldEarlyExit(config, ec2Client, ghClient);
if (canEarlyExit) {
core.info(
`Runner already running. Continuing as we can target it with jobs.`
);
return;
}
var instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
core.info(`Starting instance with ${ec2Strategy} strategy`);
// Get instance config
const instanceConfig = await ec2Client.getInstanceConfiguration(
ec2Strategy
);
try {
// Start instance
const response = await ec2Client.runInstances(instanceConfig);
if (response?.length && response.length > 0 && response[0].InstanceId) {
instanceId = response[0].InstanceId;
// 6 * 10000ms = 1 minute per strategy
// TODO make longer lived spot request?
for (let i = 0; i < 6 ; i++) {
// Get instance config
const instanceConfig = await ec2Client.getInstanceConfiguration(
ec2Strategy
);
try {
// Start instance
const response = await ec2Client.runInstances(instanceConfig);
if (response?.length && response.length > 0 && response[0].InstanceId) {
instanceId = response[0].InstanceId;
}
break;
} catch (error) {
if (
error?.code &&
error.code === "InsufficientInstanceCapacity" &&
ec2SpotStrategies.length > 0 &&
ec2Strategy.toLocaleUpperCase() != "none"
) {
core.info(
"Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again."
);
} else { throw error; }
}
} catch (error) {
if (
error?.code &&
error.code === "InsufficientInstanceCapacity" &&
ec2SpotStrategies.length > 0 &&
ec2Strategy.toLocaleUpperCase() != "none"
)
core.info(
"Failed to create instance due to 'InsufficientInstanceCapacity', trying fallback strategy next"
);
else throw error;
// wait 10 seconds
await new Promise((r) => setTimeout(r, 10000));
}
if (instanceId) {
core.info(
"Successfully requested instance with ID " + instanceId
);
break;
}
}
if (instanceId) await ec2Client.waitForInstanceRunningStatus(instanceId);
Expand Down

0 comments on commit a576467

Please sign in to comment.