Skip to content

Commit

Permalink
Fix bug attaching multiple CapacityProviders in multiple EC2 services. (
Browse files Browse the repository at this point in the history
#359)

* Fix bug attaching multiple CapacityProviders in multiple EC2 services.

When launching multiple EC2 services the current tenant-onboarding-app
CloudFormation stack tries to create a ClusterCapacityAssociations
CloudFormation resource for each service. However because attaching
CapacityProviders to ECS Clusters is non-atomic only the first
ClusterCapacityAssociations will actually succeed.

This commit adds a CustomResource which on CapacityProvider create and
delete will attach or detach that CapacityProvider to the existing
ECSCluster. Because this attachment is a non-atomic operation (i.e. it
involves first getting the existing capacityProviders for the ECS
cluster before then overwriting with a new set of capacityProviders)
this commit also includes using the existing Onboarding DynamoDB table
as a locking mechanism to prevent against illogical overwrites between
multiple stacks operating on the same ECS Cluster.

Fixes #356.

* Fix null pointer in cloudformation-utils

Co-authored-by: PoeppingT <[email protected]>
Co-authored-by: brtrvn <[email protected]>
  • Loading branch information
3 people authored Oct 7, 2022
1 parent f0276f3 commit a1609e3
Show file tree
Hide file tree
Showing 21 changed files with 1,158 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;

public class CloudFormationResponse {
Expand Down Expand Up @@ -123,7 +124,7 @@ protected static String buildResponseBody(Map<String, Object> event, Context con
responseBody.put("Data", responseData != null ? responseData : Collections.EMPTY_MAP);
} else {
// CloudFormation will blow up if the failure response string is longer than 256 chars
String error = (String) responseData.getOrDefault("Reason", "");
String error = Objects.toString(responseData.getOrDefault("Reason", ""), "");
if (error.length() > 256) {
error = error.substring(0, 256);
}
Expand Down
134 changes: 134 additions & 0 deletions resources/custom-resources/attach-ecs-capacity-provider/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License").
You may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.amazon.aws.partners.saasfactory.saasboost</groupId>
<artifactId>saasboost-custom-resources</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>AttachEcsCapacityProvider</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<licenses>
<license>
<name>Apache-2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0</url>
</license>
</licenses>
<properties>
<checkstyle.maxAllowedViolations>0</checkstyle.maxAllowedViolations>
</properties>

<build>
<finalName>${project.artifactId}</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
</plugin>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>4.0.0</version>
<executions>
<execution>
<id>get-the-git-infos</id>
<goals>
<goal>revision</goal>
</goals>
<phase>initialize</phase>
</execution>
</executions>
<configuration>
<generateGitPropertiesFile>true</generateGitPropertiesFile>
<generateGitPropertiesFilename>${project.build.outputDirectory}/git.properties</generateGitPropertiesFilename>
<includeOnlyProperties>
<includeOnlyProperty>^git.commit.id.describe</includeOnlyProperty>
<includeOnlyProperty>^git.commit.id.describe-short</includeOnlyProperty>
<includeOnlyProperty>^git.commit.time</includeOnlyProperty>
<includeOnlyProperty>^git.closest.tag.name</includeOnlyProperty>
</includeOnlyProperties>
<commitIdGenerationMode>full</commitIdGenerationMode>
<dotGitDirectory>../../.git</dotGitDirectory>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>

<dependencies>
<dependency>
<groupId>com.amazon.aws.partners.saasfactory.saasboost</groupId>
<artifactId>Utils</artifactId>
<version>1.0.0</version>
<!-- Don't bundle our layer so we get the shared one at runtime -->
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.amazon.aws.partners.saasfactory.saasboost</groupId>
<artifactId>CloudFormationUtils</artifactId>
<version>1.0.0</version>
<!-- Don't bundle our layer so we get the shared one at runtime -->
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
</dependency>
<dependency>
<groupId>software.amazon.awssdk</groupId>
<artifactId>ecs</artifactId>
<version>${aws.java.sdk.version}</version>
<exclusions>
<exclusion>
<groupId>software.amazon.awssdk</groupId>
<artifactId>netty-nio-client</artifactId>
</exclusion>
<exclusion>
<groupId>software.amazon.awssdk</groupId>
<artifactId>apache-client</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>software.amazon.awssdk</groupId>
<artifactId>dynamodb</artifactId>
<version>${aws.java.sdk.version}</version>
<exclusions>
<exclusion>
<groupId>software.amazon.awssdk</groupId>
<artifactId>netty-nio-client</artifactId>
</exclusion>
<exclusion>
<groupId>software.amazon.awssdk</groupId>
<artifactId>apache-client</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.amazon.aws.partners.saasfactory.saasboost;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.services.ecs.EcsClient;
import software.amazon.awssdk.services.ecs.model.Cluster;
import software.amazon.awssdk.services.ecs.model.ClusterNotFoundException;
import software.amazon.awssdk.services.ecs.model.DescribeClustersRequest;
import software.amazon.awssdk.services.ecs.model.PutClusterCapacityProvidersRequest;
import software.amazon.awssdk.services.ecs.model.UpdateInProgressException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.function.Function;
import java.util.stream.Collectors;

public final class AttachCapacityProviderRequestHandler implements Callable<HandleResult> {

private static final Logger LOGGER = LoggerFactory.getLogger(AttachCapacityProviderRequestHandler.class);

private final RequestContext requestContext;
private final CapacityProviderLock lock;
private final EcsClient ecs;

public AttachCapacityProviderRequestHandler(
RequestContext requestContext,
CapacityProviderLock lock,
EcsClient ecs) {
this.ecs = ecs;
this.requestContext = requestContext;
this.lock = lock;
}

@Override
public HandleResult call() {
HandleResult result = new HandleResult();
LOGGER.info(requestContext.requestType.toUpperCase());
if ("Create".equalsIgnoreCase(requestContext.requestType)
|| "Update".equalsIgnoreCase(requestContext.requestType)) {
LOGGER.info("Attaching capacity provider {} to ecs cluster {} for tenant {}",
requestContext.capacityProvider, requestContext.ecsCluster, requestContext.tenantId);
result = atomicallyUpdateCapacityProviders((capacityProviders) -> {
if (!capacityProviders.contains(requestContext.capacityProvider)) {
List<String> modifiedCapacityProviders = new ArrayList<String>(capacityProviders);
modifiedCapacityProviders.add(requestContext.capacityProvider);
return modifiedCapacityProviders;
}
return capacityProviders;
});
} else if ("Delete".equalsIgnoreCase(requestContext.requestType)) {
// unclear whether we need this.. commenting it out for testing.
LOGGER.info("Detaching capacity provider {} from ecs cluster {} for tenant {}",
requestContext.capacityProvider, requestContext.ecsCluster, requestContext.tenantId);
result = atomicallyUpdateCapacityProviders((capacityProviders) -> {
return capacityProviders.stream()
.filter((capacityProvider) -> !capacityProvider.equals(requestContext.capacityProvider))
.collect(Collectors.toList());
});
result.setSucceeded();
} else {
LOGGER.error("FAILED unknown requestType {}", requestContext.requestType);
result.putFailureReason("Unknown RequestType " + requestContext.requestType);
result.setFailed();
}

return result;
}

private HandleResult atomicallyUpdateCapacityProviders(
Function<List<String>, List<String>> capacityProvidersMutationFunction) {
HandleResult result = new HandleResult();
// lock ddb
lock.lock(requestContext);
try {
// read capacity providers into list
List<String> existingCapacityProviders = getExistingCapacityProviders();

List<String> mutatedCapacityProviders = capacityProvidersMutationFunction.apply(existingCapacityProviders);
LOGGER.debug("existingCapacityProviders {} mutated to {}",
existingCapacityProviders, mutatedCapacityProviders);

boolean successful = false;
// if the mutate did nothing, no point in slowing us down to make an ECS call
if (existingCapacityProviders.equals(mutatedCapacityProviders)) {
successful = true;
result.setSucceeded();
}
while (!successful) {
try {
// set capacity providers. response doesn't really give us anything but a
// description of the new cluster. exceptions are thrown on failure
ecs.putClusterCapacityProviders(PutClusterCapacityProvidersRequest.builder()
.cluster(requestContext.ecsCluster)
.capacityProviders(mutatedCapacityProviders)
.build());
successful = true;
result.setSucceeded();
} catch (UpdateInProgressException uipe) {
// There's a Amazon ECS container agent update in progress on this container instance.
// ECS errors indicate this can be retried. Wait 10 seconds and try again.
LOGGER.error("Received error calling putClusterCapacityProviders", uipe);
LOGGER.error(Utils.getFullStackTrace(uipe));
LOGGER.error("Waiting 10 seconds before retrying..");
Thread.sleep(10 * 1000); // 10 seconds
}
}
} catch (ClusterNotFoundException cnfe) {
LOGGER.error("Could not find ecs cluster: {}", requestContext.ecsCluster);
LOGGER.error(Utils.getFullStackTrace(cnfe));
result.putFailureReason(cnfe.getMessage());
result.setFailed();
} catch (InterruptedException ie) {
LOGGER.error("Error while waiting between putClusterCapacityProvider calls", ie.getMessage());
LOGGER.error(Utils.getFullStackTrace(ie));
result.putFailureReason(ie.getMessage());
result.setFailed();
} finally {
// unlock ddb
lock.unlock(requestContext);
}
return result;
}

private List<String> getExistingCapacityProviders() {
List<Cluster> returnedClusters = ecs.describeClusters(
DescribeClustersRequest.builder().clusters(requestContext.ecsCluster).build()).clusters();
if (returnedClusters.size() != 1) {
// we only passed one cluster ARN but we received 0 or 2
LOGGER.error("Expected 1 cluster with name {} but found {}",
requestContext.ecsCluster, returnedClusters.size());
}
List<String> existingCapacityProviders = returnedClusters.get(0).capacityProviders();
return existingCapacityProviders == null ? List.of() : existingCapacityProviders;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.amazon.aws.partners.saasfactory.saasboost;

import com.amazonaws.services.lambda.runtime.Context;
import com.amazonaws.services.lambda.runtime.RequestHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
import software.amazon.awssdk.services.ecs.EcsClient;

import java.util.*;
import java.util.concurrent.*;

public class AttachEcsCapacityProvider implements RequestHandler<Map<String, Object>, Object> {

private static final Logger LOGGER = LoggerFactory.getLogger(AttachEcsCapacityProvider.class);

private final EcsClient ecs;
private final CapacityProviderLock lock;

public AttachEcsCapacityProvider() {
LOGGER.info("Version Info: {}", Utils.version(this.getClass()));
ecs = Utils.sdkClient(EcsClient.builder(), EcsClient.SERVICE_NAME);
lock = new CapacityProviderLock(Utils.sdkClient(DynamoDbClient.builder(), DynamoDbClient.SERVICE_NAME));
}

@Override
public Object handleRequest(Map<String, Object> event, Context context) {
Utils.logRequestEvent(event);

Map<String, Object> resourceProperties = (Map<String, Object>) event.get("ResourceProperties");
RequestContext requestContext = RequestContext.builder()
.requestType((String) event.get("RequestType"))
.capacityProvider((String) resourceProperties.get("CapacityProvider"))
.ecsCluster((String) resourceProperties.get("ECSCluster"))
.onboardingDdbTable((String) resourceProperties.get("OnboardingDdbTable"))
.tenantId((String) resourceProperties.get("TenantId"))
.build();
HandleResult handleRequestResult = new HandleResult();
ExecutorService service = Executors.newSingleThreadExecutor();
try {
Callable<HandleResult> c = new AttachCapacityProviderRequestHandler(requestContext, lock, ecs);
Future<?> f = service.submit(c);
handleRequestResult = (HandleResult) f.get(context.getRemainingTimeInMillis() - 1000,
TimeUnit.MILLISECONDS);
} catch (final TimeoutException | InterruptedException | ExecutionException e) {
// Timed out
LOGGER.error("FAILED unexpected error or request timed out " + e.getMessage());
String stackTrace = Utils.getFullStackTrace(e);
LOGGER.error(stackTrace);
handleRequestResult.setFailed();
handleRequestResult.putResponseData("Reason", stackTrace);
} finally {
service.shutdown();
}
CloudFormationResponse.send(event, context,
handleRequestResult.succeeded() ? "SUCCESS" : "FAILED",
handleRequestResult.getResponseData());
return null;
}
}
Loading

0 comments on commit a1609e3

Please sign in to comment.