-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
change rds cpu critical to warning and increase evaluation period #1783
Conversation
staging: rds✅ Terraform Init: Plan: 3 to add, 3 to change, 3 to destroy Show summary
Show planResource actions are indicated with the following symbols:
+ create
~ update in-place
- destroy
Terraform will perform the following actions:
# aws_cloudwatch_metric_alarm.high-db-cpu-critical[0] will be destroyed
# (because aws_cloudwatch_metric_alarm.high-db-cpu-critical is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "high-db-cpu-critical" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-critical",
] -> null
- alarm_description = "CPU usage of the RDS instance > 95%" -> null
- alarm_name = "high-db-cpu-critical-instance-0" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:high-db-cpu-critical-instance-0" -> null
- comparison_operator = "GreaterThanThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {
- "DBInstanceIdentifier" = "notification-canada-ca-staging-instance-0"
} -> null
- evaluation_periods = 1 -> null
- id = "high-db-cpu-critical-instance-0" -> null
- insufficient_data_actions = [] -> null
- metric_name = "CPUUtilization" -> null
- namespace = "AWS/RDS" -> null
- ok_actions = [] -> null
- period = 60 -> null
- statistic = "Average" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 95 -> null
- treat_missing_data = "notBreaching" -> null
# (4 unchanged attributes hidden)
}
# aws_cloudwatch_metric_alarm.high-db-cpu-critical[1] will be destroyed
# (because aws_cloudwatch_metric_alarm.high-db-cpu-critical is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "high-db-cpu-critical" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-critical",
] -> null
- alarm_description = "CPU usage of the RDS instance > 95%" -> null
- alarm_name = "high-db-cpu-critical-instance-1" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:high-db-cpu-critical-instance-1" -> null
- comparison_operator = "GreaterThanThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {
- "DBInstanceIdentifier" = "notification-canada-ca-staging-instance-1"
} -> null
- evaluation_periods = 1 -> null
- id = "high-db-cpu-critical-instance-1" -> null
- insufficient_data_actions = [] -> null
- metric_name = "CPUUtilization" -> null
- namespace = "AWS/RDS" -> null
- ok_actions = [] -> null
- period = 60 -> null
- statistic = "Average" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 95 -> null
- treat_missing_data = "notBreaching" -> null
# (4 unchanged attributes hidden)
}
# aws_cloudwatch_metric_alarm.high-db-cpu-critical[2] will be destroyed
# (because aws_cloudwatch_metric_alarm.high-db-cpu-critical is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "high-db-cpu-critical" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-critical",
] -> null
- alarm_description = "CPU usage of the RDS instance > 95%" -> null
- alarm_name = "high-db-cpu-critical-instance-2" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:high-db-cpu-critical-instance-2" -> null
- comparison_operator = "GreaterThanThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {
- "DBInstanceIdentifier" = "notification-canada-ca-staging-instance-2"
} -> null
- evaluation_periods = 1 -> null
- id = "high-db-cpu-critical-instance-2" -> null
- insufficient_data_actions = [] -> null
- metric_name = "CPUUtilization" -> null
- namespace = "AWS/RDS" -> null
- ok_actions = [] -> null
- period = 60 -> null
- statistic = "Average" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 95 -> null
- treat_missing_data = "notBreaching" -> null
# (4 unchanged attributes hidden)
}
# aws_cloudwatch_metric_alarm.high-db-cpu-warning[0] will be updated in-place
~ resource "aws_cloudwatch_metric_alarm" "high-db-cpu-warning" {
~ alarm_description = "CPU usage of the RDS instance > 80%" -> "CPU usage of the RDS instance > 80% for 5 minutes"
~ datapoints_to_alarm = 0 -> 5
~ evaluation_periods = 1 -> 5
id = "high-db-cpu-warning-instance-0"
tags = {}
# (19 unchanged attributes hidden)
}
# aws_cloudwatch_metric_alarm.high-db-cpu-warning[1] will be updated in-place
~ resource "aws_cloudwatch_metric_alarm" "high-db-cpu-warning" {
~ alarm_description = "CPU usage of the RDS instance > 80%" -> "CPU usage of the RDS instance > 80% for 5 minutes"
~ datapoints_to_alarm = 0 -> 5
~ evaluation_periods = 1 -> 5
id = "high-db-cpu-warning-instance-1"
tags = {}
# (19 unchanged attributes hidden)
}
# aws_cloudwatch_metric_alarm.high-db-cpu-warning[2] will be updated in-place
~ resource "aws_cloudwatch_metric_alarm" "high-db-cpu-warning" {
~ alarm_description = "CPU usage of the RDS instance > 80%" -> "CPU usage of the RDS instance > 80% for 5 minutes"
~ datapoints_to_alarm = 0 -> 5
~ evaluation_periods = 1 -> 5
id = "high-db-cpu-warning-instance-2"
tags = {}
# (19 unchanged attributes hidden)
}
# aws_cloudwatch_metric_alarm.very-high-db-cpu-warning[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "very-high-db-cpu-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "CPU usage of the RDS instance > 95% for 5 minutes"
+ alarm_name = "very-high-db-cpu-warning-instance-0"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanThreshold"
+ datapoints_to_alarm = 5
+ dimensions = {
+ "DBInstanceIdentifier" = "notification-canada-ca-staging-instance-0"
}
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 5
+ id = (known after apply)
+ metric_name = "CPUUtilization"
+ namespace = "AWS/RDS"
+ period = 60
+ statistic = "Average"
+ tags_all = (known after apply)
+ threshold = 95
+ treat_missing_data = "notBreaching"
}
# aws_cloudwatch_metric_alarm.very-high-db-cpu-warning[1] will be created
+ resource "aws_cloudwatch_metric_alarm" "very-high-db-cpu-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "CPU usage of the RDS instance > 95% for 5 minutes"
+ alarm_name = "very-high-db-cpu-warning-instance-1"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanThreshold"
+ datapoints_to_alarm = 5
+ dimensions = {
+ "DBInstanceIdentifier" = "notification-canada-ca-staging-instance-1"
}
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 5
+ id = (known after apply)
+ metric_name = "CPUUtilization"
+ namespace = "AWS/RDS"
+ period = 60
+ statistic = "Average"
+ tags_all = (known after apply)
+ threshold = 95
+ treat_missing_data = "notBreaching"
}
# aws_cloudwatch_metric_alarm.very-high-db-cpu-warning[2] will be created
+ resource "aws_cloudwatch_metric_alarm" "very-high-db-cpu-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "CPU usage of the RDS instance > 95% for 5 minutes"
+ alarm_name = "very-high-db-cpu-warning-instance-2"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanThreshold"
+ datapoints_to_alarm = 5
+ dimensions = {
+ "DBInstanceIdentifier" = "notification-canada-ca-staging-instance-2"
}
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 5
+ id = (known after apply)
+ metric_name = "CPUUtilization"
+ namespace = "AWS/RDS"
+ period = 60
+ statistic = "Average"
+ tags_all = (known after apply)
+ threshold = 95
+ treat_missing_data = "notBreaching"
}
Plan: 3 to add, 3 to change, 3 to destroy.
─────────────────────────────────────────────────────────────────────────────
Saved the plan to: plan.tfplan
To perform exactly these actions, run the following command to apply:
terraform apply "plan.tfplan"
Show Conftest resultsWARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.logs_exports"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.db-free-local-storage-critical[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.db-free-local-storage-critical[1]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.db-free-local-storage-critical[2]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.db-free-local-storage-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.db-free-local-storage-warning[1]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.db-free-local-storage-warning[2]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-db-cpu-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-db-cpu-warning[1]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-db-cpu-warning[2]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-dbload-critical[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-dbload-critical[1]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-dbload-critical[2]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-dbload-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-dbload-warning[1]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.high-dbload-warning[2]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.low-db-memory-critical[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.low-db-memory-critical[1]"]
WARN - plan.json - main - Missing Common Tags:... |
Updating alarms ⏰? Great! Please update the Google Sheet and add a 👍 to this message after 🙏 |
Updating alarms ⏰? Great! Please update the Google Sheet and add a 👍 to this message after 🙏 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sweet :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you -- let's see how that goes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Summary | Résumé
This alarm went off yesterday evening and the system stayed solid. We can change this to a warning so that if other badness is happening we will still be aware of the RDS cpu load.
We also change this and the other RDS cpu alarm to require the load to be above the threshold for 5 minutes in a row
Test instructions | Instructions pour tester la modification
View in staging / prod
Release Instructions | Instructions pour le déploiement
None.
Reviewer checklist | Liste de vérification du réviseur