First

calh · Jul 11, 2022 · 479be5f · 479be5f
commit 479be5f
Show file tree

Hide file tree

Showing 9 changed files with 452 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,26 @@
+FROM lambci/lambda:build-ruby2.7
+
+RUN gem update bundler
+
+WORKDIR /usr/src/app
+COPY Gemfile /usr/src/app
+RUN bundle config --local silence_root_warning true
+RUN bundle config set --local clean 'true'
+RUN bundle config set --local path 'vendor/bundle'
+RUN bundle install 
+
+COPY . /usr/src/app
+
+# Remove AWS SDK gems, since they're already included in the base Lambda image.
+# It saves a lot of space on the deployment package
+RUN zip -r deploy.zip * \
+  -x Dockerfile \
+  -x aws_runner.rb \
+  -x script/\* \
+  -x vendor/bundle/ruby/2.7.0/cache/\* \
+  -x vendor/bundle/ruby/\*/\*/aws-\* \
+  -x vendor/bundle/ruby/\*/cache \
+  -x vendor/bundle/ruby/\*/\*/jmespath*
+
+CMD "/bin/bash"
+
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,7 @@
+source 'https://rubygems.org'
+
+gem 'aws-sdk-rds'
+gem 'aws-sdk-cloudwatchlogs'
+gem 'aws-sdk-cloudwatch'
+
+gem 'chronic_duration'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,37 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    aws-eventstream (1.2.0)
+    aws-partitions (1.602.0)
+    aws-sdk-cloudwatch (1.64.0)
+      aws-sdk-core (~> 3, >= 3.127.0)
+      aws-sigv4 (~> 1.1)
+    aws-sdk-cloudwatchlogs (1.53.0)
+      aws-sdk-core (~> 3, >= 3.127.0)
+      aws-sigv4 (~> 1.1)
+    aws-sdk-core (3.131.2)
+      aws-eventstream (~> 1, >= 1.0.2)
+      aws-partitions (~> 1, >= 1.525.0)
+      aws-sigv4 (~> 1.1)
+      jmespath (~> 1, >= 1.6.1)
+    aws-sdk-rds (1.146.0)
+      aws-sdk-core (~> 3, >= 3.127.0)
+      aws-sigv4 (~> 1.1)
+    aws-sigv4 (1.5.0)
+      aws-eventstream (~> 1, >= 1.0.2)
+    chronic_duration (0.10.6)
+      numerizer (~> 0.1.1)
+    jmespath (1.6.1)
+    numerizer (0.1.1)
+
+PLATFORMS
+  x86_64-linux
+
+DEPENDENCIES
+  aws-sdk-cloudwatch
+  aws-sdk-cloudwatchlogs
+  aws-sdk-rds
+  chronic_duration
+
+BUNDLED WITH
+   2.2.25
diff --git a/README.md b/README.md
@@ -0,0 +1,130 @@
+# CloudWatch RDS OS Metrics
+
+There are quite a few things going on under the hood of Aurora, some of which might be 
+consuming extra resources without much explanation.
+
+For each Aurora Postgres instance, there are `RDS processes`, `Aurora Storage Daemon`, 
+`rsdadmin` background processes, aurora runtimes, and `OS processes`.  You can see 
+a glimpse of them in the RDS dashboard, under Monitoring -> OS Process List.
+
+After spending months tracking down unexplained CPU utilization, I discovered
+that the RDS processes consumes a majority of the CPU when query logging
+is enabled.  After several months of uptime, the CPU utilization increases
+even more.
+
+This Lambda script pulls metrics from the CloudWatch RDSOSMetrics logs,
+parses the CPU and memory utilization for each PID, aggregates, categorizes
+and then publishes custom CloudWatch metrics for a given RDS instance.
+
+(Neat screenshot here)
+
+While this was written for Aurora Postgres, it could be tailored for MySQL as well.  
+
+### First Local Test
+
+If you have Ruby installed, you can run a quick test without doing all of the deployment work below, try this out:
+
+```
+$ bundle install
+# Edit runner.rb and change the `event` hash
+$ export AWS_ACCESS_KEY_ID=...
+$ export AWS_SECRET_ACCESS_KEY=...
+$ export AWS_DEFAULT_REGION=...
+$ bundle exec ruby runner.rb
+```
+
+Wait a few minutes, and then check out your [CloudWatch custom metrics](https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2).
+
+There should be an `RDS_OS_Metrics` custom namespace with everything fun in it.
+
+### First Deployment
+
+Install Docker for the CI build process
+
+```
+$ ./script/ci_build
+$ ./script/create_function --profile me --region us-east-1 --name rdsosmetrics
+```
+
+### Create an EventBridge Rule
+
+[Create a Rule](https://us-east-1.console.aws.amazon.com/events/home?region=us-east-1#/rules/create)
+
+Given that your database instance name is called `prod-writer`:
+
+* Name: `prod-writer_RDS_OS_Metrics`
+* Description: `Publish RDS metrics to CloudWatch for prod-writer`
+* Rule type: `Schedule`  (click Next)
+* A schedule that runs at a regular rate: `1 minute`  (click Next)
+* AWS Service -> Lambda function -> rdsosmetrics
+* Additional settings -> Configure target input -> Constant (JSON text)
+* Paste in parameters to call the function with:
+
+```
+{ 
+  "instance_id": "prod-writer", 
+  "interval": "1 minute"
+}
+```
+
+* `instance_id`: The instance name of the RDS Aurora instance to publish metrics for
+* `interval`:  Human readable duration that this script runs. Aggregates stats over this time frame for publishing to CloudWatch metrics.  Default: "1 minute"
+
+* Set Maximum age to 1 minute, retry attempts to 0.  If the script fails, you don't want re-runs to build up.
+
+Create a new rule for each Aurora instance you want to monitor.
+
+### Create a CloudWatch Widget
+
+Use this as a JSON source.  I'm interested in RDS Processes and regular Postgres user CPU activity.
+Everything else I group into an Other category.
+
+```
+{
+  "metrics": [
+    [ "RDS_OS_Metrics", "CPU", "service", "postgres", "rds_instance", "prod-writer", { "id": "m1" } ],
+    [ "...", "rds-processes", ".", ".", { "id": "m2" } ],
+    [ "...", "postgres-aurora", ".", ".", { "id": "m3", "visible": false } ],
+    [ "...", "aurora-storage", ".", ".", { "id": "m4", "visible": false } ],
+    [ "...", "postgres-background", ".", ".", { "id": "m5", "visible": false } ],
+    [ "...", "os-processes", ".", ".", { "id": "m6", "visible": false } ],
+    [ { "expression": "m3 + m4 + m5 + m6", "label": "Other", "id": "e1" } ]
+  ],
+  "view": "timeSeries",
+  "stacked": false,
+  "region": "us-east-1",
+  "stat": "Average",
+  "period": 60
+}
+```
+
+And one for memory, although this isn't as interesting:
+
+
+```
+{
+  "metrics": [
+    [ "RDS_OS_Metrics", "Memory", "service", "postgres", "rds_instance", "prod-writer", { "id": "m1" } ],
+    [ "...", "rds-processes", ".", ".", { "id": "m2" } ],
+    [ "...", "postgres-aurora", ".", ".", { "id": "m3", "visible": false } ],
+    [ "...", "aurora-storage", ".", ".", { "id": "m4", "visible": false } ],
+    [ "...", "postgres-background", ".", ".", { "id": "m5", "visible": false } ],
+    [ "...", "os-processes", ".", ".", { "id": "m6", "visible": false } ],
+    [ { "expression": "m3 + m4 + m5 + m6", "label": "Other", "id": "e1" } ]
+  ],
+  "view": "timeSeries",
+  "stacked": false,
+  "region": "us-east-1",
+  "stat": "Average",
+  "period": 60
+}
+```
+
+### Updating New Code
+
+```
+$ ./script/ci_build
+$ ./script/update_function --profile me --region us-east-1 --name rdsosmetrics
+```
+
+
diff --git a/handler.rb b/handler.rb
@@ -0,0 +1,118 @@
+require 'json'
+require 'aws-sdk-rds'
+require 'aws-sdk-cloudwatchlogs'
+require 'aws-sdk-cloudwatch'
+require 'chronic_duration'
+
+# Event input JSON
+# * instance_id:    Your database instance name
+# * interval:       Human readable duration that this script runs.
+#                   Aggregates stats over this time frame for publishing
+#                   to CloudWatch metrics.  Default: "1 minute"
+
+# Cache for multiple RDS instances in the same
+# Lambda execution environment.  This saves
+# one API call for looking up the resource ID.
+@resource_ids = {}
+
+def handler(event:, context:)
+  $stdout.sync = true
+  $stderr.sync = true
+
+  @resource_ids ||= {}
+
+  puts "event: #{event.inspect}"
+  puts "resource id cache: #{@resource_ids.inspect}"
+
+  interval = event['interval'] || "1 minute"
+  instance_id = event['instance_id']
+
+  rds = Aws::RDS::Client.new
+  @resource_ids[instance_id] ||= rds.describe_db_instances({
+    db_instance_identifier: instance_id
+  }).to_h[:db_instances][0][:dbi_resource_id]
+
+  cwl = Aws::CloudWatchLogs::Client.new
+  events = cwl.get_log_events({
+    log_group_name: "RDSOSMetrics",
+    log_stream_name: @resource_ids[instance_id],
+    start_time: (Time.now - ChronicDuration.parse(interval)).to_i * 1000
+  })
+
+  # Aggregation of all metrics for this time interval
+  # [ dimensions ] => value
+  sums = {}
+  event_count = 0
+
+  events.events.each do |event|
+    timestamp = Time.at(event.timestamp / 1000)
+    data = JSON.parse(event.message)
+    data['processList'].each do |process|
+      dimension = parse_process_dimension(instance_id, process['name'])
+      # Other interesting metrics are available here, like vss and rss, but I'm more 
+      # interested in just percentages
+      sums[ dimension + [{name:"metric",value:"CPU"}] ] ||= 0 
+      sums[ dimension + [{name:"metric",value:"CPU"}] ] += process['cpuUsedPc'].to_f
+      #if process['cpuUsedPc'].to_f > sums[ dimension + [{name:"metric",value:"CPU"}] ].to_f
+      #  sums[ dimension + [{name:"metric",value:"CPU"}] ] = process['cpuUsedPc'].to_f
+      #end
+
+      sums[ dimension + [{name:"metric",value:"Memory"}] ] ||= 0
+      sums[ dimension + [{name:"metric",value:"Memory"}] ] += process['memoryUsedPc'].to_f
+      #if process['memoryUsedPc'].to_f > sums[ dimension + [{name:"metric",value:"Memory"}] ]
+      #  sums[ dimension + [{name:"metric",value:"Memory"}] ] = process['memoryUsedPc'].to_f
+      #end
+    end
+    event_count += 1
+  end
+
+  # Iterate over the sums and publish average statistics
+  # for this time interval
+  cw = Aws::CloudWatch::Client.new
+  sums.each do |dimension, value|
+    metric_name = dimension.pop[:value]
+    cw.put_metric_data({
+      namespace: "RDS_OS_Metrics",
+      metric_data: [{
+        metric_name: metric_name,
+        timestamp: Time.now,
+        unit: "Percent",
+        # divide by event count for average
+        value: (value.to_f / event_count.to_f),
+        # NOTE:  Do we want to use the max instead?
+        #value: value.to_f,
+        dimensions: dimension
+      }]
+    })
+  end
+
+rescue => e
+  puts "Exception: #{e.message}"
+  raise e
+end
+
+# Take a process name, categorize it and return the 
+# dimensions of a CW metric for this PID
+def parse_process_dimension(instance_id, name)
+  dimension = [
+    { name: "rds_instance", value: instance_id }
+  ]
+  case name
+  when /^postgres: postgres/, "postgres"
+    dimension.push({ name: "service", value: "postgres"})
+  when /^postgres: rdsadmin/, /^postgres: aurora/
+    dimension.push({ name: "service", value: "postgres-aurora"})
+  when /^postgres: /, "pg_controldata"
+    dimension.push({ name: "service", value: "postgres-background"})
+  when "Aurora Storage Daemon"
+    dimension.push({ name: "service", value: "aurora-storage"})
+  when "RDS processes"
+    dimension.push({ name: "service", value: "rds-processes"})
+  when "OS processes"
+    dimension.push({ name: "service", value: "os-processes"})
+  else
+    puts "Can't figure out what this process is: #{name}"
+  end
+
+  dimension
+end
diff --git a/runner.rb b/runner.rb
@@ -0,0 +1,18 @@
+# Simple test script
+# Remember to set your AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY!
+
+# Set this to your db instance name
+event = {
+  "instance_id" => "my-prod-writer"
+}
+
+require "./handler.rb"
+
+# Simulate what this will behave like when run with
+# a cached execution environment in Lambda
+loop do 
+  start = Time.now
+  handler(event:event,context:{})
+  puts "execution time: #{ (Time.now - start) * 1000 }"
+  sleep 60
+end
diff --git a/script/ci_build b/script/ci_build
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Build a deployment zip file.  Contains any special 
+# gems that are not part of the usual AWS SDK, which 
+# is already available in the Lambda environment
+
+set -e
+
+ROOT=$(cd -P -- "$(dirname -- "$0")/.." && printf '%s\n' "$(pwd -P)")
+
+cd $ROOT
+
+rm -f $ROOT/deploy.zip
+
+docker build -t rdsosmetrics .
+docker create --name rdsosmetrics_container rdsosmetrics
+docker cp rdsosmetrics_container:/usr/src/app/deploy.zip .
+docker rm rdsosmetrics_container
+