From ad8a9e0c4387ac2ec36418908b3338fd6b02f615 Mon Sep 17 00:00:00 2001 From: Michael Bauer Date: Mon, 16 Dec 2024 10:59:16 -0500 Subject: [PATCH] feat: Implement mvp functionality to pull stats besides averges --- basic/collector.go | 1 + basic/metrics.go | 93 +++++++++++++++++++++++----------------------- basic/scraper.go | 84 ++++++++++++++++++++++++++--------------- 3 files changed, 102 insertions(+), 76 deletions(-) diff --git a/basic/collector.go b/basic/collector.go index adfb6c68..ca84539c 100644 --- a/basic/collector.go +++ b/basic/collector.go @@ -28,6 +28,7 @@ type Metric struct { cwName string prometheusName string prometheusHelp string + statistics []string } type Collector struct { diff --git a/basic/metrics.go b/basic/metrics.go index a35c2c37..89beab8f 100644 --- a/basic/metrics.go +++ b/basic/metrics.go @@ -3,117 +3,118 @@ package basic var Metrics = []Metric{ { cwName: "ActiveTransactions", - prometheusName: "aws_rds_active_transactions_average", + prometheusName: "aws_rds_active_transactions", prometheusHelp: "ActiveTransactions", }, { cwName: "AuroraBinlogReplicaLag", - prometheusName: "aws_rds_aurora_binlog_replica_lag_average", + prometheusName: "aws_rds_aurora_binlog_replica_lag", prometheusHelp: "AuroraBinlogReplicaLag", }, { cwName: "AuroraReplicaLag", - prometheusName: "aws_rds_aurora_replica_lag_average", + prometheusName: "aws_rds_aurora_replica_lag", prometheusHelp: "AuroraReplicaLag", }, { cwName: "AuroraReplicaLagMaximum", - prometheusName: "aws_rds_aurora_replica_lag_maximum_average", + prometheusName: "aws_rds_aurora_replica_lag_maximum", prometheusHelp: "AuroraReplicaLagMaximum", }, { cwName: "AuroraReplicaLagMinimum", - prometheusName: "aws_rds_aurora_replica_lag_minimum_average", + prometheusName: "aws_rds_aurora_replica_lag_minimum", prometheusHelp: "AuroraReplicaLagMinimum", }, { cwName: "BinLogDiskUsage", - prometheusName: "aws_rds_bin_log_disk_usage_average", + prometheusName: "aws_rds_bin_log_disk_usage", prometheusHelp: "The amount of disk space occupied by binary logs on the master. Applies to MySQL read replicas. Units: Bytes", }, { cwName: "BlockedTransactions", - prometheusName: "aws_rds_blocked_transactions_average", + prometheusName: "aws_rds_blocked_transactions", prometheusHelp: "BlockedTransactions", }, { cwName: "BufferCacheHitRatio", - prometheusName: "aws_rds_buffer_cache_hit_ratio_average", + prometheusName: "aws_rds_buffer_cache_hit_ratio", prometheusHelp: "BufferCacheHitRatio", }, { cwName: "BurstBalance", - prometheusName: "aws_rds_burst_balance_average", + prometheusName: "aws_rds_burst_balance", prometheusHelp: "The percent of General Purpose SSD (gp2) burst-bucket I/O credits available. Units: Percent", }, { cwName: "CPUCreditBalance", - prometheusName: "aws_rds_cpu_credit_balance_average", + prometheusName: "aws_rds_cpu_credit_balance", prometheusHelp: "[T2 instances] The number of CPU credits available for the instance to burst beyond its base CPU utilization. Credits are stored in the credit balance after they are earned and removed from the credit balance after they expire. Credits expire 24 hours after they are earned. CPU credit metrics are available only at a 5 minute frequency. Units: Count", }, { cwName: "CPUCreditUsage", - prometheusName: "aws_rds_cpu_credit_usage_average", + prometheusName: "aws_rds_cpu_credit_usage", prometheusHelp: "[T2 instances] The number of CPU credits consumed by the instance. One CPU credit equals one vCPU running at 100% utilization for one minute or an equivalent combination of vCPUs, utilization, and time (for example, one vCPU running at 50% utilization for two minutes or two vCPUs running at 25% utilization for two minutes). CPU credit metrics are available only at a 5 minute frequency. If you specify a period greater than five minutes, use the Sum statistic instead of the Average statistic. Units: Count", }, { cwName: "CPUUtilization", - prometheusName: "node_cpu_average", + prometheusName: "node_cpu", prometheusHelp: "The percentage of CPU utilization. Units: Percent", }, { cwName: "CommitLatency", - prometheusName: "aws_rds_commit_latency_average", + prometheusName: "aws_rds_commit_latency", prometheusHelp: "CommitLatency", }, { cwName: "CommitThroughput", - prometheusName: "aws_rds_commit_throughput_average", + prometheusName: "aws_rds_commit_throughput", prometheusHelp: "CommitThroughput", }, { cwName: "DDLLatency", - prometheusName: "aws_rds_ddl_latency_average", + prometheusName: "aws_rds_ddl_latency", prometheusHelp: "DDLLatency", }, { cwName: "DDLThroughput", - prometheusName: "aws_rds_ddl_throughput_average", + prometheusName: "aws_rds_ddl_throughput", prometheusHelp: "DDLThroughput", }, { cwName: "DMLLatency", - prometheusName: "aws_rds_dml_latency_average", + prometheusName: "aws_rds_dml_latency", prometheusHelp: "DMLLatency", }, { cwName: "DMLThroughput", - prometheusName: "aws_rds_dml_throughput_average", + prometheusName: "aws_rds_dml_throughput", prometheusHelp: "DMLThroughput", }, { cwName: "DatabaseConnections", - prometheusName: "aws_rds_database_connections_average", + prometheusName: "aws_rds_database_connections", prometheusHelp: "The number of database connections in use. Units: Count", + statistics: []string{"Sum", "Average"}, }, { cwName: "Deadlocks", - prometheusName: "aws_rds_deadlocks_average", + prometheusName: "aws_rds_deadlocks", prometheusHelp: "Deadlocks", }, { cwName: "DeleteLatency", - prometheusName: "aws_rds_delete_latency_average", + prometheusName: "aws_rds_delete_latency", prometheusHelp: "DeleteLatency", }, { cwName: "DeleteThroughput", - prometheusName: "aws_rds_delete_throughput_average", + prometheusName: "aws_rds_delete_throughput", prometheusHelp: "DeleteThroughput", }, { cwName: "DiskQueueDepth", - prometheusName: "aws_rds_disk_queue_depth_average", + prometheusName: "aws_rds_disk_queue_depth", prometheusHelp: "The number of outstanding IOs (read/write requests) waiting to access the disk. Units: Count", }, { @@ -123,7 +124,7 @@ var Metrics = []Metric{ }, { cwName: "FreeLocalStorage", - prometheusName: "aws_rds_free_local_storage_average", + prometheusName: "aws_rds_free_local_storage", prometheusHelp: "FreeLocalStorage", }, { @@ -138,112 +139,112 @@ var Metrics = []Metric{ }, { cwName: "InsertLatency", - prometheusName: "aws_rds_insert_latency_average", + prometheusName: "aws_rds_insert_latency", prometheusHelp: "InsertLatency", }, { cwName: "InsertThroughput", - prometheusName: "aws_rds_insert_throughput_average", + prometheusName: "aws_rds_insert_throughput", prometheusHelp: "InsertThroughput", }, { cwName: "LoginFailures", - prometheusName: "aws_rds_login_failures_average", + prometheusName: "aws_rds_login_failures", prometheusHelp: "LoginFailures", }, { cwName: "NetworkReceiveThroughput", - prometheusName: "aws_rds_network_receive_throughput_average", + prometheusName: "aws_rds_network_receive_throughput", prometheusHelp: "The incoming (Receive) network traffic on the DB instance, including both customer database traffic and Amazon RDS traffic used for monitoring and replication. Units: Bytes/second", }, { cwName: "NetworkThroughput", - prometheusName: "aws_rds_network_throughput_average", + prometheusName: "aws_rds_network_throughput", prometheusHelp: "NetworkThroughput", }, { cwName: "NetworkTransmitThroughput", - prometheusName: "aws_rds_network_transmit_throughput_average", + prometheusName: "aws_rds_network_transmit_throughput", prometheusHelp: "The outgoing (Transmit) network traffic on the DB instance, including both customer database traffic and Amazon RDS traffic used for monitoring and replication. Units: Bytes/second", }, { cwName: "Queries", - prometheusName: "aws_rds_queries_average", + prometheusName: "aws_rds_queries", prometheusHelp: "Queries", }, { cwName: "ReadIOPS", - prometheusName: "aws_rds_read_iops_average", + prometheusName: "aws_rds_read_iops", prometheusHelp: "The average number of disk I/O operations per second. Units: Count/Second", }, { cwName: "ReadLatency", - prometheusName: "aws_rds_read_latency_average", + prometheusName: "aws_rds_read_latency", prometheusHelp: "The average amount of time taken per disk I/O operation. Units: Seconds", }, { cwName: "ReadThroughput", - prometheusName: "aws_rds_read_throughput_average", + prometheusName: "aws_rds_read_throughput", prometheusHelp: "The average number of bytes read from disk per second. Units: Bytes/Second", }, { cwName: "ResultSetCacheHitRatio", - prometheusName: "aws_rds_result_set_cache_hit_ratio_average", + prometheusName: "aws_rds_result_set_cache_hit_ratio", prometheusHelp: "ResultSetCacheHitRatio", }, { cwName: "SelectLatency", - prometheusName: "aws_rds_select_latency_average", + prometheusName: "aws_rds_select_latency", prometheusHelp: "SelectLatency", }, { cwName: "SelectThroughput", - prometheusName: "aws_rds_select_throughput_average", + prometheusName: "aws_rds_select_throughput", prometheusHelp: "SelectThroughput", }, { cwName: "SwapUsage", - prometheusName: "aws_rds_swap_usage_average", + prometheusName: "aws_rds_swap_usage", prometheusHelp: "The amount of swap space used on the DB instance. Units: Bytes", }, { cwName: "UpdateLatency", - prometheusName: "aws_rds_update_latency_average", + prometheusName: "aws_rds_update_latency", prometheusHelp: "UpdateLatency", }, { cwName: "UpdateThroughput", - prometheusName: "aws_rds_update_throughput_average", + prometheusName: "aws_rds_update_throughput", prometheusHelp: "UpdateThroughput", }, { cwName: "VolumeBytesUsed", - prometheusName: "aws_rds_volume_bytes_used_average", + prometheusName: "aws_rds_volume_bytes_used", prometheusHelp: "VolumeBytesUsed", }, { cwName: "VolumeReadIOPs", - prometheusName: "aws_rds_volume_read_io_ps_average", + prometheusName: "aws_rds_volume_read_io_ps", prometheusHelp: "VolumeReadIOPs", }, { cwName: "VolumeWriteIOPs", - prometheusName: "aws_rds_volume_write_io_ps_average", + prometheusName: "aws_rds_volume_write_io_ps", prometheusHelp: "VolumeWriteIOPs", }, { cwName: "WriteIOPS", - prometheusName: "aws_rds_write_iops_average", + prometheusName: "aws_rds_write_iops", prometheusHelp: "The average number of disk I/O operations per second. Units: Count/Second", }, { cwName: "WriteLatency", - prometheusName: "aws_rds_write_latency_average", + prometheusName: "aws_rds_write_latency", prometheusHelp: "The average amount of time taken per disk I/O operation. Units: Seconds", }, { cwName: "WriteThroughput", - prometheusName: "aws_rds_write_throughput_average", + prometheusName: "aws_rds_write_throughput", prometheusHelp: "The average number of bytes written to disk per second. Units: Bytes/Second", }, { diff --git a/basic/scraper.go b/basic/scraper.go index 978f5d02..0a25219f 100644 --- a/basic/scraper.go +++ b/basic/scraper.go @@ -1,13 +1,13 @@ package basic import ( - "sync" - "time" - "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/cloudwatch" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" + "strings" + "sync" + "time" "github.com/percona/rds_exporter/config" ) @@ -96,51 +96,75 @@ func (s *Scraper) scrapeMetric(metric Metric) error { now := time.Now() end := now.Add(-Delay) - params := &cloudwatch.GetMetricStatisticsInput{ - EndTime: aws.Time(end), - StartTime: aws.Time(end.Add(-Range)), + // If metric.statistics is empty, default to ["Average"] for backwards compatibility + stats := metric.statistics + if stats == nil || len(stats) == 0 { + stats = []string{"Average"} + } + params := &cloudwatch.GetMetricStatisticsInput{ + EndTime: aws.Time(end), + StartTime: aws.Time(end.Add(-Range)), Period: aws.Int64(int64(Period.Seconds())), MetricName: aws.String(metric.cwName), Namespace: aws.String("AWS/RDS"), - Dimensions: []*cloudwatch.Dimension{}, - Statistics: aws.StringSlice([]string{"Average"}), - Unit: nil, + Dimensions: []*cloudwatch.Dimension{ + { + Name: aws.String("DBInstanceIdentifier"), + Value: aws.String(s.instance.Instance), + }, + }, + Statistics: aws.StringSlice(stats), } - params.Dimensions = append(params.Dimensions, &cloudwatch.Dimension{ - Name: aws.String("DBInstanceIdentifier"), - Value: aws.String(s.instance.Instance), - }) - - // Call CloudWatch to gather the datapoints resp, err := s.svc.GetMetricStatistics(params) if err != nil { return err } - // There's nothing in there, don't publish the metric if len(resp.Datapoints) == 0 { return nil } - // Pick the latest datapoint dp := getLatestDatapoint(resp.Datapoints) - - // Get the metric. - v := aws.Float64Value(dp.Average) - switch metric.cwName { - case "EngineUptime": - // "Fake EngineUptime -> node_boot_time with time.Now().Unix() - EngineUptime." - v = float64(time.Now().Unix() - int64(v)) + if dp == nil { + return nil } - // Send metric. - s.ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc(metric.prometheusName, metric.prometheusHelp, nil, s.constLabels), - prometheus.GaugeValue, - v, - ) + // For each requested statistic, build and send the Prometheus metric + for _, stat := range stats { + var value float64 + + switch stat { + case "Average": + value = aws.Float64Value(dp.Average) + case "Sum": + value = aws.Float64Value(dp.Sum) + case "Maximum": + value = aws.Float64Value(dp.Maximum) + case "Minimum": + value = aws.Float64Value(dp.Minimum) + default: + continue + } + + switch metric.cwName { + case "EngineUptime": + value = float64(time.Now().Unix() - int64(value)) + } + + // Append the statistic name to help identify them in Prometheus. + lowerStat := strings.ToLower(stat) + nameWithStat := metric.prometheusName + "_" + lowerStat + helpWithStat := metric.prometheusHelp + " (" + lowerStat + ")" + + // Emit the Prometheus metric + s.ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc(nameWithStat, helpWithStat, nil, s.constLabels), + prometheus.GaugeValue, + value, + ) + } return nil }