GDB-286 Added monitoring for graphdb

* Added the necessary policy for the cloudwatch-agent * Modified the user data script to enable the cloudwatch agent and configure it for the current tenant * Added sample dashboard with a widget that monitors the cpu
Ontotext-AD · Oct 13, 2023 · b391f45 · b391f45
1 parent 1e0b374
commit b391f45
Show file tree

Hide file tree

Showing 10 changed files with 44 additions and 6 deletions.
diff --git a/examples/vpc-with-multiple-az/main.tf b/examples/vpc-with-multiple-az/main.tf
@@ -28,3 +28,10 @@ module "graphdb" {
   ami_id          = var.ami_id
   graphdb_version = var.graphdb_version
 }
+
+module "monitoring" {
+  source = "./monitoring"
+
+  aws_region           = var.aws_region
+  resource_name_prefix = var.resource_name_prefix
+}
diff --git a/examples/vpc-with-multiple-az/monitoring/README.md b/examples/vpc-with-multiple-az/monitoring/README.md
@@ -0,0 +1,15 @@
+# GraphDB AWS Monitoring Module
+
+This module creates the necessary configurations to set up a Cloudwatch dashboard with all the necessary widgets that show the health of the graphdb cluster. Those configurations should be by tenant.
+
+## Widgets
+Here are created the widgets that are required to monitor the load and the health of graphdb. Those include things like:
+* CPU Load
+* Free Memory
+* Node Health
+* and etc.
+
+The widgets are configured with queries and should be usable for each new SaaS offering.
+
+## Alarms
+In case of abnormal behavior like node going down or CPU goes high alarms are defined to send notifications for the cluster health.
diff --git a/modules/monitoring/main.tf → ...s/vpc-with-multiple-az/monitoring/main.tf b/modules/monitoring/main.tf → ...s/vpc-with-multiple-az/monitoring/main.tf
@@ -11,7 +11,7 @@ resource "aws_cloudwatch_dashboard" "main" {
         "type": "metric",
         "properties": {
           "metrics": [
-            [ { "expression": "SELECT AVG(graphdb_cpu_load) FROM \"GraphDB-Metrics\" GROUP BY host", "id": "q1", "label": "CPU", "${var.aws_region}": "eu-central-1", "stat": "Average" } ]
+            [ { "expression": "SELECT AVG(graphdb_cpu_load) FROM \"${var.resource_name_prefix}-graphdb\" GROUP BY host", "id": "q1", "label": "CPU", "region": var.aws_region, "stat": "Average" } ]
           ],
           "region": var.aws_region,
           "stacked": false,

diff --git a/modules/monitoring/outputs.tf → ...pc-with-multiple-az/monitoring/outputs.tf b/modules/monitoring/outputs.tf → ...pc-with-multiple-az/monitoring/outputs.tf
diff --git a/modules/monitoring/variables.tf → ...-with-multiple-az/monitoring/variables.tf b/modules/monitoring/variables.tf → ...-with-multiple-az/monitoring/variables.tf
diff --git a/examples/vpc-with-multiple-az/monitoring/versions.tf b/examples/vpc-with-multiple-az/monitoring/versions.tf
@@ -0,0 +1,10 @@
+terraform {
+  required_version = ">= 1.4.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.15"
+    }
+  }
+}
diff --git a/modules/iam/main.tf b/modules/iam/main.tf
@@ -3,6 +3,11 @@ resource "aws_iam_instance_profile" "graphdb" {
   role        = var.user_supplied_iam_role_name != null ? var.user_supplied_iam_role_name : aws_iam_role.graphdb[0].name
 }
 
+resource "aws_iam_role_policy_attachment" "cloudwatch-agent-policy" {
+  role       = aws_iam_role.graphdb[0].id
+  policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+}
+
 resource "aws_iam_role" "graphdb" {
   count                = var.user_supplied_iam_role_name != null ? 0 : 1
   name_prefix          = "${var.resource_name_prefix}-graphdb-"

diff --git a/modules/monitoring/README.md b/modules/monitoring/README.md
diff --git a/modules/user_data/main.tf b/modules/user_data/main.tf
@@ -27,6 +27,7 @@ locals {
       zone_id       = var.zone_id
 
       jvm_max_memory = local.jvm_max_memory
+      resource_name_prefix = var.resource_name_prefix
     }
   )
 }
diff --git a/modules/user_data/templates/start_graphdb.sh.tpl b/modules/user_data/templates/start_graphdb.sh.tpl
@@ -231,11 +231,14 @@ echo 'fs.file-max = 262144' | tee -a /etc/sysctl.conf
 
 sysctl -p
 
-chown -R graphdb:graphdb /etc/graphdb/ /etc/prometheus/
+tmp=$(mktemp)
+jq '.logs.metrics_collected.prometheus.log_group_name = "${resource_name_prefix}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json > "$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
+jq '.logs.metrics_collected.prometheus.emf_processor.metric_namespace = "${resource_name_prefix}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json > "$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
+amazon-cloudwatch-agent-ctl -a start
 amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/etc/graphdb/cloudwatch-agent-config.json
 
 # the proxy service is set up in the AMI but not enabled there, so we enable and start it
 systemctl daemon-reload
 systemctl start graphdb
 systemctl enable graphdb-cluster-proxy.service
-systemctl start graphdb-cluster-proxy.service
+systemctl start graphdb-cluster-proxy.service