From 5f7f35693fe754809831b102fa07bf9bbb66c646 Mon Sep 17 00:00:00 2001 From: Jeremy Fleischman Date: Mon, 10 Mar 2025 09:38:46 -0700 Subject: [PATCH] More mailserver probe fixes A few things here: 1. Increase the `stmp_starttls` timeout. We're seeing probes sporadically fail against ImprovMX with "i/o timeout". (see logs below) 2. Add missing port for probing `umbriel` (oops). 3. Also alert on `umbriel` probe failures. ImprovMX probe logs: ``` ts=2025-03-10T16:27:37.785620787Z caller=main.go:190 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Beginning probe" probe=tcp timeout_seconds=5 ts=2025-03-10T16:27:37.785744902Z caller=tcp.go:40 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Resolving target address" target=mx2.improvmx.com ip_protocol=ip6 ts=2025-03-10T16:27:37.823984377Z caller=tcp.go:40 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Resolved target address" target=mx2.improvmx.com ip=2a05:d012:412:e202:e81e:cc44:3b53:8a 3d ts=2025-03-10T16:27:37.824035793Z caller=tcp.go:112 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Dialing TCP without TLS" ts=2025-03-10T16:27:37.980888559Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Successfully dialed" ts=2025-03-10T16:27:37.980965545Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Processing query response entry" entry_number=0 ts=2025-03-10T16:27:42.293492984Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=debug msg="Read line" line="220 mx2.improvmx.com Welcome to ImprovMX mail server v3. - Improv MX v2024.06.06" ts=2025-03-10T16:27:42.293596408Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Regexp matched" regexp=^220 line="220 mx2.improvmx.com Welcome to ImprovMX mail ser ver v3. - ImprovMX v2024.06.06" ts=2025-03-10T16:27:42.293617839Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Processing query response entry" entry_number=1 ts=2025-03-10T16:27:42.293631825Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=debug msg="Sending line" line="EHLO prober\r" ts=2025-03-10T16:27:42.293683932Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=info msg="Processing query response entry" entry_number=2 ts=2025-03-10T16:27:42.786519394Z caller=handler.go:119 module=smtp_starttls target=mx2.improvmx.com:25 level=error msg="Error reading from connection" err="read tcp6 [2a01:4ff:1f0:ad06::]:39860->[2a05:d 012:412:e202:e81e:cc44:3b53:8a3d]:25: i/o timeout" ts=2025-03-10T16:27:42.786645963Z caller=main.go:190 module=smtp_starttls target=mx2.improvmx.com:25 level=error msg="Probe failed" duration_seconds=5.000976894 ``` --- build/pluto/prometheus/exporters/blackbox.nix | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/build/pluto/prometheus/exporters/blackbox.nix b/build/pluto/prometheus/exporters/blackbox.nix index 0512d378..443e7752 100644 --- a/build/pluto/prometheus/exporters/blackbox.nix +++ b/build/pluto/prometheus/exporters/blackbox.nix @@ -5,10 +5,10 @@ let { module, targets, - job_name ? "blackbox-${module}", + job_suffix ? "", }: { - inherit job_name; + job_name = "blackbox-${module}${job_suffix}"; metrics_path = "/probe"; params = { module = [ module ]; @@ -75,7 +75,7 @@ in # From https://github.com/prometheus/blackbox_exporter/blob/53e78c2b3535ecedfd072327885eeba2e9e51ea2/example.yml#L120-L133 modules.smtp_starttls = { prober = "tcp"; - timeout = "5s"; + timeout = "10s"; tcp = { query_response = [ { expect = "^220"; } @@ -122,8 +122,8 @@ in # https://github.com/NixOS/infra/issues/485 (mkStaticProbe { module = "smtp_starttls"; - job_name = "smtp_starttls_umbriel"; - targets = [ "umbriel.nixos.org" ]; + job_suffix = "_umbriel"; + targets = [ "umbriel.nixos.org:25" ]; }) (mkDnsSdProbe "smtp_starttls" { names = [ @@ -162,7 +162,7 @@ in { alert = "MxUnreachable"; expr = '' - probe_success{job="blackbox-smtp_starttls"} == 0 + probe_success{job=~"blackbox-smtp_starttls.*"} == 0 ''; for = "15m"; labels.severity = "warning";