Skip to content

Commit

Permalink
metrics: add grafana and alertmanager scripts (pingcap#14360)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackysp authored Jan 17, 2020
1 parent a7076a5 commit 511cedf
Show file tree
Hide file tree
Showing 4 changed files with 19,729 additions and 0 deletions.
146 changes: 146 additions & 0 deletions metrics/alertmanager/tidb.rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
groups:
- name: alert.rules
rules:
- alert: TiDB_schema_error
expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB schema error

- alert: TiDB_tikvclient_region_err_total
expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB tikvclient_backoff_count error

- alert: TiDB_binlog_error_total
expr: increase( tidb_server_critical_error_total[5m] ) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: increase( tidb_server_critical_error_total[5m] ) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB tidb binlog error total

- alert: TiDB_domain_load_schema_total
expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB domain_load_schema_total error

- alert: TiDB_monitor_keep_alive
expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB monitor_keep_alive error

- alert: TiDB_server_panic_total
expr: increase(tidb_server_panic_total[10m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: increase(tidb_server_panic_total[10m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB server panic total

- alert: TiDB_memory_abnormal
expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB heap memory usage is over 10 GB

- alert: TiDB_query_duration
expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB query duration 99th percentile is above 1s

- alert: TiDB_server_event_error
expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB server event error

- alert: tidb_tikvclient_backoff_seconds_count
expr: increase( tidb_tikvclient_backoff_seconds_count[10m] ) > 10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: increase( tidb_tikvclient_backoff_seconds_count[10m] ) > 10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB tikvclient_backoff_count error

- alert: TiDB_monitor_time_jump_back_error
expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB monitor time_jump_back error

- alert: TiDB_ddl_waiting_jobs
expr: sum(tidb_ddl_waiting_jobs) > 5
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(tidb_ddl_waiting_jobs) > 5
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiDB ddl waiting_jobs too much
Loading

0 comments on commit 511cedf

Please sign in to comment.