forked from customink/nagios-nrpe-check_glusterfs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_glusterfs
430 lines (387 loc) · 12.3 KB
/
check_glusterfs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#!/bin/bash
## Fork of MarkR's GlusterFS-checks at:
## http://exchange.nagios.org/directory/Plugins/System-Metrics/File-System/GlusterFS-checks/details
### CHANGELOG
## 1.0.4
# * 19/03/2020
# * Modified by Edgar Matzinger <[email protected]>
#
# - outputs performance data (v;w;c;min;max) on free space and bricks (v)
# - introduced long options
# - allow for (almost) any unit to process free space, warning and
# critical thresholds
# - all calculations w.r.t. free space are done in kilobytes (KB)
# critical and warning thresholds are first converted into KB
# and converted back into the desired unit for output
# - warning and critical thresholds can be specified in percentages
# irrespective of the desired unit
# - if the desired output unit is in percentages, warning and critical
# thresholds MUST be in percentages too
# - if utils.sh isn't located, make up some exit values
# - according to Nagios Plugin Dev Guideline, --timeout should be present,
# but this script doesn't use it nor is verbosity used
# - verify if a correct output unit is specified
# - enclosed variables in braces (${var})
# - nice to have: getting rid of awk (integer math maybe?)
## 1.0.2
# * 07/01/2014
# * Modified by Doug Wilson <[email protected]>
# * includes carrillm's fix to support TB sized volumes
# * outputs all errors on a critical alarm, not just free space
# This Nagios script was written against version 3.3 & 3.4 of Gluster. Older
# versions will most likely not work at all with this monitoring script.
#
# Gluster currently requires elevated permissions to do anything. In order to
# accommodate this, you need to allow your Nagios user some additional
# permissions via sudo. The line you want to add will look something like the
# following in /etc/sudoers (or something equivalent):
#
# Defaults:nagios !requiretty
# nagios ALL=(root) NOPASSWD:/usr/sbin/gluster volume status [[\:graph\:]]* detail,/usr/sbin/gluster volume heal [[\:graph\:]]* info
#
# That should give us all the access we need to check the status of any
# currently defined peers and volumes.
# Inspired by a script of Mark Nipper
#
# 2013, Mark Ruys, [email protected]
PATH=/sbin:/bin:/usr/sbin:/usr/bin
PROGNAME=${0##*/} # strip everything including the last '/', i.e. act as basename
PROGPATH=${0%/*} # strip everything after and including the last '/', i.e. act as dirname
REVISION="1.0.4"
if [[ -f ${PROGPATH}/utils.sh ]]; then
# located it, source utils.sh
. ${PROGPATH}/utils.sh
else
# utils.sh not located, make some exit values up
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
fi
function Exit () {
echo "$1: ${2:0}"
status=STATE_$1
exit ${!status}
}
function convertToKB
{
local unit="${1:0:1}"
local val="${2:-0}"
local totalKB="${3:-0}"
awk 'BEGIN{
unit = "'${unit}'";
val = '${val}';
totalspace = '${totalKB}';
switch (tolower(unit)) {
case "%" :
printf ("%d\n", (val * totalspace / 100));
break;
case "b" :
printf ("%d\n", val / 1024);
break;
case "m" :
printf ("%d\n", val * 1024);
break;
case "g" :
printf ("%d\n", val * (1024 * 1024));
break;
case "t" :
printf ("%d\n", val * (1024 * 1024 * 1024));
break;
case "p" :
printf ("%d\n", val * (1024 * 1024 * 1024 * 1024));
break;
default:
printf ("%d\n", val);
break;
}
}'
}
function convertToUnit
{
local unit="${1:0:1}"
local val="${2:-0}"
local totalKB="${3:-0}"
awk 'BEGIN{
unit = "'${unit}'";
val = '${val}';
totalspace = '${totalKB}';
switch (tolower(unit)) {
case "%" :
printf ("%1.1f\n", 100 * val / totalspace);
break;
case "b" :
printf ("%d\n", val * 1024);
break;
case "m" :
printf ("%1.3f\n", val / 1024);
break;
case "g" :
printf ("%1.3f\n", val / (1024 * 1024));
break;
case "t" :
printf ("%1.3f\n", val / (1024 * 1024 * 1024));
break;
case "p" :
printf ("%1.3f\n", val / (1024 * 1024 * 1024 * 1024));
break;
default:
printf ("%d\n", val);
break;
}
}'
}
function usage () {
local action="${1}"
local msg="${2}"
# if on a terminal and a message has been specified, print it.
[[ -t 1 && -n "${msg}" ]] && echo -e "${PROGNAME}: ${msg}" 1>&2
# if on a terminal and an action is specified (either USAGE or HELP)
if [[ -t 1 && -n "${action}" ]]; then
(
# print brief usage
echo -e "${PROGNAME}: usage: ${PROGNAME} -v <VOLUME> -n <BRICKS> [-u <{%|B|K|M|G|T|P}>] [-w <THRESHOLD[%]> -c <THRESHOLD[%]>]"
if [[ ${action} == "HELP" ]]; then
# print extensive help
echo -e "where options are:\n"
echo -e "\t--version\t\t\t- show version and exit"
echo -e "\t-U,--usage\t\t\t- show brief information"
echo -e "\t-c,--critical <THRESHOLD>\t- return critical if free disk space is below threshold"
echo -e "\t-h,--help\t\t\t- show this information"
echo -e "\t-n,--bricks <BRICKS>\t\t- number of bricks to be expected to be used on volume"
echo -e "\t-u,--unit <{%|B|K|M|G|T|P}>\t- output unit, default G"
echo -e "\t-v,--volume <VOLUME>\t\t- volume to inspect"
echo -e "\t-w,--warning <THRESHOLD>\t- return warning if free disk space is below threshold"
echo -e "\noptions can be specified in any order"
fi
) 1>&2
fi
# if not on a terminal, print a message to nagios/naemon/etc.
[[ ! -t 1 && -n "${msg}" ]] && Exit UNKNOWN "incorrect program usage: ${msg}"
# default exit code
exit ${STATE_UNKNOWN}
}
# main
# force UNIT to be in uppercase
typeset -u UNIT='G'
# parse command line
while getopts ":Uc:hn:u:v:w:-:" opt; do
case ${opt} in
-)
case "${OPTARG}" in
bricks)
BRICKS="${!OPTIND}"
OPTIND=$(( ${OPTIND} + 1 ))
[[ -z "${BRICKS}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
bricks=*)
BRICKS=${OPTARG#*=}
[[ -z "${BRICKS}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
critical)
CRIT="${!OPTIND}"
OPTIND=$(( ${OPTIND} + 1 ))
[[ -z "${CRIT}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
critical=*)
CRIT=${OPTARG#*=}
[[ -z "${CRIT}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
help)
usage "HELP" ""
;;
unit)
UNIT="${!OPTIND}"
OPTIND=$(( ${OPTIND} + 1 ))
[[ -z "${UNIT}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
unit=*)
UNIT="${OPTARG#*=}"
[[ -z "${UNIT}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
usage)
usage "USAGE" ""
;;
version)
usage "" "version: ${REVISION}"
;;
volume)
VOLUME="${!OPTIND}"
OPTIND=$(( ${OPTIND} + 1 ))
[[ -z "${VOLUME}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
volume=*)
VOLUME=${OPTARG#*=}
[[ -z "${VOLUME}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
warning)
WARN="${!OPTIND}"
OPTIND=$(( ${OPTIND} + 1 ))
[[ -z "${WARN}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
warning=*)
WARN=${OPTARG#*=}
[[ -z "${WARN}" ]] && usage "USAGE" "option --${OPTARG%%=*} needs an argument"
;;
*)
if [ "${OPTERR}" = 1 ] && [ "${optspec:0:1}" != ":" ]; then
usage "USAGE" "Unknown option --${OPTARG}"
fi
;;
esac ;;
U) usage "USAGE" ;;
c) CRIT=${OPTARG} ;;
h) usage "HELP" ;;
n) BRICKS=${OPTARG} ;;
u) UNIT="${OPTARG:0:1}" ;;
v) VOLUME=${OPTARG} ;;
w) WARN=${OPTARG} ;;
:) usage "USAGE" "option -${OPTARG} needs an argument" ;;
*) usage "USAGE" "unknown option -${OPTARG}";;
esac
done
[[ -z "${VOLUME}" || -z "${BRICKS}" ]] && usage "USAGE" "please specify both a volume and the number of expected bricks"
[[ -n "${CRIT}" && -z "${WARN}" ]] && usage "USAGE" "please specify a warning threshold too"
[[ -z "${CRIT}" && -n "${WARN}" ]] && usage "USAGE" "please specify a critical threshold too"
[[ ! "BKMGTP%" =~ "${UNIT}" ]] && usage "USAGE" "invalid output unit: ${UNIT}"
# check for commands
for cmd in awk gluster pidof sudo; do
if ! type -p "${cmd}" >/dev/null; then
Exit UNKNOWN "${cmd} not found"
fi
done
# check for running glusterd (management daemon)
if ! pidof glusterd &>/dev/null; then
Exit CRITICAL "glusterd management daemon not running"
fi
# check for running glusterfsd (brick daemon)
if ! pidof glusterfsd &>/dev/null; then
Exit CRITICAL "glusterfsd brick daemon not running"
fi
# get volume heal status
heal=0
for entries in $(sudo gluster volume heal ${VOLUME} info | awk '/^Number of entries: /{print $4}'); do
if [ "${entries}" -gt 0 ]; then
let $((heal+=entries))
fi
done
if [ "${heal}" -gt 0 ]; then
errors=("${errors[@]}" "${heal} unsynched entries")
fi
# get volume status
bricksfound=0
freeKB=99999999999
totalKB=99999999999
shopt -s nullglob
while read -r line; do
field=($(echo ${line}))
case ${field[0]} in
Brick)
brick=${field[@]:2}
;;
Disk)
key=${field[@]:0:3}
if [ "${key}" = "Disk Space Free" ]; then
freeunit=${field[@]:4}
free=${freeunit:0:-2}
unit=${freeunit#${free}}
free=${free%%.*}
case "${unit}" in
M*) let $((free *= 1024)) ;;
G*) let $((free *= (1024 *1024))) ;;
T*) let $((free *= (1024 * 1024 *1024))) ;;
P*) let $((free *= (1024 * 1024 * 1024 *1024))) ;;
esac
if [ ${free} -lt ${freeKB} ]; then
freeKB=${free}
fi
fi
;;
Total)
key=${field[@]:0:3}
if [ "${key}" = "Total Disk Space" ]; then
totalunit=${field[@]:4}
total=${totalunit:0:-2}
unit=${totalunit#${total}}
total=${total%%.*}
case "${unit}" in
M*) let $((total *= 1024)) ;;
G*) let $((total *= (1024 *1024))) ;;
T*) let $((total *= (1024 * 1024 *1024))) ;;
P*) let $((total *= (1024 * 1024 * 1024 *1024))) ;;
esac
if [ ${total} -lt ${totalKB} ]; then
totalKB=${total}
fi
fi
;;
Online)
online=${field[@]:2}
if [ "${online}" = "Y" ]; then
let $((bricksfound++))
else
errors=("${errors[@]}" "${brick} offline")
fi
;;
esac
done < <(sudo gluster volume status ${VOLUME} detail)
# process number of bricks found
if [ ${bricksfound} -eq 0 ]; then
Exit CRITICAL "no bricks found"
elif [ ${bricksfound} -lt ${BRICKS} ]; then
errors=("${errors[@]}" "found ${bricksfound} bricks, expected ${BRICKS}")
ex_stat="WARNING_stat"
fi
# convert critical threshold to kilobytes
if [[ "${CRIT}" == *"%"* ]]; then
# critical threshold is in percentages
CRITVALUE=$(convertToKB "%" "${CRIT%[%BKMGTP]*}" "${totalKB}")
else
# critical threshold is a regular (floating point) number
CRITVALUE=$(convertToKB "${UNIT}" "${CRIT%[%BKMGTP]*}")
fi
# convert warning threshold to kilobytes
if [[ "${WARN}" == *"%"* ]]; then
# warning threshold is in percentages
WARNVALUE=$(convertToKB "%" "${WARN%[%BKMGTP]*}" "${totalKB}")
else
# warning threshold is a regular (floating point) number
WARNVALUE=$(convertToKB "${UNIT}" "${WARN%[%BKMGTP]*}")
fi
# verify that critical threshold is lower than warning threshold
[[ ${CRITVALUE} -ge ${WARNVALUE} ]] && Exit UNKNOWN "critical threshold above warning (${CRITVALUE} >= ${WARNVALUE})"
# freeKB, totalKB, CRITVALUE and WARNVALUE are in kilobytes
freespace=$(convertToUnit "${UNIT}" "${freeKB}" "${totalKB}")
totalspace=$(convertToUnit "${UNIT}" "${totalKB}" "${totalKB}")
crit=$(convertToUnit "${UNIT}" "${CRITVALUE}" "${totalKB}")
warn=$(convertToUnit "${UNIT}" "${WARNVALUE}" "${totalKB}")
# default performance data
perfData=" | bricks:${bricksfound},freespace:${freespace}${UNIT};;;0;${totalspace}"
# process critical and warning thresholds
if [ -n "${CRIT}" -a -n "${WARN}" ]; then
# expand performance data with warning and critical thresholds
perfData=" | bricks:${bricksfound},freespace:${freespace}${UNIT};${warn};${crit};0;${totalspace}"
if [ ${freeKB} -lt ${CRITVALUE} ]; then
errors=("${errors[@]}" "free space below critical threshold: ${freespace}${UNIT}")
ex_stat="CRITICAL_stat"
elif [ ${freeKB} -lt ${WARNVALUE} ]; then
errors=("${errors[@]}" "free space below warning threshold: ${freespace}${UNIT}")
ex_stat="WARNING_stat"
fi
fi
# exit with warnings or errors
if [ -n "${errors}" ]; then
sep='; '
msg=$(printf "${sep}%s" "${errors[@]}")
msg=${msg:${#sep}}
flt="fault"
[[ ${#errors[@]} -gt 1 ]] && flt="${flt}s"
if [ ${ex_stat} == "CRITICAL_stat" ]; then
Exit CRITICAL "${#errors[@]} ${flt}: ${msg}${perfData}"
else
Exit WARNING "${#errors[@]} ${flt}: ${msg}${perfData}"
fi
fi
# exit with no errors
Exit OK "volume: ${VOLUME}, bricks: ${bricksfound}, free space: ${freespace}${UNIT}${perfData}"