Skip to content

Commit a9b4909

Browse files
committed
Add drive I/O metrics for Prometheus
Below metrics are exported: * directpv_stats_drive_ready * directpv_stats_drive_total_read_bytes * directpv_stats_drive_total_write_bytes * directpv_stats_drive_read_latency_seconds * directpv_stats_drive_write_latency_seconds * directpv_stats_drive_wait_time_seconds Fixes #839 Signed-off-by: Bala.FA <[email protected]>
1 parent 557e925 commit a9b4909

File tree

3 files changed

+165
-4
lines changed

3 files changed

+165
-4
lines changed

docs/monitoring.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33
DirectPV nodes export Prometheus compatible metrics data via port `10443`. The metrics data includes
44
* directpv_stats_bytes_used
55
* directpv_stats_bytes_total
6-
and categorized by labels `tenant`, `volumeID` and `node`.
6+
* directpv_stats_drive_ready
7+
* directpv_stats_drive_total_read_bytes
8+
* directpv_stats_drive_total_write_bytes
9+
* directpv_stats_drive_read_latency_seconds
10+
* directpv_stats_drive_write_latency_seconds
11+
* directpv_stats_drive_wait_time_seconds
12+
and categorized by labels `drive`, `tenant`, `volumeID` and `node`.
713

814
To scrape data in Prometheus, each node must be accessible by port `10443`. A simple example is below
915

pkg/device/sysfs_linux.go

+18
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,21 @@ func getHolders(name string) ([]string, error) {
106106
func getDMName(name string) (string, error) {
107107
return readFirstLine("/sys/class/block/" + name + "/dm/name")
108108
}
109+
110+
// GetStat returns statistics for a given device name.
111+
func GetStat(name string) (stats []uint64, err error) {
112+
line, err := readFirstLine("/sys/class/block/" + name + "/stat")
113+
if err != nil {
114+
return nil, err
115+
}
116+
117+
for _, token := range strings.Fields(line) {
118+
ui64, err := strconv.ParseUint(token, 10, 64)
119+
if err != nil {
120+
return nil, err
121+
}
122+
stats = append(stats, ui64)
123+
}
124+
125+
return stats, nil
126+
}

pkg/metrics/collector.go

+140-3
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,51 @@ package metrics
1818

1919
import (
2020
"context"
21+
"fmt"
2122

2223
directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
2324
"github.com/minio/directpv/pkg/client"
2425
"github.com/minio/directpv/pkg/consts"
26+
"github.com/minio/directpv/pkg/device"
2527
"github.com/minio/directpv/pkg/sys"
2628
"github.com/minio/directpv/pkg/types"
29+
"github.com/minio/directpv/pkg/utils"
2730
"github.com/minio/directpv/pkg/xfs"
2831
"github.com/prometheus/client_golang/prometheus"
2932
"k8s.io/klog/v2"
3033
)
3134

35+
const defaultSectorSize = 512
36+
37+
type driveStats struct {
38+
readBytes float64
39+
readTicks float64
40+
writeBytes float64
41+
writeTicks float64
42+
timeInQueue float64
43+
}
44+
45+
func getDriveStats(driveName string) (*driveStats, error) {
46+
stat, err := device.GetStat(driveName)
47+
switch {
48+
case err != nil:
49+
return nil, err
50+
case len(stat) == 0:
51+
return nil, fmt.Errorf("unable to read stat from drive %v", driveName)
52+
case len(stat) < 10:
53+
return nil, fmt.Errorf("invalid stat format from drive %v", driveName)
54+
}
55+
56+
// Refer https://www.kernel.org/doc/Documentation/block/stat.txt for meaning of each field.
57+
return &driveStats{
58+
readBytes: float64(stat[2] * defaultSectorSize),
59+
readTicks: float64(stat[3]),
60+
writeBytes: float64(stat[6] * defaultSectorSize),
61+
writeTicks: float64(stat[7]),
62+
timeInQueue: float64(stat[10]),
63+
}, nil
64+
}
65+
3266
type metricsCollector struct {
3367
nodeID directpvtypes.NodeID
3468
desc *prometheus.Desc
@@ -95,21 +129,124 @@ func (c *metricsCollector) publishVolumeStats(ctx context.Context, volume *types
95129
)
96130
}
97131

132+
func (c *metricsCollector) publishDriveStats(drive *types.Drive, ch chan<- prometheus.Metric) {
133+
deviceID, err := c.getDeviceByFSUUID(drive.Status.FSUUID)
134+
if err != nil {
135+
klog.ErrorS(
136+
err,
137+
"unable to find device by FSUUID; "+
138+
"either device is removed or run command "+
139+
"`sudo udevadm control --reload-rules && sudo udevadm trigger`"+
140+
" on the host to reload",
141+
"FSUUID", drive.Status.FSUUID)
142+
client.Eventf(
143+
drive, client.EventTypeWarning, client.EventReasonMetrics,
144+
"unable to find device by FSUUID %v; "+
145+
"either device is removed or run command "+
146+
"`sudo udevadm control --reload-rules && sudo udevadm trigger`"+
147+
" on the host to reload", drive.Status.FSUUID)
148+
149+
return
150+
}
151+
deviceName := utils.TrimDevPrefix(deviceID)
152+
153+
status := float64(1) // Online
154+
driveStat, err := getDriveStats(deviceName)
155+
if err != nil {
156+
klog.ErrorS(err, "unable to read drive statistics")
157+
status = float64(0) // Offline
158+
}
159+
160+
// Metrics
161+
ch <- prometheus.MustNewConstMetric(
162+
prometheus.NewDesc(
163+
prometheus.BuildFQName(consts.AppName, "stats", "drive_ready"),
164+
"Drive Online/Offline Status",
165+
[]string{"drive"}, nil),
166+
prometheus.GaugeValue,
167+
status, drive.Name,
168+
)
169+
170+
if driveStat == nil {
171+
return
172+
}
173+
174+
ch <- prometheus.MustNewConstMetric(
175+
prometheus.NewDesc(
176+
prometheus.BuildFQName(consts.AppName, "stats", "drive_total_read_bytes"),
177+
"Total number of bytes read from the drive",
178+
[]string{"drive"}, nil),
179+
prometheus.CounterValue,
180+
driveStat.readBytes, drive.Name,
181+
)
182+
183+
ch <- prometheus.MustNewConstMetric(
184+
prometheus.NewDesc(
185+
prometheus.BuildFQName(consts.AppName, "stats", "drive_total_write_bytes"),
186+
"Total number of bytes written to the drive",
187+
[]string{"drive"}, nil),
188+
prometheus.CounterValue,
189+
driveStat.writeBytes, drive.Name,
190+
)
191+
192+
// Drive Read/Write Latency
193+
ch <- prometheus.MustNewConstMetric(
194+
prometheus.NewDesc(
195+
prometheus.BuildFQName(consts.AppName, "stats", "drive_read_latency_seconds"),
196+
"Drive Read Latency",
197+
[]string{"drive"}, nil),
198+
prometheus.GaugeValue,
199+
driveStat.readTicks/1000, drive.Name,
200+
)
201+
202+
ch <- prometheus.MustNewConstMetric(
203+
prometheus.NewDesc(
204+
prometheus.BuildFQName(consts.AppName, "stats", "drive_write_latency_seconds"),
205+
"Drive Write Latency",
206+
[]string{"drive"}, nil),
207+
prometheus.GaugeValue,
208+
driveStat.writeTicks/1000, drive.Name,
209+
)
210+
211+
// Wait Time
212+
ch <- prometheus.MustNewConstMetric(
213+
prometheus.NewDesc(
214+
prometheus.BuildFQName(consts.AppName, "stats", "drive_wait_time_seconds"),
215+
"Drive Wait Time",
216+
[]string{"drive"}, nil),
217+
prometheus.GaugeValue,
218+
driveStat.timeInQueue/1000, drive.Name,
219+
)
220+
}
221+
98222
// Collect is called by Prometheus registry when collecting metrics.
99223
func (c *metricsCollector) Collect(ch chan<- prometheus.Metric) {
100224
ctx, cancelFunc := context.WithCancel(context.Background())
101225
defer cancelFunc()
102226

103-
resultCh := client.NewVolumeLister().
227+
// Collecting volume statistics
228+
volumeResultCh := client.NewVolumeLister().
104229
NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}).
105230
List(ctx)
106-
for result := range resultCh {
231+
for result := range volumeResultCh {
107232
if result.Err != nil {
108-
return
233+
break
109234
}
110235

111236
if result.Volume.Status.TargetPath != "" {
112237
c.publishVolumeStats(ctx, &result.Volume, ch)
113238
}
114239
}
240+
241+
// Collecting drive statistics
242+
driveResultCh := client.NewDriveLister().
243+
NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}).
244+
List(ctx)
245+
for result := range driveResultCh {
246+
if result.Err != nil {
247+
break
248+
}
249+
250+
c.publishDriveStats(&result.Drive, ch)
251+
}
115252
}

0 commit comments

Comments
 (0)