@@ -18,17 +18,51 @@ package metrics
18
18
19
19
import (
20
20
"context"
21
+ "fmt"
21
22
22
23
directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
23
24
"github.com/minio/directpv/pkg/client"
24
25
"github.com/minio/directpv/pkg/consts"
26
+ "github.com/minio/directpv/pkg/device"
25
27
"github.com/minio/directpv/pkg/sys"
26
28
"github.com/minio/directpv/pkg/types"
29
+ "github.com/minio/directpv/pkg/utils"
27
30
"github.com/minio/directpv/pkg/xfs"
28
31
"github.com/prometheus/client_golang/prometheus"
29
32
"k8s.io/klog/v2"
30
33
)
31
34
35
+ const defaultSectorSize = 512
36
+
37
+ type driveStats struct {
38
+ readBytes float64
39
+ readTicks float64
40
+ writeBytes float64
41
+ writeTicks float64
42
+ timeInQueue float64
43
+ }
44
+
45
+ func getDriveStats (driveName string ) (* driveStats , error ) {
46
+ stat , err := device .GetStat (driveName )
47
+ switch {
48
+ case err != nil :
49
+ return nil , err
50
+ case len (stat ) == 0 :
51
+ return nil , fmt .Errorf ("unable to read stat from drive %v" , driveName )
52
+ case len (stat ) < 10 :
53
+ return nil , fmt .Errorf ("invalid stat format from drive %v" , driveName )
54
+ }
55
+
56
+ // Refer https://www.kernel.org/doc/Documentation/block/stat.txt for meaning of each field.
57
+ return & driveStats {
58
+ readBytes : float64 (stat [2 ] * defaultSectorSize ),
59
+ readTicks : float64 (stat [3 ]),
60
+ writeBytes : float64 (stat [6 ] * defaultSectorSize ),
61
+ writeTicks : float64 (stat [7 ]),
62
+ timeInQueue : float64 (stat [10 ]),
63
+ }, nil
64
+ }
65
+
32
66
type metricsCollector struct {
33
67
nodeID directpvtypes.NodeID
34
68
desc * prometheus.Desc
@@ -95,21 +129,124 @@ func (c *metricsCollector) publishVolumeStats(ctx context.Context, volume *types
95
129
)
96
130
}
97
131
132
+ func (c * metricsCollector ) publishDriveStats (drive * types.Drive , ch chan <- prometheus.Metric ) {
133
+ deviceID , err := c .getDeviceByFSUUID (drive .Status .FSUUID )
134
+ if err != nil {
135
+ klog .ErrorS (
136
+ err ,
137
+ "unable to find device by FSUUID; " +
138
+ "either device is removed or run command " +
139
+ "`sudo udevadm control --reload-rules && sudo udevadm trigger`" +
140
+ " on the host to reload" ,
141
+ "FSUUID" , drive .Status .FSUUID )
142
+ client .Eventf (
143
+ drive , client .EventTypeWarning , client .EventReasonMetrics ,
144
+ "unable to find device by FSUUID %v; " +
145
+ "either device is removed or run command " +
146
+ "`sudo udevadm control --reload-rules && sudo udevadm trigger`" +
147
+ " on the host to reload" , drive .Status .FSUUID )
148
+
149
+ return
150
+ }
151
+ deviceName := utils .TrimDevPrefix (deviceID )
152
+
153
+ status := float64 (1 ) // Online
154
+ driveStat , err := getDriveStats (deviceName )
155
+ if err != nil {
156
+ klog .ErrorS (err , "unable to read drive statistics" )
157
+ status = float64 (0 ) // Offline
158
+ }
159
+
160
+ // Metrics
161
+ ch <- prometheus .MustNewConstMetric (
162
+ prometheus .NewDesc (
163
+ prometheus .BuildFQName (consts .AppName , "stats" , "drive_ready" ),
164
+ "Drive Online/Offline Status" ,
165
+ []string {"drive" }, nil ),
166
+ prometheus .GaugeValue ,
167
+ status , drive .Name ,
168
+ )
169
+
170
+ if driveStat == nil {
171
+ return
172
+ }
173
+
174
+ ch <- prometheus .MustNewConstMetric (
175
+ prometheus .NewDesc (
176
+ prometheus .BuildFQName (consts .AppName , "stats" , "drive_total_read_bytes" ),
177
+ "Total number of bytes read from the drive" ,
178
+ []string {"drive" }, nil ),
179
+ prometheus .CounterValue ,
180
+ driveStat .readBytes , drive .Name ,
181
+ )
182
+
183
+ ch <- prometheus .MustNewConstMetric (
184
+ prometheus .NewDesc (
185
+ prometheus .BuildFQName (consts .AppName , "stats" , "drive_total_write_bytes" ),
186
+ "Total number of bytes written to the drive" ,
187
+ []string {"drive" }, nil ),
188
+ prometheus .CounterValue ,
189
+ driveStat .writeBytes , drive .Name ,
190
+ )
191
+
192
+ // Drive Read/Write Latency
193
+ ch <- prometheus .MustNewConstMetric (
194
+ prometheus .NewDesc (
195
+ prometheus .BuildFQName (consts .AppName , "stats" , "drive_read_latency_seconds" ),
196
+ "Drive Read Latency" ,
197
+ []string {"drive" }, nil ),
198
+ prometheus .GaugeValue ,
199
+ driveStat .readTicks / 1000 , drive .Name ,
200
+ )
201
+
202
+ ch <- prometheus .MustNewConstMetric (
203
+ prometheus .NewDesc (
204
+ prometheus .BuildFQName (consts .AppName , "stats" , "drive_write_latency_seconds" ),
205
+ "Drive Write Latency" ,
206
+ []string {"drive" }, nil ),
207
+ prometheus .GaugeValue ,
208
+ driveStat .writeTicks / 1000 , drive .Name ,
209
+ )
210
+
211
+ // Wait Time
212
+ ch <- prometheus .MustNewConstMetric (
213
+ prometheus .NewDesc (
214
+ prometheus .BuildFQName (consts .AppName , "stats" , "drive_wait_time_seconds" ),
215
+ "Drive Wait Time" ,
216
+ []string {"drive" }, nil ),
217
+ prometheus .GaugeValue ,
218
+ driveStat .timeInQueue / 1000 , drive .Name ,
219
+ )
220
+ }
221
+
98
222
// Collect is called by Prometheus registry when collecting metrics.
99
223
func (c * metricsCollector ) Collect (ch chan <- prometheus.Metric ) {
100
224
ctx , cancelFunc := context .WithCancel (context .Background ())
101
225
defer cancelFunc ()
102
226
103
- resultCh := client .NewVolumeLister ().
227
+ // Collecting volume statistics
228
+ volumeResultCh := client .NewVolumeLister ().
104
229
NodeSelector ([]directpvtypes.LabelValue {directpvtypes .ToLabelValue (string (c .nodeID ))}).
105
230
List (ctx )
106
- for result := range resultCh {
231
+ for result := range volumeResultCh {
107
232
if result .Err != nil {
108
- return
233
+ break
109
234
}
110
235
111
236
if result .Volume .Status .TargetPath != "" {
112
237
c .publishVolumeStats (ctx , & result .Volume , ch )
113
238
}
114
239
}
240
+
241
+ // Collecting drive statistics
242
+ driveResultCh := client .NewDriveLister ().
243
+ NodeSelector ([]directpvtypes.LabelValue {directpvtypes .ToLabelValue (string (c .nodeID ))}).
244
+ List (ctx )
245
+ for result := range driveResultCh {
246
+ if result .Err != nil {
247
+ break
248
+ }
249
+
250
+ c .publishDriveStats (& result .Drive , ch )
251
+ }
115
252
}
0 commit comments