Skip to content

Commit

Permalink
koordlet: supply rdma device
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <[email protected]>
  • Loading branch information
wangjianyu.wjy committed Dec 4, 2024
1 parent b36d230 commit d878706
Show file tree
Hide file tree
Showing 14 changed files with 456 additions and 25 deletions.
2 changes: 1 addition & 1 deletion docker/koordlet.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RUN go build -a -o koordlet cmd/koordlet/main.go

FROM --platform=$TARGETPLATFORM nvidia/cuda:11.8.0-base-ubuntu22.04
WORKDIR /
RUN apt-get update && apt-get install -y lvm2 iptables && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y lvm2 iptables pciutils && rm -rf /var/lib/apt/lists/*
COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/koordlet .
COPY --from=builder /usr/local/lib /usr/lib
ENTRYPOINT ["/koordlet"]
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/koordinator-sh/koordinator
go 1.20

require (
github.com/Mellanox/rdmamap v1.1.0
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82
github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5
github.com/containerd/nri v0.6.1
Expand All @@ -19,6 +20,7 @@ require (
github.com/golang/protobuf v1.5.3
github.com/google/go-cmp v0.5.9
github.com/google/uuid v1.3.0
github.com/jaypipes/ghw v0.12.0
github.com/jedib0t/go-pretty/v6 v6.4.0
github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.1
github.com/mohae/deepcopy v0.0.0-20170603005431-491d3605edfb
Expand Down Expand Up @@ -65,23 +67,29 @@ require (

require (
cloud.google.com/go/compute/metadata v0.2.3 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230305170008-8188dc5388df // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/containerd/containerd v1.6.9 // indirect
github.com/containerd/ttrpc v1.2.3 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch/v5 v5.6.0 // indirect
github.com/ghodss/yaml v1.0.0 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/cel-go v0.16.1 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/pprof v0.0.0-20220829040838-70bd9ae97f40 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3 // indirect
github.com/jaypipes/pcidb v1.0.0 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/stoewer/go-strcase v1.2.0 // indirect
golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20230731190214-cbb8c96f2d6d // indirect
howett.net/plist v1.0.0 // indirect
k8s.io/controller-manager v0.28.7 // indirect
k8s.io/dynamic-resource-allocation v0.28.7 // indirect
k8s.io/gengo v0.0.0-20220902162205-c0856e24416d // indirect
Expand Down
15 changes: 15 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab h1:UKkYhof1njT1
github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab/go.mod h1:3VYc5hodBMJ5+l/7J4xAyMeuM2PNuepvHlGs8yilUCA=
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE=
github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4=
github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8=
github.com/Microsoft/go-winio v0.4.11/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA=
github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA=
github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
Expand Down Expand Up @@ -264,6 +266,8 @@ github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cq
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
github.com/a8m/tree v0.0.0-20210115125333-10a5fd5b637d/go.mod h1:FSdwKX97koS5efgm8WevNf7XS3PqtyFkKDDXrz778cg=
github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY=
github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk=
Expand Down Expand Up @@ -586,6 +590,7 @@ github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbS
github.com/fullsailor/pkcs7 v0.0.0-20190404230743-d7302db945fa/go.mod h1:KnogPXtdwXqoenmZCw6S+25EAm2MkxbG0deNDu4cbSA=
github.com/fvbommel/sortorder v1.1.0/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=
github.com/garyburd/redigo v0.0.0-20150301180006-535138d7bcd7/go.mod h1:NR3MbYisc3/PwhQ00EMzDiPmrwpPxAn5GI05/YaO1SY=
github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
Expand Down Expand Up @@ -621,6 +626,9 @@ github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre
github.com/go-logr/zapr v1.2.3/go.mod h1:eIauM6P8qSvTw5o2ez6UEAfGjQKrxQTl5EoK+Qa2oG4=
github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo=
github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA=
github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonreference v0.20.1/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
Expand Down Expand Up @@ -852,6 +860,10 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf
github.com/ionos-cloud/sdk-go/v6 v6.1.3 h1:vb6yqdpiqaytvreM0bsn2pXw+1YDvEk2RKSmBAQvgDQ=
github.com/ishidawataru/sctp v0.0.0-20190723014705-7c296d48a2b5/go.mod h1:DM4VvS+hD/kDi1U1QsX2fnZowwBhqD0Dk3bRPKF/Oc8=
github.com/j-keck/arping v0.0.0-20160618110441-2cf9dc699c56/go.mod h1:ymszkNOg6tORTn+6F6j+Jc8TOr5osrynvN6ivFWZ2GA=
github.com/jaypipes/ghw v0.12.0 h1:xU2/MDJfWmBhJnujHY9qwXQLs3DBsf0/Xa9vECY0Tho=
github.com/jaypipes/ghw v0.12.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
github.com/jaypipes/pcidb v1.0.0 h1:vtZIfkiCUE42oYbJS0TAq9XSfSmcsgo9IdxSm9qzYU8=
github.com/jaypipes/pcidb v1.0.0/go.mod h1:TnYUvqhPBzCKnH34KrIX22kAeEbDCSRJ9cqLRCuNDfk=
github.com/jedib0t/go-pretty/v6 v6.4.0 h1:YlI/2zYDrweA4MThiYMKtGRfT+2qZOO65ulej8GTcVI=
github.com/jedib0t/go-pretty/v6 v6.4.0/go.mod h1:MgmISkTWDSFu0xOqiZ0mKNntMQ2mDgOcwOkwBEkMDJI=
github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
Expand Down Expand Up @@ -1859,6 +1871,7 @@ gopkg.in/square/go-jose.v2 v2.6.0/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Expand All @@ -1883,6 +1896,8 @@ honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.1.3/go.mod h1:NgwopIslSNH47DimFoV78dnkksY2EFtX0ajyb3K/las=
howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
k8s.io/api v0.28.7 h1:YKIhBxjXKaxuxWJnwohV0aGjRA5l4IU0Eywf/q19AVI=
k8s.io/api v0.28.7/go.mod h1:y4RbcjCCMff1930SG/TcP3AUKNfaJUgIeUp58e/2vyY=
k8s.io/apiextensions-apiserver v0.28.7 h1:NQlzP/vmvIO9Qt7wQTdMe9sGWGkozQZMPk9suehAvR8=
Expand Down
7 changes: 7 additions & 0 deletions pkg/features/koordlet_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ const (
// Accelerators enables GPU related feature in koordlet. Only Nvidia GPUs supported.
Accelerators featuregate.Feature = "Accelerators"

// owner: @ZiMengSheng
// alpha: v0.6
//
// NetDevices enables RDMA related feature in koordlet.
RDMADevices featuregate.Feature = "RDMADevices"

// owner: @songtao98 @zwzhang0107
// alpha: v1.0
//
Expand Down Expand Up @@ -164,6 +170,7 @@ var (
CgroupReconcile: {Default: false, PreRelease: featuregate.Alpha},
NodeTopologyReport: {Default: true, PreRelease: featuregate.Beta},
Accelerators: {Default: false, PreRelease: featuregate.Alpha},
RDMADevices: {Default: false, PreRelease: featuregate.Alpha},
CPICollector: {Default: false, PreRelease: featuregate.Alpha},
Libpfm4: {Default: false, PreRelease: featuregate.Alpha},
PSICollector: {Default: false, PreRelease: featuregate.Alpha},
Expand Down
11 changes: 10 additions & 1 deletion pkg/koordlet/metricsadvisor/devices/gpu/collector_gpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"fmt"
"sort"
"strings"
"sync"
"time"

Expand All @@ -30,6 +31,7 @@ import (

"github.com/koordinator-sh/koordinator/pkg/features"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/devices/helper"
"github.com/koordinator-sh/koordinator/pkg/koordlet/util"
)

Expand Down Expand Up @@ -121,7 +123,14 @@ func (g *gpuDeviceManager) initGPUData() error {
if ret != nvml.SUCCESS {
return fmt.Errorf("unable to get pci info: %v", nvml.ErrorString(ret))
}
nodeID, pcie, busID, err := parseGPUPCIInfo(pciInfo.BusIdLegacy)
busIDBuilder := &strings.Builder{}
for _, v := range pciInfo.BusIdLegacy {
if v != 0 {
busIDBuilder.WriteByte(byte(v))
}
}
busID := strings.ToLower(busIDBuilder.String())
nodeID, pcie, busID, err := helper.ParsePCIInfo(busID)
if err != nil {
return err
}
Expand Down
102 changes: 102 additions & 0 deletions pkg/koordlet/metricsadvisor/devices/helper/sriov.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package helper

import (
"bytes"
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"

"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
)

const (
configuredVfFile = "sriov_numvfs"
)

// SriovConfigured returns true if sriov_numvfs reads > 0 else false
func SriovConfigured(addr string) bool {
return GetVConfigured(addr) > 0
}

func extractNumber(pfDir string, s string) int {
num, _ := strconv.Atoi(strings.TrimPrefix(s, fmt.Sprintf("%s/virtfn", pfDir)))
return num
}

// GetVFList returns a List containing PCI addr for all VF discovered in a given PF
func GetVFList(pf string) (vfList []string, err error) {
vfList = make([]string, 0)
pfDir := filepath.Join(system.GetPCIDeviceDir(), pf)
_, err = os.Lstat(pfDir)
if err != nil {
err = fmt.Errorf("error. Could not get PF directory information for device: %s, Err: %v", pf, err)
return
}

vfDirs, err := filepath.Glob(filepath.Join(pfDir, "virtfn*"))

if err != nil {
err = fmt.Errorf("error reading VF directories %v", err)
return
}
//TODO 排序
sort.Slice(vfDirs, func(i, j int) bool {
return extractNumber(pfDir, vfDirs[i]) < extractNumber(pfDir, vfDirs[j])
})

// Read all VF directory and get add VF PCI addr to the vfList
for _, dir := range vfDirs {
dirInfo, err := os.Lstat(dir)
if err == nil && (dirInfo.Mode()&os.ModeSymlink != 0) {
linkName, err := filepath.EvalSymlinks(dir)
if err == nil {
vfLink := filepath.Base(linkName)
vfList = append(vfList, vfLink)
}
}
}
return
}

// GetVConfigured returns number of VF configured for a PF
func GetVConfigured(pf string) int {
configuredVfPath := filepath.Join(system.GetPCIDeviceDir(), pf, configuredVfFile)
vfs, err := os.ReadFile(configuredVfPath)
if err != nil {
return 0
}
configuredVFs := bytes.TrimSpace(vfs)
numConfiguredVFs, err := strconv.Atoi(string(configuredVFs))
if err != nil {
return 0
}
return numConfiguredVFs
}

// IsSriovVF check if a pci device has link to a PF
func IsSriovVF(pciAddr string) bool {
totalVfFilePath := filepath.Join(system.GetPCIDeviceDir(), pciAddr, "physfn")
if _, err := os.Stat(totalVfFilePath); err != nil {
return false
}
return true
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package gpu
package helper

import (
"bytes"
Expand All @@ -23,7 +23,6 @@ import (
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
)
Expand All @@ -32,14 +31,7 @@ var (
pcieRegexp = regexp.MustCompile(`pci\d{4}:[0-9a-fA-F]{2}`)
)

func parseGPUPCIInfo(busIdLegacy [16]int8) (int32, string, string, error) {
busIDBuilder := &strings.Builder{}
for _, v := range busIdLegacy {
if v != 0 {
busIDBuilder.WriteByte(byte(v))
}
}
busID := strings.ToLower(busIDBuilder.String())
func ParsePCIInfo(busID string) (int32, string, string, error) {
nodeID, err := getNUMANodeID(busID)
if err != nil {
return 0, "", "", fmt.Errorf("failed to parse NUMA Node ID, err: %w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package gpu
package helper

import (
"fmt"
Expand Down Expand Up @@ -74,7 +74,7 @@ func Test_parseGPUPCIInfo(t *testing.T) {
for i, v := range tt.busID {
busIdLegacy[i] = int8(v)
}
nodeID, pcie, busID, err := parseGPUPCIInfo(busIdLegacy)
nodeID, pcie, busID, err := ParsePCIInfo(tt.busID)
if (err != nil) && !tt.wantErr {
t.Errorf("expect wantErr=%v but got err=%v", tt.wantErr, err)
return
Expand Down
Loading

0 comments on commit d878706

Please sign in to comment.