Skip to content

Commit

Permalink
gpu monitor
Browse files Browse the repository at this point in the history
  • Loading branch information
ferris-cx authored and [email protected] committed Dec 10, 2024
1 parent 3df2cd0 commit b8ca007
Showing 1 changed file with 79 additions and 0 deletions.
79 changes: 79 additions & 0 deletions cmd/koordlet/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ package main

import (
"flag"
"google.golang.org/grpc"
pb "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
"net"
"net/http"
_ "net/http/pprof"
"os"
"time"

"github.com/prometheus/client_golang/prometheus/promhttp"
Expand All @@ -36,6 +40,8 @@ import (
metricsutil "github.com/koordinator-sh/koordinator/pkg/util/metrics"
)

const ServerPodResourcesKubeletSocket = "/pod-resources/koordlet.sock"

func main() {
cfg := config.NewConfiguration()
cfg.InitFlags(flag.CommandLine)
Expand Down Expand Up @@ -101,3 +107,76 @@ func installHTTPHandler() {
// http.HandleFunc("/healthz", d.HealthzHandler())
klog.Fatalf("Prometheus monitoring failed: %v", http.ListenAndServe(*options.ServerAddr, mux))
}

type PodResourcesServer struct{}

var nodePodResources = []*pb.PodResources{
{
Name: "pod-1",
Namespace: "default",
Containers: []*pb.ContainerResources{
{
Name: "container-1",
Devices: []*pb.ContainerDevices{
{
ResourceName: "nvidia.com/gpu",
DeviceIds: []string{"GPU-32e51276-4ddd-5d40-b63a-7bf69ea08b2e", "GPU-6638b0e5-0708-3bef-74d1-5b75b85f6e75"},
},
},
},
},
},
{
Name: "pod-2",
Namespace: "kube-system",
Containers: []*pb.ContainerResources{
{
Name: "container-2",
Devices: []*pb.ContainerDevices{
{
ResourceName: "nvidia.com/gpu",
DeviceIds: []string{"GPU-68d4072a-f4b8-46e5-c76f-66ce3d02cc38"},
},
},
},
},
},
}

func (s *PodResourcesServer) List(ctx context.Context, req *pb.ListPodResourcesRequest) (*pb.ListPodResourcesResponse, error) {

Check failure on line 146 in cmd/koordlet/main.go

View workflow job for this annotation

GitHub Actions / golangci-lint

undefined: context (typecheck)

Check failure on line 146 in cmd/koordlet/main.go

View workflow job for this annotation

GitHub Actions / unit-tests(Run Go build)

undefined: context

Check failure on line 146 in cmd/koordlet/main.go

View workflow job for this annotation

GitHub Actions / unit-tests(Run Go test)

undefined: context
//var pods []*pb.PodResources
klog.V(1).Infof("List(): start to list pod")

return &pb.ListPodResourcesResponse{PodResources: nodePodResources}, nil
}

func startGrpc() error {
lis, err := net.Listen("unix", ServerPodResourcesKubeletSocket)
if err != nil {
klog.Errorf("failed to listen: %v", err)
return err
}
klog.V(1).Infof("setSocketPermissions...")
if err := setSocketPermissions(ServerPodResourcesKubeletSocket); err != nil {
klog.Errorf("failed to set socket permissions: %v", err)
return err
}
klog.V(1).Infof("NewServer...")
server := grpc.NewServer()
klog.V(1).Infof("RegisterPodResourcesListerServer...")
pb.RegisterPodResourcesListerServer(server, &PodResourcesServer{})

klog.V(1).Infof("Starting gRPC server on %s", ServerPodResourcesKubeletSocket)
if err := server.Serve(lis); err != nil {
klog.Errorf("failed to serve: %v", err)
return err
}
return nil
}

func setSocketPermissions(socketPath string) error {
// In a real application, you would set the correct permissions here.
// For example:
return os.Chmod(socketPath, 0660)
//return nil
}

0 comments on commit b8ca007

Please sign in to comment.