diff --git a/cmd/nvidia-cdi-hook/commands/commands.go b/cmd/nvidia-cdi-hook/commands/commands.go index a222acf22..3f80ba9be 100644 --- a/cmd/nvidia-cdi-hook/commands/commands.go +++ b/cmd/nvidia-cdi-hook/commands/commands.go @@ -21,6 +21,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod" symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks" + "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat" ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" ) @@ -32,5 +33,6 @@ func New(logger logger.Interface) []*cli.Command { ldcache.NewCommand(logger), symlinks.NewCommand(logger), chmod.NewCommand(logger), + cudacompat.NewCommand(logger), } } diff --git a/cmd/nvidia-cdi-hook/cudacompat/container-root.go b/cmd/nvidia-cdi-hook/cudacompat/container-root.go new file mode 100644 index 000000000..8bb3b3c85 --- /dev/null +++ b/cmd/nvidia-cdi-hook/cudacompat/container-root.go @@ -0,0 +1,76 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cudacompat + +import ( + "os" + "path/filepath" + + "github.com/moby/sys/symlink" +) + +// A containerRoot represents the root filesystem of a container. +type containerRoot string + +// hasPath checks whether the specified path exists in the root. +func (r containerRoot) hasPath(path string) bool { + resolved, err := r.resolve(path) + if err != nil { + return false + } + if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) { + return false + } + return true +} + +// globFiles matches the specified pattern in the root. +// The files that match must be regular files. +func (r containerRoot) globFiles(pattern string) ([]string, error) { + patternPath, err := r.resolve(pattern) + if err != nil { + return nil, err + } + matches, err := filepath.Glob(patternPath) + if err != nil { + return nil, err + } + var files []string + for _, match := range matches { + info, err := os.Lstat(match) + if err != nil { + return nil, err + } + // Ignore symlinks. + if info.Mode()&os.ModeSymlink != 0 { + continue + } + // Ignore directories. + if info.IsDir() { + continue + } + files = append(files, match) + } + return files, nil +} + +// resolve returns the absolute path including root path. +// Symlinks are resolved, but are guaranteed to resolve in the root. +func (r containerRoot) resolve(path string) (string, error) { + absolute := filepath.Clean(filepath.Join(string(r), path)) + return symlink.FollowSymlinkInScope(absolute, string(r)) +} diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go new file mode 100644 index 000000000..0cecd6c17 --- /dev/null +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go @@ -0,0 +1,221 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cudacompat + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/urfave/cli/v2" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" +) + +const ( + cudaCompatPath = "/usr/local/cuda/compat" + // cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename + // in ld.so.conf.d that includes a reference to the CUDA compat path. + // The 00-compat prefix is chosen to ensure that these libraries have a + // higher precedence than other libraries on the system. + cudaCompatLdsoconfdFilenamePattern = "00-compat-*.conf" +) + +type command struct { + logger logger.Interface +} + +type options struct { + hostDriverVersion string + containerSpec string +} + +// NewCommand constructs a cuda-compat command with the specified logger +func NewCommand(logger logger.Interface) *cli.Command { + c := command{ + logger: logger, + } + return c.build() +} + +// build the enable-cuda-compat command +func (m command) build() *cli.Command { + cfg := options{} + + // Create the 'enable-cuda-compat' command + c := cli.Command{ + Name: "enable-cuda-compat", + Usage: "This hook ensures that the folder containing the CUDA compat libraries is added to the ldconfig search path if required.", + Before: func(c *cli.Context) error { + return m.validateFlags(c, &cfg) + }, + Action: func(c *cli.Context) error { + return m.run(c, &cfg) + }, + } + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "host-driver-version", + Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.", + Destination: &cfg.hostDriverVersion, + }, + &cli.StringFlag{ + Name: "container-spec", + Hidden: true, + Category: "testing-only", + Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN", + Destination: &cfg.containerSpec, + }, + } + + return &c +} + +func (m command) validateFlags(_ *cli.Context, cfg *options) error { + return nil +} + +func (m command) run(_ *cli.Context, cfg *options) error { + if cfg.hostDriverVersion == "" { + return nil + } + + s, err := oci.LoadContainerState(cfg.containerSpec) + if err != nil { + return fmt.Errorf("failed to load container state: %w", err) + } + + containerRootDir, err := s.GetContainerRoot() + if err != nil { + return fmt.Errorf("failed to determined container root: %w", err) + } + + containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion) + if err != nil { + return fmt.Errorf("failed to get container forward compat directory: %w", err) + } + if containerForwardCompatDir == "" { + return nil + } + + return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir) +} + +func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) { + if hostDriverVersion == "" { + m.logger.Debugf("Host driver version not specified") + return "", nil + } + if !containerRoot.hasPath(cudaCompatPath) { + m.logger.Debugf("No CUDA forward compatibility libraries directory in container") + return "", nil + } + if !containerRoot.hasPath("/etc/ld.so.cache") { + m.logger.Debugf("The container does not have an LDCache") + return "", nil + } + + libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*")) + if err != nil { + m.logger.Warningf("Failed to find CUDA compat library: %w", err) + return "", nil + } + + if len(libs) == 0 { + m.logger.Debugf("No CUDA forward compatibility libraries container") + return "", nil + } + + if len(libs) != 1 { + m.logger.Warningf("Unexpected number of CUDA compat libraries in container: %v", libs) + return "", nil + } + + compatDriverVersion := strings.TrimPrefix(filepath.Base(libs[0]), "libcuda.so.") + compatMajor, err := extractMajorVersion(compatDriverVersion) + if err != nil { + return "", fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err) + } + + driverMajor, err := extractMajorVersion(hostDriverVersion) + if err != nil { + return "", fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err) + } + + if driverMajor >= compatMajor { + m.logger.Debugf("Compat major version is not greater than the host driver major version (%v >= %v)", hostDriverVersion, compatDriverVersion) + return "", nil + } + + resolvedCompatDir := strings.TrimPrefix(filepath.Dir(libs[0]), string(containerRoot)) + return resolvedCompatDir, nil +} + +// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root. +// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and +// contains the specified directories on each line. +func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error { + if len(dirs) == 0 { + m.logger.Debugf("No directories to add to /etc/ld.so.conf") + return nil + } + + ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d") + if err != nil { + return err + } + if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil { + return fmt.Errorf("failed to create ld.so.conf.d: %w", err) + } + + configFile, err := os.CreateTemp(ldsoconfdDir, pattern) + if err != nil { + return fmt.Errorf("failed to create config file: %w", err) + } + defer configFile.Close() + + m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name()) + + added := make(map[string]bool) + for _, dir := range dirs { + if added[dir] { + continue + } + _, err = configFile.WriteString(fmt.Sprintf("%s\n", dir)) + if err != nil { + return fmt.Errorf("failed to update config file: %w", err) + } + added[dir] = true + } + + // The created file needs to be world readable for the cases where the container is run as a non-root user. + if err := configFile.Chmod(0644); err != nil { + return fmt.Errorf("failed to chmod config file: %w", err) + } + + return nil +} + +// extractMajorVersion parses a version string and returns the major version as an int. +func extractMajorVersion(version string) (int, error) { + majorString := strings.SplitN(version, ".", 2)[0] + return strconv.Atoi(majorString) +} diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go new file mode 100644 index 000000000..0422fe76c --- /dev/null +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go @@ -0,0 +1,182 @@ +/* +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package cudacompat + +import ( + "os" + "path/filepath" + "strings" + "testing" + + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" +) + +func TestCompatLibs(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + testCases := []struct { + description string + contents map[string]string + hostDriverVersion string + expectedContainerForwardCompatDir string + }{ + { + description: "empty root", + hostDriverVersion: "222.55.66", + }, + { + description: "compat lib is newer; no ldcache", + contents: map[string]string{ + "/usr/local/cuda/compat/libcuda.so.333.88.99": "", + }, + hostDriverVersion: "222.55.66", + }, + { + description: "compat lib is newer; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.333.88.99": "", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "/usr/local/cuda/compat", + }, + { + description: "compat lib is older; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.111.88.99": "", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "", + }, + { + description: "compat lib has same major version; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.222.88.99": "", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "", + }, + { + description: "numeric comparison is used; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.222.88.99": "", + }, + hostDriverVersion: "99.55.66", + expectedContainerForwardCompatDir: "/usr/local/cuda/compat", + }, + { + description: "driver version empty; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.222.88.99": "", + }, + hostDriverVersion: "", + }, + { + description: "symlinks are followed", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/etc/alternatives/cuda/compat/libcuda.so.333.88.99": "", + "/usr/local/cuda": "symlink=/etc/alternatives/cuda", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "/etc/alternatives/cuda/compat", + }, + { + description: "symlinks stay in container", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/compat/libcuda.so.333.88.99": "", + "/usr/local/cuda": "symlink=../../../../../../", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "/compat", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + containerRootDir := t.TempDir() + for name, contents := range tc.contents { + target := filepath.Join(containerRootDir, name) + require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755)) + + if strings.HasPrefix(contents, "symlink=") { + require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target)) + continue + } + + require.NoError(t, os.WriteFile(target, []byte(contents), 0600)) + } + + c := command{ + logger: logger, + } + containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion) + require.NoError(t, err) + require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir) + }) + } +} + +func TestUpdateLdconfig(t *testing.T) { + logger, _ := testlog.NewNullLogger() + testCases := []struct { + description string + folders []string + expectedContents string + }{ + { + description: "no folders; have no contents", + }, + { + description: "single folder is added", + folders: []string{"/usr/local/cuda/compat"}, + expectedContents: "/usr/local/cuda/compat\n", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + containerRootDir := t.TempDir() + c := command{ + logger: logger, + } + err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...) + require.NoError(t, err) + + matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf")) + require.NoError(t, err) + + if tc.expectedContents == "" { + require.Empty(t, matches) + return + } + + require.Len(t, matches, 1) + contents, err := os.ReadFile(matches[0]) + require.NoError(t, err) + + require.EqualValues(t, tc.expectedContents, string(contents)) + }) + } + +} diff --git a/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go b/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go index 855141ffb..dfc331209 100644 --- a/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go +++ b/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go @@ -80,6 +80,12 @@ containerEdits: - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so hookName: createContainer path: {{ .toolkitRoot }}/nvidia-cdi-hook + - args: + - nvidia-cdi-hook + - enable-cuda-compat + - --host-driver-version=999.88.77 + hookName: createContainer + path: {{ .toolkitRoot }}/nvidia-cdi-hook - args: - nvidia-cdi-hook - update-ldcache diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 598a40c1d..b187335b7 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -25,6 +25,8 @@ import ( "github.com/urfave/cli/v2" cdi "tags.cncf.io/container-device-interface/pkg/parser" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" @@ -60,6 +62,9 @@ type options struct { files cli.StringSlice ignorePatterns cli.StringSlice } + + // the following are used for dependency injection during spec generation. + nvmllib nvml.Interface } // NewCommand constructs a generate-cdi command with the specified logger @@ -269,6 +274,8 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) { nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()), nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()), + // We set the following to allow for dependency injection: + nvcdi.WithNvmlLib(opts.nvmllib), ) if err != nil { return nil, fmt.Errorf("failed to create CDI library: %v", err) diff --git a/cmd/nvidia-ctk/cdi/generate/generate_test.go b/cmd/nvidia-ctk/cdi/generate/generate_test.go new file mode 100644 index 000000000..57bd865a9 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/generate_test.go @@ -0,0 +1,157 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "bytes" + "path/filepath" + "strings" + "testing" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100" + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/test" +) + +func TestGenerateSpec(t *testing.T) { + t.Setenv("__NVCT_TESTING_DEVICES_ARE_FILES", "true") + moduleRoot, err := test.GetModuleRoot() + require.NoError(t, err) + + driverRoot := filepath.Join(moduleRoot, "testdata", "lookup", "rootfs-1") + + logger, _ := testlog.NewNullLogger() + testCases := []struct { + description string + options options + expectedValidateError error + expectedOptions options + expectedError error + expectedSpec string + }{ + { + description: "default", + options: options{ + format: "yaml", + mode: "nvml", + vendor: "example.com", + class: "device", + driverRoot: driverRoot, + }, + expectedOptions: options{ + format: "yaml", + mode: "nvml", + vendor: "example.com", + class: "device", + nvidiaCDIHookPath: "/usr/bin/nvidia-cdi-hook", + driverRoot: driverRoot, + }, + expectedSpec: `--- +cdiVersion: 0.5.0 +containerEdits: + deviceNodes: + - hostPath: {{ .driverRoot }}/dev/nvidiactl + path: /dev/nvidiactl + env: + - NVIDIA_VISIBLE_DEVICES=void + hooks: + - args: + - nvidia-cdi-hook + - create-symlinks + - --link + - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so + hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + - args: + - nvidia-cdi-hook + - enable-cuda-compat + - --host-driver-version=999.88.77 + hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + - args: + - nvidia-cdi-hook + - update-ldcache + - --folder + - /lib/x86_64-linux-gnu + hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + mounts: + - containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77 + hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77 + options: + - ro + - nosuid + - nodev + - bind +devices: +- containerEdits: + deviceNodes: + - hostPath: {{ .driverRoot }}/dev/nvidia0 + path: /dev/nvidia0 + name: "0" +- containerEdits: + deviceNodes: + - hostPath: {{ .driverRoot }}/dev/nvidia0 + path: /dev/nvidia0 + name: all +kind: example.com/device +`, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + c := command{ + logger: logger, + } + + err := c.validateFlags(nil, &tc.options) + require.ErrorIs(t, err, tc.expectedValidateError) + require.EqualValues(t, tc.expectedOptions, tc.options) + + // Set up a mock server, reusing the DGX A100 mock. + server := dgxa100.New() + // Override the driver version to match the version in our mock filesystem. + server.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return "999.88.77", nvml.SUCCESS + } + // Set the device count to 1 explicitly since we only have a single device node. + server.DeviceGetCountFunc = func() (int, nvml.Return) { + return 1, nvml.SUCCESS + } + for _, d := range server.Devices { + // TODO: This is not implemented in the mock. + (d.(*dgxa100.Device)).GetMaxMigDeviceCountFunc = func() (int, nvml.Return) { + return 0, nvml.SUCCESS + } + } + tc.options.nvmllib = server + + spec, err := c.generateSpec(&tc.options) + require.ErrorIs(t, err, tc.expectedError) + + var buf bytes.Buffer + _, err = spec.WriteTo(&buf) + require.NoError(t, err) + + require.Equal(t, strings.ReplaceAll(tc.expectedSpec, "{{ .driverRoot }}", driverRoot), buf.String()) + }) + } +} diff --git a/internal/config/features.go b/internal/config/features.go index 0d5113988..a326e4ee2 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -25,6 +25,12 @@ type features struct { // If this feature flag is not set to 'true' only host-rooted config paths // (i.e. paths starting with an '@' are considered valid) AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"` + // DisableCUDACompatLibHook, when enabled skips the injection of a specific + // hook to process CUDA compatibility libraries. + // + // Note: Since this mechanism replaces the logic in the `nvidia-container-cli`, + // toggling this feature has no effect if `allow-cuda-compat-libs-from-container` is enabled. + DisableCUDACompatLibHook *feature `toml:"disable-cuda-compat-lib-hook,omitempty"` // DisableImexChannelCreation ensures that the implicit creation of // requested IMEX channels is skipped when invoking the nvidia-container-cli. DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"` diff --git a/internal/discover/compat_libs.go b/internal/discover/compat_libs.go new file mode 100644 index 000000000..027ca2ed2 --- /dev/null +++ b/internal/discover/compat_libs.go @@ -0,0 +1,24 @@ +package discover + +import ( + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" +) + +// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook. +// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version. +func NewCUDACompatHookDiscoverer(logger logger.Interface, nvidiaCDIHookPath string, driver *root.Driver) Discover { + _, cudaVersionPattern := getCUDALibRootAndVersionPattern(logger, driver) + var args []string + if !strings.Contains(cudaVersionPattern, "*") { + args = append(args, "--host-driver-version="+cudaVersionPattern) + } + + return CreateNvidiaCDIHook( + nvidiaCDIHookPath, + "enable-cuda-compat", + args..., + ) +} diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 2c39d2074..8320286e9 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -23,6 +23,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" ) @@ -35,7 +36,7 @@ import ( // NVIDIA_GDRCOPY=enabled // // If not devices are selected, no changes are made. -func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) { +func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) { if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil @@ -78,5 +79,24 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } + if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { + compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) + discoverers = append(discoverers, compatLibHookDiscoverer) + // For legacy mode, we also need to inject a hook to update the LDCache + // after we have modifed the configuration. + if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" { + ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( + logger, + discover.None{}, + cfg.NVIDIACTKConfig.Path, + "", + ) + if err != nil { + return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) + } + discoverers = append(discoverers, ldcacheUpdateHookDiscoverer) + } + } + return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) } diff --git a/internal/runtime/runtime_factory.go b/internal/runtime/runtime_factory.go index 2b5cd9c6f..e88213dc3 100644 --- a/internal/runtime/runtime_factory.go +++ b/internal/runtime/runtime_factory.go @@ -75,6 +75,8 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp } mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image) + // We update the mode here so that we can continue passing just the config to other functions. + cfg.NVIDIAContainerRuntimeConfig.Mode = mode modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image) if err != nil { return nil, err @@ -94,7 +96,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp } modifiers = append(modifiers, graphicsModifier) case "feature-gated": - featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image) + featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image, driver) if err != nil { return nil, err } @@ -126,8 +128,8 @@ func supportedModifierTypes(mode string) []string { return []string{"nvidia-hook-remover", "mode"} case "csv": // For CSV mode we support mode and feature-gated modification. - return []string{"nvidia-hook-remover", "mode", "feature-gated"} + return []string{"nvidia-hook-remover", "feature-gated", "mode"} default: - return []string{"mode", "graphics", "feature-gated"} + return []string{"feature-gated", "graphics", "mode"} } } diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index b0006aebf..782d60fcd 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -97,6 +97,8 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv libraryPaths, ) + // TODO: The following should use the version directly. + cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, nvidiaCDIHookPath, driver) updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath) d := discover.Merge( @@ -105,6 +107,7 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv version, nvidiaCDIHookPath, ), + cudaCompatLibHookDiscoverer, updateLDCache, ) diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index 1895aff03..5948014b6 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -18,13 +18,15 @@ package e2e import ( "context" + "path/filepath" + "strings" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) // Integration tests for Docker runtime -var _ = Describe("docker", Ordered, func() { +var _ = Describe("docker", Ordered, ContinueOnFailure, func() { var r Runner // Install the NVIDIA Container Toolkit @@ -166,4 +168,51 @@ var _ = Describe("docker", Ordered, func() { Expect(referenceOutput).To(Equal(out4)) }) }) + + Describe("CUDA Forward compatibility", Ordered, func() { + BeforeAll(func(ctx context.Context) { + _, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8") + Expect(err).ToNot(HaveOccurred()) + }) + + BeforeAll(func(ctx context.Context) { + compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"") + Expect(err).ToNot(HaveOccurred()) + Expect(compatOutput).ToNot(BeEmpty()) + compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.") + compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0] + + driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"") + Expect(err).ToNot(HaveOccurred()) + parts := strings.SplitN(driverOutput, ":", 2) + Expect(parts).To(HaveLen(2)) + + hostDriverVersion := strings.TrimSpace(parts[1]) + Expect(hostDriverVersion).ToNot(BeEmpty()) + driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0] + + if driverMajor >= compatMajor { + GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion) + Skip("CUDA Forward Compatibility tests require an older driver version") + } + }) + + It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) { + ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + Expect(err).ToNot(HaveOccurred()) + Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) + }) + + It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) { + ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + Expect(err).ToNot(HaveOccurred()) + Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) + }) + + It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) { + ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + Expect(err).ToNot(HaveOccurred()) + Expect(ldconfigOut).To(ContainSubstring("/usr/lib64")) + }) + }) }) diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go new file mode 100644 index 000000000..7654dc7d0 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "fmt" + "sync" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/google/uuid" +) + +type Server struct { + mock.Interface + mock.ExtendedInterface + Devices [8]nvml.Device + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} +type Device struct { + mock.Device + sync.RWMutex + UUID string + Name string + Brand nvml.BrandType + Architecture nvml.DeviceArchitecture + PciBusID string + Minor int + Index int + CudaComputeCapability CudaComputeCapability + MigMode int + GpuInstances map[*GpuInstance]struct{} + GpuInstanceCounter uint32 + MemoryInfo nvml.Memory +} + +type GpuInstance struct { + mock.GpuInstance + sync.RWMutex + Info nvml.GpuInstanceInfo + ComputeInstances map[*ComputeInstance]struct{} + ComputeInstanceCounter uint32 +} + +type ComputeInstance struct { + mock.ComputeInstance + Info nvml.ComputeInstanceInfo +} + +type CudaComputeCapability struct { + Major int + Minor int +} + +var _ nvml.Interface = (*Server)(nil) +var _ nvml.Device = (*Device)(nil) +var _ nvml.GpuInstance = (*GpuInstance)(nil) +var _ nvml.ComputeInstance = (*ComputeInstance)(nil) + +func New() *Server { + server := &Server{ + Devices: [8]nvml.Device{ + NewDevice(0), + NewDevice(1), + NewDevice(2), + NewDevice(3), + NewDevice(4), + NewDevice(5), + NewDevice(6), + NewDevice(7), + }, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + } + server.setMockFuncs() + return server +} + +func NewDevice(index int) *Device { + device := &Device{ + UUID: "GPU-" + uuid.New().String(), + Name: "Mock NVIDIA A100-SXM4-40GB", + Brand: nvml.BRAND_NVIDIA, + Architecture: nvml.DEVICE_ARCH_AMPERE, + PciBusID: fmt.Sprintf("0000:%02x:00.0", index), + Minor: index, + Index: index, + CudaComputeCapability: CudaComputeCapability{ + Major: 8, + Minor: 0, + }, + GpuInstances: make(map[*GpuInstance]struct{}), + GpuInstanceCounter: 0, + MemoryInfo: nvml.Memory{42949672960, 0, 0}, + } + device.setMockFuncs() + return device +} + +func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance { + gi := &GpuInstance{ + Info: info, + ComputeInstances: make(map[*ComputeInstance]struct{}), + ComputeInstanceCounter: 0, + } + gi.setMockFuncs() + return gi +} + +func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance { + ci := &ComputeInstance{ + Info: info, + } + ci.setMockFuncs() + return ci +} + +func (s *Server) setMockFuncs() { + s.ExtensionsFunc = func() nvml.ExtendedInterface { + return s + } + + s.LookupSymbolFunc = func(symbol string) error { + return nil + } + + s.InitFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.ShutdownFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return s.DriverVersion, nvml.SUCCESS + } + + s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { + return s.NvmlVersion, nvml.SUCCESS + } + + s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { + return s.CudaDriverVersion, nvml.SUCCESS + } + + s.DeviceGetCountFunc = func() (int, nvml.Return) { + return len(s.Devices), nvml.SUCCESS + } + + s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { + if index < 0 || index >= len(s.Devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + return s.Devices[index], nvml.SUCCESS + } + + s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if uuid == d.(*Device).UUID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } + + s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if busID == d.(*Device).PciBusID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } +} + +func (d *Device) setMockFuncs() { + d.GetMinorNumberFunc = func() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS + } + + d.GetIndexFunc = func() (int, nvml.Return) { + return d.Index, nvml.SUCCESS + } + + d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { + return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS + } + + d.GetUUIDFunc = func() (string, nvml.Return) { + return d.UUID, nvml.SUCCESS + } + + d.GetNameFunc = func() (string, nvml.Return) { + return d.Name, nvml.SUCCESS + } + + d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { + return d.Brand, nvml.SUCCESS + } + + d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { + return d.Architecture, nvml.SUCCESS + } + + d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, nvml.SUCCESS + } + + d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { + p := nvml.PciInfo{ + PciDeviceId: 0x20B010DE, + } + return p, nvml.SUCCESS + } + + d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { + d.MigMode = mode + return nvml.SUCCESS, nvml.SUCCESS + } + + d.GetMigModeFunc = func() (int, int, nvml.Return) { + return d.MigMode, d.MigMode, nvml.SUCCESS + } + + d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS + } + + d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS + } + + d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + } + d.GpuInstanceCounter++ + gi := NewGpuInstance(giInfo) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + Placement: *placement, + } + d.GpuInstanceCounter++ + gi := NewGpuInstance(giInfo) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + d.RLock() + defer d.RUnlock() + var gis []nvml.GpuInstance + for gi := range d.GpuInstances { + if gi.Info.ProfileId == info.Id { + gis = append(gis, gi) + } + } + return gis, nvml.SUCCESS + } +} + +func (gi *GpuInstance) setMockFuncs() { + gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { + return gi.Info, nvml.SUCCESS + } + + gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { + if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + giProfileId := int(gi.Info.ProfileId) + + if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS + } + + gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS + } + + gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + gi.Lock() + defer gi.Unlock() + ciInfo := nvml.ComputeInstanceInfo{ + Device: gi.Info.Device, + GpuInstance: gi, + Id: gi.ComputeInstanceCounter, + ProfileId: info.Id, + } + gi.ComputeInstanceCounter++ + ci := NewComputeInstance(ciInfo) + gi.ComputeInstances[ci] = struct{}{} + return ci, nvml.SUCCESS + } + + gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + gi.RLock() + defer gi.RUnlock() + var cis []nvml.ComputeInstance + for ci := range gi.ComputeInstances { + if ci.Info.ProfileId == info.Id { + cis = append(cis, ci) + } + } + return cis, nvml.SUCCESS + } + + gi.DestroyFunc = func() nvml.Return { + d := gi.Info.Device.(*Device) + d.Lock() + defer d.Unlock() + delete(d.GpuInstances, gi) + return nvml.SUCCESS + } +} + +func (ci *ComputeInstance) setMockFuncs() { + ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { + return ci.Info, nvml.SUCCESS + } + + ci.DestroyFunc = func() nvml.Return { + gi := ci.Info.GpuInstance.(*GpuInstance) + gi.Lock() + defer gi.Unlock() + delete(gi.ComputeInstances, ci) + return nvml.SUCCESS + } +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go new file mode 100644 index 000000000..c4df4c833 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// MIGProfiles holds the profile information for GIs and CIs in this mock server. +// We should consider auto-generating this object in the future. +var MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo +}{ + GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 0, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 40192, + }, + }, + ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 0, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + SharedCopyEngineCount: 2, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + SharedCopyEngineCount: 2, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 14, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 42, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + }, + }, +} + +// MIGPlacements holds the placement information for GIs and CIs in this mock server. +// We should consider auto-generating this object in the future. +var MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement +}{ + GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + { + Start: 6, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + { + Start: 0, + Size: 4, + }, + { + Start: 4, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + { + Start: 0, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + { + Start: 0, + Size: 8, + }, + }, + }, + // TODO: Fill out ComputeInstancePossiblePlacements + ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, + }, + }, +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 0675a33e5..c824e308b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -11,6 +11,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml github.com/NVIDIA/go-nvml/pkg/nvml/mock +github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100 # github.com/cpuguy83/go-md2man/v2 v2.0.5 ## explicit; go 1.11 github.com/cpuguy83/go-md2man/v2/md2man