-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add API for CDI --devices flag in Docker and Podman for mapping GPUs #3290
base: main
Are you sure you want to change the base?
Changes from all commits
0486254
5ff36e4
830a0dd
0d20235
6e4314d
b125982
d2a13ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -255,6 +255,20 @@ func runArgsForNode(node *config.Node, clusterIPFamily config.ClusterIPFamily, n | |
args = append(args, "-e", "KUBECONFIG=/etc/kubernetes/admin.conf") | ||
} | ||
|
||
// Append CDI device args (used for GPU support) | ||
if len(node.CDIDevices) > 0 { | ||
// Check for docker > 25 | ||
ver := Version() | ||
if ver != "dev" || strings.Split(ver, ".")[0] < "25" { | ||
return nil, errors.Errorf("using devices api in kind requires Docker >= v25, but found %q", ver) | ||
} | ||
Comment on lines
+262
to
+264
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: This also requires that the Daemon be configured in experimental mode. What about dropping this check entirely and relying on the error reported by the Docker daemon. Should be something along the lines of "no driver for "cdi"". Or is this UX to confusing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer this approach, rather than make kind to have awareness of the provider versions |
||
|
||
// Append args for each device | ||
for _, device := range node.CDIDevices { | ||
args = append(args, "--device", strings.TrimSpace(device)) | ||
} | ||
} | ||
|
||
Comment on lines
+258
to
+271
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will also allow passing "standard" devices via the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, validation is called as part of the initial passing for the node configuration. |
||
// finally, specify the image to run | ||
return append(args, node.Image), nil | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,16 @@ func IsAvailable() bool { | |
return strings.HasPrefix(lines[0], "Docker version") | ||
} | ||
|
||
// Version gets the version of docker available on the system | ||
func Version() string { | ||
cmd := exec.Command("docker", "version", "--format", "'{{.Server.Version}}'") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we use Does changing this function to |
||
lines, err := exec.OutputLines(cmd) | ||
if err != nil || len(lines) != 1 { | ||
return "" | ||
} | ||
return strings.Trim(lines[0], "'") | ||
} | ||
|
||
// usernsRemap checks if userns-remap is enabled in dockerd | ||
func usernsRemap() bool { | ||
cmd := exec.Command("docker", "info", "--format", "'{{json .SecurityOptions}}'") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -212,6 +212,13 @@ func runArgsForNode(node *config.Node, clusterIPFamily config.ClusterIPFamily, n | |
args..., | ||
) | ||
|
||
// Append CDI device args (used for GPU support) | ||
if len(node.CDIDevices) > 0 { | ||
for _, device := range node.CDIDevices { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is a compatibility check similar to that for docker also required here? CDI support was only really added with Podman |
||
args = append(args, "--device", strings.TrimSpace(device)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of having to remember to call |
||
} | ||
} | ||
|
||
// convert mounts and port mappings to container run args | ||
args = append(args, generateMountBindings(node.ExtraMounts...)...) | ||
mappingArgs, err := generatePortMappings(clusterIPFamily, node.ExtraPortMappings...) | ||
|
@@ -302,7 +309,6 @@ type podmanNetworks []struct { | |
func getSubnets(networkName string) ([]string, error) { | ||
cmd := exec.Command("podman", "network", "inspect", networkName) | ||
out, err := exec.Output(cmd) | ||
|
||
if err != nil { | ||
return nil, errors.Wrap(err, "failed to get subnets") | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import ( | |
"regexp" | ||
"strings" | ||
|
||
"github.com/container-orchestrated-devices/container-device-interface/pkg/parser" | ||
"sigs.k8s.io/kind/pkg/errors" | ||
"sigs.k8s.io/kind/pkg/internal/sets" | ||
) | ||
|
@@ -114,6 +115,10 @@ func (n *Node) Validate() error { | |
errs = append(errs, errors.New("image is a required field")) | ||
} | ||
|
||
if err := validateDevices(n.CDIDevices); err != nil { | ||
errs = append(errs, errors.Wrapf(err, "invalid devices")) | ||
} | ||
|
||
// validate extra port forwards | ||
for _, mapping := range n.ExtraPortMappings { | ||
if err := validatePort(mapping.HostPort); err != nil { | ||
|
@@ -192,6 +197,23 @@ func validatePortMappings(portMappings []PortMapping) error { | |
return nil | ||
} | ||
|
||
func validateDevices(cdiDevices []string) error { | ||
for _, device := range cdiDevices { | ||
device := strings.TrimSpace(device) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should be able to use It should also be fine to allow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll give that a try. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please update to what Evan suggests to reduce the dependencies pulled in. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @BenTheElder What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @klueska I am using parser.ParseQualifiedName(device) instead of parser.ParseQualifiedName(device). The results are the same except you get the additional error message about why your device string is invalid. I think this is preferable. ParseQualifiedName() is called in IsQualifiedName(). This doesn't change the dependencies at all. There are tests on github.com/container-orchestrated-devices/container-device-interface/pkg/parser pulling in the additional packages. Maybe we should put the tests in a different package? @elezar wdyt? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can update the tests in that package to use a different import for comparison or to be in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't personally have a preference - I haven't found any with fewer dependencies. I was thinking use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have created cncf-tags/container-device-interface#149 with an update to the test package. This should prevent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, a few things came up back in the real world and here with #3277 etc., the dependency discussion wound up forked in https://github.com/kubernetes-sigs/kind/pull/3290/files#r1304873501 I'm going to try to get #3335 after which when this PR is rebased and See linked comment for stance on deps. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently for example we have a lot of CRI inspired types with in-tree dependency-free code instead, to avoid creating dep hell for our users. That may not be so reasonable here. |
||
// validate device string is not empty | ||
if len(device) == 0 { | ||
return errors.Errorf("invalid device string: '%v'. Empty Strings not allowed", device) | ||
} | ||
|
||
// validate device string is valid | ||
_, _, _, err := parser.ParseQualifiedName(device) | ||
if err != nil { | ||
return errors.Errorf("invalid device string: '%v'. %v", device, err) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func validatePort(port int32) error { | ||
// NOTE: -1 is a special value for auto-selecting the port in the container | ||
// backend where possible as opposed to in kind itself. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
regarding dependency management, I'm working on #3335 to switch the modules to 1.17+, which will enable https://go.dev/ref/mod#graph-pruning and make the transitive dependencies clear.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just glancing at the go.sum changes, there's a concerningly large set of deps, but we can't tell with pre 1.17 go.mod files what is actually in the binary.
It remains important that
kind
the go libraries and CLI are trivially embeddable with minimal dependencies which also must meet kubernetes / CNCF's (allowed licenses) dependency standards. We have a number of users using kind directly in test tools etc and we don't want to bring in more dependencies.https://github.com/cncf-tags/container-device-interface/blob/main/go.mod has a lot of deps for what I would expect to just be a spec/parser.