From 12b1da65582f122e6fd5d9f7e68a596feed3f2bf Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Wed, 18 Oct 2023 10:20:19 -0500 Subject: [PATCH 01/13] add: devspaces, nfd, gpu operator updates --- .../overlays/default/kustomization.yaml | 9 +++ .../overlays/fix-autoscale/kustomization.yaml | 9 +++ .../overlays/low-idle/kustomization.yaml | 15 ++++ devspaces/instance/NOTES.md | 34 ++++++++ devspaces/instance/base/checluster.yaml | 63 +++++++++++++++ .../instance/base/devworkspace-config.yaml | 9 +++ devspaces/instance/base/kustomization.yaml | 12 +++ devspaces/instance/base/namespace.yaml | 7 ++ devspaces/instance/base/network-policy.yaml | 13 +++ devspaces/instance/overlays/default/README.md | 3 + .../overlays/default/kustomization.yaml | 5 ++ .../overlays/timeout-12m/kustomization.yaml | 24 ++++++ gpu-operator-certified/instance/INFO.md | 48 +++++++++++ .../instance/base/cluster-policy.yaml | 80 ++++++++++++++++++- .../instance/base/device-plugin-config.yaml | 5 ++ .../instance/base/kustomization.yaml | 1 + .../overlays/mig-mixed/kustomization.yaml | 14 ++++ .../overlays/mig-single/kustomization.yaml | 14 ++++ .../time-slicing-2/kustomization.yaml | 34 ++++++++ .../time-slicing-4/kustomization.yaml | 21 +++++ .../overlays/default/kustomization.yaml | 1 - .../overlays/only-nvidia/kustomization.yaml | 11 +++ nfd/instance/README.md | 6 +- nfd/instance/base/kustomization.yaml | 1 - nfd/instance/base/node-feature-discovery.yaml | 7 +- .../overlays/default/kustomization.yaml | 1 - .../overlays/only-nvidia/kustomization.yaml | 11 +++ .../patch-node-feature-discovery.yaml | 20 +++++ 28 files changed, 468 insertions(+), 10 deletions(-) create mode 100644 devspaces/aggregate/overlays/default/kustomization.yaml create mode 100644 devspaces/aggregate/overlays/fix-autoscale/kustomization.yaml create mode 100644 devspaces/aggregate/overlays/low-idle/kustomization.yaml create mode 100644 devspaces/instance/NOTES.md create mode 100644 devspaces/instance/base/checluster.yaml create mode 100644 devspaces/instance/base/devworkspace-config.yaml create mode 100644 devspaces/instance/base/kustomization.yaml create mode 100644 devspaces/instance/base/namespace.yaml create mode 100644 devspaces/instance/base/network-policy.yaml create mode 100644 devspaces/instance/overlays/default/README.md create mode 100644 devspaces/instance/overlays/default/kustomization.yaml create mode 100644 devspaces/instance/overlays/timeout-12m/kustomization.yaml create mode 100644 gpu-operator-certified/instance/INFO.md create mode 100644 gpu-operator-certified/instance/base/device-plugin-config.yaml create mode 100644 gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml create mode 100644 gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml create mode 100644 nfd/aggregate/overlays/only-nvidia/kustomization.yaml create mode 100644 nfd/instance/overlays/only-nvidia/kustomization.yaml create mode 100644 nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml diff --git a/devspaces/aggregate/overlays/default/kustomization.yaml b/devspaces/aggregate/overlays/default/kustomization.yaml new file mode 100644 index 00000000..713d4b98 --- /dev/null +++ b/devspaces/aggregate/overlays/default/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + +resources: + - ../../../instance/overlays/default + - ../../../operator/overlays/stable diff --git a/devspaces/aggregate/overlays/fix-autoscale/kustomization.yaml b/devspaces/aggregate/overlays/fix-autoscale/kustomization.yaml new file mode 100644 index 00000000..1fabfb08 --- /dev/null +++ b/devspaces/aggregate/overlays/fix-autoscale/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + +resources: + - ../../../instance/overlays/timeout-12m + - ../../../operator/overlays/stable diff --git a/devspaces/aggregate/overlays/low-idle/kustomization.yaml b/devspaces/aggregate/overlays/low-idle/kustomization.yaml new file mode 100644 index 00000000..2c6e8ffb --- /dev/null +++ b/devspaces/aggregate/overlays/low-idle/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../default + +patches: + - target: + group: org.eclipse.che + kind: CheCluster + name: devspaces + patch: |- + - op: replace + path: /spec/devEnvironments/secondsOfRunBeforeIdling + value: 300 diff --git a/devspaces/instance/NOTES.md b/devspaces/instance/NOTES.md new file mode 100644 index 00000000..0ebea8ee --- /dev/null +++ b/devspaces/instance/NOTES.md @@ -0,0 +1,34 @@ +# Notes + +## Key images + +Init containers + +``` +# che / vscode image +registry.redhat.io/devspaces/code-rhel8 +# init container copies bins to `/checode` + +# project clone +registry.redhat.io/devworkspace/devworkspace-project-clone-rhel8 +``` + +Other containers + +``` +# che gateway +registry.redhat.io/devspaces/traefik-rhel8 + +# developer tools +https://github.com/devfile/developer-images + +# che docs +https://eclipse.dev/che/docs/stable/overview/introduction-to-eclipse-che/ +https://github.com/eclipse/che +``` + +Dashboard / devfile registry + +``` +https://github.com/eclipse-che/che-devfile-registry +``` diff --git a/devspaces/instance/base/checluster.yaml b/devspaces/instance/base/checluster.yaml new file mode 100644 index 00000000..e5ef05a9 --- /dev/null +++ b/devspaces/instance/base/checluster.yaml @@ -0,0 +1,63 @@ +apiVersion: org.eclipse.che/v2 +kind: CheCluster +metadata: + annotations: + argocd.argoproj.io/sync-wave: "5" + name: devspaces +spec: + components: + cheServer: + debug: false + logLevel: INFO + extraProperties: + CHE_SYSTEM_ADMIN__NAME: 'opentlc-mgr' + dashboard: + headerMessage: + show: false + text: >- + It's time to get your Dev on! + # database: + # credentialsSecretName: postgres-credentials + # externalDb: false + # postgresDb: devspaces + # # BUG: can not change postgresHostNamae + # postgresHostName: postgres + # postgresPort: '5432' + # pvc: + # claimSize: 1Gi + devfileRegistry: + # deployment: + # containers: + # - name: devfile-registry + # # image: quay.io/eclipse/che-devfile-registry:next + # image: registry.redhat.io/devspaces/devfileregistry-rhel8:latest + externalDevfileRegistries: + - url: https://eclipse-che.github.io/che-devfile-registry/main + metrics: + enable: true + # pluginRegistry: + # openVSXURL: "https://open-vsx.org" + # openVSXURL: "https://marketplace.visualstudio.com" + containerRegistry: {} + devEnvironments: + startTimeoutSeconds: 180 + secondsOfRunBeforeIdling: -1 + maxNumberOfRunningWorkspacesPerUser: 2 + maxNumberOfWorkspacesPerUser: -1 + containerBuildConfiguration: + openShiftSecurityContextConstraint: container-build + disableContainerBuildCapabilities: true + defaultEditor: che-incubator/che-code/latest + # defaultComponents: + # - container: + # sourceMapping: /projects + # image: registry.redhat.io/devspaces/udi-rhel8:latest + # name: universal-developer-image + defaultNamespace: + autoProvision: true + template: workspace- + secondsOfInactivityBeforeIdling: 1800 + storage: + pvcStrategy: per-user + gitServices: {} + networking: {} diff --git a/devspaces/instance/base/devworkspace-config.yaml b/devspaces/instance/base/devworkspace-config.yaml new file mode 100644 index 00000000..d6343121 --- /dev/null +++ b/devspaces/instance/base/devworkspace-config.yaml @@ -0,0 +1,9 @@ +apiVersion: controller.devfile.io/v1alpha1 +kind: DevWorkspaceOperatorConfig +metadata: + name: devworkspace-config +config: + workspace: + # kludge: allow cluster autoscaling + ignoredUnrecoverableEvents: + - FailedScheduling diff --git a/devspaces/instance/base/kustomization.yaml b/devspaces/instance/base/kustomization.yaml new file mode 100644 index 00000000..178fdc23 --- /dev/null +++ b/devspaces/instance/base/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonLabels: + component: devspaces + +namespace: devspaces + +resources: + - checluster.yaml + - devworkspace-config.yaml + - namespace.yaml diff --git a/devspaces/instance/base/namespace.yaml b/devspaces/instance/base/namespace.yaml new file mode 100644 index 00000000..f8770ebf --- /dev/null +++ b/devspaces/instance/base/namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: devspaces + annotations: + openshift.io/display-name: "DevSpaces Infra" + argocd.argoproj.io/sync-wave: "0" \ No newline at end of file diff --git a/devspaces/instance/base/network-policy.yaml b/devspaces/instance/base/network-policy.yaml new file mode 100644 index 00000000..38c9c782 --- /dev/null +++ b/devspaces/instance/base/network-policy.yaml @@ -0,0 +1,13 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-from-openshift-devspaces +spec: + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: openshift-devspaces + podSelector: {} + policyTypes: + - Ingress diff --git a/devspaces/instance/overlays/default/README.md b/devspaces/instance/overlays/default/README.md new file mode 100644 index 00000000..635e9907 --- /dev/null +++ b/devspaces/instance/overlays/default/README.md @@ -0,0 +1,3 @@ +# Dev Spaces + +[OpenShift Dev Spaces Docs](https://access.redhat.com/documentation/en-us/red_hat_openshift_dev_spaces) diff --git a/devspaces/instance/overlays/default/kustomization.yaml b/devspaces/instance/overlays/default/kustomization.yaml new file mode 100644 index 00000000..774a422d --- /dev/null +++ b/devspaces/instance/overlays/default/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base diff --git a/devspaces/instance/overlays/timeout-12m/kustomization.yaml b/devspaces/instance/overlays/timeout-12m/kustomization.yaml new file mode 100644 index 00000000..134e8cb0 --- /dev/null +++ b/devspaces/instance/overlays/timeout-12m/kustomization.yaml @@ -0,0 +1,24 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: devspaces + +resources: + - ../../base + +patches: + - target: + group: org.eclipse.che + kind: CheCluster + name: devspaces + patch: |- + - op: replace + path: /spec/components/dashboard/headerMessage/show + value: true + - op: replace + path: /spec/components/dashboard/headerMessage/text + value: | + Please be patient... GPUs may take up to 12 min to be available! + - op: replace + path: /spec/devEnvironments/startTimeoutSeconds + value: 720 diff --git a/gpu-operator-certified/instance/INFO.md b/gpu-operator-certified/instance/INFO.md new file mode 100644 index 00000000..9123d7da --- /dev/null +++ b/gpu-operator-certified/instance/INFO.md @@ -0,0 +1,48 @@ +# GPU Notes + +## Instance Types + +AWS GPU Types: + +Multi-instance GPU (MIG) can be: + +- `p5.48xlarge` - 8 x H100 Tensor Core +- `p4d.24xlarge` - 8 x A100 Tensor Core + +Time-slicing GPU can be any Nvidia type (as documented by Nvidia): + +- P3 - V100 + - `p3.2xlarge` - 1 x V100 + - `p3.8xlarge` - 4 x V100 + - `p3.16xlarge` - 8 x V100 +- P2 - K80 + - `P2.xlarge` - 1 x K80 + - `P2.8xlarge` - 8 x K80 + - `P2.16xlarge` - 16 x K80 +- G5g - T4G + - `g5g.{,2,4,8}xlarge` - 1 x T4G + - `g5g.16xlarge`, `g5g.metal` - 2 x T4G +- G5 - A10G + - `g5.{,2,4,8,16}xlarge` - 1 x A10G + - `g5.{12,24}xlarge` - 4 x A10G + - `g5.48xlarge` - 8 x A10G +- G4dn - T4 + - `g4dn.{,2,4,8,16}xlarge` - 1 x T4 + - `g4dn.48xlarge` - 4 x T4 + - `g4dn.metal` - 8 x T4 +- G3 - M60 + - `g3s.xlarge` - 1 x M60 + - `g3.4xlarge` - 1 x M60 + - `g3.8xlarge` - 2 x M60 + - `g3.16xlarge` - 4 x M60 + + +## Links + +- [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing) +- [Docs - Nvidia GPU Operator on Openshift](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/contents.html) +- [Docs - Nvidia GPU admin dashboard](https://docs.openshift.com/container-platform/4.11/monitoring/nvidia-gpu-admin-dashboard.html) +- [Docs - MIG support in OCP](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/mig-ocp.html) +- [Blog - RH Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift) +- [Demo - GPU DevSpaces](https://github.com/bkoz/devspaces) +- [GPU Operator default config map](https://gitlab.com/nvidia/kubernetes/gpu-operator/-/blob/v23.6.1/assets/state-mig-manager/0400_configmap.yaml?ref_type=tags) \ No newline at end of file diff --git a/gpu-operator-certified/instance/base/cluster-policy.yaml b/gpu-operator-certified/instance/base/cluster-policy.yaml index 33712f0f..724b2026 100644 --- a/gpu-operator-certified/instance/base/cluster-policy.yaml +++ b/gpu-operator-certified/instance/base/cluster-policy.yaml @@ -1,4 +1,82 @@ -apiVersion: nvidia.com/v1 kind: ClusterPolicy +apiVersion: nvidia.com/v1 metadata: name: gpu-cluster-policy +spec: + operator: + defaultRuntime: crio + use_ocp_driver_toolkit: true + initContainer: {} + sandboxWorkloads: + enabled: false + defaultWorkload: container + driver: + enabled: true + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + repoConfig: + configMapName: '' + certConfig: + name: '' + licensingConfig: + nlsEnabled: false + configMapName: '' + virtualTopology: + config: '' + kernelModuleConfig: + name: '' + dcgmExporter: + enabled: true + config: + name: 'console-plugin-nvidia-gpu' + serviceMonitor: + enabled: true + dcgm: + enabled: true + daemonsets: + updateStrategy: RollingUpdate + rollingUpdate: + maxUnavailable: '1' + devicePlugin: + enabled: true + config: + name: '' + default: '' + gfd: + enabled: true + migManager: + enabled: true + nodeStatusExporter: + enabled: true + mig: + strategy: single + toolkit: + enabled: true + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: 'true' + vgpuManager: + enabled: false + vgpuDeviceManager: + enabled: true + sandboxDevicePlugin: + enabled: true + vfioManager: + enabled: true + gds: + enabled: false diff --git a/gpu-operator-certified/instance/base/device-plugin-config.yaml b/gpu-operator-certified/instance/base/device-plugin-config.yaml new file mode 100644 index 00000000..47fa37a7 --- /dev/null +++ b/gpu-operator-certified/instance/base/device-plugin-config.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: device-plugin-config +data: {} diff --git a/gpu-operator-certified/instance/base/kustomization.yaml b/gpu-operator-certified/instance/base/kustomization.yaml index 0dfb6137..eead45d1 100644 --- a/gpu-operator-certified/instance/base/kustomization.yaml +++ b/gpu-operator-certified/instance/base/kustomization.yaml @@ -5,3 +5,4 @@ namespace: nvidia-gpu-operator resources: - cluster-policy.yaml + - device-plugin-config.yaml diff --git a/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml b/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml new file mode 100644 index 00000000..24ad3593 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +patches: + - target: + kind: ClusterPolicy + name: gpu-cluster-policy + patch: |- + - op: add + path: /spec/mig/strategy + value: mixed \ No newline at end of file diff --git a/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml b/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml new file mode 100644 index 00000000..ecc91db7 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +patches: + - target: + kind: ClusterPolicy + name: gpu-cluster-policy + patch: |- + - op: add + path: /spec/mig/strategy + value: single \ No newline at end of file diff --git a/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml new file mode 100644 index 00000000..94f5b390 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml @@ -0,0 +1,34 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +patches: + - target: + kind: ClusterPolicy + name: gpu-cluster-policy + patch: |- + - op: add + path: /spec/devicePlugin/config/name + value: device-plugin-config + - op: add + path: /spec/devicePlugin/config/default + value: Tesla-T4-time-sliced + - op: replace + path: /spec/gfd/enabled + value: true + - target: + kind: ConfigMap + name: device-plugin-config + patch: |- + - op: add + path: /data + value: + Tesla-T4-time-sliced: |- + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 2 \ No newline at end of file diff --git a/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml new file mode 100644 index 00000000..67a61c7a --- /dev/null +++ b/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../time-slicing-2 + +patches: + - target: + kind: ConfigMap + name: device-plugin-config + patch: |- + - op: add + path: /data + value: + Tesla-T4-time-sliced: |- + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 4 diff --git a/nfd/aggregate/overlays/default/kustomization.yaml b/nfd/aggregate/overlays/default/kustomization.yaml index c666be10..303e9470 100644 --- a/nfd/aggregate/overlays/default/kustomization.yaml +++ b/nfd/aggregate/overlays/default/kustomization.yaml @@ -1,4 +1,3 @@ ---- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization diff --git a/nfd/aggregate/overlays/only-nvidia/kustomization.yaml b/nfd/aggregate/overlays/only-nvidia/kustomization.yaml new file mode 100644 index 00000000..a4d3d612 --- /dev/null +++ b/nfd/aggregate/overlays/only-nvidia/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + +namespace: openshift-nfd + +resources: + - ../../../operator/overlays/stable + - ../../../instance/overlays/only-nvidia diff --git a/nfd/instance/README.md b/nfd/instance/README.md index 61c6c968..af250a83 100644 --- a/nfd/instance/README.md +++ b/nfd/instance/README.md @@ -22,13 +22,13 @@ The options for this operator are the following *overlays*: If you have cloned the `gitops-catalog` repository, you can install the Storage System by running from the root `gitops-catalog` directory ``` -oc apply -k nfd/instance/overlays/default +oc apply -k openshift-nfd-operator/instance/overlays/default ``` Or, without cloning: ``` -oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/default +oc apply -k https://github.com/redhat-cop/gitops-catalog/openshift-nfd-operator/instance/overlays/default ``` As part of a different overlay in your own GitOps repo: @@ -38,5 +38,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/default?ref=main + - github.com/redhat-cop/gitops-catalog/openshift-nfd-operator/instance/overlays/default?ref=main ``` diff --git a/nfd/instance/base/kustomization.yaml b/nfd/instance/base/kustomization.yaml index 133b643b..309c6ea7 100644 --- a/nfd/instance/base/kustomization.yaml +++ b/nfd/instance/base/kustomization.yaml @@ -1,4 +1,3 @@ ---- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization diff --git a/nfd/instance/base/node-feature-discovery.yaml b/nfd/instance/base/node-feature-discovery.yaml index 9f6c3b84..8672e6cc 100644 --- a/nfd/instance/base/node-feature-discovery.yaml +++ b/nfd/instance/base/node-feature-discovery.yaml @@ -13,8 +13,9 @@ spec: # matchOn: # - nodename: ["special-.*-node-.*"] operand: - image: >- - registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:9c080fc2cd9d9cbca9ec360674e32fe54b3724ec87bedaa513ac3ee71cb14269 + # bug: an image has to be defined otherwise the deployment fails + # bug: this behavior recently changed + image: registry.redhat.io/openshift4/ose-node-feature-discovery:latest servicePort: 12000 workerConfig: configData: | @@ -123,4 +124,4 @@ spec: # - pciId: # vendor: ["15b3"] # device: ["1014", "1017"] - # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] + # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] \ No newline at end of file diff --git a/nfd/instance/overlays/default/kustomization.yaml b/nfd/instance/overlays/default/kustomization.yaml index ef6e263c..774a422d 100644 --- a/nfd/instance/overlays/default/kustomization.yaml +++ b/nfd/instance/overlays/default/kustomization.yaml @@ -1,4 +1,3 @@ ---- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization diff --git a/nfd/instance/overlays/only-nvidia/kustomization.yaml b/nfd/instance/overlays/only-nvidia/kustomization.yaml new file mode 100644 index 00000000..4c67b875 --- /dev/null +++ b/nfd/instance/overlays/only-nvidia/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +patches: + - target: + group: nfd.openshift.io + kind: NodeFeatureDiscovery + path: patch-node-feature-discovery.yaml diff --git a/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml b/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml new file mode 100644 index 00000000..d5e81be6 --- /dev/null +++ b/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml @@ -0,0 +1,20 @@ +- op: add + path: /spec + value: + instance: '' + operand: + image: registry.redhat.io/openshift4/ose-node-feature-discovery:latest + servicePort: 12000 + topologyUpdater: false + workerConfig: + configData: | + core: + sleepInterval: 60s + sources: + pci: + deviceClassWhitelist: + - "0200" + - "03" + - "12" + deviceLabelFields: + - "vendor" From d6a29f950d9a35cf6e94a50a4397d2067a37b528 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Wed, 18 Oct 2023 10:26:15 -0500 Subject: [PATCH 02/13] cleanup: gpu operator --- nvidia-gpu-operator | 1 + nvidia-gpu-operator/README.md | 42 ------------------- .../operator/base/kustomization.yaml | 7 ---- .../operator/base/namespace.yaml | 8 ---- .../operator/base/operator-group.yaml | 8 ---- .../operator/base/subscription.yaml | 11 ----- .../overlays/default/kustomization.yaml | 11 ----- .../overlays/default/patch-channel.yaml | 3 -- 8 files changed, 1 insertion(+), 90 deletions(-) create mode 120000 nvidia-gpu-operator delete mode 100644 nvidia-gpu-operator/README.md delete mode 100644 nvidia-gpu-operator/operator/base/kustomization.yaml delete mode 100644 nvidia-gpu-operator/operator/base/namespace.yaml delete mode 100644 nvidia-gpu-operator/operator/base/operator-group.yaml delete mode 100644 nvidia-gpu-operator/operator/base/subscription.yaml delete mode 100644 nvidia-gpu-operator/operator/overlays/default/kustomization.yaml delete mode 100644 nvidia-gpu-operator/operator/overlays/default/patch-channel.yaml diff --git a/nvidia-gpu-operator b/nvidia-gpu-operator new file mode 120000 index 00000000..b4c63bff --- /dev/null +++ b/nvidia-gpu-operator @@ -0,0 +1 @@ +gpu-operator-certified/ \ No newline at end of file diff --git a/nvidia-gpu-operator/README.md b/nvidia-gpu-operator/README.md deleted file mode 100644 index 0f21b90f..00000000 --- a/nvidia-gpu-operator/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# NVIDIA GPU Operator - -Installs the NVIDIA GPU Operator. - -## Prerequisites - -First, install the [NVIDIA GPU Operator](../operator) in your cluster. - -Do not use the `base` directory directly, as you will need to patch the `channel` based on the version of OpenShift you are using, or the version of the operator you want to use. - -## Overlays - -The options for this operator are the following *overlays*: -* [default](overlays/default) - -### Default - -[default](overlays/default) configures the NVIDIA GPU Operator. - -## Usage - -If you have cloned the `gitops-catalog` repository, you can install the Storage System by running from the root `gitops-catalog` directory - -``` -oc apply -k nvidia-gpu-operator/operator/overlays/default -``` - -Or, without cloning: - -``` -oc apply -k https://github.com/redhat-cop/gitops-catalog/nvidia-gpu-operator/instance/overlays/default -``` - -As part of a different overlay in your own GitOps repo: - -``` -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - github.com/redhat-cop/gitops-catalog/nvidia-gpu-operator/instance/overlays/default?ref=main -``` \ No newline at end of file diff --git a/nvidia-gpu-operator/operator/base/kustomization.yaml b/nvidia-gpu-operator/operator/base/kustomization.yaml deleted file mode 100644 index 1e66bd5f..00000000 --- a/nvidia-gpu-operator/operator/base/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - namespace.yaml - - operator-group.yaml - - subscription.yaml diff --git a/nvidia-gpu-operator/operator/base/namespace.yaml b/nvidia-gpu-operator/operator/base/namespace.yaml deleted file mode 100644 index 9f802932..00000000 --- a/nvidia-gpu-operator/operator/base/namespace.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - annotations: - openshift.io/display-name: "NVIDIA GPU Operator" - labels: - openshift.io/cluster-monitoring: "true" - name: nvidia-gpu-operator diff --git a/nvidia-gpu-operator/operator/base/operator-group.yaml b/nvidia-gpu-operator/operator/base/operator-group.yaml deleted file mode 100644 index 53acfaaf..00000000 --- a/nvidia-gpu-operator/operator/base/operator-group.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - name: nvidia-gpu-operator-group - namespace: nvidia-gpu-operator -spec: - targetNamespaces: - - nvidia-gpu-operator diff --git a/nvidia-gpu-operator/operator/base/subscription.yaml b/nvidia-gpu-operator/operator/base/subscription.yaml deleted file mode 100644 index 322840eb..00000000 --- a/nvidia-gpu-operator/operator/base/subscription.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - name: gpu-operator-certified - namespace: nvidia-gpu-operator -spec: - channel: patch-me-see-overlays-dir - installPlanApproval: Automatic - name: gpu-operator-certified - source: certified-operators - sourceNamespace: openshift-marketplace diff --git a/nvidia-gpu-operator/operator/overlays/default/kustomization.yaml b/nvidia-gpu-operator/operator/overlays/default/kustomization.yaml deleted file mode 100644 index c771cd2a..00000000 --- a/nvidia-gpu-operator/operator/overlays/default/kustomization.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../base - -patches: - - target: - kind: Subscription - name: gpu-operator-certified - path: patch-channel.yaml diff --git a/nvidia-gpu-operator/operator/overlays/default/patch-channel.yaml b/nvidia-gpu-operator/operator/overlays/default/patch-channel.yaml deleted file mode 100644 index 6642eb17..00000000 --- a/nvidia-gpu-operator/operator/overlays/default/patch-channel.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: replace - path: /spec/channel - value: stable From 3afcd3e1bb83896eb77b2a28d836a53c204ccd95 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Wed, 18 Oct 2023 10:36:20 -0500 Subject: [PATCH 03/13] update: gpu overlays --- .../time-slicing-2/kustomization.yaml | 6 +-- .../time-slicing-4/kustomization.yaml | 2 +- .../time-slicing-8-a100/kustomization.yaml | 42 +++++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml diff --git a/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml index 94f5b390..1ed4b944 100644 --- a/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/time-slicing-2/kustomization.yaml @@ -14,7 +14,7 @@ patches: value: device-plugin-config - op: add path: /spec/devicePlugin/config/default - value: Tesla-T4-time-sliced + value: Tesla-T4 - op: replace path: /spec/gfd/enabled value: true @@ -25,10 +25,10 @@ patches: - op: add path: /data value: - Tesla-T4-time-sliced: |- + Tesla-T4: |- version: v1 sharing: timeSlicing: resources: - name: nvidia.com/gpu - replicas: 2 \ No newline at end of file + replicas: 2 diff --git a/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml index 67a61c7a..35fe72ba 100644 --- a/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/time-slicing-4/kustomization.yaml @@ -12,7 +12,7 @@ patches: - op: add path: /data value: - Tesla-T4-time-sliced: |- + Tesla-T4: |- version: v1 sharing: timeSlicing: diff --git a/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml new file mode 100644 index 00000000..802544b3 --- /dev/null +++ b/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml @@ -0,0 +1,42 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +patches: + - target: + kind: ClusterPolicy + name: gpu-cluster-policy + patch: |- + - op: add + path: /spec/devicePlugin/config/name + value: device-plugin-config + - op: add + path: /spec/devicePlugin/config/default + value: A100-SXM4-40GB + - op: replace + path: /spec/gfd/enabled + value: true + - target: + kind: ConfigMap + name: device-plugin-config + patch: |- + - op: add + path: /data + value: + A100-SXM4-40GB: |- + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 8 + - name: nvidia.com/mig-1g.5gb + replicas: 1 + - name: nvidia.com/mig-2g.10gb + replicas: 2 + - name: nvidia.com/mig-3g.20gb + replicas: 3 + - name: nvidia.com/mig-7g.40gb + replicas: 7 \ No newline at end of file From dc6a68aad340fb751fab3a0e3a6487f5e4db1797 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Wed, 18 Oct 2023 10:41:07 -0500 Subject: [PATCH 04/13] update: spelling --- .wordlist-md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.wordlist-md b/.wordlist-md index fa185fbd..b620d64f 100644 --- a/.wordlist-md +++ b/.wordlist-md @@ -101,6 +101,7 @@ arn aws canada ceph +che checluster cicd cli @@ -112,6 +113,7 @@ configmap datasource deployable dev +devfile devspaces devworkspace devworkspaces @@ -180,6 +182,7 @@ prometheus redhat redistributions repo +rhel rhpds runtime sagemaker @@ -194,8 +197,10 @@ sublicense tekton templating thanos +traefik truly vSphere +vscode vsphere wordlist workspaces From bee7d7e2e1e3d158e8a08225bebb42bc55ee8432 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Wed, 18 Oct 2023 10:44:39 -0500 Subject: [PATCH 05/13] fix: yamlint --- devspaces/instance/base/namespace.yaml | 2 +- .../instance/overlays/mig-mixed/kustomization.yaml | 2 +- .../instance/overlays/mig-single/kustomization.yaml | 2 +- .../instance/overlays/time-slicing-8-a100/kustomization.yaml | 2 +- nfd/instance/base/node-feature-discovery.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/devspaces/instance/base/namespace.yaml b/devspaces/instance/base/namespace.yaml index f8770ebf..2283637e 100644 --- a/devspaces/instance/base/namespace.yaml +++ b/devspaces/instance/base/namespace.yaml @@ -4,4 +4,4 @@ metadata: name: devspaces annotations: openshift.io/display-name: "DevSpaces Infra" - argocd.argoproj.io/sync-wave: "0" \ No newline at end of file + argocd.argoproj.io/sync-wave: "0" diff --git a/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml b/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml index 24ad3593..5abb963b 100644 --- a/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/mig-mixed/kustomization.yaml @@ -11,4 +11,4 @@ patches: patch: |- - op: add path: /spec/mig/strategy - value: mixed \ No newline at end of file + value: mixed diff --git a/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml b/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml index ecc91db7..87472ae9 100644 --- a/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/mig-single/kustomization.yaml @@ -11,4 +11,4 @@ patches: patch: |- - op: add path: /spec/mig/strategy - value: single \ No newline at end of file + value: single diff --git a/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml b/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml index 802544b3..9b9570d1 100644 --- a/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml +++ b/gpu-operator-certified/instance/overlays/time-slicing-8-a100/kustomization.yaml @@ -39,4 +39,4 @@ patches: - name: nvidia.com/mig-3g.20gb replicas: 3 - name: nvidia.com/mig-7g.40gb - replicas: 7 \ No newline at end of file + replicas: 7 diff --git a/nfd/instance/base/node-feature-discovery.yaml b/nfd/instance/base/node-feature-discovery.yaml index 8672e6cc..fa447eb8 100644 --- a/nfd/instance/base/node-feature-discovery.yaml +++ b/nfd/instance/base/node-feature-discovery.yaml @@ -124,4 +124,4 @@ spec: # - pciId: # vendor: ["15b3"] # device: ["1014", "1017"] - # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] \ No newline at end of file + # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] From 2e8eef1874b64f33fb6ad101d33e0867e5b6bcfe Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:31:33 -0500 Subject: [PATCH 06/13] update: devspaces aggregate --- .../overlays/low-idle/kustomization.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) rename devspaces/{aggregate => instance}/overlays/low-idle/kustomization.yaml (88%) diff --git a/devspaces/aggregate/overlays/low-idle/kustomization.yaml b/devspaces/instance/overlays/low-idle/kustomization.yaml similarity index 88% rename from devspaces/aggregate/overlays/low-idle/kustomization.yaml rename to devspaces/instance/overlays/low-idle/kustomization.yaml index 2c6e8ffb..84a9d4bc 100644 --- a/devspaces/aggregate/overlays/low-idle/kustomization.yaml +++ b/devspaces/instance/overlays/low-idle/kustomization.yaml @@ -1,8 +1,10 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization +namespace: devspaces + resources: - - ../default + - ../../base patches: - target: From 6d99766a3ab6c1e23ef5d1f6221c16b4f2968b97 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:33:44 -0500 Subject: [PATCH 07/13] update: path to notes --- devspaces/{instance => }/NOTES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename devspaces/{instance => }/NOTES.md (97%) diff --git a/devspaces/instance/NOTES.md b/devspaces/NOTES.md similarity index 97% rename from devspaces/instance/NOTES.md rename to devspaces/NOTES.md index 0ebea8ee..11a1c830 100644 --- a/devspaces/instance/NOTES.md +++ b/devspaces/NOTES.md @@ -1,4 +1,4 @@ -# Notes +# General Notes ## Key images From 84f4d6c2ed1f3528689e820d1e57f1866bbdcdd6 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:37:00 -0500 Subject: [PATCH 08/13] rename: to README --- gpu-operator-certified/instance/INFO.md | 48 ----------------------- gpu-operator-certified/instance/README.md | 48 +++++++++++++++++++++++ 2 files changed, 48 insertions(+), 48 deletions(-) delete mode 100644 gpu-operator-certified/instance/INFO.md diff --git a/gpu-operator-certified/instance/INFO.md b/gpu-operator-certified/instance/INFO.md deleted file mode 100644 index 9123d7da..00000000 --- a/gpu-operator-certified/instance/INFO.md +++ /dev/null @@ -1,48 +0,0 @@ -# GPU Notes - -## Instance Types - -AWS GPU Types: - -Multi-instance GPU (MIG) can be: - -- `p5.48xlarge` - 8 x H100 Tensor Core -- `p4d.24xlarge` - 8 x A100 Tensor Core - -Time-slicing GPU can be any Nvidia type (as documented by Nvidia): - -- P3 - V100 - - `p3.2xlarge` - 1 x V100 - - `p3.8xlarge` - 4 x V100 - - `p3.16xlarge` - 8 x V100 -- P2 - K80 - - `P2.xlarge` - 1 x K80 - - `P2.8xlarge` - 8 x K80 - - `P2.16xlarge` - 16 x K80 -- G5g - T4G - - `g5g.{,2,4,8}xlarge` - 1 x T4G - - `g5g.16xlarge`, `g5g.metal` - 2 x T4G -- G5 - A10G - - `g5.{,2,4,8,16}xlarge` - 1 x A10G - - `g5.{12,24}xlarge` - 4 x A10G - - `g5.48xlarge` - 8 x A10G -- G4dn - T4 - - `g4dn.{,2,4,8,16}xlarge` - 1 x T4 - - `g4dn.48xlarge` - 4 x T4 - - `g4dn.metal` - 8 x T4 -- G3 - M60 - - `g3s.xlarge` - 1 x M60 - - `g3.4xlarge` - 1 x M60 - - `g3.8xlarge` - 2 x M60 - - `g3.16xlarge` - 4 x M60 - - -## Links - -- [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing) -- [Docs - Nvidia GPU Operator on Openshift](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/contents.html) -- [Docs - Nvidia GPU admin dashboard](https://docs.openshift.com/container-platform/4.11/monitoring/nvidia-gpu-admin-dashboard.html) -- [Docs - MIG support in OCP](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/mig-ocp.html) -- [Blog - RH Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift) -- [Demo - GPU DevSpaces](https://github.com/bkoz/devspaces) -- [GPU Operator default config map](https://gitlab.com/nvidia/kubernetes/gpu-operator/-/blob/v23.6.1/assets/state-mig-manager/0400_configmap.yaml?ref_type=tags) \ No newline at end of file diff --git a/gpu-operator-certified/instance/README.md b/gpu-operator-certified/instance/README.md index e69de29b..9123d7da 100644 --- a/gpu-operator-certified/instance/README.md +++ b/gpu-operator-certified/instance/README.md @@ -0,0 +1,48 @@ +# GPU Notes + +## Instance Types + +AWS GPU Types: + +Multi-instance GPU (MIG) can be: + +- `p5.48xlarge` - 8 x H100 Tensor Core +- `p4d.24xlarge` - 8 x A100 Tensor Core + +Time-slicing GPU can be any Nvidia type (as documented by Nvidia): + +- P3 - V100 + - `p3.2xlarge` - 1 x V100 + - `p3.8xlarge` - 4 x V100 + - `p3.16xlarge` - 8 x V100 +- P2 - K80 + - `P2.xlarge` - 1 x K80 + - `P2.8xlarge` - 8 x K80 + - `P2.16xlarge` - 16 x K80 +- G5g - T4G + - `g5g.{,2,4,8}xlarge` - 1 x T4G + - `g5g.16xlarge`, `g5g.metal` - 2 x T4G +- G5 - A10G + - `g5.{,2,4,8,16}xlarge` - 1 x A10G + - `g5.{12,24}xlarge` - 4 x A10G + - `g5.48xlarge` - 8 x A10G +- G4dn - T4 + - `g4dn.{,2,4,8,16}xlarge` - 1 x T4 + - `g4dn.48xlarge` - 4 x T4 + - `g4dn.metal` - 8 x T4 +- G3 - M60 + - `g3s.xlarge` - 1 x M60 + - `g3.4xlarge` - 1 x M60 + - `g3.8xlarge` - 2 x M60 + - `g3.16xlarge` - 4 x M60 + + +## Links + +- [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing) +- [Docs - Nvidia GPU Operator on Openshift](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/contents.html) +- [Docs - Nvidia GPU admin dashboard](https://docs.openshift.com/container-platform/4.11/monitoring/nvidia-gpu-admin-dashboard.html) +- [Docs - MIG support in OCP](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/mig-ocp.html) +- [Blog - RH Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift) +- [Demo - GPU DevSpaces](https://github.com/bkoz/devspaces) +- [GPU Operator default config map](https://gitlab.com/nvidia/kubernetes/gpu-operator/-/blob/v23.6.1/assets/state-mig-manager/0400_configmap.yaml?ref_type=tags) \ No newline at end of file From 0f9f2b27b062a4ad88c9116979d567a777d5c7e8 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:41:13 -0500 Subject: [PATCH 09/13] update: README --- gpu-operator-certified/instance/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gpu-operator-certified/instance/README.md b/gpu-operator-certified/instance/README.md index 9123d7da..e238cc20 100644 --- a/gpu-operator-certified/instance/README.md +++ b/gpu-operator-certified/instance/README.md @@ -1,5 +1,9 @@ # GPU Notes +For more info please review the following: + +- [Demo GPUs on OpenShift](https://github.com/redhat-na-ssa/demo-ocp-gpu) + ## Instance Types AWS GPU Types: @@ -36,7 +40,6 @@ Time-slicing GPU can be any Nvidia type (as documented by Nvidia): - `g3.8xlarge` - 2 x M60 - `g3.16xlarge` - 4 x M60 - ## Links - [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing) From 17f47a18fd9ca84e05904529cc1de97863770a14 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:44:54 -0500 Subject: [PATCH 10/13] fix: lint --- .wordlist-md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.wordlist-md b/.wordlist-md index b620d64f..85baa39c 100644 --- a/.wordlist-md +++ b/.wordlist-md @@ -19,12 +19,14 @@ ClusterTask CodeReady DNS Dev +DevSpaces DevWorkspace DevWorkspaces Devfile DotNET Eventing FullAccess +GPUs Gi GitOps HTPasswd @@ -118,6 +120,7 @@ devspaces devworkspace devworkspaces disableNameSuffixHash +dn dns dotnet ec From 94e566aab890d754365b3deffb972406171e77e2 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:52:09 -0500 Subject: [PATCH 11/13] update: make simple --- .../patch-node-feature-discovery.yaml | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml b/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml index d5e81be6..27d2eac1 100644 --- a/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml +++ b/nfd/instance/overlays/only-nvidia/patch-node-feature-discovery.yaml @@ -1,20 +1,22 @@ -- op: add - path: /spec - value: - instance: '' - operand: - image: registry.redhat.io/openshift4/ose-node-feature-discovery:latest - servicePort: 12000 - topologyUpdater: false - workerConfig: - configData: | - core: - sleepInterval: 60s - sources: - pci: - deviceClassWhitelist: - - "0200" - - "03" - - "12" - deviceLabelFields: - - "vendor" +kind: NodeFeatureDiscovery +apiVersion: nfd.openshift.io/v1 +metadata: + name: nfd-instance +spec: + instance: '' + operand: + image: registry.redhat.io/openshift4/ose-node-feature-discovery:latest + servicePort: 12000 + topologyUpdater: false + workerConfig: + configData: | + core: + sleepInterval: 60s + sources: + pci: + deviceClassWhitelist: + - "0200" + - "03" + - "12" + deviceLabelFields: + - "vendor" From 0107acfe852998b2dfbc03fd573aec5bb1597a92 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:57:15 -0500 Subject: [PATCH 12/13] fix: nfd readme --- nfd/instance/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nfd/instance/README.md b/nfd/instance/README.md index af250a83..61c6c968 100644 --- a/nfd/instance/README.md +++ b/nfd/instance/README.md @@ -22,13 +22,13 @@ The options for this operator are the following *overlays*: If you have cloned the `gitops-catalog` repository, you can install the Storage System by running from the root `gitops-catalog` directory ``` -oc apply -k openshift-nfd-operator/instance/overlays/default +oc apply -k nfd/instance/overlays/default ``` Or, without cloning: ``` -oc apply -k https://github.com/redhat-cop/gitops-catalog/openshift-nfd-operator/instance/overlays/default +oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/default ``` As part of a different overlay in your own GitOps repo: @@ -38,5 +38,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - github.com/redhat-cop/gitops-catalog/openshift-nfd-operator/instance/overlays/default?ref=main + - github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/default?ref=main ``` From 0e4edc6dabc268808f9f10b35509984378c2c9f9 Mon Sep 17 00:00:00 2001 From: Cory Latschkowski Date: Fri, 27 Oct 2023 22:59:14 -0500 Subject: [PATCH 13/13] cleanup --- nfd/instance/README.md | 42 ------------------------------------------ 1 file changed, 42 deletions(-) delete mode 100644 nfd/instance/README.md diff --git a/nfd/instance/README.md b/nfd/instance/README.md deleted file mode 100644 index 61c6c968..00000000 --- a/nfd/instance/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# OpenShift Node Feature Discovery (NFD) - -Installs a basic nodeFeatureDiscovery instance. - -## Prerequisites - -First, install the [OpenShift NFD Operator](../operator) in your cluster. - -Do not use the `base` directory directly, as you will need to patch the `channel` based on the version of OpenShift you are using, or the version of the operator you want to use. - -## Overlays - -The options for this operator are the following *overlays*: -* [default](overlays/default) - -### Default - -[default](overlays/default) configures a basic default configuration for a nodeFeatureDiscovery instance. For more details on customizing the NFD workers, refer to the [docs](https://kubernetes-sigs.github.io/node-feature-discovery/v0.10/advanced/worker-configuration-reference.html). - -## Usage - -If you have cloned the `gitops-catalog` repository, you can install the Storage System by running from the root `gitops-catalog` directory - -``` -oc apply -k nfd/instance/overlays/default -``` - -Or, without cloning: - -``` -oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/default -``` - -As part of a different overlay in your own GitOps repo: - -``` -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/default?ref=main -```