From 252d9ef252c3438d7afe32c7237ff3fd0f0c86c0 Mon Sep 17 00:00:00 2001 From: Xieshen Zhang Date: Tue, 11 Jun 2024 16:04:07 -0400 Subject: [PATCH] Enable Nvidia NIM as an application --- .../overlays/apps/base/kustomization.yaml | 1 + .../apps/base/nvidia-nim/kustomization.yaml | 7 + .../apps/base/nvidia-nim/nvidia-nim-app.yaml | 41 +++ manifests/overlays/rhoai/kustomization.yaml | 1 + .../rhoai/nvidia-nim-validator-cron.yaml | 305 ++++++++++++++++++ 5 files changed, 355 insertions(+) create mode 100644 manifests/overlays/apps/base/nvidia-nim/kustomization.yaml create mode 100644 manifests/overlays/apps/base/nvidia-nim/nvidia-nim-app.yaml create mode 100644 manifests/overlays/rhoai/nvidia-nim-validator-cron.yaml diff --git a/manifests/overlays/apps/base/kustomization.yaml b/manifests/overlays/apps/base/kustomization.yaml index f10478327e..900d6dbcec 100644 --- a/manifests/overlays/apps/base/kustomization.yaml +++ b/manifests/overlays/apps/base/kustomization.yaml @@ -12,3 +12,4 @@ resources: - ./pachyderm - ./watson-x - ./rhoai + - ./nvidia-nim diff --git a/manifests/overlays/apps/base/nvidia-nim/kustomization.yaml b/manifests/overlays/apps/base/nvidia-nim/kustomization.yaml new file mode 100644 index 0000000000..3918d12278 --- /dev/null +++ b/manifests/overlays/apps/base/nvidia-nim/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +commonLabels: + app: odh-dashboard + app.kubernetes.io/part-of: odh-dashboard +resources: +- nvidia-nim-app.yaml diff --git a/manifests/overlays/apps/base/nvidia-nim/nvidia-nim-app.yaml b/manifests/overlays/apps/base/nvidia-nim/nvidia-nim-app.yaml new file mode 100644 index 0000000000..a8c70a834c --- /dev/null +++ b/manifests/overlays/apps/base/nvidia-nim/nvidia-nim-app.yaml @@ -0,0 +1,41 @@ +apiVersion: dashboard.opendatahub.io/v1 +kind: OdhApplication +metadata: + name: nvidia-nim + annotations: + opendatahub.io/categories: 'Model serving' +spec: + displayName: NVIDIA NIM + provider: NVIDIA + description: |- + NVIDIA NIM, part of NVIDIA AI Enterprise, is a set of accelerated inference microservices that allow organizations to run AI models on NVIDIA GPUs anywhere—in the cloud, data center, workstations, and PCs. + kfdefApplications: [] + route: '' + img: >- + + category: Partner managed + support: third party support + docsLink: https://developer.nvidia.com/nim + quickStart: '' + getStartedLink: 'https://developer.nvidia.com/nim' + enable: + title: Enable NVIDIA NIM + actionLabel: Enable + description: '' + variables: + api_key: password + variableDisplayText: + api_key: NGC API Key + variableHelpText: + api_key: This NGC API key is given to you by NVIDIA + validationJob: nvidia-nim-periodic-validator + validationSecret: nvidia-nim-access + validationConfigMap: nvidia-nim-validation-result + getStartedMarkDown: >- + # NVIDIA NIM + + 1. Create a NVIDIA account + + 2. Join NVIDIA AI Enterprise + + 3. Generate a NGC API Key diff --git a/manifests/overlays/rhoai/kustomization.yaml b/manifests/overlays/rhoai/kustomization.yaml index 494f3549fa..a3207d973c 100644 --- a/manifests/overlays/rhoai/kustomization.yaml +++ b/manifests/overlays/rhoai/kustomization.yaml @@ -6,6 +6,7 @@ commonLabels: resources: - ../../base - anaconda-ce-validator-cron.yaml + - nvidia-nim-validator-cron.yaml patchesJson6902: - path: service-account.yaml target: diff --git a/manifests/overlays/rhoai/nvidia-nim-validator-cron.yaml b/manifests/overlays/rhoai/nvidia-nim-validator-cron.yaml new file mode 100644 index 0000000000..9992d986db --- /dev/null +++ b/manifests/overlays/rhoai/nvidia-nim-validator-cron.yaml @@ -0,0 +1,305 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: nvidia-nim-periodic-validator + labels: + opendatahub.io/modified: "false" +spec: + schedule: "0 0 * * *" + concurrencyPolicy: "Replace" + startingDeadlineSeconds: 200 + suspend: true + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + labels: + parent: "nvidia-nim-periodic-validator" + spec: + serviceAccount: "rhods-dashboard" + imagePullSecrets: + - name: addon-managed-odh-pullsecret + containers: + - name: nvidia-nim-validator + image: registry.redhat.io/openshift4/ose-cli@sha256:75bf9b911b6481dcf29f7942240d1555adaa607eec7fc61bedb7f624f87c36d4 + command: + - /bin/sh + - -c + - > + #!/bin/sh + + RESULT_CONFIGMAP_NAME='nvidia-nim-validation-result' + DATA_CONFIGMAP_NAME='nvidia-nim-images-data' + IMAGE_PULL_SECRET_NAME='nvidia-nim-image-pull' + + function verify_result_configmap_exists() { + if ! oc get configmap "${RESULT_CONFIGMAP_NAME}" &>/dev/null; then + echo "Result ConfigMap doesn't exist, creating" + + oc create configmap "${RESULT_CONFIGMAP_NAME}" --from-literal validation_result="false" + fi + } + + function write_result_configmap_value() { + oc patch configmap "${RESULT_CONFIGMAP_NAME}" -p '"data": { "validation_result": "'${1}'" }' + } + + function write_last_valid_time() { + oc patch configmap "${RESULT_CONFIGMAP_NAME}" -p '"data": { "last_valid_time": "'$(date -Is)'" }' + } + + function create_image_pull_secret() { + if ! oc get secret "${IMAGE_PULL_SECRET_NAME}" &>/dev/null; then + echo "Image pull Secret doesn't exist, creating" + + api_key=$(get_api_key) + + oc create secret docker-registry "${IMAGE_PULL_SECRET_NAME}" \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password=${api_key} + fi + } + + function delete_image_pull_secret() { + echo "Deleting image pull Secret" + + oc delete secret "${IMAGE_PULL_SECRET_NAME}" --ignore-not-found=true + } + + function verify_image_data_configmap() { + if ! oc get configmap "${DATA_CONFIGMAP_NAME}" &>/dev/null; then + echo "Image data ConfigMap doesn't exist, creating" + + oc create configmap "${DATA_CONFIGMAP_NAME}" + fi + } + + function write_image_data_configmap() { + echo "Patching image data ConfigMap" + + oc get configmap "${DATA_CONFIGMAP_NAME}" -o json | jq --argjson data "$1" '.data = ($data)' | oc apply -f - + } + + function delete_image_data_configmap() { + echo "Deleting image data ConfigMap" + + oc delete configmap "${DATA_CONFIGMAP_NAME}" --ignore-not-found=true + } + + function success() { + echo "Validation succeeded, enabling NIM" + + create_image_pull_secret + verify_image_data_configmap + write_image_data_configmap "$1" + verify_result_configmap_exists + write_result_configmap_value true + write_last_valid_time + } + + function failure() { + echo "Validation failed, disabling NIM" + + delete_image_pull_secret + delete_image_data_configmap + verify_result_configmap_exists + write_result_configmap_value false + } + + function get_api_key() { + cat "/etc/secret-volume/api_key" + } + + function get_ngc_token() { + tempfile=$(mktemp) + + http_code=$(curl -s --write-out "%{http_code}" -o $tempfile "https://authn.nvidia.com/token?service=ngc&" \ + -H "Authorization: ApiKey $1") + + if [ "${http_code}" == 200 ]; then + token=$(jq -r '.token' $tempfile) + echo $token + fi + } + + function get_nim_images() { + tempfile=$(mktemp) + + http_code=$(curl -s --write-out "%{http_code}" -o $tempfile \ + https://api.ngc.nvidia.com/v2/search/catalog/resources/CONTAINER?q=%7B%22query%22%3A+%22orgName%3Anim%22%7D) + + if [ "${http_code}" == 200 ]; then + nim_images=$(jq -r \ + '.results[] | select(.groupValue == "CONTAINER") | .resources[] | (.resourceId + ":" + (.attributes[] | select(.key == "latestTag") | .value))' \ + $tempfile) + echo $nim_images + fi + } + + function get_nim_image_details() { + IFS=':' read -r -a refs <<< "$1" + + if [ ${#refs[@]} -ne 2 ]; then + return + fi + + name="${refs[0]}" + tag="${refs[1]}" + + IFS='/' read -r -a parts <<< "$name" + + if [ ${#parts[@]} -ne 3 ]; then + return + fi + org="${parts[0]}" + team="${parts[1]}" + image="${parts[2]}" + + tempfile=$(mktemp) + + http_code=$(curl -s --write-out "%{http_code}" -o $tempfile \ + https://api.ngc.nvidia.com/v2/org/$org/team/$team/repos/$image?resolve-labels=true \ + -H "Authorization: Bearer $2") + + if [ "${http_code}" == 200 ]; then + raw_data=$(jq -r \ + '{name, displayName, shortDescription, namespace, tags, latestTag, updatedDate}' \ + $tempfile) + image_data=$(jq -n --arg name "$image" --arg data "$raw_data" '{($name): ($data)}') + echo $image_data + fi + } + + function get_image_data() { + images=("$@") + + api_key=$(get_api_key) + token=$(get_ngc_token $api_key) + + if [ ! -z "$token" ]; then + images_data=() + i=0 + for image in "${images[@]}"; + do + images_data[i]=$(get_nim_image_details $image $token) + i=$((i+1)) + done + + data='{}' + for image_data in "${images_data[@]}"; + do + data="$(jq --argjson data "$image_data" '. += $data' <<< "$data")" + done + + echo $data + fi + } + + function get_image_registry_token() { + tempfile=$(mktemp) + + http_code=$(curl -s --write-out "%{http_code}" -o $tempfile \ + "https://nvcr.io/proxy_auth?account=\$oauthtoken&offline_token=true&scope=repository:$1:pull" \ + -H "Authorization: Basic $2") + + if [ "${http_code}" == 200 ]; then + token=$(jq -r '.token' $tempfile) + echo $token + fi + } + + function get_image_manifest() { + tempfile=$(mktemp) + + http_code=$(curl -s --write-out "%{http_code}" -o $tempfile \ + "https://nvcr.io/v2/$1/manifests/$2" \ + -H "Authorization: Bearer $3") + + if [ "${http_code}" == 200 ]; then + cat $tempfile + fi + } + + function verify_api_key() { + api_key=$(get_api_key) + basic=$(printf "\$oauthtoken:$api_key" | base64 -w 0) + + token=$(get_image_registry_token $1 $basic) + + if [ ! -z "$token" ]; then + manifest=$(get_image_manifest $1 $2 $token) + + if [ ! -z "$manifest" ]; then + echo $manifest + fi + fi + } + + echo "Install jq" + + dnf install -y jq + + echo "Get NIM images" + + nim_images=$(get_nim_images) + + if [ ! -z "$nim_images" ]; then + images=($nim_images) + + IFS=':' read -r -a refs <<< "${images[0]}" + + if [ ${#refs[@]} -ne 2 ]; then + echo "Failed to parse NIM image name" + + failure + fi + + echo "Verify Api Key" + + verification=$(verify_api_key "${refs[0]}" "${refs[1]}") + + if [ ! -z "$verification" ]; then + echo "Get images data" + + nim_data=$(get_image_data "${images[@]}") + + if [ ! -z "$nim_data" ]; then + echo "Enable NIM app" + + success "$nim_data" + else + echo "Failed to retrieve NIM image details" + + failure + fi + else + echo "Api key verification failed" + + failure + fi + else + echo "Failed to get NIM images" + + failure + fi + + exit 0 + volumeMounts: + - name: secret-volume + mountPath: /etc/secret-volume + readOnly: true + resources: + limits: + cpu: 100m + memory: 256Mi + requests: + cpu: 100m + memory: 256Mi + volumes: + - name: secret-volume + secret: + secretName: nvidia-nim-access + restartPolicy: Never