feat: TorchServe support (#250)

Motivation The Triton runtime can be used with model-mesh to serve PyTorch torchscript models, but it does not support arbitrary PyTorch models i.e. eager mode. KServe "classic" has integration with TorchServe but it would be good to have integration with model-mesh too so that these kinds of models can be used in distributed multi-model serving contexts. Modifications The bulk of the required changes are to the adapter image, covered by PR kserve/modelmesh-runtime-adapter#34. This PR contains the minimal controller changes needed to enable the support: - TorchServe ServingRuntime spec - Add "torchserve" to the list of supported built-in runtime types - Add "ID extraction" entry for TorchServe's gRPC Predictions RPC so that model-mesh will automatically extract the model name from corresponding request messages Note the supported model format is advertised as "pytorch-mar" to distinguish from the existing "pytorch" format that refers to raw TorchScript .pt files as supported by Triton. Result TorchServe can be used seamlessly with ModelMesh Serving to serve PyTorch models, including eager mode. Resolves #63 Signed-off-by: Nick Hill <[email protected]>
kserve · Nov 15, 2022 · bd16e5b · bd16e5b
1 parent 2d90aad
commit bd16e5b
Show file tree

Hide file tree

Showing 11 changed files with 319 additions and 3 deletions.
diff --git a/.github/workflows/run-fvt.yml b/.github/workflows/run-fvt.yml
@@ -36,6 +36,7 @@ jobs:
         # Update the image tag and reduce some resource request amounts to allow FVTs to run
         # on reduced resource environments. Also the RollingUpdate strategy for Runtime deployments
         # is adjusted for these environments.
+        # Disable the torchserve ServingRuntime for now (insufficent resources to run them all).
         run: |
           sed -i 's/newTag:.*$/newTag: '"${{ env.IMAGE_TAG }}"'/' config/manager/kustomization.yaml
           sed -i '0,/cpu:.*$/s/cpu:.*$/cpu: 100m/' \
@@ -49,6 +50,7 @@ jobs:
             config/runtimes/ovms-1.x.yaml
           sed -i 's/maxSurge:.*$/maxSurge: 0/' config/internal/base/deployment.yaml.tmpl
           sed -i 's/maxUnavailable:.*$/maxUnavailable: 100%/' config/internal/base/deployment.yaml.tmpl
+          echo -e '\n  disabled: true' >> config/runtimes/torchserve-0.x.yaml
       - name: Build Controller image
         run: |
           make build.develop

diff --git a/README.md b/README.md
@@ -1,3 +1,5 @@
+[![Build and Push](https://github.com/kserve/modelmesh-serving/actions/workflows/build-and-push.yml/badge.svg)](https://github.com/kserve/modelmesh-serving/actions/workflows/build-and-push.yml)
+
 # ModelMesh Serving
 
 ModelMesh Serving is the Controller for managing ModelMesh, a general-purpose model serving management/routing layer.
@@ -32,6 +34,7 @@ ModelMesh Serving provides out-of-the-box integration with the following model s
 - [triton-inference-server](https://github.com/triton-inference-server/server) - Nvidia's Triton Inference Server
 - [seldon-mlserver](https://github.com/SeldonIO/MLServer) - Seldon's Python MLServer
 - [openVINO-model-server](https://github.com/openvinotoolkit/model_server) - OpenVINO Model Server
+- [torchserve](https://github.com/pytorch/serve) - TorchServe
 
 `ServingRuntime` custom resources can be used to add support for other existing or custom-built model servers, see the docs on [implementing a custom Serving Runtime](./docs/runtimes/custom_runtimes.md)
 

diff --git a/config/default/config-defaults.yaml b/config/default/config-defaults.yaml
@@ -55,3 +55,4 @@ builtInServerTypes:
   - triton
   - mlserver
   - ovms
+  - torchserve
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
@@ -15,6 +15,7 @@ resources:
   - triton-2.x.yaml
   - mlserver-0.x.yaml
   - ovms-1.x.yaml
+  - torchserve-0.x.yaml
 
 images:
   - name: tritonserver-2
@@ -29,5 +30,9 @@ images:
     newName: openvino/model_server
     newTag: "2022.2"
 
+  - name: torchserve-0
+    newName: pytorch/torchserve
+    newTag: 0.6.0-cpu
+
 transformers:
   - ../default/metadataLabelTransformer.yaml
diff --git a/config/runtimes/torchserve-0.x.yaml b/config/runtimes/torchserve-0.x.yaml
@@ -0,0 +1,59 @@
+# Copyright 2022 IBM Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: torchserve-0.x
+  labels:
+    name: modelmesh-serving-torchserve-0.x-SR
+spec:
+  supportedModelFormats:
+    - name: pytorch-mar
+      version: "0"
+      autoSelect: true
+
+  multiModel: true
+
+  grpcEndpoint: "port:8085"
+  grpcDataEndpoint: "port:7070"
+
+  containers:
+    - name: torchserve
+      image: torchserve-0:replace
+      args:
+        # Adapter creates the config file; wait for it to exist before starting
+        - while [ ! -e "$TS_CONFIG_FILE" ]; do echo "waiting for config file..."; sleep 1; done;
+        - exec
+        - torchserve
+        - --start
+        - --foreground
+      env:
+        - name: TS_CONFIG_FILE
+          value: /models/_torchserve_models/mmconfig.properties
+        # TBD, this may give better performance
+        #- name: TS_PREFER_DIRECT_BUFFER
+        #  value: true
+        # Additional TS_ prefixed TorchServe config options may be added here
+      resources:
+        requests:
+          cpu: 500m
+          memory: 1Gi
+        limits:
+          cpu: "5"
+          memory: 1Gi
+  builtInAdapter:
+    serverType: torchserve
+    runtimeManagementPort: 7071
+    memBufferBytes: 134217728
+    modelLoadingTimeoutMillis: 90000
diff --git a/controllers/modelmesh/cluster_config.go b/controllers/modelmesh/cluster_config.go
@@ -46,6 +46,10 @@ var dataPlaneApiJsonConfigBytes = []byte(`{
         "tensorflow.serving.PredictionService/Predict": {
             "idExtractionPath": [1, 1],
             "vModelId": true
+        },
+        "org.pytorch.serve.grpc.inference.InferenceAPIsService/Predictions": {
+            "idExtractionPath": [1],
+            "vModelId": true
         }
     },
     "allowOtherRpcs": true

diff --git a/controllers/servingruntime_controller_test.go b/controllers/servingruntime_controller_test.go
@@ -56,6 +56,7 @@ var _ = Describe("Sample Runtime", func() {
 		"config/runtimes/mlserver-0.x.yaml",
 		"config/runtimes/triton-2.x.yaml",
 		"config/runtimes/ovms-1.x.yaml",
+		"config/runtimes/torchserve-0.x.yaml",
 	}
 	for _, f := range samplesToTest {
 		// capture the value in new variable for each iteration

diff --git a/controllers/testdata/servingruntime_controller.golden b/controllers/testdata/servingruntime_controller.golden
@@ -1009,6 +1009,247 @@ spec:
           secretName: secret
 status: {}
 '''
+"Sample Runtime config/runtimes/torchserve-0.x.yaml should be a valid runtime specification" = '''
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  creationTimestamp: null
+spec:
+  progressDeadlineSeconds: 600
+  replicas: 2
+  revisionHistoryLimit: 10
+  selector:
+    matchLabels:
+      modelmesh-service: modelmesh-serving
+      name: modelmesh-serving-torchserve-0.x
+  strategy:
+    rollingUpdate:
+      maxSurge: 75%
+      maxUnavailable: 15%
+    type: RollingUpdate
+  template:
+    metadata:
+      creationTimestamp: null
+      labels:
+        app.kubernetes.io/instance: modelmesh-controller
+        app.kubernetes.io/managed-by: modelmesh-controller
+        app.kubernetes.io/name: modelmesh-controller
+        modelmesh-service: modelmesh-serving
+        name: modelmesh-serving-torchserve-0.x
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/arch
+                operator: In
+                values:
+                - amd64
+      containers:
+      - command:
+        - /opt/app/torchserve-adapter
+        env:
+        - name: ADAPTER_PORT
+          value: "8085"
+        - name: RUNTIME_PORT
+          value: "7071"
+        - name: RUNTIME_DATA_ENDPOINT
+          value: port:7070
+        - name: CONTAINER_MEM_REQ_BYTES
+          valueFrom:
+            resourceFieldRef:
+              containerName: torchserve
+              divisor: "0"
+              resource: requests.memory
+        - name: MEM_BUFFER_BYTES
+          value: "134217728"
+        - name: LOADTIME_TIMEOUT
+          value: "90000"
+        - name: USE_EMBEDDED_PULLER
+          value: "true"
+        - name: RUNTIME_VERSION
+          value: replace
+        image: image:tag
+        imagePullPolicy: IfNotPresent
+        lifecycle:
+          preStop:
+            httpGet:
+              path: /prestop
+              port: 8090
+              scheme: HTTP
+        name: torchserve-adapter
+        resources:
+          limits:
+            cpu: "2"
+            memory: 512Mi
+          requests:
+            cpu: 50m
+            memory: 96Mi
+        securityContext:
+          capabilities:
+            drop:
+            - ALL
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /models
+          name: models-dir
+        - mountPath: /storage-config
+          name: storage-config
+          readOnly: true
+      - args:
+        - while [ ! -e "$TS_CONFIG_FILE" ]; do echo "waiting for config file...";
+          sleep 1; done;
+        - exec
+        - torchserve
+        - --start
+        - --foreground
+        env:
+        - name: TS_CONFIG_FILE
+          value: /models/_torchserve_models/mmconfig.properties
+        image: torchserve-0:replace
+        imagePullPolicy: IfNotPresent
+        lifecycle:
+          preStop:
+            httpGet:
+              path: /prestop
+              port: 8090
+              scheme: HTTP
+        name: torchserve
+        resources:
+          limits:
+            cpu: "5"
+            memory: 1Gi
+          requests:
+            cpu: 500m
+            memory: 1Gi
+        securityContext:
+          capabilities:
+            drop:
+            - ALL
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /models
+          name: models-dir
+      - env:
+        - name: MM_SERVICE_NAME
+          value: modelmesh-serving
+        - name: MM_SVC_GRPC_PORT
+          value: "1234"
+        - name: WKUBE_POD_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.name
+        - name: WKUBE_POD_IPADDR
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: status.podIP
+        - name: MM_LOCATION
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: status.hostIP
+        - name: KV_STORE
+          value: etcd:/opt/kserve/mmesh/etcd/etcd_connection
+        - name: MM_METRICS
+          value: disabled
+        - name: SHUTDOWN_TIMEOUT_MS
+          value: "90000"
+        - name: INTERNAL_SERVING_GRPC_PORT
+          value: "7070"
+        - name: INTERNAL_GRPC_PORT
+          value: "8085"
+        - name: MM_SVC_GRPC_MAX_MSG_SIZE
+          value: "16777216"
+        - name: MM_KVSTORE_PREFIX
+          value: mm
+        - name: MM_DEFAULT_VMODEL_OWNER
+          value: ksp
+        - name: MM_LABELS
+          value: mt:pytorch-mar,mt:pytorch-mar:0,rt:torchserve-0.x
+        - name: MM_TYPE_CONSTRAINTS_PATH
+          value: /etc/watson/mmesh/config/type_constraints
+        - name: MM_DATAPLANE_CONFIG_PATH
+          value: /etc/watson/mmesh/config/dataplane_api_config
+        image: image:tag
+        imagePullPolicy: IfNotPresent
+        lifecycle:
+          preStop:
+            exec:
+              command:
+              - /opt/kserve/mmesh/stop.sh
+              - wait
+        livenessProbe:
+          failureThreshold: 2
+          httpGet:
+            path: /live
+            port: 8089
+            scheme: HTTP
+          initialDelaySeconds: 90
+          periodSeconds: 30
+          successThreshold: 1
+          timeoutSeconds: 5
+        name: mm
+        ports:
+        - containerPort: 1234
+          name: grpc
+          protocol: TCP
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /ready
+            port: 8089
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          limits:
+            cpu: "3"
+            memory: 448Mi
+          requests:
+            cpu: 300m
+            memory: 448Mi
+        securityContext:
+          capabilities:
+            drop:
+            - ALL
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /etc/watson/mmesh/config
+          name: tc-config
+        - mountPath: /opt/kserve/mmesh/etcd
+          name: etcd-config
+          readOnly: true
+      dnsPolicy: ClusterFirst
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      securityContext: {}
+      terminationGracePeriodSeconds: 90
+      volumes:
+      - emptyDir:
+          sizeLimit: 1536Mi
+        name: models-dir
+      - name: storage-config
+        secret:
+          defaultMode: 420
+          secretName: storage-config
+      - configMap:
+          defaultMode: 420
+          name: tc-config
+        name: tc-config
+      - name: etcd-config
+        secret:
+          defaultMode: 420
+          secretName: secret
+status: {}
+'''
 "Sample Runtime config/runtimes/triton-2.x.yaml should be a valid runtime specification" = '''
 apiVersion: apps/v1
 kind: Deployment

diff --git a/fvt/predictor/predictor_suite_test.go b/fvt/predictor/predictor_suite_test.go
@@ -72,7 +72,7 @@ var _ = SynchronizedBeforeSuite(func() []byte {
 		list, err = FVTClientInstance.ListClusterServingRuntimes(metav1.ListOptions{})
 	}
 	Expect(err).ToNot(HaveOccurred())
-	Expect(list.Items).To(HaveLen(3))
+	Expect(list.Items).To(HaveLen(4))
 
 	FVTClientInstance.SetDefaultUserConfigMap()
 	// ensure that there are no predictors to start

diff --git a/fvt/scaleToZero/scaleToZero_suite_test.go b/fvt/scaleToZero/scaleToZero_suite_test.go
@@ -71,7 +71,7 @@ var _ = SynchronizedBeforeSuite(func() []byte {
 		list, err = FVTClientInstance.ListClusterServingRuntimes(metav1.ListOptions{})
 	}
 	Expect(err).ToNot(HaveOccurred())
-	Expect(list.Items).To(HaveLen(3))
+	Expect(list.Items).To(HaveLen(4))
 
 	config := map[string]interface{}{
 		"scaleToZero": map[string]interface{}{