Allow model scaling, k6 constant throughput tests and Prometheus/Graf…

…ana in Docker Compose install (#262) * small update to docs * Add prometheus to docker compose * add constant rate k7 test and scale replicas for model setting * test on k8s * allow grpc in constant rate tests * review comments
SeldonIO · Jun 9, 2022 · 2d1a27d · 2d1a27d
1 parent c0ca3fe
commit 2d1a27d
Show file tree

Hide file tree

Showing 22 changed files with 1,285 additions and 16 deletions.
diff --git a/docs/source/contents/metrics/index.md b/docs/source/contents/metrics/index.md
@@ -27,7 +27,11 @@ We have a prebuilt grafana dashboard that makes use of many of the metrics that
 
 ![kafka](dashboard.png)
 
-### Installation
+### Local Use
+
+Grafana and Prometheus are available when you run Seldon locally. You will be able to connect to the Grafana dashboard at `http://localhost:3000`. Prometheus will be available at `http://localhost:9090`.
+
+### Kubernetes Installation
 
 Download the dashboard from [SCv2 dashboard](https://github.com/SeldonIO/seldon-core-v2/blob/master/prometheus/dashboards/Seldon%20Core%20Model%20Mesh%20Monitoring.json) and import it in grafana, making sure that the data source is pointing to the correct prometheus store. Find more information on how to import the dashboard [here](https://grafana.com/docs/grafana/latest/dashboards/export-import/)
 

diff --git a/k8s/helm-charts/seldon-core-v2-crds/templates/seldon-v2-crds.yaml b/k8s/helm-charts/seldon-core-v2-crds/templates/seldon-v2-crds.yaml
@@ -430,11 +430,19 @@ spec:
                   that was last processed by the controller.
                 format: int64
                 type: integer
+              replicas:
+                description: 'Important: Run "make" to regenerate code after modifying
+                  this file'
+                format: int32
+                type: integer
             type: object
         type: object
     served: true
     storage: true
     subresources:
+      scale:
+        specReplicasPath: .spec.replicas
+        statusReplicasPath: .status.replicas
       status: {}
 status:
   acceptedNames:

diff --git a/k8s/yaml/seldon-v2-crds.yaml b/k8s/yaml/seldon-v2-crds.yaml
@@ -430,11 +430,19 @@ spec:
                   that was last processed by the controller.
                 format: int64
                 type: integer
+              replicas:
+                description: 'Important: Run "make" to regenerate code after modifying
+                  this file'
+                format: int32
+                type: integer
             type: object
         type: object
     served: true
     storage: true
     subresources:
+      scale:
+        specReplicasPath: .spec.replicas
+        statusReplicasPath: .status.replicas
       status: {}
 status:
   acceptedNames:

diff --git a/operator/apis/mlops/v1alpha1/model_types.go b/operator/apis/mlops/v1alpha1/model_types.go
@@ -89,13 +89,15 @@ type InferenceArtifactSpec struct {
 
 // ModelStatus defines the observed state of Model
 type ModelStatus struct {
-	// Important: Run "make" to regenerate code after modifying this file
+	// Total number of replicas targeted by this model
+	Replicas      int32 `json:"replicas,omitempty"`
 	duckv1.Status `json:",inline"`
 }
 
 //+kubebuilder:object:root=true
 //+kubebuilder:subresource:status
 //+kubebuilder:resource:shortName=mlm
+// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas
 
 // Model is the Schema for the models API
 type Model struct {

diff --git a/operator/config/crd/bases/mlops.seldon.io_models.yaml b/operator/config/crd/bases/mlops.seldon.io_models.yaml
@@ -151,11 +151,19 @@ spec:
                   that was last processed by the controller.
                 format: int64
                 type: integer
+              replicas:
+                description: 'Important: Run "make" to regenerate code after modifying
+                  this file'
+                format: int32
+                type: integer
             type: object
         type: object
     served: true
     storage: true
     subresources:
+      scale:
+        specReplicasPath: .spec.replicas
+        statusReplicasPath: .status.replicas
       status: {}
 status:
   acceptedNames:

diff --git a/operator/scheduler/model.go b/operator/scheduler/model.go
@@ -135,6 +135,8 @@ func (s *SchedulerClient) SubscribeModelEvents(ctx context.Context) error {
 				logger.Info("Setting model to not ready", "name", event.ModelName, "state", latestVersionStatus.State.State.String())
 				latestModel.Status.CreateAndSetCondition(v1alpha1.ModelReady, false, latestVersionStatus.State.Reason)
 			}
+			// Set the total number of replicas targeted by this model
+			latestModel.Status.Replicas = int32(latestVersionStatus.State.GetAvailableReplicas() + latestVersionStatus.State.GetUnavailableReplicas())
 			return s.updateModelStatus(latestModel)
 		})
 		if retryErr != nil {

diff --git a/samples/k8s-examples.ipynb b/samples/k8s-examples.ipynb
@@ -17,7 +17,7 @@
     {
      "data": {
       "text/plain": [
-       "'172.22.255.9'"
+       "'172.31.255.9'"
       ]
      },
      "execution_count": 1,
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "beneficial-logan",
    "metadata": {},
    "outputs": [
@@ -102,12 +102,12 @@
     }
    ],
    "source": [
-    "build!kubectl wait --for condition=ready --timeout=300s model --all -n seldon-mesh"
+    "!kubectl wait --for condition=ready --timeout=300s model --all -n seldon-mesh"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "prepared-duration",
    "metadata": {},
    "outputs": [
@@ -118,16 +118,17 @@
       "{\r\n",
       "  \"conditions\": [\r\n",
       "    {\r\n",
-      "      \"lastTransitionTime\": \"2022-05-26T10:09:32Z\",\r\n",
+      "      \"lastTransitionTime\": \"2022-06-03T14:35:59Z\",\r\n",
       "      \"status\": \"True\",\r\n",
       "      \"type\": \"ModelReady\"\r\n",
       "    },\r\n",
       "    {\r\n",
-      "      \"lastTransitionTime\": \"2022-05-26T10:09:32Z\",\r\n",
+      "      \"lastTransitionTime\": \"2022-06-03T14:35:59Z\",\r\n",
       "      \"status\": \"True\",\r\n",
       "      \"type\": \"Ready\"\r\n",
       "    }\r\n",
-      "  ]\r\n",
+      "  ],\r\n",
+      "  \"replicas\": 1\r\n",
       "}\r\n"
      ]
     }
@@ -138,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "67900afd",
    "metadata": {},
    "outputs": [
@@ -149,7 +150,7 @@
       "{\r\n",
       "\t\"model_name\": \"iris_1\",\r\n",
       "\t\"model_version\": \"1\",\r\n",
-      "\t\"id\": \"5890ac9d-c1ed-4343-b9e6-a460bead5fe8\",\r\n",
+      "\t\"id\": \"3be6542c-5ad2-4ebc-a0d4-842377653b5d\",\r\n",
       "\t\"parameters\": null,\r\n",
       "\t\"outputs\": [\r\n",
       "\t\t{\r\n",

diff --git a/scheduler/Dockerfile.grafana b/scheduler/Dockerfile.grafana
@@ -0,0 +1,15 @@
+FROM grafana/grafana:8.5.4
+
+# Disable Login form or not
+ENV GF_AUTH_DISABLE_LOGIN_FORM "true"
+# Allow anonymous authentication or not
+ENV GF_AUTH_ANONYMOUS_ENABLED "true"
+# Role of anonymous user
+ENV GF_AUTH_ANONYMOUS_ORG_ROLE "Admin"
+
+# Add provisioning
+ADD ./config/grafana/provisioning /etc/grafana/provisioning
+# Add configuration file
+ADD ./config/grafana/grafana.ini /etc/grafana/grafana.ini
+# Add dashboard json files
+ADD ./config/grafana/dashboards /etc/grafana/dashboards
diff --git a/scheduler/Makefile b/scheduler/Makefile
@@ -7,6 +7,8 @@ MODELGATEWAY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-modelgateway:${CUSTOM_IMAGE_TAG
 PIPELINEGATEWAY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-pipelinegateway:${CUSTOM_IMAGE_TAG}
 DATAFLOW_IMG ?= ${DOCKERHUB_USERNAME}/seldon-dataflow-engine:${CUSTOM_IMAGE_TAG}
 ENVOY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-envoy:${CUSTOM_IMAGE_TAG}
+# Grafana image only used for Docker compose not k8s
+GRAFANA_IMG ?= ${DOCKERHUB_USERNAME}/seldon-grafana:${CUSTOM_IMAGE_TAG}
 MLSERVER_IMG ?= seldonio/mlserver:1.1.0.dev3
 TRITON_IMG ?= nvcr.io/nvidia/tritonserver:21.12-py3
 KIND_NAME=ansible
@@ -143,11 +145,19 @@ docker-build-dataflow: copy-apis data-flow/opentelemetry-javaagent.jar
 docker-push-dataflow:
 	docker push ${DATAFLOW_IMG}
 
+.PHONY: docker-build-grafana
+docker-build-grafana:
+	docker build -t ${GRAFANA_IMG} -f Dockerfile.grafana .
+
+.PHONY: docker-push-grafana
+docker-push-grafana:
+	docker push ${GRAFANA_IMG}
+
 .PHONY: docker-build-all
-docker-build-all: docker-build-dataflow docker-build-agent docker-build-envoy docker-build-rclone docker-build-scheduler docker-build-modelgateway docker-build-pipelinegateway
+docker-build-all: docker-build-dataflow docker-build-agent docker-build-envoy docker-build-rclone docker-build-scheduler docker-build-modelgateway docker-build-pipelinegateway docker-build-grafana
 
 .PHONY: docker-push-all
-docker-push-all: docker-push-agent docker-push-envoy docker-push-rclone docker-push-scheduler docker-push-modelgateway docker-push-pipelinegateway docker-push-dataflow
+docker-push-all: docker-push-agent docker-push-envoy docker-push-rclone docker-push-scheduler docker-push-modelgateway docker-push-pipelinegateway docker-push-dataflow docker-push-grafana
 
 
 #####################################
@@ -203,7 +213,8 @@ DOCKER_COMPOSE_COMMON_IMAGES = \
 		RCLONE_IMAGE_AND_TAG=${RCLONE_IMG} \
 		SERVER_MLSERVER_IMAGE_AND_TAG=${MLSERVER_IMG} \
 		TRITON_LOG_LEVEL=${DOCKER_COMPOSE_TRITON_LOG_LEVEL} \
-		SERVER_TRITON_IMAGE_AND_TAG=${TRITON_IMG}
+		SERVER_TRITON_IMAGE_AND_TAG=${TRITON_IMG} \
+		GRAFANA_IMAGE_AND_TAG=${GRAFANA_IMG}
 
 DOCKER_COMPOSE_TRITON_LOG_LEVEL ?= 0
 
@@ -440,6 +451,30 @@ stop-kafka:
 start-kafka-host:
 	${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} up -d kafka
 
+.PHONY: start-prometheus
+start-prometheus:
+	${DOCKER_COMPOSE_SERVICE_COMMAND} up -d prometheus
+
+.PHONY: stop-prometheus
+stop-prometheus:
+	${DOCKER_COMPOSE_SERVICE_COMMAND} rm --stop --force ${DOCKER_COMPOSE_REMOVE_VOLUMES} prometheus
+
+.PHONY: start-prometheus-host
+start-prometheus-host:
+	${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} up -d prometheus
+
+.PHONY: start-grafana
+start-grafana:
+	${DOCKER_COMPOSE_SERVICE_COMMAND} up -d grafana
+
+.PHONY: stop-grafana
+stop-grafana:
+	${DOCKER_COMPOSE_SERVICE_COMMAND} rm --stop --force ${DOCKER_COMPOSE_REMOVE_VOLUMES} grafana
+
+.PHONY: start-grafana-host
+start-grafana-host:
+	${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} up -d grafana
+
 .PHONY: stop-kafka-host
 stop-kafka-host:
 	${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} rm --stop --force ${DOCKER_COMPOSE_REMOVE_VOLUMES} kafka

diff --git a/scheduler/all-base.yaml b/scheduler/all-base.yaml
@@ -165,3 +165,12 @@ services:
     environment:
       - ALLOW_ANONYMOUS_LOGIN=yes
 
+  prometheus:
+    image: prom/prometheus:latest
+    ports:
+      - "9090:9090"
+
+  grafana:
+    image: "${GRAFANA_IMAGE_AND_TAG}"
+    ports:
+      - 3000:3000
diff --git a/scheduler/all-host-network.yaml b/scheduler/all-host-network.yaml
@@ -164,3 +164,12 @@ services:
   zookeeper:
     ports:
       - "2181:2181"
+
+  prometheus:
+    command:
+      - --config.file=/etc/prometheus/prometheus-host.yml
+    volumes:
+      - type: bind
+        source: ./config
+        target: /etc/prometheus
+
diff --git a/scheduler/all-internal.yaml b/scheduler/all-internal.yaml
@@ -185,7 +185,6 @@ services:
 
 
   dataflow:
-    environment:
     environment:
       - SELDON_UPSTREAM_PORT=${SCHEDULER_DATAFLOW_PORT}
       - SELDON_KAFKA_BOOTSTRAP_SERVERS=kafka:${KAFKA_BROKER_INTERNAL_PORT}
@@ -218,3 +217,11 @@ services:
   zookeeper:
     ports:
       - "2181:2181"
+
+  prometheus:
+    command:
+      - --config.file=/etc/prometheus/prometheus-internal.yml
+    volumes:
+      - type: bind
+        source: ./config
+        target: /etc/prometheus