Skip to content

Commit

Permalink
Allow model scaling, k6 constant throughput tests and Prometheus/Graf…
Browse files Browse the repository at this point in the history
…ana in Docker Compose install (#262)

* small update to docs

* Add prometheus to docker compose

* add constant rate k7 test and scale replicas for model setting

* test on k8s

* allow grpc in constant rate tests

* review comments
  • Loading branch information
ukclivecox authored Jun 9, 2022
1 parent c0ca3fe commit 2d1a27d
Show file tree
Hide file tree
Showing 22 changed files with 1,285 additions and 16 deletions.
6 changes: 5 additions & 1 deletion docs/source/contents/metrics/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ We have a prebuilt grafana dashboard that makes use of many of the metrics that

![kafka](dashboard.png)

### Installation
### Local Use

Grafana and Prometheus are available when you run Seldon locally. You will be able to connect to the Grafana dashboard at `http://localhost:3000`. Prometheus will be available at `http://localhost:9090`.

### Kubernetes Installation

Download the dashboard from [SCv2 dashboard](https://github.com/SeldonIO/seldon-core-v2/blob/master/prometheus/dashboards/Seldon%20Core%20Model%20Mesh%20Monitoring.json) and import it in grafana, making sure that the data source is pointing to the correct prometheus store. Find more information on how to import the dashboard [here](https://grafana.com/docs/grafana/latest/dashboards/export-import/)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,11 +430,19 @@ spec:
that was last processed by the controller.
format: int64
type: integer
replicas:
description: 'Important: Run "make" to regenerate code after modifying
this file'
format: int32
type: integer
type: object
type: object
served: true
storage: true
subresources:
scale:
specReplicasPath: .spec.replicas
statusReplicasPath: .status.replicas
status: {}
status:
acceptedNames:
Expand Down
8 changes: 8 additions & 0 deletions k8s/yaml/seldon-v2-crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -430,11 +430,19 @@ spec:
that was last processed by the controller.
format: int64
type: integer
replicas:
description: 'Important: Run "make" to regenerate code after modifying
this file'
format: int32
type: integer
type: object
type: object
served: true
storage: true
subresources:
scale:
specReplicasPath: .spec.replicas
statusReplicasPath: .status.replicas
status: {}
status:
acceptedNames:
Expand Down
4 changes: 3 additions & 1 deletion operator/apis/mlops/v1alpha1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,15 @@ type InferenceArtifactSpec struct {

// ModelStatus defines the observed state of Model
type ModelStatus struct {
// Important: Run "make" to regenerate code after modifying this file
// Total number of replicas targeted by this model
Replicas int32 `json:"replicas,omitempty"`
duckv1.Status `json:",inline"`
}

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:shortName=mlm
// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas

// Model is the Schema for the models API
type Model struct {
Expand Down
8 changes: 8 additions & 0 deletions operator/config/crd/bases/mlops.seldon.io_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,19 @@ spec:
that was last processed by the controller.
format: int64
type: integer
replicas:
description: 'Important: Run "make" to regenerate code after modifying
this file'
format: int32
type: integer
type: object
type: object
served: true
storage: true
subresources:
scale:
specReplicasPath: .spec.replicas
statusReplicasPath: .status.replicas
status: {}
status:
acceptedNames:
Expand Down
2 changes: 2 additions & 0 deletions operator/scheduler/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ func (s *SchedulerClient) SubscribeModelEvents(ctx context.Context) error {
logger.Info("Setting model to not ready", "name", event.ModelName, "state", latestVersionStatus.State.State.String())
latestModel.Status.CreateAndSetCondition(v1alpha1.ModelReady, false, latestVersionStatus.State.Reason)
}
// Set the total number of replicas targeted by this model
latestModel.Status.Replicas = int32(latestVersionStatus.State.GetAvailableReplicas() + latestVersionStatus.State.GetUnavailableReplicas())
return s.updateModelStatus(latestModel)
})
if retryErr != nil {
Expand Down
19 changes: 10 additions & 9 deletions samples/k8s-examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
{
"data": {
"text/plain": [
"'172.22.255.9'"
"'172.31.255.9'"
]
},
"execution_count": 1,
Expand Down Expand Up @@ -89,7 +89,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "beneficial-logan",
"metadata": {},
"outputs": [
Expand All @@ -102,12 +102,12 @@
}
],
"source": [
"build!kubectl wait --for condition=ready --timeout=300s model --all -n seldon-mesh"
"!kubectl wait --for condition=ready --timeout=300s model --all -n seldon-mesh"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "prepared-duration",
"metadata": {},
"outputs": [
Expand All @@ -118,16 +118,17 @@
"{\r\n",
" \"conditions\": [\r\n",
" {\r\n",
" \"lastTransitionTime\": \"2022-05-26T10:09:32Z\",\r\n",
" \"lastTransitionTime\": \"2022-06-03T14:35:59Z\",\r\n",
" \"status\": \"True\",\r\n",
" \"type\": \"ModelReady\"\r\n",
" },\r\n",
" {\r\n",
" \"lastTransitionTime\": \"2022-05-26T10:09:32Z\",\r\n",
" \"lastTransitionTime\": \"2022-06-03T14:35:59Z\",\r\n",
" \"status\": \"True\",\r\n",
" \"type\": \"Ready\"\r\n",
" }\r\n",
" ]\r\n",
" ],\r\n",
" \"replicas\": 1\r\n",
"}\r\n"
]
}
Expand All @@ -138,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "67900afd",
"metadata": {},
"outputs": [
Expand All @@ -149,7 +150,7 @@
"{\r\n",
"\t\"model_name\": \"iris_1\",\r\n",
"\t\"model_version\": \"1\",\r\n",
"\t\"id\": \"5890ac9d-c1ed-4343-b9e6-a460bead5fe8\",\r\n",
"\t\"id\": \"3be6542c-5ad2-4ebc-a0d4-842377653b5d\",\r\n",
"\t\"parameters\": null,\r\n",
"\t\"outputs\": [\r\n",
"\t\t{\r\n",
Expand Down
15 changes: 15 additions & 0 deletions scheduler/Dockerfile.grafana
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM grafana/grafana:8.5.4

# Disable Login form or not
ENV GF_AUTH_DISABLE_LOGIN_FORM "true"
# Allow anonymous authentication or not
ENV GF_AUTH_ANONYMOUS_ENABLED "true"
# Role of anonymous user
ENV GF_AUTH_ANONYMOUS_ORG_ROLE "Admin"

# Add provisioning
ADD ./config/grafana/provisioning /etc/grafana/provisioning
# Add configuration file
ADD ./config/grafana/grafana.ini /etc/grafana/grafana.ini
# Add dashboard json files
ADD ./config/grafana/dashboards /etc/grafana/dashboards
41 changes: 38 additions & 3 deletions scheduler/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ MODELGATEWAY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-modelgateway:${CUSTOM_IMAGE_TAG
PIPELINEGATEWAY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-pipelinegateway:${CUSTOM_IMAGE_TAG}
DATAFLOW_IMG ?= ${DOCKERHUB_USERNAME}/seldon-dataflow-engine:${CUSTOM_IMAGE_TAG}
ENVOY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-envoy:${CUSTOM_IMAGE_TAG}
# Grafana image only used for Docker compose not k8s
GRAFANA_IMG ?= ${DOCKERHUB_USERNAME}/seldon-grafana:${CUSTOM_IMAGE_TAG}
MLSERVER_IMG ?= seldonio/mlserver:1.1.0.dev3
TRITON_IMG ?= nvcr.io/nvidia/tritonserver:21.12-py3
KIND_NAME=ansible
Expand Down Expand Up @@ -143,11 +145,19 @@ docker-build-dataflow: copy-apis data-flow/opentelemetry-javaagent.jar
docker-push-dataflow:
docker push ${DATAFLOW_IMG}

.PHONY: docker-build-grafana
docker-build-grafana:
docker build -t ${GRAFANA_IMG} -f Dockerfile.grafana .

.PHONY: docker-push-grafana
docker-push-grafana:
docker push ${GRAFANA_IMG}

.PHONY: docker-build-all
docker-build-all: docker-build-dataflow docker-build-agent docker-build-envoy docker-build-rclone docker-build-scheduler docker-build-modelgateway docker-build-pipelinegateway
docker-build-all: docker-build-dataflow docker-build-agent docker-build-envoy docker-build-rclone docker-build-scheduler docker-build-modelgateway docker-build-pipelinegateway docker-build-grafana

.PHONY: docker-push-all
docker-push-all: docker-push-agent docker-push-envoy docker-push-rclone docker-push-scheduler docker-push-modelgateway docker-push-pipelinegateway docker-push-dataflow
docker-push-all: docker-push-agent docker-push-envoy docker-push-rclone docker-push-scheduler docker-push-modelgateway docker-push-pipelinegateway docker-push-dataflow docker-push-grafana


#####################################
Expand Down Expand Up @@ -203,7 +213,8 @@ DOCKER_COMPOSE_COMMON_IMAGES = \
RCLONE_IMAGE_AND_TAG=${RCLONE_IMG} \
SERVER_MLSERVER_IMAGE_AND_TAG=${MLSERVER_IMG} \
TRITON_LOG_LEVEL=${DOCKER_COMPOSE_TRITON_LOG_LEVEL} \
SERVER_TRITON_IMAGE_AND_TAG=${TRITON_IMG}
SERVER_TRITON_IMAGE_AND_TAG=${TRITON_IMG} \
GRAFANA_IMAGE_AND_TAG=${GRAFANA_IMG}

DOCKER_COMPOSE_TRITON_LOG_LEVEL ?= 0

Expand Down Expand Up @@ -440,6 +451,30 @@ stop-kafka:
start-kafka-host:
${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} up -d kafka

.PHONY: start-prometheus
start-prometheus:
${DOCKER_COMPOSE_SERVICE_COMMAND} up -d prometheus

.PHONY: stop-prometheus
stop-prometheus:
${DOCKER_COMPOSE_SERVICE_COMMAND} rm --stop --force ${DOCKER_COMPOSE_REMOVE_VOLUMES} prometheus

.PHONY: start-prometheus-host
start-prometheus-host:
${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} up -d prometheus

.PHONY: start-grafana
start-grafana:
${DOCKER_COMPOSE_SERVICE_COMMAND} up -d grafana

.PHONY: stop-grafana
stop-grafana:
${DOCKER_COMPOSE_SERVICE_COMMAND} rm --stop --force ${DOCKER_COMPOSE_REMOVE_VOLUMES} grafana

.PHONY: start-grafana-host
start-grafana-host:
${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} up -d grafana

.PHONY: stop-kafka-host
stop-kafka-host:
${DOCKER_COMPOSE_SERVICE_HOST_COMMAND} rm --stop --force ${DOCKER_COMPOSE_REMOVE_VOLUMES} kafka
Expand Down
9 changes: 9 additions & 0 deletions scheduler/all-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,12 @@ services:
environment:
- ALLOW_ANONYMOUS_LOGIN=yes

prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"

grafana:
image: "${GRAFANA_IMAGE_AND_TAG}"
ports:
- 3000:3000
9 changes: 9 additions & 0 deletions scheduler/all-host-network.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,12 @@ services:
zookeeper:
ports:
- "2181:2181"

prometheus:
command:
- --config.file=/etc/prometheus/prometheus-host.yml
volumes:
- type: bind
source: ./config
target: /etc/prometheus

9 changes: 8 additions & 1 deletion scheduler/all-internal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ services:


dataflow:
environment:
environment:
- SELDON_UPSTREAM_PORT=${SCHEDULER_DATAFLOW_PORT}
- SELDON_KAFKA_BOOTSTRAP_SERVERS=kafka:${KAFKA_BROKER_INTERNAL_PORT}
Expand Down Expand Up @@ -218,3 +217,11 @@ services:
zookeeper:
ports:
- "2181:2181"

prometheus:
command:
- --config.file=/etc/prometheus/prometheus-internal.yml
volumes:
- type: bind
source: ./config
target: /etc/prometheus
Loading

0 comments on commit 2d1a27d

Please sign in to comment.