diff --git a/doc/source/graph/annotations.md b/doc/source/graph/annotations.md index 820a8bf3e3..5d4bbfc48d 100644 --- a/doc/source/graph/annotations.md +++ b/doc/source/graph/annotations.md @@ -33,11 +33,15 @@ You can configure aspects of Seldon Core via annotations in the SeldonDeployment ### Service Orchestrator * ```seldon.io/engine-separate-pod``` : Use a separate pod for the service orchestrator - * Locations : SeldonDeployment.spec.annotations + * Locations : SeldonDeployment.metadata.annotations, SeldonDeployment.spec.annotations * [Separate svc-orc pod example](model_svcorch_sep.md) * ```seldon.io/headless-svc``` : Run main endpoint as headless kubernetes service. This is required for gRPC load balancing via Ambassador. - * Locations : SeldonDeployment.spec.annotations + * Locations : SeldonDeployment.metadata.annotations, SeldonDeployment.spec.annotations * [gRPC headless example](grpc_load_balancing_ambassador.md) + * ```seldon.io/executor-logger-queue-size``` : Size of request logging worker queue + * Locations: SeldonDeployment.metadata.annotations, SeldonDeployment.spec.annotations + * ```seldon.io/executor-logger-write-timeout-ms``` : Write timeout for adding to logging work queue + * Locations: SeldonDeployment.metadata.annotations, SeldonDeployment.spec.annotations ### Misc diff --git a/doc/source/reference/upgrading.md b/doc/source/reference/upgrading.md index 9585e9dc49..4df2261506 100644 --- a/doc/source/reference/upgrading.md +++ b/doc/source/reference/upgrading.md @@ -19,6 +19,18 @@ Seldon Core adds support for Kubernetes 1.22 by upgrading all ValidatingWebhookC * Access required to modify files in the local folder are required so the application folder should be writable * The default base image now changes the owner of the /microservice folder to user 8888 +### Updated executor request logger settings + +The request logging from the executor now has a configurable queue size and write timeout. This will allow a tradeoff between pending request memory usage and failing requests when sending to various logging endpoints that may be slow. The write timeout will mean logging of requests will fail if waiting for more than the given time to be added to the work queue. The two settings are: + + * `executor.requestLogger.workQueueSize` (default 10000) + * `executor.requestLogger.writeTimeoutMs` (default 2000) + +It is also possible to update these values on a per SeldonDeployment basis with the annotations: + + * `seldon.io/executor-logger-queue-size` + * `seldon.io/executor-logger-write-timeout-ms` + ## Upgrading to 1.11 ### Python S2I Wrapper diff --git a/executor/cmd/executor/main.go b/executor/cmd/executor/main.go index c8c88a8f2c..b9d29cf2f4 100644 --- a/executor/cmd/executor/main.go +++ b/executor/cmd/executor/main.go @@ -71,8 +71,8 @@ var ( filename = flag.String("file", "", "Load graph from file") hostname = flag.String("hostname", "", "The hostname of the running server") logWorkers = flag.Int("logger_workers", 10, "Number of workers handling payload logging") - logWorkBufferSize = flag.Int("log_work_buffer_size", 10000, "Limit of buffered logs in memory while waiting for downstream request ingestion") - logWriteTimeoutMs = flag.Int("log_write_timeout_ms", 2000, "Timeout before giving up writing log if buffer is full. If <= 0 will immediately drop log on full log buffer.") + logWorkBufferSize = flag.Int("log_work_buffer_size", loghandler.DefaultWorkQueueSize, "Limit of buffered logs in memory while waiting for downstream request ingestion") + logWriteTimeoutMs = flag.Int("log_write_timeout_ms", loghandler.DefaultWriteTimeoutMilliseconds, "Timeout before giving up writing log if buffer is full. If <= 0 will immediately drop log on full log buffer.") prometheusPath = flag.String("prometheus_path", "/metrics", "The prometheus metrics path") kafkaBroker = flag.String("kafka_broker", "", "The kafka broker as host:port") kafkaTopicIn = flag.String("kafka_input_topic", "", "The kafka input topic") diff --git a/helm-charts/seldon-abtest/README.md b/helm-charts/seldon-abtest/README.md index a1b11f6c8f..064b9f1413 100644 --- a/helm-charts/seldon-abtest/README.md +++ b/helm-charts/seldon-abtest/README.md @@ -38,10 +38,10 @@ helm install $MY_MODEL_NAME seldonio/seldon-abtest --namespace $MODELS_NAMESPACE | Key | Type | Default | Description | |-----|------|---------|-------------| | modela.image.name | string | `"seldonio/mock_classifier"` | | -| modela.image.version | string | `"1.9.0"` | | +| modela.image.version | string | `"1.12.0-dev"` | | | modela.name | string | `"classifier-1"` | | | modelb.image.name | string | `"seldonio/mock_classifier"` | | -| modelb.image.version | string | `"1.9.0"` | | +| modelb.image.version | string | `"1.12.0-dev"` | | | modelb.name | string | `"classifier-2"` | | | predictor.name | string | `"default"` | | | replicas | int | `1` | | diff --git a/helm-charts/seldon-benchmark-workflow/README.md b/helm-charts/seldon-benchmark-workflow/README.md index 47e2829975..a893c49ef7 100644 --- a/helm-charts/seldon-benchmark-workflow/README.md +++ b/helm-charts/seldon-benchmark-workflow/README.md @@ -1,3 +1,67 @@ -# Seldon Batch Workflow +# seldon-benchmark-workflow -This chart creates a batch workflow which leverages the seldon batch processor functionality. +![Version: 0.1](https://img.shields.io/static/v1?label=Version&message=0.1&color=informational&style=flat-square) + +Seldon Benchmark Workflow + +## Usage + +To use this chart, you will first need to add the `seldonio` Helm repo: + +```bash +helm repo add seldonio https://storage.googleapis.com/seldon-charts +helm repo update +``` + +Once that's done, you should then be able to use the inference graph template as: + +```bash +helm template $MY_MODEL_NAME seldonio/seldon-benchmark-workflow --namespace $MODELS_NAMESPACE +``` + +Note that you can also deploy the inference graph directly to your cluster +using: + +```bash +helm install $MY_MODEL_NAME seldonio/seldon-benchmark-workflow --namespace $MODELS_NAMESPACE +``` + +## Source Code + +* + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| benchmark.concurrency | int | `1` | | +| benchmark.cpu | int | `4` | | +| benchmark.data | string | `"{\"data\": {\"ndarray\": [[0,1,2,3]]}}"` | | +| benchmark.duration | string | `"30s"` | | +| benchmark.grpcDataOverride | string | `nil` | | +| benchmark.grpcImage | string | `"seldonio/ghz:v0.95.0"` | | +| benchmark.host | string | `"istio-ingressgateway.istio-system.svc.cluster.local:80"` | | +| benchmark.rate | int | `0` | | +| benchmark.restImage | string | `"peterevans/vegeta:latest-vegeta12.8.4"` | | +| seldonDeployment.apiType | string | `"rest"` | | +| seldonDeployment.disableOrchestrator | bool | `false` | | +| seldonDeployment.enableResources | string | `"false"` | | +| seldonDeployment.image | string | `nil` | | +| seldonDeployment.limits.cpu | string | `"50m"` | | +| seldonDeployment.limits.memory | string | `"1000Mi"` | | +| seldonDeployment.modelName | string | `"classifier"` | | +| seldonDeployment.modelUri | string | `nil` | | +| seldonDeployment.name | string | `"seldon-{{workflow.uid}}"` | | +| seldonDeployment.protocol | string | `"seldon"` | | +| seldonDeployment.replicas | int | `2` | | +| seldonDeployment.requests.cpu | string | `"50m"` | | +| seldonDeployment.requests.memory | string | `"100Mi"` | | +| seldonDeployment.server | string | `nil` | | +| seldonDeployment.serverThreads | int | `1` | | +| seldonDeployment.serverWorkers | int | `4` | | +| seldonDeployment.waitTime | int | `5` | | +| workflow.name | string | `"seldon-benchmark-process"` | | +| workflow.namespace | string | `"default"` | | +| workflow.parallelism | int | `1` | | +| workflow.paramDelimiter | string | `"|"` | | +| workflow.useNameAsGenerateName | string | `"false"` | | diff --git a/helm-charts/seldon-core-analytics/README.md b/helm-charts/seldon-core-analytics/README.md index 287888c66a..09e4a61434 100644 --- a/helm-charts/seldon-core-analytics/README.md +++ b/helm-charts/seldon-core-analytics/README.md @@ -1,6 +1,6 @@ # seldon-core-analytics -![Version: 1.9.0](https://img.shields.io/static/v1?label=Version&message=1.9.0&color=informational&style=flat-square) +![Version: 1.12.0-dev](https://img.shields.io/static/v1?label=Version&message=1.12.0--dev&color=informational&style=flat-square) Prometheus and Grafana installation with a basic Grafana dashboard showing the default Prometheus metrics exposed by Seldon for each inference graph diff --git a/helm-charts/seldon-core-operator/README.md b/helm-charts/seldon-core-operator/README.md index 068e2f60d7..7884824f7f 100644 --- a/helm-charts/seldon-core-operator/README.md +++ b/helm-charts/seldon-core-operator/README.md @@ -1,6 +1,6 @@ # seldon-core-operator -![Version: 1.9.0](https://img.shields.io/static/v1?label=Version&message=1.9.0&color=informational&style=flat-square) +![Version: 1.12.0-dev](https://img.shields.io/static/v1?label=Version&message=1.12.0--dev&color=informational&style=flat-square) Seldon Core CRD and controller helm chart for Kubernetes. @@ -34,10 +34,10 @@ helm install seldon-core-operator seldonio/seldon-core-operator --namespace seld | ambassador.singleNamespace | bool | `false` | | | certManager.enabled | bool | `false` | | | controllerId | string | `""` | | +| crd.annotations | object | `{}` | | | crd.create | bool | `true` | | | crd.forceV1 | bool | `false` | | | crd.forceV1beta1 | bool | `false` | | -| crd.annotations | map | `{}` | Annotations to add to the CRD | | credentials.gcs.gcsCredentialFileName | string | `"gcloud-application-credentials.json"` | | | credentials.s3.s3AccessKeyIDName | string | `"awsAccessKeyID"` | | | credentials.s3.s3SecretAccessKeyName | string | `"awsSecretAccessKey"` | | @@ -46,7 +46,7 @@ helm install seldon-core-operator seldonio/seldon-core-operator --namespace seld | engine.image.pullPolicy | string | `"IfNotPresent"` | | | engine.image.registry | string | `"docker.io"` | | | engine.image.repository | string | `"seldonio/engine"` | | -| engine.image.tag | string | `"1.9.0"` | | +| engine.image.tag | string | `"1.12.0-dev"` | | | engine.logMessagesExternally | bool | `false` | | | engine.port | int | `8000` | | | engine.prometheus.path | string | `"/prometheus"` | | @@ -59,34 +59,36 @@ helm install seldon-core-operator seldonio/seldon-core-operator --namespace seld | executor.image.pullPolicy | string | `"IfNotPresent"` | | | executor.image.registry | string | `"docker.io"` | | | executor.image.repository | string | `"seldonio/seldon-core-executor"` | | -| executor.image.tag | string | `"1.9.0"` | | +| executor.image.tag | string | `"1.12.0-dev"` | | | executor.metricsPortName | string | `"metrics"` | | | executor.port | int | `8000` | | | executor.prometheus.path | string | `"/prometheus"` | | | executor.requestLogger.defaultEndpoint | string | `"http://default-broker"` | | +| executor.requestLogger.workQueueSize | int | `10000` | | +| executor.requestLogger.writeTimeoutMs | int | `2000` | | | executor.resources.cpuLimit | string | `"500m"` | | | executor.resources.cpuRequest | string | `"500m"` | | | executor.resources.memoryLimit | string | `"512Mi"` | | | executor.resources.memoryRequest | string | `"512Mi"` | | | executor.serviceAccount.name | string | `"default"` | | | executor.user | int | `8888` | | -| explainer.image | string | `"seldonio/alibiexplainer:1.9.0"` | | +| explainer.image | string | `"seldonio/alibiexplainer:1.12.0-dev"` | | | image.pullPolicy | string | `"IfNotPresent"` | | | image.registry | string | `"docker.io"` | | | image.repository | string | `"seldonio/seldon-core-operator"` | | -| image.tag | string | `"1.9.0"` | | +| image.tag | string | `"1.12.0-dev"` | | | istio.enabled | bool | `false` | | | istio.gateway | string | `"istio-system/seldon-gateway"` | | | istio.tlsMode | string | `""` | | | keda.enabled | bool | `false` | | | kubeflow | bool | `false` | | +| manager.annotations | object | `{}` | | | manager.cpuLimit | string | `"500m"` | | | manager.cpuRequest | string | `"100m"` | | -| manager.logLevel | string | `"INFO"` | | | manager.leaderElectionID | string | `"a33bd623.machinelearning.seldon.io"` | | +| manager.logLevel | string | `"INFO"` | | | manager.memoryLimit | string | `"300Mi"` | | | manager.memoryRequest | string | `"200Mi"` | | -| manager.annotations | map | `{}` | Annotations to add to the deployment template spec | | managerCreateResources | bool | `false` | | | managerUserID | int | `8888` | | | namespaceOverride | string | `""` | | @@ -94,23 +96,25 @@ helm install seldon-core-operator seldonio/seldon-core-operator --namespace seld | predictiveUnit.grpcPort | int | `9500` | | | predictiveUnit.httpPort | int | `9000` | | | predictiveUnit.metricsPortName | string | `"metrics"` | | -| predictor_servers.MLFLOW_SERVER.protocols.seldon.defaultImageVersion | string | `"1.9.0"` | | +| predictor_servers.MLFLOW_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.5.0"` | | +| predictor_servers.MLFLOW_SERVER.protocols.kfserving.image | string | `"seldonio/mlserver"` | | +| predictor_servers.MLFLOW_SERVER.protocols.seldon.defaultImageVersion | string | `"1.12.0-dev"` | | | predictor_servers.MLFLOW_SERVER.protocols.seldon.image | string | `"seldonio/mlflowserver"` | | -| predictor_servers.SKLEARN_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.3.2"` | | +| predictor_servers.SKLEARN_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.5.0"` | | | predictor_servers.SKLEARN_SERVER.protocols.kfserving.image | string | `"seldonio/mlserver"` | | -| predictor_servers.SKLEARN_SERVER.protocols.seldon.defaultImageVersion | string | `"1.9.0"` | | +| predictor_servers.SKLEARN_SERVER.protocols.seldon.defaultImageVersion | string | `"1.12.0-dev"` | | | predictor_servers.SKLEARN_SERVER.protocols.seldon.image | string | `"seldonio/sklearnserver"` | | -| predictor_servers.TEMPO_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.3.2"` | | +| predictor_servers.TEMPO_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.5.0"` | | | predictor_servers.TEMPO_SERVER.protocols.kfserving.image | string | `"seldonio/mlserver"` | | -| predictor_servers.TENSORFLOW_SERVER.protocols.seldon.defaultImageVersion | string | `"1.9.0"` | | +| predictor_servers.TENSORFLOW_SERVER.protocols.seldon.defaultImageVersion | string | `"1.12.0-dev"` | | | predictor_servers.TENSORFLOW_SERVER.protocols.seldon.image | string | `"seldonio/tfserving-proxy"` | | | predictor_servers.TENSORFLOW_SERVER.protocols.tensorflow.defaultImageVersion | string | `"2.1.0"` | | | predictor_servers.TENSORFLOW_SERVER.protocols.tensorflow.image | string | `"tensorflow/serving"` | | | predictor_servers.TRITON_SERVER.protocols.kfserving.defaultImageVersion | string | `"21.08-py3"` | | | predictor_servers.TRITON_SERVER.protocols.kfserving.image | string | `"nvcr.io/nvidia/tritonserver"` | | -| predictor_servers.XGBOOST_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.3.2"` | | +| predictor_servers.XGBOOST_SERVER.protocols.kfserving.defaultImageVersion | string | `"0.5.0"` | | | predictor_servers.XGBOOST_SERVER.protocols.kfserving.image | string | `"seldonio/mlserver"` | | -| predictor_servers.XGBOOST_SERVER.protocols.seldon.defaultImageVersion | string | `"1.9.0"` | | +| predictor_servers.XGBOOST_SERVER.protocols.seldon.defaultImageVersion | string | `"1.12.0-dev"` | | | predictor_servers.XGBOOST_SERVER.protocols.seldon.image | string | `"seldonio/xgboostserver"` | | | rbac.configmap.create | bool | `true` | | | rbac.create | bool | `true` | | @@ -119,7 +123,7 @@ helm install seldon-core-operator seldonio/seldon-core-operator --namespace seld | singleNamespace | bool | `false` | | | storageInitializer.cpuLimit | string | `"1"` | | | storageInitializer.cpuRequest | string | `"100m"` | | -| storageInitializer.image | string | `"seldonio/rclone-storage-initializer:1.9.0"` | | +| storageInitializer.image | string | `"seldonio/rclone-storage-initializer:1.12.0-dev"` | | | storageInitializer.memoryLimit | string | `"1Gi"` | | | storageInitializer.memoryRequest | string | `"100Mi"` | | | usageMetrics.enabled | bool | `false` | | diff --git a/helm-charts/seldon-core-operator/templates/deployment_seldon-controller-manager.yaml b/helm-charts/seldon-core-operator/templates/deployment_seldon-controller-manager.yaml index de95e66b7c..4391326e4c 100644 --- a/helm-charts/seldon-core-operator/templates/deployment_seldon-controller-manager.yaml +++ b/helm-charts/seldon-core-operator/templates/deployment_seldon-controller-manager.yaml @@ -135,6 +135,10 @@ spec: value: '{{ .Values.executor.metricsPortName }}' - name: EXECUTOR_REQUEST_LOGGER_DEFAULT_ENDPOINT value: '{{ .Values.executor.requestLogger.defaultEndpoint }}' + - name: EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE + value: '{{ .Values.executor.requestLogger.workQueueSize }}' + - name: EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS + value: '{{ .Values.executor.requestLogger.writeTimeoutMs }}' - name: DEFAULT_USER_ID value: '{{ .Values.defaultUserID }}' - name: EXECUTOR_DEFAULT_CPU_REQUEST diff --git a/helm-charts/seldon-core-operator/values.yaml b/helm-charts/seldon-core-operator/values.yaml index 72f7a780e2..24147a4623 100644 --- a/helm-charts/seldon-core-operator/values.yaml +++ b/helm-charts/seldon-core-operator/values.yaml @@ -66,6 +66,8 @@ executor: # For more information see the Production Integration for Payload Request Logging with ELK in the docs requestLogger: defaultEndpoint: 'http://default-broker' + workQueueSize: 10000 + writeTimeoutMs: 2000 # ## Seldon Core Controller Manager Options image: diff --git a/helm-charts/seldon-mab/README.md b/helm-charts/seldon-mab/README.md index 9923fe4f39..ff51e4f10c 100644 --- a/helm-charts/seldon-mab/README.md +++ b/helm-charts/seldon-mab/README.md @@ -47,17 +47,17 @@ helm install $MY_MODEL_NAME seldonio/seldon-mab --namespace $MODELS_NAMESPACE | mab.branches | int | `2` | | | mab.epsilon | float | `0.2` | | | mab.image.name | string | `"seldonio/mab_epsilon_greedy"` | | -| mab.image.version | string | `"1.9.0"` | | +| mab.image.version | string | `"1.12.0-dev"` | | | mab.name | string | `"eg-router"` | | | mab.verbose | int | `1` | | | modela.image.name | string | `"seldonio/mock_classifier"` | | -| modela.image.version | string | `"1.9.0"` | | +| modela.image.version | string | `"1.12.0-dev"` | | | modela.name | string | `"classifier-1"` | | | modelb.image.name | string | `"seldonio/mock_classifier"` | | -| modelb.image.version | string | `"1.9.0"` | | +| modelb.image.version | string | `"1.12.0-dev"` | | | modelb.name | string | `"classifier-2"` | | | predictor.name | string | `"default"` | | | predictorLabels.fluentd | string | `"true"` | | -| predictorLabels.version | string | `"1.9.0"` | | +| predictorLabels.version | string | `"1.12.0-dev"` | | | replicas | int | `1` | | | sdepLabels.app | string | `"seldon"` | | diff --git a/operator/apis/machinelearning.seldon.io/v1/seldondeployment_types.go b/operator/apis/machinelearning.seldon.io/v1/seldondeployment_types.go index aeb251f7ed..5aaf9bde4d 100644 --- a/operator/apis/machinelearning.seldon.io/v1/seldondeployment_types.go +++ b/operator/apis/machinelearning.seldon.io/v1/seldondeployment_types.go @@ -66,12 +66,14 @@ const ( ENV_SELDON_DEPLOYMENT_ID = "SELDON_DEPLOYMENT_ID" ENV_SELDON_EXECUTOR_ENABLED = "SELDON_EXECUTOR_ENABLED" - ANNOTATION_JAVA_OPTS = "seldon.io/engine-java-opts" - ANNOTATION_SEPARATE_ENGINE = "seldon.io/engine-separate-pod" - ANNOTATION_HEADLESS_SVC = "seldon.io/headless-svc" - ANNOTATION_NO_ENGINE = "seldon.io/no-engine" - ANNOTATION_CUSTOM_SVC_NAME = "seldon.io/svc-name" - ANNOTATION_EXECUTOR = "seldon.io/executor" + ANNOTATION_JAVA_OPTS = "seldon.io/engine-java-opts" + ANNOTATION_SEPARATE_ENGINE = "seldon.io/engine-separate-pod" + ANNOTATION_HEADLESS_SVC = "seldon.io/headless-svc" + ANNOTATION_NO_ENGINE = "seldon.io/no-engine" + ANNOTATION_CUSTOM_SVC_NAME = "seldon.io/svc-name" + ANNOTATION_EXECUTOR = "seldon.io/executor" + ANNOTATION_LOGGER_WORK_QUEUE_SIZE = "seldon.io/executor-logger-queue-size" + ANNOTATION_LOGGER_WRITE_TIMEOUT_MS = "seldon.io/executor-logger-write-timeout-ms" DeploymentNamePrefix = "seldon" ) diff --git a/operator/config/manager/manager.yaml b/operator/config/manager/manager.yaml index f6a07e0ca3..33be170954 100644 --- a/operator/config/manager/manager.yaml +++ b/operator/config/manager/manager.yaml @@ -125,6 +125,10 @@ spec: value: "metrics" - name: EXECUTOR_REQUEST_LOGGER_DEFAULT_ENDPOINT value: "http://default-broker" + - name: EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE + value: "10000" + - name: EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS + value: "2000" - name: DEFAULT_USER_ID value: '' - name: EXECUTOR_DEFAULT_CPU_REQUEST diff --git a/operator/constants/constants.go b/operator/constants/constants.go index 71599d6837..d9ceab8a46 100644 --- a/operator/constants/constants.go +++ b/operator/constants/constants.go @@ -79,12 +79,14 @@ const ( // Default resources const ( - DefaultExecutorCpuRequest = "0.5" - DefaultExecutorCpuLimit = "0.5" - DefaultExecutorMemoryRequest = "512Mi" - DefaultExecutorMemoryLimit = "512Mi" - DefaultEngineCpuRequest = "0.5" - DefaultEngineCpuLimit = "0.5" - DefaultEngineMemoryRequest = "512Mi" - DefaultEngineMemoryLimit = "512Mi" + DefaultExecutorCpuRequest = "0.5" + DefaultExecutorCpuLimit = "0.5" + DefaultExecutorMemoryRequest = "512Mi" + DefaultExecutorMemoryLimit = "512Mi" + DefaultEngineCpuRequest = "0.5" + DefaultEngineCpuLimit = "0.5" + DefaultEngineMemoryRequest = "512Mi" + DefaultEngineMemoryLimit = "512Mi" + DefaultExecutorReqLoggerWorkQueueSize = "10000" + DefaultExecutorReqLoggerWriteTimeoutMs = "2000" ) diff --git a/operator/controllers/seldondeployment_engine.go b/operator/controllers/seldondeployment_engine.go index f6706241eb..dec4e7a925 100644 --- a/operator/controllers/seldondeployment_engine.go +++ b/operator/controllers/seldondeployment_engine.go @@ -33,22 +33,24 @@ import ( ) const ( - ENV_DEFAULT_EXECUTOR_SERVER_PORT = "EXECUTOR_SERVER_PORT" - ENV_DEFAULT_EXECUTOR_SERVER_GRPC_PORT = "EXECUTOR_SERVER_GRPC_PORT" - ENV_DEFAULT_EXECUTOR_CPU_REQUEST = "EXECUTOR_DEFAULT_CPU_REQUEST" - ENV_DEFAULT_EXECUTOR_MEMORY_REQUEST = "EXECUTOR_DEFAULT_MEMORY_REQUEST" - ENV_DEFAULT_EXECUTOR_CPU_LIMIT = "EXECUTOR_DEFAULT_CPU_LIMIT" - ENV_DEFAULT_EXECUTOR_MEMORY_LIMIT = "EXECUTOR_DEFAULT_MEMORY_LIMIT" - ENV_DEFAULT_ENGINE_CPU_REQUEST = "ENGINE_DEFAULT_CPU_REQUEST" - ENV_DEFAULT_ENGINE_MEMORY_REQUEST = "ENGINE_DEFAULT_MEMORY_REQUEST" - ENV_DEFAULT_ENGINE_CPU_LIMIT = "ENGINE_DEFAULT_CPU_LIMIT" - ENV_DEFAULT_ENGINE_MEMORY_LIMIT = "ENGINE_DEFAULT_MEMORY_LIMIT" - ENV_EXECUTOR_METRICS_PORT_NAME = "EXECUTOR_SERVER_METRICS_PORT_NAME" - ENV_EXECUTOR_PROMETHEUS_PATH = "EXECUTOR_PROMETHEUS_PATH" - ENV_ENGINE_PROMETHEUS_PATH = "ENGINE_PROMETHEUS_PATH" - ENV_EXECUTOR_USER = "EXECUTOR_CONTAINER_USER" - ENV_ENGINE_USER = "ENGINE_CONTAINER_USER" - ENV_USE_EXECUTOR = "USE_EXECUTOR" + ENV_DEFAULT_EXECUTOR_SERVER_PORT = "EXECUTOR_SERVER_PORT" + ENV_DEFAULT_EXECUTOR_SERVER_GRPC_PORT = "EXECUTOR_SERVER_GRPC_PORT" + ENV_DEFAULT_EXECUTOR_CPU_REQUEST = "EXECUTOR_DEFAULT_CPU_REQUEST" + ENV_DEFAULT_EXECUTOR_MEMORY_REQUEST = "EXECUTOR_DEFAULT_MEMORY_REQUEST" + ENV_DEFAULT_EXECUTOR_CPU_LIMIT = "EXECUTOR_DEFAULT_CPU_LIMIT" + ENV_DEFAULT_EXECUTOR_MEMORY_LIMIT = "EXECUTOR_DEFAULT_MEMORY_LIMIT" + ENV_DEFAULT_ENGINE_CPU_REQUEST = "ENGINE_DEFAULT_CPU_REQUEST" + ENV_DEFAULT_ENGINE_MEMORY_REQUEST = "ENGINE_DEFAULT_MEMORY_REQUEST" + ENV_DEFAULT_ENGINE_CPU_LIMIT = "ENGINE_DEFAULT_CPU_LIMIT" + ENV_DEFAULT_ENGINE_MEMORY_LIMIT = "ENGINE_DEFAULT_MEMORY_LIMIT" + ENV_EXECUTOR_METRICS_PORT_NAME = "EXECUTOR_SERVER_METRICS_PORT_NAME" + ENV_EXECUTOR_PROMETHEUS_PATH = "EXECUTOR_PROMETHEUS_PATH" + ENV_EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE = "EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE" + ENV_EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS = "EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS" + ENV_ENGINE_PROMETHEUS_PATH = "ENGINE_PROMETHEUS_PATH" + ENV_EXECUTOR_USER = "EXECUTOR_CONTAINER_USER" + ENV_ENGINE_USER = "ENGINE_CONTAINER_USER" + ENV_USE_EXECUTOR = "USE_EXECUTOR" DEFAULT_EXECUTOR_CONTAINER_PORT = 8000 DEFAULT_EXECUTOR_GRPC_PORT = 5001 @@ -71,10 +73,12 @@ var ( executorMetricsPortName = utils.GetEnv(ENV_EXECUTOR_METRICS_PORT_NAME, constants.DefaultMetricsPortName) - executorDefaultCpuRequest = utils.GetEnv(ENV_DEFAULT_EXECUTOR_CPU_REQUEST, constants.DefaultExecutorCpuRequest) - executorDefaultCpuLimit = utils.GetEnv(ENV_DEFAULT_EXECUTOR_CPU_LIMIT, constants.DefaultExecutorCpuLimit) - executorDefaultMemoryRequest = utils.GetEnv(ENV_DEFAULT_EXECUTOR_MEMORY_REQUEST, constants.DefaultExecutorMemoryRequest) - executorDefaultMemoryLimit = utils.GetEnv(ENV_DEFAULT_EXECUTOR_MEMORY_LIMIT, constants.DefaultExecutorMemoryLimit) + executorDefaultCpuRequest = utils.GetEnv(ENV_DEFAULT_EXECUTOR_CPU_REQUEST, constants.DefaultExecutorCpuRequest) + executorDefaultCpuLimit = utils.GetEnv(ENV_DEFAULT_EXECUTOR_CPU_LIMIT, constants.DefaultExecutorCpuLimit) + executorDefaultMemoryRequest = utils.GetEnv(ENV_DEFAULT_EXECUTOR_MEMORY_REQUEST, constants.DefaultExecutorMemoryRequest) + executorDefaultMemoryLimit = utils.GetEnv(ENV_DEFAULT_EXECUTOR_MEMORY_LIMIT, constants.DefaultExecutorMemoryLimit) + executorReqLoggerWorkQueueSize = utils.GetEnv(ENV_EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE, constants.DefaultExecutorReqLoggerWorkQueueSize) + executorReqLoggerWriteTimeoutMs = utils.GetEnv(ENV_EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS, constants.DefaultExecutorReqLoggerWriteTimeoutMs) engineDefaultCpuRequest = utils.GetEnv(ENV_DEFAULT_ENGINE_CPU_REQUEST, constants.DefaultEngineCpuRequest) engineDefaultCpuLimit = utils.GetEnv(ENV_DEFAULT_ENGINE_CPU_LIMIT, constants.DefaultEngineCpuLimit) @@ -249,6 +253,18 @@ func createExecutorContainer(mlDep *machinelearningv1.SeldonDeployment, p *machi probeScheme = corev1.URISchemeHTTPS } + loggerQSize := getAnnotation(mlDep, machinelearningv1.ANNOTATION_LOGGER_WORK_QUEUE_SIZE, executorReqLoggerWorkQueueSize) + _, err := strconv.Atoi(loggerQSize) + if err != nil { + return nil, fmt.Errorf("Failed to parse %s as integer for %s. %w", loggerQSize, ENV_EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE, err) + } + + loggerWriteTimeout := getAnnotation(mlDep, machinelearningv1.ANNOTATION_LOGGER_WRITE_TIMEOUT_MS, executorReqLoggerWriteTimeoutMs) + _, err = strconv.Atoi(loggerWriteTimeout) + if err != nil { + return nil, fmt.Errorf("Failed to parse %s as integer for %s. %w", executorReqLoggerWriteTimeoutMs, ENV_EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS, err) + } + return &corev1.Container{ Name: EngineContainerName, Image: executorImage, @@ -261,6 +277,8 @@ func createExecutorContainer(mlDep *machinelearningv1.SeldonDeployment, p *machi "--protocol", string(protocol), "--prometheus_path", getPrometheusPath(mlDep), "--server_type", string(serverType), + "--log_work_buffer_size", loggerQSize, + "--log_write_timeout_ms", loggerWriteTimeout, }, ImagePullPolicy: corev1.PullPolicy(utils.GetEnv("EXECUTOR_CONTAINER_IMAGE_PULL_POLICY", "IfNotPresent")), TerminationMessagePath: "/dev/termination-log", diff --git a/operator/controllers/seldondeployment_engine_test.go b/operator/controllers/seldondeployment_engine_test.go index 5c7f8df9be..6f9d9f8445 100644 --- a/operator/controllers/seldondeployment_engine_test.go +++ b/operator/controllers/seldondeployment_engine_test.go @@ -5,6 +5,7 @@ import ( . "github.com/onsi/gomega" machinelearningv1 "github.com/seldonio/seldon-core/operator/apis/machinelearning.seldon.io/v1" + "github.com/seldonio/seldon-core/operator/constants" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -18,8 +19,9 @@ func createTestSeldonDeployment() *machinelearningv1.SeldonDeployment { } return &machinelearningv1.SeldonDeployment{ ObjectMeta: metav1.ObjectMeta{ - Name: key.Name, - Namespace: key.Namespace, + Name: key.Name, + Namespace: key.Namespace, + Annotations: make(map[string]string), }, Spec: machinelearningv1.SeldonDeploymentSpec{ Name: "mydep", @@ -178,3 +180,63 @@ func TestExecutorCreateKafka(t *testing.T) { g.Expect(err).ToNot(BeNil()) cleanEnvImages() } + +func TestEngineCreateLoggerParams(t *testing.T) { + g := NewGomegaWithT(t) + cleanEnvImages() + envExecutorImage = "executor" + mlDep := createTestSeldonDeployment() + con, err := createExecutorContainer(mlDep, &mlDep.Spec.Predictors[0], "", 1, 2, &v1.ResourceRequirements{}) + g.Expect(err).To(BeNil()) + for idx, arg := range con.Args { + if arg == "--log_work_buffer_size" { + g.Expect(con.Args[idx+1]).To(Equal(constants.DefaultExecutorReqLoggerWorkQueueSize)) + } + if arg == "--log_write_timeout_ms" { + g.Expect(con.Args[idx+1]).To(Equal(constants.DefaultExecutorReqLoggerWriteTimeoutMs)) + } + } + cleanEnvImages() +} + +func TestEngineCreateLoggerParamsEnv(t *testing.T) { + g := NewGomegaWithT(t) + cleanEnvImages() + envExecutorImage = "executor" + executorReqLoggerWorkQueueSize = "1" + executorReqLoggerWriteTimeoutMs = "1" + mlDep := createTestSeldonDeployment() + con, err := createExecutorContainer(mlDep, &mlDep.Spec.Predictors[0], "", 1, 2, &v1.ResourceRequirements{}) + g.Expect(err).To(BeNil()) + for idx, arg := range con.Args { + if arg == "--log_work_buffer_size" { + g.Expect(con.Args[idx+1]).To(Equal("1")) + } + if arg == "--log_write_timeout_ms" { + g.Expect(con.Args[idx+1]).To(Equal("1")) + } + } + cleanEnvImages() + executorReqLoggerWorkQueueSize = constants.DefaultExecutorReqLoggerWorkQueueSize + executorReqLoggerWriteTimeoutMs = constants.DefaultExecutorReqLoggerWriteTimeoutMs +} + +func TestEngineCreateLoggerAnnotation(t *testing.T) { + g := NewGomegaWithT(t) + cleanEnvImages() + envExecutorImage = "executor" + mlDep := createTestSeldonDeployment() + mlDep.Annotations[machinelearningv1.ANNOTATION_LOGGER_WORK_QUEUE_SIZE] = "22" + mlDep.Annotations[machinelearningv1.ANNOTATION_LOGGER_WRITE_TIMEOUT_MS] = "5" + con, err := createExecutorContainer(mlDep, &mlDep.Spec.Predictors[0], "", 1, 2, &v1.ResourceRequirements{}) + g.Expect(err).To(BeNil()) + for idx, arg := range con.Args { + if arg == "--log_work_buffer_size" { + g.Expect(con.Args[idx+1]).To(Equal("22")) + } + if arg == "--log_write_timeout_ms" { + g.Expect(con.Args[idx+1]).To(Equal("5")) + } + } + cleanEnvImages() +} diff --git a/operator/helm/split_resources.py b/operator/helm/split_resources.py index 8e2a4bcb19..13e674564f 100644 --- a/operator/helm/split_resources.py +++ b/operator/helm/split_resources.py @@ -74,6 +74,8 @@ "ENGINE_DEFAULT_MEMORY_REQUEST": "engine.resources.memoryRequest", "MANAGER_LOG_LEVEL": "manager.logLevel", "MANAGER_LEADER_ELECTION_ID": "manager.leaderElectionID", + "EXECUTOR_REQUEST_LOGGER_WORK_QUEUE_SIZE": "executor.requestLogger.workQueueSize", + "EXECUTOR_REQUEST_LOGGER_WRITE_TIMEOUT_MS": "executor.requestLogger.writeTimeoutMs", } HELM_VALUES_IMAGE_PULL_POLICY = "{{ .Values.image.pullPolicy }}"