diff --git a/example/integrations/tensorflow/tf-example.yaml b/example/integrations/tensorflow/tf-example.yaml index d2090bbf681..05bc67d781c 100644 --- a/example/integrations/tensorflow/tf-example.yaml +++ b/example/integrations/tensorflow/tf-example.yaml @@ -37,6 +37,7 @@ spec: minAvailable: 2 schedulerName: kube-batch plugins: + env: [] svc: [] policies: - event: PodEvicted diff --git a/example/duplicatedPolicyEvent-webhook-deny.yaml b/example/invalid_jobs/duplicatedPolicyEvent-webhook-deny.yaml similarity index 100% rename from example/duplicatedPolicyEvent-webhook-deny.yaml rename to example/invalid_jobs/duplicatedPolicyEvent-webhook-deny.yaml diff --git a/example/duplicatedTaskName-webhook-deny.yaml b/example/invalid_jobs/duplicatedTaskName-webhook-deny.yaml similarity index 100% rename from example/duplicatedTaskName-webhook-deny.yaml rename to example/invalid_jobs/duplicatedTaskName-webhook-deny.yaml diff --git a/example/minAvailable-webhook-deny.yaml b/example/invalid_jobs/minAvailable-webhook-deny.yaml similarity index 100% rename from example/minAvailable-webhook-deny.yaml rename to example/invalid_jobs/minAvailable-webhook-deny.yaml diff --git a/example/job.yaml b/example/job.yaml index ace1d986f6e..c5247f8e7be 100644 --- a/example/job.yaml +++ b/example/job.yaml @@ -6,30 +6,36 @@ spec: minAvailable: 3 schedulerName: kube-batch policies: - - event: PodEvicted - action: RestartJob + - event: PodEvicted + action: RestartJob + plugins: + ssh: [] + env: [] + svc: [] + maxRetry: 5 + queue: default volumes: - - mountPath: "/myinput" - - mountPath: "/myoutput" - volumeClaimName: "testvolumeclaimname" - volumeClaim: - accessModes: [ "ReadWriteOnce" ] - storageClassName: "my-storage-class" - resources: - requests: - storage: 1Gi + - mountPath: "/myinput" + - mountPath: "/myoutput" + volumeClaimName: "testvolumeclaimname" + volumeClaim: + accessModes: [ "ReadWriteOnce" ] + storageClassName: "my-storage-class" + resources: + requests: + storage: 1Gi tasks: - - replicas: 6 - name: "default-nginx" - template: - metadata: - name: web - spec: - containers: - - image: nginx - imagePullPolicy: IfNotPresent - name: nginx - resources: - requests: - cpu: "1" - restartPolicy: OnFailure + - replicas: 6 + name: "default-nginx" + template: + metadata: + name: web + spec: + containers: + - image: nginx + imagePullPolicy: IfNotPresent + name: nginx + resources: + requests: + cpu: "1" + restartPolicy: OnFailure diff --git a/example/kube-batch-conf.yaml b/example/kube-batch-conf.yaml deleted file mode 100644 index add2e0b8907..00000000000 --- a/example/kube-batch-conf.yaml +++ /dev/null @@ -1,11 +0,0 @@ -actions: "enqueue, reclaim, allocate, backfill, preempt" -tiers: - - plugins: - - name: priority - - name: gang - - name: conformance - - plugins: - - name: drf - - name: predicates - - name: proportion - - name: nodeorder diff --git a/example/openmpi-hello.yaml b/example/openmpi-hello.yaml deleted file mode 100644 index 5f136c736af..00000000000 --- a/example/openmpi-hello.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: batch.volcano.sh/v1alpha1 -kind: Job -metadata: - name: openmpi-hello -spec: - minAvailable: 3 - schedulerName: scheduler - plugins: - ssh: [] - env: [] - svc: [] - tasks: - - replicas: 1 - name: mpimaster - policies: - - event: TaskCompleted - action: CompleteJob - template: - spec: - imagePullSecrets: - - name: default-secret - containers: - - command: - - /bin/sh - - -c - - | - MPI_HOST=`cat /etc/volcano/mpiworker.host | tr "\n" ","`; - mkdir -p /var/run/sshd; /usr/sbin/sshd; - mpiexec --allow-run-as-root --host ${MPI_HOST} -np 2 mpi_hello_world > /home/re - image: 100.125.5.235:20202/l00427178/openmpi-hello:3.28 - name: mpimaster - ports: - - containerPort: 22 - name: mpijob-port - workingDir: /home - restartPolicy: OnFailure - - replicas: 2 - name: mpiworker - template: - spec: - imagePullSecrets: - - name: default-secret - containers: - - command: - - /bin/sh - - -c - - | - mkdir -p /var/run/sshd; /usr/sbin/sshd -D; - image: 100.125.5.235:20202/l00427178/openmpi-hello:3.28 - name: mpiworker - ports: - - containerPort: 22 - name: mpijob-port - workingDir: /home - restartPolicy: OnFailure - diff --git a/example/role.yaml b/example/role.yaml deleted file mode 100644 index fe6bdfacc0c..00000000000 --- a/example/role.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: default-as-admin -subjects: - - kind: ServiceAccount - name: default - namespace: kube-system -roleRef: - kind: ClusterRole - name: cluster-admin - apiGroup: rbac.authorization.k8s.io diff --git a/example/tensorflow-benchmark.yaml b/example/tensorflow-benchmark.yaml deleted file mode 100644 index 49912a57e39..00000000000 --- a/example/tensorflow-benchmark.yaml +++ /dev/null @@ -1,61 +0,0 @@ -apiVersion: batch.volcano.sh/v1alpha1 -kind: Job -metadata: - name: tensorflow-benchmark -spec: - minAvailable: 5 - schedulerName: scheduler - plugins: - env: [] - svc: [] - policies: - - event: PodEvicted - action: RestartJob - tasks: - - replicas: 2 - name: ps - template: - spec: - imagePullSecrets: - - name: default-secret - containers: - - command: - - sh - - -c - - | - PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; - WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; - python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} - image: 100.125.5.235:20202/l00427178/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 - name: tensorflow - ports: - - containerPort: 2222 - name: tfjob-port - resources: {} - workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks - restartPolicy: OnFailure - - replicas: 3 - name: worker - policies: - - event: TaskCompleted - action: CompleteJob - template: - spec: - imagePullSecrets: - - name: default-secret - containers: - - command: - - sh - - -c - - | - PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; - WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; - python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} - image: 100.125.5.235:20202/l00427178/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 - name: tensorflow - ports: - - containerPort: 2222 - name: tfjob-port - resources: {} - workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks - restartPolicy: OnFailure diff --git a/installer/chart/volcano/templates/batch_v1alpha1_job.yaml b/installer/chart/volcano/templates/batch_v1alpha1_job.yaml index ad397c069ee..eb54dca40bc 100644 --- a/installer/chart/volcano/templates/batch_v1alpha1_job.yaml +++ b/installer/chart/volcano/templates/batch_v1alpha1_job.yaml @@ -74,6 +74,11 @@ spec: schedulerName: description: SchedulerName is the default value of `tasks.template.spec.schedulerName`. type: string + plugins: + description: Enabled task plugins when creating job. + type: object + additionalProperties: + type: array tasks: description: Tasks specifies the task specification of Job items: @@ -111,6 +116,13 @@ spec: type: object type: object type: array + queue: + description: The name of the queue on which job should been created + type: string + maxRetry: + description: The limit for retrying submiting job, default is 3 + format: int32 + type: integer type: object status: description: Current status of Job @@ -139,6 +151,15 @@ spec: description: Job's current version format: int32 type: integer + retryCount: + description: The number that volcano retried to submit the job. + format: int32 + type: integer + ControlledResources: + description: All of the resources that are controlled by this job. + type: object + additionalProperties: + type: string state: description: Current state of Job. properties: