Skip to content
This repository has been archived by the owner on Oct 28, 2020. It is now read-only.

Commit

Permalink
Merge pull request #75 from ystia/feature/GH-196-a4c-sm7-jobs-enhance…
Browse files Browse the repository at this point in the history
…ments

GH-196 a4c-sm7 jobs enhancements
  • Loading branch information
loicalbertin authored Dec 17, 2018
2 parents a481717 + 9592bc9 commit 583501e
Show file tree
Hide file tree
Showing 35 changed files with 499 additions and 121 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## UNRELEASED

### ENHANCEMENTS

* Support Jobs lifecycle enhancements (new operations `submit`, `run`, `cancel`) ([GH-196](https://github.com/ystia/yorc/issues/196))
* Generate Alien 2.1-compatible events ([GH-148](https://github.com/ystia/yorc/issues/148))

## 3.1.0-M7 (December 07, 2018)
Expand Down
2 changes: 1 addition & 1 deletion alien4cloud-yorc-plugin/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
<yorc.hp.types.version>1.0.0</yorc.hp.types.version>
<yorc.k8s.types.version>2.0.0-SNAPSHOT</yorc.k8s.types.version>
<yorc.os.types.version>1.0.0</yorc.os.types.version>
<yorc.slurm.types.version>1.0.0</yorc.slurm.types.version>
<yorc.slurm.types.version>1.1.0-SNAPSHOT</yorc.slurm.types.version>
</properties>

<dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,10 @@ public void run() {
orchestrator.changeStatus(paasId, DeploymentStatus.DEPLOYED);
done = true;
break;
case "DEPLOYMENT_FAILED":
orchestrator.doChangeStatus(paasId, DeploymentStatus.FAILURE);
error = new Exception("Deployment failed");
break;
default:
log.debug("Deployment Status is currently " + status);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ public void run() {
eMessage += event.getType() + ":" + eState;
log.debug("Received Event from Yorc <<< " + eMessage);
synchronized (jrdi) {
jrdi.setLastEvent(event);
jrdi.notifyAll();
}
switch (event.getStatus()) {
Expand Down Expand Up @@ -190,7 +189,6 @@ public void run() {
eMessage += event.getType() + ":" + eState;
log.debug("Received Event from Yorc <<< " + eMessage);
synchronized (jrdi) {
jrdi.setLastEvent(event);
jrdi.notifyAll();
}
switch (event.getStatus()) {
Expand All @@ -215,7 +213,6 @@ public void run() {
eMessage += event.getType() + ":" + eState;
log.debug("Received Event from Yorc <<< " + eMessage);
synchronized (jrdi) {
jrdi.setLastEvent(event);
jrdi.notifyAll();
}
switch (event.getStatus()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,82 +72,20 @@ public void run() {
Event evt;
while (!done && error == null) {
synchronized (jrdi) {
// Check for timeout
// Check deployment timeout
long timetowait = timeout - System.currentTimeMillis();
if (timetowait <= 0) {
log.warn("Timeout occured");
error = new Throwable("Workflow timeout");
log.warn("Deployment Timeout occured");
error = new Throwable("Deployment timeout");
orchestrator.doChangeStatus(paasId, DeploymentStatus.FAILURE);
break;
}
// Wait Events from Yorc
log.debug(paasId + ": Waiting for workflow events.");
// Wait Deployment Events from Yorc
log.debug(paasId + ": Waiting for deployment events.");
try {
jrdi.wait(timetowait);
} catch (InterruptedException e) {
log.error("Interrupted while waiting for task end");
break;
}
// Check if we received a Workflow Event and process it
evt = jrdi.getLastEvent();
if (evt != null && evt.getAlienExecutionId().equals(taskId)) {
jrdi.setLastEvent(null);
switch (evt.getType()) {
case EventListenerTask.EVT_WORKFLOW:
switch (evt.getStatus()) {
case "failed":
log.warn("Workflow failed: " + paasId);
orchestrator.postWorkflowMonitorEvent(new PaaSWorkflowFailedEvent(), evt);
error = new Exception("Workflow " + workflowName + " failed");
break;
case "canceled":
log.warn("Workflow canceled: " + paasId);
orchestrator.postWorkflowMonitorEvent(new PaaSWorkflowCancelledEvent(), evt);
error = new Exception("Workflow " + workflowName + " canceled");
break;
case "done":
orchestrator.postWorkflowMonitorEvent(new PaaSWorkflowSucceededEvent(), evt);
done = true;
break;
case "initial":
orchestrator.postWorkflowMonitorEvent(new PaaSWorkflowStartedEvent(), evt);
break;
default:
log.warn("An event has been ignored. Unexpected status=" + evt.getStatus());
break;
}
break;
case EventListenerTask.EVT_WORKFLOW_STEP:
switch (evt.getStatus()) {
case "initial":
orchestrator.postWorkflowStepEvent(new WorkflowStepStartedEvent(), evt);
break;
case "done":
case "error":
orchestrator.postWorkflowStepEvent(new WorkflowStepCompletedEvent(), evt);
break;
}
break;
case EventListenerTask.EVT_ALIEN_TASK:
switch (evt.getStatus()) {
case "initial":
orchestrator.postTaskEvent(new TaskSentEvent(), evt);
break;
case "running":
orchestrator.postTaskEvent(new TaskStartedEvent(), evt);
break;
case "done":
orchestrator.postTaskEvent(new TaskSucceededEvent(), evt);
break;
case "error":
orchestrator.postTaskEvent(new TaskFailedEvent(), evt);
break;
case "canceled":
orchestrator.postTaskEvent(new TaskCancelledEvent(), evt);
break;
}
break;
}
continue;
log.warn("Interrupted while waiting for deployment");
}
}
// We were awaken for some bad reason or a timeout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,3 @@ node_types:
endpoint:
type: yorc.capabilities.Endpoint.ProvisioningAdmin

# NOTE: Alien specific

Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,23 @@ node_types:
description: The ID of the job.
interfaces:
tosca.interfaces.node.lifecycle.Runnable:
submit:
implementation:
# This is a hack to force Alien to generate this step in workflows it will be overrided in Yorc
# TODO(loicalbertin) think about use a topology modifier for this to add this step only if a submit operation exists
file: "resources.yaml"
type: yorc.artifacts.Deployment.SlurmJob
run:
implementation:
file:
# This is a hack to force Alien to generate this step in workflows it will be overrided in Yorc
# TODO(loicalbertin) think about use a topology modifier for this to add this step only if a submit operation exists
file: "resources.yaml"
type: yorc.artifacts.Deployment.SlurmJob
cancel:
implementation:
# This is a hack to force Alien to generate this step in workflows it will be overrided in Yorc
# TODO(loicalbertin) think about use a topology modifier for this to add this step only if a submit operation exists
file: "resources.yaml"
type: yorc.artifacts.Deployment.SlurmJob

yorc.nodes.slurm.SingularityJob:
Expand Down
Binary file added documentation/_static/img/JobsRunLifeCycle.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions documentation/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@

# General information about the project.
project = u'Alien4Cloud Yorc Plugin'
copyright = u'2017, Atos BDS R&D'
copyright = u'2017-2018, Atos BDS R&D'
author = u'Atos BDS R&D'

# The version info for the project you're documenting, acts as replacement for
Expand Down Expand Up @@ -121,7 +121,7 @@
#keep_warnings = False

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
todo_include_todos = False


# -- Options for HTML output ----------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions documentation/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@ Contents:
location
quickstart
upgrade
jobs


157 changes: 157 additions & 0 deletions documentation/jobs.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
..
Copyright 2018 Bull S.A.S. Atos Technologies - Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois, France.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
---

Working with Jobs
=================

What's a Job?
-------------

By opposite to a service which is a long running application, a Job is an
application that runs to completion.

TOSCA life-cycle (install -> configure -> start ....... then finally stop
-> delete) was designed to handle services.
There is no concept of Jobs life-cycle within normative TOSCA.

But, as per our experience in HPC and emerging container scheduling
within Container as a Service solutions like Kubernetes, we are convinced
that supporting Job scheduling is fundamental for any orchestration solution.

So we decided in collaboration with the Alien4Cloud team to extend TOSCA to
support Jobs!

Extending TOSCA to support Jobs
-------------------------------

First was the life-cycle! In TOSCA the core concept is the life-cycle. So,
based on our experience we defined a life-cycle for Jobs.

.. image:: _static/img/JobsRunLifeCycle.png
:alt: Jobs Life Cycle
:align: center

Translated in TOSCA, we defined a new interface
``tosca.interfaces.node.lifecycle.Runnable`` this interface defines three
operations:

* ``submit``: Submit is this operation that *submits* a job to a Job Scheduler,
generally at the end of the ``submit`` we got a **job identifier**
* ``run``: Run is an asynchronous operation that will be called periodically
to check the **job status**.
* ``cancel``: Cancel allows to *cancel* a **submitted job**.

Supported Jobs Schedulers
-------------------------

Slurm
~~~~~

Slurm is an HPC scheduler. Unsurprisingly, it was our first builtin support for
Jobs scheduling. Our Slurm support allows to run single jobs and batches made of
several jobs.

.. todo:: Include a description on how to write SlurmBin/SlurmBatch/Singularity
Jobs

Kubernetes
~~~~~~~~~~

Over the years Kubernetes became the de-facto standard of Containers As A
Service (CaaS).

Kubernetes has a special builtin *Controller* for jobs called *Jobs - Run to
Completion*.

.. todo:: Include a description on how to write Kubernetes Jobs

The one you want!
~~~~~~~~~~~~~~~~~

Yorc also support Jobs defined in pure-TOSCA. That means that you are able
to write using YAML and Python, Shell or Ansible scripts your own interaction
with any scheduler.

All you need to do is to provide implementation for at least the ``submit``
operation of the job life-cycle. If you do not provide implementation for
the ``run`` operation, your job will run in *fire and forget* mode, you will
not be able to get information about its completion. Similarly, if you do not
provide an implementation for the ``cancel`` operation then your Job will
simply not being cancellable.

To allow Yorc to manage your job properly some conventions:

* at the end of the ``submit`` operation you should export a fact or
environment variable named ``TOSCA_JOB_ID`` containing the
**submitted job identifier**.

* Yorc automatically injects this ``TOSCA_JOB_ID`` as an input of the ``run``
and ``cancel`` operations.

* The ``run`` operation should be designed to be **non-blocking** and
**called several times**. Its primary role is to check the job status. It
should export a fact or environment variable named ``TOSCA_JOB_STATUS``
containing one of the following values:

* ``COMPLETED``: meaning that the job is done successfully.
* ``FAILED``: meaning that the job is done but in error.
* ``RUNNING``: meaning that the job is still running.
* ``QUEUED``: meaning that the job is submitted but didn't started yet.

Internally ``RUNNING`` and ``QUEUED`` statuses are handled the same way by
Yorc that will recall the ``run`` operation after a delay to refresh the
status.

* The ``run`` operation can also be used to retrieve logs or perform some
cleanup after the job completion.


You can find an example of a pure-TOSCA implementation of jobs in the official
*CSARs public library* with an implementation of a
`Spark Job <https://github.com/alien4cloud/csar-public-library/tree/develop/org/alien4cloud/spark/job-linux-sh>`_

Specific workflows for Jobs
---------------------------

When your application contains Jobs (meaning node templates which implements
the ``tosca.interfaces.node.lifecycle.Runnable`` interface) then Alien4Cloud
will automatically generate two workflows:

* ``run``: a workflow that submits and monitor jobs
* ``cancel``: a workflow that cancels jobs

.. warning:: The cancel workflow is a kind of temporary work around. It allows
to cancel jobs but do not take care if the job is submitted or not. The
recommended way to cancel a ``run`` workflow is to cancel the associated
task in Yorc using either the CLI or the Rest API.
This is temporary and we will provide soon a way to cancel workflows directly
from Alien4Cloud.

The ``run`` workflow allows to orchestrate Jobs. That means that if for
instance, ``jobB`` depends on ``jobA`` using a TOSCA ``dependsOn`` or
``connectsTO`` relationship then Alien4Cloud will generate a workflow that
first submit and wait for the completion of ``jobA`` before submitting
``jobB``.

Jobs cancellation
-----------------

The proper way to cancel Jobs that were submitted by a TOSCA workflow is
to cancel the associated Yorc Task/Execution of this workflow.
This way Yorc will automatically call ``cancel`` operations for nodes that
implement it and which have successfully executed their ``submit`` operation.
Currently those automatic cancellation steps do not appear in Alien4Cloud.
We will work soon on making them visible.
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
<parent>
<groupId>alien4cloud</groupId>
<artifactId>alien4cloud-parent</artifactId>
<version>2.1.0-SM7</version>
<version>2.1.0-RC1</version>
</parent>

<properties>
<alien4cloud.version>2.1.0-SM7</alien4cloud.version>
<alien4cloud.kubernetes.api.version>2.1.0-SM7</alien4cloud.kubernetes.api.version>
<alien4cloud.version>2.1.0-RC1</alien4cloud.version>
<alien4cloud.kubernetes.api.version>2.1.0-RC1</alien4cloud.kubernetes.api.version>
<tosca.normative.types.version>1.0.0-ALIEN20</tosca.normative.types.version>
<alien4cloud.dsl.version>alien_dsl_2_0_0</alien4cloud.dsl.version>
</properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ metadata:

imports:
- tosca-normative-types:1.0.0-ALIEN20
- docker-types:2.1.0-SM7
- docker-types:2.1.0-RC1

description: Contains types for testing Jobs in Kubernetes

Expand Down
Loading

0 comments on commit 583501e

Please sign in to comment.