Skip to content

Commit

Permalink
Cml runner long job (#583)
Browse files Browse the repository at this point in the history
* Restart uncompleted and runner exits

* no log

* remove log

* overcome 72 hours limit

* retry param

* docs

* Update help and set GH_5_MIN_TIMEOUT

* avoid enqueue runs

* fix job vs id

* no-rety

* remove unused error

* docs
  • Loading branch information
DavidGOrtega authored Jul 5, 2021
1 parent 866f81a commit 33db696
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 107 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-deploy.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Test & Deploy
on:
schedule:
- cron: 0 1 * * 6 # Sat 01:00
- cron: 0 1 * * 6 # Sat 01:00
release:
types: [published]
pull_request_target:
Expand Down
87 changes: 44 additions & 43 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,54 +475,55 @@ For example, `docker://dvcorg/cml:0-dvc2-base1-gpu`, or
The `cml-runner` function accepts the following arguments:

```
Usage: cml-runner.js
Usage: cml-runner
Options:
--version Show version number [boolean]
--labels One or more user-defined labels for this runner
(delimited with commas) [default: "cml"]
--idle-timeout Time in seconds for the runner to be waiting for
jobs before shutting down. Setting it to 0
disables automatic shutdown [default: 300]
--name Name displayed in the repository once registered
[default: "cml-4wdd123kha"]
--single Exit after running a single job
--version Show version number [boolean]
--labels One or more user-defined labels for this runner
(delimited with commas) [default: "cml"]
--idle-timeout Time in seconds for the runner to be waiting for
jobs before shutting down. Setting it to 0
disables automatic shutdown [default: 300]
--name Name displayed in the repository once registered
cml-{ID}
--no-retry Do not restart workflow terminated due to instance
disposal or GitHub Actions timeout
[boolean] [default: false]
--reuse Don't launch a new runner if an existing one has
the same name or overlapping labels
--single Exit after running a single job
[boolean] [default: false]
--driver Platform where the repository is hosted. If not
specified, it will be inferred from the
environment [choices: "github", "gitlab"]
--repo Repository to be used for registering the runner.
If not specified, it will be inferred from the
environment
--token Personal access token to register a self-hosted
runner on the repository. If not specified, it
will be inferred from the environment
--cloud Cloud to deploy the runner
[choices: "aws", "azure"]
--cloud-region Region where the instance is deployed. Choices:
[us-east, us-west, eu-west, eu-north]. Also
accepts native cloud regions [default: "us-west"]
--cloud-type Instance type. Choices: [m, l, xl]. Also supports
native types like i.e. t2.micro
--cloud-gpu GPU type. [choices: "nogpu", "k80", "tesla"]
--cloud-hdd-size HDD size in GB.
--cloud-ssh-private Custom private RSA SSH key. If not provided an
automatically generated throwaway key will be
used [default: ""]
--cloud-ssh-private-visible Show the private SSH key in the output with the
rest of the instance properties (not recommended)
[boolean]
--cloud-spot Request a spot instance [boolean]
--cloud-spot-price Maximum spot instance bidding price in USD.
Defaults to the current spot bidding price
[default: "-1"]
--cloud-startup-script Run the provided Base64-encoded Linux shell
script during the instance initialization
--reuse Don't launch a new runner if an existing one has
the same name or overlapping labels
[boolean] [default: false]
--driver Platform where the repository is hosted. If not
specified, it will be inferred from the
environment [choices: "github", "gitlab"]
--repo Repository to be used for registering the runner.
If not specified, it will be inferred from the
environment
--token Personal access token to register a self-hosted
runner on the repository. If not specified, it
will be inferred from the environment
--cloud Cloud to deploy the runner
[choices: "aws", "azure", "kubernetes"]
--cloud-region Region where the instance is deployed. Choices:
[us-east, us-west, eu-west, eu-north]. Also
accepts native cloud regions [default: "us-west"]
--cloud-type Instance type. Choices: [m, l, xl]. Also supports
native types like i.e. t2.micro
--cloud-gpu GPU type.
[choices: "nogpu", "k80", "v100", "tesla"]
--cloud-hdd-size HDD size in GB
--cloud-ssh-private Custom private RSA SSH key. If not provided an
automatically generated throwaway key will be used
[default: ""]
-h Show help [boolean]
--cloud-spot Request a spot instance [boolean]
--cloud-spot-price Maximum spot instance bidding price in USD.
Defaults to the current spot bidding price
[default: "-1"]
--cloud-startup-script Run the provided Base64-encoded Linux shell script
during the instance initialization [default: ""]
--cloud-aws-security-group Specifies the security group in AWS [default: ""]
-h Show help [boolean]
```

#### Environment Variables
Expand Down
131 changes: 81 additions & 50 deletions bin/cml-runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,97 +17,97 @@ const {

RUNNER_PATH = `${WORKDIR_BASE}/${NAME}`,
RUNNER_IDLE_TIMEOUT = 5 * 60,
RUNNER_DESTROY_DELAY = 30,
RUNNER_DESTROY_DELAY = 20,
RUNNER_LABELS = 'cml',
RUNNER_NAME = NAME,
RUNNER_SINGLE = false,
RUNNER_REUSE = false,
RUNNER_NO_RETRY = false,
RUNNER_DRIVER,
RUNNER_REPO,
REPO_TOKEN
} = process.env;

let cml;
let RUNNER_LAUNCHED = false;
let RUNNER;
let RUNNER_TIMEOUT_TIMER = 0;
let RUNNER_SHUTTING_DOWN = false;
const RUNNER_JOBS_RUNNING = [];
let RUNNER_JOBS_RUNNING = [];
const GH_5_MIN_TIMEOUT = (72 * 60 - 5) * 60 * 1000;

const shutdown = async (opts) => {
if (RUNNER_SHUTTING_DOWN) return;

RUNNER_SHUTTING_DOWN = true;

let { error, cloud } = opts;
const { name, tfResource, workdir = '' } = opts;
const { error, cloud } = opts;
const { name, workdir = '', tfResource, noRetry } = opts;
const tfPath = workdir;

console.log(
JSON.stringify({ level: error ? 'error' : 'info', status: 'terminated' })
);
if (error) console.error(error);

const unregisterRunner = async () => {
if (!RUNNER) return;

try {
console.log('Unregistering runner...');
console.log(`Unregistering runner ${name}...`);
RUNNER && RUNNER.kill('SIGINT');
await cml.unregisterRunner({ name });
console.log('\tSuccess');
} catch (err) {
console.error('\tFailed');
error = err;
console.error(`\tFailed: ${err.message}`);
}
};

const shutdownDockerMachine = async () => {
console.log('docker-machine destroy...');
console.log(
'Docker machine is deprecated and this will be removed!! Check how to deploy using our tf provider.'
);
const retryWorkflows = async () => {
try {
await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`);
if (!noRetry && RUNNER_JOBS_RUNNING.length) {
await Promise.all(
RUNNER_JOBS_RUNNING.map(
async (job) => await cml.pipelineRestart({ jobId: job.id })
)
);
}
} catch (err) {
console.error(`\tFailed shutting down docker machine: ${err.message}`);
error = err;
console.error(err);
}
};

const shutdownTf = async () => {
const { tfResource } = opts;

if (!tfResource) {
console.log(`\tNo TF resource found`);
return;
}
const destroyDockerMachine = async () => {
if (!DOCKER_MACHINE) return;

console.log('docker-machine destroy...');
console.log(
'Docker machine is deprecated and will be removed!! Check how to deploy using our tf provider.'
);
try {
await tf.destroy({ dir: tfPath });
await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`);
} catch (err) {
console.error(`\tFailed Terraform destroy: ${err.message}`);
error = err;
console.error(`\tFailed shutting down docker machine: ${err.message}`);
}
};

const destroyTerraform = async () => {
if (!tfResource) return;

try {
console.log(await tf.destroy({ dir: tfPath }));
} catch (err) {
console.error(`\tFailed destroying terraform: ${err.message}`);
error = err;
}
};

console.log(
JSON.stringify({ level: error ? 'error' : 'info', status: 'terminated' })
);
if (error) console.error(error);
await sleep(RUNNER_DESTROY_DELAY);

if (cloud) {
await destroyTerraform();
} else {
if (RUNNER_LAUNCHED) await unregisterRunner();
await unregisterRunner();
await retryWorkflows();

console.log(
`\tDestroy scheduled: ${RUNNER_DESTROY_DELAY} seconds remaining.`
);
await sleep(RUNNER_DESTROY_DELAY);

if (DOCKER_MACHINE) await shutdownDockerMachine();
if (tfResource) await shutdownTf();
await destroyDockerMachine();
await destroyTerraform();
}

process.exit(error ? 1 : 0);
Expand Down Expand Up @@ -214,7 +214,7 @@ const runCloud = async (opts) => {

const runLocal = async (opts) => {
console.log(`Launching ${cml.driver} runner`);
const { workdir, name, labels, single, idleTimeout } = opts;
const { workdir, name, labels, single, idleTimeout, noRetry } = opts;

const proc = await cml.startRunner({
workdir,
Expand All @@ -224,17 +224,30 @@ const runLocal = async (opts) => {
idleTimeout
});

const dataHandler = (data) => {
const log = cml.parseRunnerLog({ data });
const dataHandler = async (data) => {
const log = await cml.parseRunnerLog({ data });
log && console.log(JSON.stringify(log));

if (log && log.status === 'job_started') {
RUNNER_JOBS_RUNNING.push(1);
RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date });
RUNNER_TIMEOUT_TIMER = 0;
} else if (log && log.status === 'job_ended') {
RUNNER_JOBS_RUNNING.pop();
const { job } = log;

if (!RUNNER_SHUTTING_DOWN) {
const jobs = job
? [job]
: (await cml.pipelineJobs({ jobs: RUNNER_JOBS_RUNNING }))
.filter((job) => job.status === 'completed')
.map((job) => job.id);

RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter(
(job) => !jobs.includes(job.id)
);
}
}
};

proc.stderr.on('data', dataHandler);
proc.stdout.on('data', dataHandler);
proc.on('uncaughtException', () => shutdown(opts));
Expand All @@ -252,7 +265,19 @@ const runLocal = async (opts) => {
}, 1000);
}

RUNNER_LAUNCHED = true;
if (!noRetry && cml.driver === 'github') {
const watcher = setInterval(() => {
RUNNER_JOBS_RUNNING.forEach((job) => {
if (
new Date().getTime() - new Date(job.date).getTime() >
GH_5_MIN_TIMEOUT
)
shutdown(opts) && clearInterval(watcher);
});
}, 60 * 1000);
}

RUNNER = proc;
};

const run = async (opts) => {
Expand Down Expand Up @@ -337,9 +362,15 @@ const opts = yargs
'idle-timeout',
'Time in seconds for the runner to be waiting for jobs before shutting down. Setting it to 0 disables automatic shutdown'
)
.default('name', RUNNER_NAME)
.describe('name', 'Name displayed in the repository once registered')

.default('name')
.describe('name', 'Name displayed in the repository once registered cml-{ID}')
.coerce('name', (val) => val || RUNNER_NAME)
.boolean('no-retry')
.default('no-retry', RUNNER_NO_RETRY)
.describe(
'no-retry',
'Do not restart workflow terminated due to instance disposal or GitHub Actions timeout'
)
.boolean('single')
.default('single', RUNNER_SINGLE)
.describe('single', 'Exit after running a single job')
Expand Down
Loading

0 comments on commit 33db696

Please sign in to comment.