Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cml runner long job #583

Merged
merged 14 commits into from
Jul 5, 2021
2 changes: 1 addition & 1 deletion .github/workflows/test-deploy.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Test & Deploy
on:
schedule:
- cron: 0 1 * * 6 # Sat 01:00
- cron: 0 1 * * 6 # Sat 01:00
release:
types: [published]
pull_request_target:
Expand Down
87 changes: 44 additions & 43 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,54 +475,55 @@ For example, `docker://dvcorg/cml:0-dvc2-base1-gpu`, or
The `cml-runner` function accepts the following arguments:

```
Usage: cml-runner.js
Usage: cml-runner

Options:
--version Show version number [boolean]
--labels One or more user-defined labels for this runner
(delimited with commas) [default: "cml"]
--idle-timeout Time in seconds for the runner to be waiting for
jobs before shutting down. Setting it to 0
disables automatic shutdown [default: 300]
--name Name displayed in the repository once registered
[default: "cml-4wdd123kha"]
--single Exit after running a single job
--version Show version number [boolean]
--labels One or more user-defined labels for this runner
(delimited with commas) [default: "cml"]
--idle-timeout Time in seconds for the runner to be waiting for
jobs before shutting down. Setting it to 0
disables automatic shutdown [default: 300]
--name Name displayed in the repository once registered
cml-{ID}
--no-retry Do not restart workflow terminated due to instance
disposal or GitHub Actions timeout
[boolean] [default: false]
--reuse Don't launch a new runner if an existing one has
the same name or overlapping labels
--single Exit after running a single job
[boolean] [default: false]
--driver Platform where the repository is hosted. If not
specified, it will be inferred from the
environment [choices: "github", "gitlab"]
--repo Repository to be used for registering the runner.
If not specified, it will be inferred from the
environment
--token Personal access token to register a self-hosted
runner on the repository. If not specified, it
will be inferred from the environment
--cloud Cloud to deploy the runner
[choices: "aws", "azure"]
--cloud-region Region where the instance is deployed. Choices:
[us-east, us-west, eu-west, eu-north]. Also
accepts native cloud regions [default: "us-west"]
--cloud-type Instance type. Choices: [m, l, xl]. Also supports
native types like i.e. t2.micro
--cloud-gpu GPU type. [choices: "nogpu", "k80", "tesla"]
--cloud-hdd-size HDD size in GB.
--cloud-ssh-private Custom private RSA SSH key. If not provided an
automatically generated throwaway key will be
used [default: ""]
--cloud-ssh-private-visible Show the private SSH key in the output with the
rest of the instance properties (not recommended)
[boolean]
--cloud-spot Request a spot instance [boolean]
--cloud-spot-price Maximum spot instance bidding price in USD.
Defaults to the current spot bidding price
[default: "-1"]
--cloud-startup-script Run the provided Base64-encoded Linux shell
script during the instance initialization
--reuse Don't launch a new runner if an existing one has
the same name or overlapping labels
[boolean] [default: false]
--driver Platform where the repository is hosted. If not
specified, it will be inferred from the
environment [choices: "github", "gitlab"]
--repo Repository to be used for registering the runner.
If not specified, it will be inferred from the
environment
--token Personal access token to register a self-hosted
runner on the repository. If not specified, it
will be inferred from the environment
--cloud Cloud to deploy the runner
[choices: "aws", "azure", "kubernetes"]
--cloud-region Region where the instance is deployed. Choices:
[us-east, us-west, eu-west, eu-north]. Also
accepts native cloud regions [default: "us-west"]
--cloud-type Instance type. Choices: [m, l, xl]. Also supports
native types like i.e. t2.micro
--cloud-gpu GPU type.
[choices: "nogpu", "k80", "v100", "tesla"]
--cloud-hdd-size HDD size in GB
--cloud-ssh-private Custom private RSA SSH key. If not provided an
automatically generated throwaway key will be used
[default: ""]
-h Show help [boolean]
--cloud-spot Request a spot instance [boolean]
--cloud-spot-price Maximum spot instance bidding price in USD.
Defaults to the current spot bidding price
[default: "-1"]
--cloud-startup-script Run the provided Base64-encoded Linux shell script
during the instance initialization [default: ""]
--cloud-aws-security-group Specifies the security group in AWS [default: ""]
-h Show help [boolean]
```

#### Environment Variables
Expand Down
131 changes: 81 additions & 50 deletions bin/cml-runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,97 +17,97 @@ const {

RUNNER_PATH = `${WORKDIR_BASE}/${NAME}`,
RUNNER_IDLE_TIMEOUT = 5 * 60,
RUNNER_DESTROY_DELAY = 30,
RUNNER_DESTROY_DELAY = 20,
RUNNER_LABELS = 'cml',
RUNNER_NAME = NAME,
RUNNER_SINGLE = false,
RUNNER_REUSE = false,
RUNNER_NO_RETRY = false,
RUNNER_DRIVER,
RUNNER_REPO,
REPO_TOKEN
} = process.env;

let cml;
let RUNNER_LAUNCHED = false;
let RUNNER;
let RUNNER_TIMEOUT_TIMER = 0;
let RUNNER_SHUTTING_DOWN = false;
const RUNNER_JOBS_RUNNING = [];
let RUNNER_JOBS_RUNNING = [];
const GH_5_MIN_TIMEOUT = (72 * 60 - 5) * 60 * 1000;
DavidGOrtega marked this conversation as resolved.
Show resolved Hide resolved

const shutdown = async (opts) => {
if (RUNNER_SHUTTING_DOWN) return;

RUNNER_SHUTTING_DOWN = true;

let { error, cloud } = opts;
const { name, tfResource, workdir = '' } = opts;
const { error, cloud } = opts;
const { name, workdir = '', tfResource, noRetry } = opts;
const tfPath = workdir;

console.log(
JSON.stringify({ level: error ? 'error' : 'info', status: 'terminated' })
);
if (error) console.error(error);

const unregisterRunner = async () => {
if (!RUNNER) return;

try {
console.log('Unregistering runner...');
console.log(`Unregistering runner ${name}...`);
RUNNER && RUNNER.kill('SIGINT');
await cml.unregisterRunner({ name });
console.log('\tSuccess');
} catch (err) {
console.error('\tFailed');
error = err;
console.error(`\tFailed: ${err.message}`);
}
};

const shutdownDockerMachine = async () => {
console.log('docker-machine destroy...');
console.log(
'Docker machine is deprecated and this will be removed!! Check how to deploy using our tf provider.'
);
const retryWorkflows = async () => {
try {
await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`);
if (!noRetry && RUNNER_JOBS_RUNNING.length) {
await Promise.all(
RUNNER_JOBS_RUNNING.map(
async (job) => await cml.pipelineRestart({ jobId: job.id })
)
);
}
} catch (err) {
console.error(`\tFailed shutting down docker machine: ${err.message}`);
error = err;
console.error(err);
}
};

const shutdownTf = async () => {
const { tfResource } = opts;

if (!tfResource) {
console.log(`\tNo TF resource found`);
return;
}
const destroyDockerMachine = async () => {
if (!DOCKER_MACHINE) return;

console.log('docker-machine destroy...');
console.log(
'Docker machine is deprecated and will be removed!! Check how to deploy using our tf provider.'
);
try {
await tf.destroy({ dir: tfPath });
await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`);
} catch (err) {
console.error(`\tFailed Terraform destroy: ${err.message}`);
error = err;
console.error(`\tFailed shutting down docker machine: ${err.message}`);
}
};

const destroyTerraform = async () => {
if (!tfResource) return;

try {
console.log(await tf.destroy({ dir: tfPath }));
} catch (err) {
console.error(`\tFailed destroying terraform: ${err.message}`);
error = err;
}
};

console.log(
JSON.stringify({ level: error ? 'error' : 'info', status: 'terminated' })
);
if (error) console.error(error);
await sleep(RUNNER_DESTROY_DELAY);

if (cloud) {
await destroyTerraform();
} else {
if (RUNNER_LAUNCHED) await unregisterRunner();
await unregisterRunner();
await retryWorkflows();

console.log(
`\tDestroy scheduled: ${RUNNER_DESTROY_DELAY} seconds remaining.`
);
await sleep(RUNNER_DESTROY_DELAY);

if (DOCKER_MACHINE) await shutdownDockerMachine();
if (tfResource) await shutdownTf();
await destroyDockerMachine();
await destroyTerraform();
}

process.exit(error ? 1 : 0);
Expand Down Expand Up @@ -214,7 +214,7 @@ const runCloud = async (opts) => {

const runLocal = async (opts) => {
console.log(`Launching ${cml.driver} runner`);
const { workdir, name, labels, single, idleTimeout } = opts;
const { workdir, name, labels, single, idleTimeout, noRetry } = opts;

const proc = await cml.startRunner({
workdir,
Expand All @@ -224,17 +224,30 @@ const runLocal = async (opts) => {
idleTimeout
});

const dataHandler = (data) => {
const log = cml.parseRunnerLog({ data });
const dataHandler = async (data) => {
const log = await cml.parseRunnerLog({ data });
log && console.log(JSON.stringify(log));

if (log && log.status === 'job_started') {
RUNNER_JOBS_RUNNING.push(1);
RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date });
RUNNER_TIMEOUT_TIMER = 0;
} else if (log && log.status === 'job_ended') {
RUNNER_JOBS_RUNNING.pop();
const { job } = log;

if (!RUNNER_SHUTTING_DOWN) {
const jobs = job
? [job]
: (await cml.pipelineJobs({ jobs: RUNNER_JOBS_RUNNING }))
.filter((job) => job.status === 'completed')
.map((job) => job.id);

RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter(
(job) => !jobs.includes(job.id)
);
}
}
};

proc.stderr.on('data', dataHandler);
proc.stdout.on('data', dataHandler);
proc.on('uncaughtException', () => shutdown(opts));
Expand All @@ -252,7 +265,19 @@ const runLocal = async (opts) => {
}, 1000);
}

RUNNER_LAUNCHED = true;
if (!noRetry && cml.driver === 'github') {
const watcher = setInterval(() => {
RUNNER_JOBS_RUNNING.forEach((job) => {
if (
new Date().getTime() - new Date(job.date).getTime() >
GH_5_MIN_TIMEOUT
casperdcl marked this conversation as resolved.
Show resolved Hide resolved
)
shutdown(opts) && clearInterval(watcher);
});
}, 60 * 1000);
}

RUNNER = proc;
};

const run = async (opts) => {
Expand Down Expand Up @@ -337,9 +362,15 @@ const opts = yargs
'idle-timeout',
'Time in seconds for the runner to be waiting for jobs before shutting down. Setting it to 0 disables automatic shutdown'
)
.default('name', RUNNER_NAME)
.describe('name', 'Name displayed in the repository once registered')

.default('name')
.describe('name', 'Name displayed in the repository once registered cml-{ID}')
.coerce('name', (val) => val || RUNNER_NAME)
DavidGOrtega marked this conversation as resolved.
Show resolved Hide resolved
.boolean('no-retry')
.default('no-retry', RUNNER_NO_RETRY)
.describe(
'no-retry',
'Do not restart workflow terminated due to instance disposal or GitHub Actions timeout'
)
.boolean('single')
.default('single', RUNNER_SINGLE)
.describe('single', 'Exit after running a single job')
Expand Down
Loading