Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runner edge cases improvements #1030

Merged
merged 21 commits into from
Jun 19, 2022
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
62d1c82
Runner edge cases improvements
DavidGOrtega May 29, 2022
6b0e6b8
Merge branch 'master' of https://github.com/iterative/cml into runner…
DavidGOrtega May 29, 2022
7a5c7c8
merge master
DavidGOrtega May 30, 2022
6bbccbb
fix parselog
DavidGOrtega May 30, 2022
99c4222
refactor ugly return
DavidGOrtega May 30, 2022
a3a0e3e
fix test
DavidGOrtega May 30, 2022
8e6b2d6
idle check not needed
DavidGOrtega May 30, 2022
cfe5a31
fix tests
DavidGOrtega May 30, 2022
f9cade3
fix gl
DavidGOrtega May 30, 2022
d381dce
Merge branch 'master' of https://github.com/iterative/cml into runner…
DavidGOrtega Jun 2, 2022
cb1e92d
multi logs parser
DavidGOrtega Jun 5, 2022
fbec7b0
Merge branch 'master' of https://github.com/iterative/cml into runner…
DavidGOrtega Jun 5, 2022
b811adf
feedback fixes
DavidGOrtega Jun 9, 2022
cc6576d
Merge branch 'master' of https://github.com/iterative/cml into runner…
DavidGOrtega Jun 9, 2022
84c2d6e
Merge branch 'master' into runner-no-special-cases
DavidGOrtega Jun 10, 2022
7a29f6a
process lost
DavidGOrtega Jun 10, 2022
6fd8049
job not id
DavidGOrtega Jun 10, 2022
6e01f9f
patterns not entities
DavidGOrtega Jun 10, 2022
45b93f5
Merge branch 'runner-no-special-cases' of https://github.com/iterativ…
DavidGOrtega Jun 10, 2022
9b0a217
Merge branch 'master' of https://github.com/iterative/cml into runner…
DavidGOrtega Jun 10, 2022
2e733e4
Merge branch 'master' into runner-no-special-cases
DavidGOrtega Jun 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 97 additions & 128 deletions bin/cml/runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ const tf = require('../../src/terraform');

let cml;
let RUNNER;
let RUNNER_ID;
let RUNNER_JOBS_RUNNING = [];
let RUNNER_SHUTTING_DOWN = false;
let RUNNER_TIMER = 0;
Expand Down Expand Up @@ -48,11 +47,6 @@ const shutdown = async (opts) => {
const retryWorkflows = async () => {
try {
if (!noRetry) {
if (cml.driver === 'github') {
const job = await cml.runnerJob({ runnerId: RUNNER_ID });
if (job) RUNNER_JOBS_RUNNING = [job];
}

if (RUNNER_JOBS_RUNNING.length > 0) {
await Promise.all(
RUNNER_JOBS_RUNNING.map(
Expand Down Expand Up @@ -80,7 +74,7 @@ const shutdown = async (opts) => {
};

if (error) {
winston.error(error, { reason, status: 'terminated' });
winston.error(error, { status: 'terminated' });
} else {
winston.info('runner status', { reason, status: 'terminated' });
}
Expand Down Expand Up @@ -128,6 +122,8 @@ const runCloud = async (opts) => {
workdir
} = opts;

await tf.checkMinVersion();
DavidGOrtega marked this conversation as resolved.
Show resolved Hide resolved

if (gpu === 'tesla')
winston.warn(
'GPU model "tesla" has been deprecated; please use "v100" instead.'
Expand Down Expand Up @@ -163,6 +159,7 @@ const runCloud = async (opts) => {
});

await fs.writeFile(tfMainPath, tpl);

await tf.init({ dir: tfPath });
await tf.apply({ dir: tfPath });

Expand Down Expand Up @@ -211,24 +208,54 @@ const runCloud = async (opts) => {

const runLocal = async (opts) => {
winston.info(`Launching ${cml.driver} runner`);
const { workdir, name, labels, single, idleTimeout, noRetry, dockerVolumes } =
opts;
const {
workdir,
name,
labels,
single,
idleTimeout,
noRetry,
dockerVolumes,
tfResource,
tpiVersion
} = opts;

if (tfResource) {
await tf.checkMinVersion();

const tfPath = workdir;
await fs.mkdir(tfPath, { recursive: true });
const tfMainPath = join(tfPath, 'main.tf');
const tpl = tf.iterativeProviderTpl({ tpiVersion });
await fs.writeFile(tfMainPath, tpl);

await tf.init({ dir: tfPath });
await tf.apply({ dir: tfPath });

const path = join(tfPath, 'terraform.tfstate');
const tfstate = await tf.loadTfState({ path });
tfstate.resources = [
JSON.parse(Buffer.from(tfResource, 'base64').toString('utf-8'))
];
await tf.saveTfState({ tfstate, path });
}

const dataHandler = async (data) => {
const log = cml.parseRunnerLog({ data });
log && winston.info('runner status', log);

if (log && log.status === 'job_started') {
RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date });
} else if (log && log.status === 'job_ended') {
const { job: jobId } = log;
RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter(
(job) => job.id !== jobId
);
RUNNER_TIMER = 0;
const logs = await cml.parseRunnerLog({ data });
for (const log of logs) {
winston.info('runner status', log);

if (log.status === 'job_started') {
RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date });
}

if (single && cml.driver === 'bitbucket') {
await shutdown({ ...opts, reason: 'single job' });
if (log.status === 'job_ended') {
const { job: jobId } = log;
RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter(
(job) => job.id !== jobId
);

if (single) await shutdown({ ...opts, reason: 'single job' });
}
}
};
Expand All @@ -245,63 +272,25 @@ const runLocal = async (opts) => {
proc.stderr.on('data', dataHandler);
proc.stdout.on('data', dataHandler);
proc.on('disconnect', () =>
shutdown({ ...opts, reason: `runner disconnected` })
);
proc.on('close', (exit) =>
shutdown({ ...opts, reason: `runner closed with exit code ${exit}` })
shutdown({ ...opts, error: new Error('runner proccess lost') })
);
proc.on('close', (exit) => {
const reason = `runner closed with exit code ${exit}`;
if (exit === 0) shutdown({ ...opts, reason });
else shutdown({ ...opts, error: new Error(reason) });
});

RUNNER = proc;
({ id: RUNNER_ID } = await cml.runnerByName({ name }));

if (idleTimeout > 0) {
const watcher = setInterval(async () => {
let idle = RUNNER_JOBS_RUNNING.length === 0;
const idle = RUNNER_JOBS_RUNNING.length === 0;

if (RUNNER_TIMER >= idleTimeout) {
try {
if (cml.driver === 'github') {
const job = await cml.runnerJob({ runnerId: RUNNER_ID });

if (!job && !idle) {
winston.error(
`Runner is idle as per the GitHub API but busy as per CML internal state. Resetting jobs. Retrying in ${idleTimeout} seconds...`
);
winston.warn(
`CML GitHub driver response: ${JSON.stringify(job)}`
);
winston.warn(
`CML internal state: ${JSON.stringify(RUNNER_JOBS_RUNNING)}`
);

RUNNER_JOBS_RUNNING = [];
}

if (job && idle) {
winston.error(
`Runner is busy as per the GitHub API but idle as per CML internal state. Retrying in ${idleTimeout} seconds...`
);

idle = false;
}
}
} catch (err) {
winston.error(
`Error connecting the SCM: ${err.message}. Will try again in ${idleTimeout} secs`
);

idle = false;
}

if (idle) {
shutdown({ ...opts, reason: `timeout:${idleTimeout}` });
clearInterval(watcher);
} else {
RUNNER_TIMER = 0;
}
shutdown({ ...opts, reason: `timeout:${idleTimeout}` });
clearInterval(watcher);
}

RUNNER_TIMER++;
RUNNER_TIMER = idle ? RUNNER_TIMER + 1 : 0;
dacbd marked this conversation as resolved.
Show resolved Hide resolved
}, 1000);
}

Expand Down Expand Up @@ -333,23 +322,26 @@ const runLocal = async (opts) => {
};

const run = async (opts) => {
process.on('unhandledRejection', (reason) =>
DavidGOrtega marked this conversation as resolved.
Show resolved Hide resolved
shutdown({ ...opts, error: new Error(reason) })
DavidGOrtega marked this conversation as resolved.
Show resolved Hide resolved
);
process.on('uncaughtException', (error) => shutdown({ ...opts, error }));

['SIGTERM', 'SIGINT', 'SIGQUIT'].forEach((signal) => {
process.on(signal, () => shutdown({ ...opts, reason: signal }));
});

opts.workdir = opts.workdir || `${homedir()}/.cml/${opts.name}`;
const {
tpiVersion,
driver,
repo,
token,
workdir,
cloud,
labels,
name,
reuse,
dockerVolumes,
tfResource,
workdir
dockerVolumes
} = opts;

cml = new CML({ driver, repo, token });
Expand All @@ -359,27 +351,8 @@ const run = async (opts) => {
if (dockerVolumes.length && cml.driver !== 'gitlab')
winston.warn('Parameters --docker-volumes is only supported in gitlab');

if (cloud || tfResource) await tf.checkMinVersion();

// prepare tf
if (tfResource) {
const tfPath = workdir;

await fs.mkdir(tfPath, { recursive: true });
const tfMainPath = join(tfPath, 'main.tf');
const tpl = tf.iterativeProviderTpl({ tpiVersion });
await fs.writeFile(tfMainPath, tpl);
await tf.init({ dir: tfPath });
await tf.apply({ dir: tfPath });
const path = join(tfPath, 'terraform.tfstate');
const tfstate = await tf.loadTfState({ path });
tfstate.resources = [
JSON.parse(Buffer.from(tfResource, 'base64').toString('utf-8'))
];
await tf.saveTfState({ tfstate, path });
}

const runners = await cml.runners();

const runner = await cml.runnerByName({ name, runners });
if (runner) {
if (!reuse)
Expand All @@ -402,13 +375,9 @@ const run = async (opts) => {
process.exit(0);
}

try {
winston.info(`Preparing workdir ${workdir}...`);
await fs.mkdir(workdir, { recursive: true });
await fs.chmod(workdir, '766');
} catch (err) {
winston.warn(err.message);
}
winston.info(`Preparing workdir ${workdir}...`);
await fs.mkdir(workdir, { recursive: true });
await fs.chmod(workdir, '766');

if (cloud) await runCloud(opts);
else await runLocal(opts);
Expand All @@ -433,17 +402,21 @@ exports.handler = async (opts) => {
exports.builder = (yargs) =>
yargs.env('CML_RUNNER').options(
kebabcaseKeys({
tpiVersion: {
driver: {
type: 'string',
default: '>= 0.9.10',
choices: ['github', 'gitlab', 'bitbucket'],
description:
'Pin the iterative/iterative terraform provider to a specific version. i.e. "= 0.10.4" See: https://www.terraform.io/language/expressions/version-constraints',
hidden: true
'Platform where the repository is hosted. If not specified, it will be inferred from the environment'
},
dockerVolumes: {
type: 'array',
default: [],
description: 'Docker volumes. This feature is only supported in GitLab'
repo: {
type: 'string',
description:
'Repository to be used for registering the runner. If not specified, it will be inferred from the environment'
},
token: {
type: 'string',
description:
'Personal access token to register a self-hosted runner on the repository. If not specified, it will be inferred from the environment'
},
labels: {
type: 'string',
Expand Down Expand Up @@ -479,21 +452,16 @@ exports.builder = (yargs) =>
description:
"Don't launch a new runner if an existing one has the same name or overlapping labels"
},
driver: {
type: 'string',
choices: ['github', 'gitlab', 'bitbucket'],
description:
'Platform where the repository is hosted. If not specified, it will be inferred from the environment'
},
repo: {
workdir: {
type: 'string',
description:
'Repository to be used for registering the runner. If not specified, it will be inferred from the environment'
hidden: true,
alias: 'path',
description: 'Runner working directory'
},
token: {
type: 'string',
description:
'Personal access token to register a self-hosted runner on the repository. If not specified, it will be inferred from the environment'
dockerVolumes: {
type: 'array',
default: [],
description: 'Docker volumes. This feature is only supported in GitLab'
},
cloud: {
type: 'string',
Expand Down Expand Up @@ -573,6 +541,13 @@ exports.builder = (yargs) =>
description: 'Specifies the subnet to use within AWS',
alias: 'cloud-aws-subnet-id'
},
tpiVersion: {
type: 'string',
default: '>= 0.9.10',
description:
'Pin the iterative/iterative terraform provider to a specific version. i.e. "= 0.10.4" See: https://www.terraform.io/language/expressions/version-constraints',
hidden: true
},
cmlVersion: {
type: 'string',
default: require('../../package.json').version,
Expand All @@ -589,12 +564,6 @@ exports.builder = (yargs) =>
hidden: true,
description:
'Seconds to wait for collecting logs on failure (https://github.com/iterative/cml/issues/413)'
},
workdir: {
type: 'string',
hidden: true,
alias: 'path',
description: 'Runner working directory'
}
})
);
32 changes: 16 additions & 16 deletions bin/cml/runner.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,19 @@ describe('CML e2e', () => {
--version Show version number [boolean]
--log Maximum log level
[string] [choices: \\"error\\", \\"warn\\", \\"info\\", \\"debug\\"] [default: \\"info\\"]
--docker-volumes Docker volumes. This feature is only
supported in GitLab
[array] [default: []]
--driver Platform where the repository is
hosted. If not specified, it will be
inferred from the environment
[string] [choices: \\"github\\", \\"gitlab\\", \\"bitbucket\\"]
--repo Repository to be used for
registering the runner. If not
specified, it will be inferred from
the environment [string]
--token Personal access token to register a
self-hosted runner on the
repository. If not specified, it
will be inferred from the
environment [string]
--labels One or more user-defined labels for
this runner (delimited with commas)
[string] [default: \\"cml\\"]
Expand All @@ -84,19 +94,9 @@ describe('CML e2e', () => {
--reuse Don't launch a new runner if an
existing one has the same name or
overlapping labels [boolean]
--driver Platform where the repository is
hosted. If not specified, it will be
inferred from the environment
[string] [choices: \\"github\\", \\"gitlab\\", \\"bitbucket\\"]
--repo Repository to be used for
registering the runner. If not
specified, it will be inferred from
the environment [string]
--token Personal access token to register a
self-hosted runner on the
repository. If not specified, it
will be inferred from the
environment [string]
--docker-volumes Docker volumes. This feature is only
supported in GitLab
[array] [default: []]
--cloud Cloud to deploy the runner
[string] [choices: \\"aws\\", \\"azure\\", \\"gcp\\", \\"kubernetes\\"]
--cloud-region Region where the instance is
Expand Down
Loading