Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Merge branch 'master' into linter
Browse files Browse the repository at this point in the history
  • Loading branch information
debuggy committed Aug 24, 2020
2 parents 5ebed5d + c0a616d commit e5364e6
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 157 deletions.
2 changes: 1 addition & 1 deletion src/hivedscheduler/build/hivedscheduler.k8s.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

FROM hivedscheduler/hivedscheduler:v0.3.3
FROM hivedscheduler/hivedscheduler:v0.3.4
3 changes: 2 additions & 1 deletion src/rest-server/docs/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ info:
Version 2.0.2: update job detail and job attempt schema
Version 2.0.3: update parameters description of get storage list
Version 2.0.4: add default field in get storage list
Version 2.0.5: add more parameters to job list; add submissionTime
license:
name: MIT License
url: "https://github.com/microsoft/pai/blob/master/LICENSE"
version: 2.0.4
version: 2.0.5
externalDocs:
description: Find out more about OpenPAI
url: "https://github.com/microsoft/pai"
Expand Down
264 changes: 118 additions & 146 deletions src/rest-server/src/middlewares/v2/hived.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

// module dependencies
const axios = require('axios');
const { get, pickBy } = require('lodash');
const createError = require('@pai/utils/error');
const logger = require('@pai/config/logger');
const hivedSchema = require('@pai/config/v2/hived');
Expand Down Expand Up @@ -55,7 +56,7 @@ const getCellStatus = async (virtualCluster) => {
}

let cellQuota = 0;
const cellUnits = [...new Set(vcStatus.map((cell) => cell.gpuType))]
const cellUnits = [...new Set(vcStatus.map((cell) => cell.leafCellType))]
.filter((key) => key in resourceUnits)
.reduce((dict, key) => ({ ...dict, [key]: resourceUnits[key] }), {});
const cellQueue = [...vcStatus];
Expand All @@ -81,24 +82,85 @@ const hivedValidate = async (protocolObj, username) => {
hivedSchema.validate.errors,
);
}
let hivedConfig = null;
const affinityGroups = {};
let opportunistic = false;
const gangAllocation =
'extras' in protocolObj && protocolObj.extras.gangAllocation !== false;
const virtualCluster =
'defaults' in protocolObj && protocolObj.defaults.virtualCluster != null
? protocolObj.defaults.virtualCluster
: 'default';

const hivedConfig = get(protocolObj, 'extras.hivedScheduler', null);
const opportunistic = !!(get(hivedConfig, 'jobPriorityClass') === 'oppo');
const gangAllocation = !!(
get(protocolObj, 'extras.gangAllocation', true) === true
);
const virtualCluster = get(protocolObj, 'defaults.virtualCluster', 'default');

const affinityGroups = {};
const { cellQuota, cellUnits } = await getCellStatus(virtualCluster);

if ('extras' in protocolObj && 'hivedScheduler' in protocolObj.extras) {
hivedConfig = protocolObj.extras.hivedScheduler;
if (hivedConfig && hivedConfig.jobPriorityClass === 'oppo') {
opportunistic = true;
// generate podSpec for every taskRole
for (const taskRole of Object.keys(protocolObj.taskRoles)) {
const podSpec = pickBy(
{
virtualCluster,
priority: convertPriority(get(hivedConfig, 'jobPriorityClass')),
pinnedCellId: get(
hivedConfig,
`taskRoles.${taskRole}.pinnedCellId`,
null,
),
leafCellType: get(hivedConfig, `taskRoles.${taskRole}.skuType`, null),
leafCellNumber: get(hivedConfig, `taskRoles.${taskRole}.skuNumber`, 0),
gangReleaseEnable: get(hivedConfig, 'gangReleaseEnable'),
lazyPreemptionEnable: get(hivedConfig, 'lazyPreemptionEnable'),
ignoreK8sSuggestedNodes: get(hivedConfig, 'ignoreK8sSuggestedNodes'),
affinityGroup: null,
},
(v) => v !== undefined,
);

// calculate sku number
const resourcePerCell = {};
for (const t of ['gpu', 'cpu', 'memory']) {
if (podSpec.leafCellType != null) {
resourcePerCell[t] = resourceUnits[podSpec.leafCellType][t];
} else {
resourcePerCell[t] = Math.min(
...Array.from(
Object.values(opportunistic ? resourceUnits : cellUnits),
(v) => v[t],
),
);
}
}
const { gpu = 0, cpu, memoryMB } = protocolObj.taskRoles[
taskRole
].resourcePerInstance;
let requestedResource = '';
let emptyResource = '';
if (resourcePerCell.gpu === 0 && gpu > 0) {
requestedResource = resourcePerCell.gpu;
emptyResource = 'GPU';
} else if (resourcePerCell.cpu === 0 && cpu > 0) {
requestedResource = resourcePerCell.cpu;
emptyResource = 'CPU';
} else if (resourcePerCell.memory === 0 && memoryMB > 0) {
requestedResource = resourcePerCell.memory;
emptyResource = 'memory';
}
if (emptyResource !== '') {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} requests ${requestedResource} ${emptyResource}, but SKU does not ` +
`configure ${emptyResource}. Please contact admin if the taskrole needs ${emptyResource} resources.`,
);
}
podSpec.leafCellNumber = Math.max(
gpu === 0 ? 0 : Math.ceil(gpu / resourcePerCell.gpu),
cpu === 0 ? 0 : Math.ceil(cpu / resourcePerCell.cpu),
memoryMB === 0 ? 0 : Math.ceil(memoryMB / resourcePerCell.memory),
);

protocolObj.taskRoles[taskRole].hivedPodSpec = podSpec;
}

if (hivedConfig != null) {
for (const taskRole of Object.keys(hivedConfig.taskRoles || {})) {
// must be a valid taskRole
if (!(taskRole in protocolObj.taskRoles)) {
Expand All @@ -109,55 +171,47 @@ const hivedValidate = async (protocolObj, username) => {
);
}

const taskRoleConfig = hivedConfig.taskRoles[taskRole];
// at most one of [pinnedCellId, skuType] allowed
if (
taskRoleConfig.pinnedCellId !== null &&
taskRoleConfig.skuType !== null
) {
const skuType = protocolObj.taskRoles[taskRole].hivedPodSpec.leafCellType;
const pinnedCellId =
protocolObj.taskRoles[taskRole].hivedPodSpec.pinnedCellId;
// only allow one of {skuType, pinnedCellId}
if (skuType != null && pinnedCellId != null) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} has both pinnedCellId and skuType, only one allowed.`,
`Taskrole ${taskRole} has both skuType and pinnedCellId, only one is allowed.`,
);
}

if (taskRoleConfig.skuType !== null) {
if (!(taskRoleConfig.skuType in resourceUnits)) {
// check whether skuType is valid
if (skuType != null) {
if (!(skuType in resourceUnits)) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} has unknown skuType ${
taskRoleConfig.skuType
}, allow ${Object.keys(resourceUnits)}.`,
`Taskrole ${taskRole} has unknown skuType ${skuType}, allow ${Object.keys(
resourceUnits,
)}.`,
);
}
if (!opportunistic && !(taskRoleConfig.skuType in cellUnits)) {
if (!opportunistic && !(skuType in cellUnits)) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} has skuType ${
taskRoleConfig.skuType
}, VC ${virtualCluster} only allows ${Object.keys(cellUnits)}.`,
`Taskrole ${taskRole} has skuType ${skuType}, VC ${virtualCluster} only allows ${Object.keys(
cellUnits,
)}.`,
);
}
}

const affinityGroupName = taskRoleConfig.affinityGroupName;
// affinityGroup should have uniform pinnedCellId and skuType
if (affinityGroupName !== null) {
const affinityGroupName =
hivedConfig.taskRoles[taskRole].affinityGroupName;
// affinityGroup should have united skuType or pinnedCellId
if (affinityGroupName != null) {
if (affinityGroupName in affinityGroups) {
if (taskRoleConfig.pinnedCellId === null) {
taskRoleConfig.pinnedCellId =
affinityGroups[affinityGroupName].pinnedCellId;
}
if (taskRoleConfig.skuType === null) {
taskRoleConfig.skuType = affinityGroups[affinityGroupName].skuType;
}
if (
taskRoleConfig.pinnedCellId !==
affinityGroups[affinityGroupName].pinnedCellId ||
taskRoleConfig.skuType !== affinityGroups[affinityGroupName].skuType
skuType !== affinityGroups[affinityGroupName].skuType ||
pinnedCellId !== affinityGroups[affinityGroupName].pinnedCellId
) {
throw createError(
'Bad Request',
Expand All @@ -167,139 +221,57 @@ const hivedValidate = async (protocolObj, username) => {
}
} else {
affinityGroups[affinityGroupName] = {
pinnedCellId: taskRoleConfig.pinnedCellId,
skuType: taskRoleConfig.skuType,
skuType,
pinnedCellId,
affinityTaskList: [],
};
}
const { gpu = 0, cpu } = protocolObj.taskRoles[
taskRole
].resourcePerInstance;
affinityGroups[affinityGroupName].affinityTaskList.push({
podNumber: protocolObj.taskRoles[taskRole].instances,
gpuNumber: gpu === 0 ? cpu : gpu,
leafCellNumber:
protocolObj.taskRoles[taskRole].hivedPodSpec.leafCellNumber,
});
}
}

for (const affinityGroupName of Object.keys(affinityGroups)) {
if (
affinityGroups[affinityGroupName].skuType !== null &&
affinityGroups[affinityGroupName].pinnedCellId !== null
) {
throw createError(
'Bad Request',
'InvalidProtocolError',
`AffinityGroup ${affinityGroupName} has both pinnedCellId and skuType, only one allowed.`,
);
}
}
}

// generate default affinity group for the gang scheduling jobs
let defaultAffinityGroup = null;
if (!Object.keys(affinityGroups).length && gangAllocation) {
defaultAffinityGroup = {
affinityTaskList: Object.keys(protocolObj.taskRoles).map((taskRole) => {
const { gpu = 0, cpu } = protocolObj.taskRoles[
taskRole
].resourcePerInstance;
return {
podNumber: protocolObj.taskRoles[taskRole].instances,
gpuNumber: gpu === 0 ? cpu : gpu,
leafCellNumber:
protocolObj.taskRoles[taskRole].hivedPodSpec.leafCellNumber,
};
}),
};
}

// generate podSpec for every taskRole
let requestCellNumber = 0;
for (const taskRole of Object.keys(protocolObj.taskRoles)) {
const resourcePerCell = {};
for (const t of ['gpu', 'cpu', 'memory']) {
resourcePerCell[t] = Math.min(
...Array.from(
Object.values(opportunistic ? resourceUnits : cellUnits),
(v) => v[t],
),
);
}

const podSpec = {
virtualCluster,
priority: convertPriority(
hivedConfig ? hivedConfig.jobPriorityClass : undefined,
),
gpuType: null,
pinnedCellId: null,
gpuNumber: 0,
affinityGroup: null,
};
if (
hivedConfig &&
hivedConfig.taskRoles &&
taskRole in hivedConfig.taskRoles
) {
podSpec.gpuType = hivedConfig.taskRoles[taskRole].skuType;
if (podSpec.gpuType !== null) {
for (const t of ['gpu', 'cpu', 'memory']) {
resourcePerCell[t] = resourceUnits[podSpec.gpuType][t];
}
}
podSpec.pinnedCellId = hivedConfig.taskRoles[taskRole].pinnedCellId;

const affinityGroupName =
hivedConfig.taskRoles[taskRole].affinityGroupName;
podSpec.affinityGroup = affinityGroupName
? {
name: `${username}~${protocolObj.name}/${affinityGroupName}`,
members: affinityGroups[affinityGroupName].affinityTaskList,
}
: null;
const affinityGroupName = get(
hivedConfig,
`taskRoles.${taskRole}.affinityGroupName`,
);
if (affinityGroupName != null) {
protocolObj.taskRoles[taskRole].hivedPodSpec.affinityGroup = {
name: `${username}~${protocolObj.name}/${affinityGroupName}`,
members: affinityGroups[affinityGroupName].affinityTaskList,
};
}

if (defaultAffinityGroup != null) {
podSpec.affinityGroup = {
protocolObj.taskRoles[taskRole].hivedPodSpec.affinityGroup = {
name: `${username}~${protocolObj.name}/default`,
members: defaultAffinityGroup.affinityTaskList,
};
}

const { gpu = 0, cpu, memoryMB } = protocolObj.taskRoles[
taskRole
].resourcePerInstance;
let requestedResource = '';
let emptyResource = '';
if (resourcePerCell.gpu === 0 && gpu > 0) {
requestedResource = resourcePerCell.gpu;
emptyResource = 'GPU';
} else if (resourcePerCell.cpu === 0 && cpu > 0) {
requestedResource = resourcePerCell.cpu;
emptyResource = 'CPU';
} else if (resourcePerCell.memory === 0 && memoryMB > 0) {
requestedResource = resourcePerCell.memory;
emptyResource = 'memory';
}
if (emptyResource !== '') {
throw createError(
'Bad Request',
'InvalidProtocolError',
`Taskrole ${taskRole} requests ${requestedResource} ${emptyResource}, but SKU does not ` +
`configure ${emptyResource}. Please contact admin if the taskrole needs ${emptyResource} resources.`,
);
}

const cellNumber = Math.max(
gpu === 0 ? 0 : Math.ceil(gpu / resourcePerCell.gpu),
cpu === 0 ? 0 : Math.ceil(cpu / resourcePerCell.cpu),
memoryMB === 0 ? 0 : Math.ceil(memoryMB / resourcePerCell.memory),
);
podSpec.gpuNumber = cellNumber;
requestCellNumber += protocolObj.taskRoles[taskRole].instances * cellNumber;

protocolObj.taskRoles[taskRole].hivedPodSpec = podSpec;
requestCellNumber +=
protocolObj.taskRoles[taskRole].instances *
protocolObj.taskRoles[taskRole].hivedPodSpec.leafCellNumber;
}

// best effort check cell quota
if (requestCellNumber > cellQuota && gangAllocation && !opportunistic) {
throw createError(
'Bad Request',
Expand Down
Loading

0 comments on commit e5364e6

Please sign in to comment.