Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
New get Pod logs API (#5048)
Browse files Browse the repository at this point in the history
Add new API to get pod logs:
GET /jobs/{job-name}/pods/{podUid}/logs
RESP:

{
   "locations": [
      {"name": "stdout", "uri": "https://nodeIP/podUid/stdout.0?token=string"},
      {"name": "stderr"," uri": "https://nodeIP/podUid/stderr?token=string"}  
   ]
}
  • Loading branch information
Binyang2014 authored Nov 11, 2020
1 parent 2ccdc1b commit 7c180f0
Show file tree
Hide file tree
Showing 9 changed files with 217 additions and 12 deletions.
6 changes: 5 additions & 1 deletion src/rest-server/deploy/rest-server.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ spec:
value: {{ cluster_cfg['hivedscheduler']['webservice'] }}
- name: LOG_MANAGER_PORT
value: "{{ cluster_cfg['log-manager']['port'] }}"
- name: LOG_MANAGER_ADMIN_NAME
value: "{{ cluster_cfg['log-manager']['admin_name'] }}"
- name: LOG_MANAGER_ADMIN_PASSWORD
value: "{{ cluster_cfg['log-manager']['admin_password'] }}"
{%- endif %}
- name: RATE_LIMIT_API_PER_MIN
value: "{{ cluster_cfg['rest-server']['rate-limit-api-per-min'] }}"
Expand All @@ -89,7 +93,7 @@ spec:
value: {{ cluster_cfg['rest-server']['jwt-expire-time'] }}
- name: WEBPORTAL_URL
{%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
value: "{{ cluster_cfg['pylon']['uriHttps']}}"
value: "{{ cluster_cfg['pylon']['uri-https']}}"
{%- else %}
value: "{{ cluster_cfg['pylon']['uri']}}"
{%- endif %}
Expand Down
76 changes: 74 additions & 2 deletions src/rest-server/docs/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ info:
Version 2.1.1: add get event api
Version 2.2.0: add get task status api; add jobAttempId filter to job status api and extend job detail schema
Version 2.2.1: a user can add/delete tags to/from his/her own jobs
version 2.2.2: add get pod logs api
license:
name: MIT License
url: "https://github.com/microsoft/pai/blob/master/LICENSE"
Expand Down Expand Up @@ -1485,6 +1486,8 @@ paths:
security:
- bearerAuth: []
parameters:
- $ref: "#/components/parameters/user"
- $ref: "#/components/parameters/job"
- name: type
in: query
description: filter events with type. Could be "Warning" or "Normal".
Expand Down Expand Up @@ -1738,7 +1741,7 @@ paths:
ssh: 37508
http: 24661
containerGpus: null
containerLog: http://10.151.40.34:9103/log-manager/tail/admin/9893b6e0f9f5997f4a82f23bef39bf32/taskrole/a9f9f1f2-5e43-4423-88c3-022433b8cd7c/
containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs
containerExitCode: -220
containerExitSpec:
code: -220
Expand All @@ -1765,7 +1768,7 @@ paths:
ssh: 37508
http: 24661
containerGpus: null
containerLog: http://10.151.40.34:9103/log-manager/tail/admin/9893b6e0f9f5997f4a82f23bef39bf32/taskrole/a9f9f1f2-5e43-4423-88c3-022433b8cd7c/
containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs
containerExitCode: -220
containerExitSpec:
code: -220
Expand All @@ -1784,6 +1787,43 @@ paths:
$ref: "#/components/responses/NoTaskError"
"500":
$ref: "#/components/responses/UnknownError"
"/api/v2/jobs/{user}~{job}/pods/{podUid}/logs":
get:
tags:
- job
summary: Get job pod log list.
description: Get job pod log list.
operationId: getPodLogs
security:
- bearerAuth: []
parameters:
- $ref: "#/components/parameters/user"
- $ref: "#/components/parameters/job"
- $ref: "#/components/parameters/podUid"
- name: tailMode
in: query
description: getting log content via tail mode. Could be "true" or "false"
schema:
type: boolean
responses:
"200":
description: Succeeded
content:
application/json:
schema:
$ref: "#/components/schemas/PodLogInfo"
example:
locations:
- name: stderr
uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stderr?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token"
- name: all
uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.all?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token"
- name: stdout
uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stdout?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token"
"404":
$ref: "#/components/responses/NoPodLogsError"
"500":
$ref: "#/components/responses/UnknownError"
/api/v2/kubernetes/nodes:
get:
tags:
Expand Down Expand Up @@ -1917,6 +1957,13 @@ components:
required: true
schema:
type: string
podUid:
name: podUid
in: path
description: job pod uid
required: true
schema:
type: string
schemas:
Response:
type: object
Expand Down Expand Up @@ -3067,6 +3114,20 @@ components:
- storageConfig
- email
- extension
PodLogInfo:
type: object
properties:
locations:
type: array
items:
type: object
properties:
name:
type: string
description: log name.
uri:
type: string
description: log content lnk.
responses:
InvalidParametersError:
description: InvalidParametersError
Expand Down Expand Up @@ -3226,6 +3287,17 @@ components:
value:
code: NoJobSshInfoError
message: "SSH info of job {job} is not found."
NoPodLogsError:
description: NoPodLogsError
content:
application/json:
schema:
$ref: "#/components/schemas/Response"
examples:
NoJobSshInfoError:
value:
code: NoPodLogsError
message: "Logs for pod {podUid} is not found."
ConflictUserError:
description: ConflictUserError
content:
Expand Down
24 changes: 23 additions & 1 deletion src/rest-server/src/controllers/v2/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ const status = require('statuses');

const asyncHandler = require('@pai/middlewares/v2/asyncHandler');
const createError = require('@pai/utils/error');
const job = require('@pai/models/v2/job');
const { job, log } = require('@pai/models/v2/job');
const logger = require('@pai/config/logger');
const { Op } = require('sequelize');

const list = asyncHandler(async (req, res) => {
Expand Down Expand Up @@ -288,6 +289,26 @@ const getEvents = asyncHandler(async (req, res) => {
res.json(data);
});

const getLogs = asyncHandler(async (req, res) => {
try {
const data = await log.getLogListFromLogManager(
req.params.frameworkName,
req.params.podUid,
req.query['tail-mode'],
);
res.json(data);
} catch (error) {
logger.error(`Got error when retrieving log list, error: ${error}`);
throw error.code === 'NoPodLogsError'
? error
: createError(
'Internal Server Error',
'UnknownError',
'Failed to get log list',
);
}
});

// module exports
module.exports = {
list,
Expand All @@ -299,4 +320,5 @@ module.exports = {
addTag,
deleteTag,
getEvents,
getLogs,
};
5 changes: 4 additions & 1 deletion src/rest-server/src/models/v2/job/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,7 @@ if (config.env !== 'test') {
}
})();
}
module.exports = require('@pai/models/v2/job/k8s');
module.exports = {
job: require('@pai/models/v2/job/k8s'),
log: require('@pai/models/v2/job/log'),
};
9 changes: 3 additions & 6 deletions src/rest-server/src/models/v2/job/k8s.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ const convertFrameworkSummary = (framework) => {
};
};

const convertTaskDetail = async (taskStatus, ports, logPathPrefix) => {
const convertTaskDetail = async (taskStatus, ports, frameworkName) => {
// get containerPorts
const containerPorts = getContainerPorts(
ports,
Expand All @@ -103,7 +103,7 @@ const convertTaskDetail = async (taskStatus, ports, logPathPrefix) => {
containerNodeName: taskStatus.attemptStatus.podNodeName,
containerPorts,
containerGpus,
containerLog: `http://${taskStatus.attemptStatus.podHostIP}:${process.env.LOG_MANAGER_PORT}/log-manager/tail/${logPathPrefix}/${taskStatus.attemptStatus.podUID}/`,
containerLog: `/api/v2/jobs/${frameworkName}/pods/${taskStatus.attemptStatus.podUID}/logs`,
containerExitCode: completionStatus ? completionStatus.code : null,
containerExitSpec: completionStatus
? generateExitSpec(completionStatus.code)
Expand Down Expand Up @@ -158,9 +158,6 @@ const convertFrameworkDetail = async (
const virtualCluster = frameworkWithLatestAttempt.metadata.labels
? frameworkWithLatestAttempt.metadata.labels.virtualCluster
: 'unknown';
const logPathInfix = frameworkWithLatestAttempt.metadata.annotations
? frameworkWithLatestAttempt.metadata.annotations.logPathInfix
: null;

const latestAttemptStatus = frameworkWithLatestAttempt.status.attemptStatus;
const latestAttemptCompletionStatus = latestAttemptStatus.completionStatus;
Expand Down Expand Up @@ -291,7 +288,7 @@ const convertFrameworkDetail = async (
await convertTaskDetail(
status,
ports[taskRoleStatus.name],
`${userName}/${logPathInfix || jobName}/${taskRoleStatus.name}`,
`${userName}~${jobName}`,
),
),
);
Expand Down
101 changes: 101 additions & 0 deletions src/rest-server/src/models/v2/job/log.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

const axios = require('axios');
const job = require('./k8s');
const logger = require('@pai/config/logger');
const createError = require('@pai/utils/error');
const { encodeName } = require('@pai/models/v2/utils/name');

const LOG_MANAGER_PORT = process.env.LOG_MANAGER_PORT;
const WEBPORTAL_URL = process.env.WEBPORTAL_URL;

const constrcutLogManagerPrefix = (nodeIp) => {
return `http://${nodeIp}:${LOG_MANAGER_PORT}/api/v1`;
};

const loginLogManager = async (nodeIp, username, password) => {
const prefix = constrcutLogManagerPrefix(nodeIp);
return axios.post(`${prefix}/tokens`, {
username: username,
password: password,
});
};

const getLogListFromLogManager = async (frameworkName, podUid, tailMode) => {
const adminName = process.env.LOG_MANAGER_ADMIN_NAME;
const adminPassword = process.env.LOG_MANAGER_ADMIN_PASSWORD;

const jobDetail = await job.get(frameworkName);
const noPodLogsErr = createError(
'Not Found',
'NoPodLogsError',
`Logs for pod ${podUid} is not found.`,
);
let nodeIp;
let taskRoleName;
for (const [key, taskRole] of Object.entries(jobDetail.taskRoles)) {
const status = taskRole.taskStatuses.find(
(status) => status.containerId === podUid,
);
if (!status) {
logger.error(`Failed to find pod which has pod uid ${podUid}`);
throw noPodLogsErr;
}
nodeIp = status.containerIp;
taskRoleName = key;
}

let res = await loginLogManager(nodeIp, adminName, adminPassword);
const token = res.data.token;

const prefix = constrcutLogManagerPrefix(nodeIp);
try {
const params = {
token: token,
username: jobDetail.jobStatus.username,
taskrole: taskRoleName,
};
params['framework-name'] = encodeName(frameworkName);
params['pod-uid'] = podUid;
res = await axios.get(`${prefix}/logs`, {
params: params,
});
} catch (err) {
if (err.response && err.response.status === 404) {
throw noPodLogsErr;
}
throw err;
}
const logList = res.data;

const ret = { locations: [] };
const urlPrefix = `${WEBPORTAL_URL}/log-manager/${nodeIp}:${LOG_MANAGER_PORT}`;
const urlSuffix = tailMode === 'true' ? '&tail-mode=true' : '';
for (const key in logList) {
ret.locations.push({
name: key,
uri: `${urlPrefix}${logList[key]}${urlSuffix}`,
});
}

return ret;
};

module.exports = {
getLogListFromLogManager,
};
5 changes: 5 additions & 0 deletions src/rest-server/src/routes/v2/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,10 @@ router
/** GET /api/v2/jobs/:frameworkName/events - Get events of a framework */
.get(token.check, controller.getEvents);

router
.route('/:frameworkName/pods/:podUid/logs')
/** GET /api/v2/jobs/:frameworkName/pods/:podUid/logs - Get logs of a pod */
.get(token.check, controller.getLogs);

// module exports
module.exports = router;
1 change: 1 addition & 0 deletions src/rest-server/src/utils/error.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ declare type Code =
'UnauthorizedUserError' |
'NoEnoughQuotaError' |
'NotImplementedError' |
'NoPodLogsError' |
'UnknownError';

declare function createError(status: Status, code: Code, message: string): HttpError;
Expand Down
2 changes: 1 addition & 1 deletion tests/jenkins/test_rest_server_js_sdk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ EOT
cp ${WORKSPACE}/src/rest-server/docs/swagger.yaml .
node tests/common/apiTestCaseGenerator.js -- "swagger.yaml" ".tests/apiTestCase.json"
sudo npm install -g mocha
mocha tests/api_tests/**/*.spec.js -t 20000
mocha tests/api_tests/**/*.spec.js -t 1800000

cd ../..
;;
Expand Down

0 comments on commit 7c180f0

Please sign in to comment.