diff --git a/src/rest-server/deploy/rest-server.yaml.template b/src/rest-server/deploy/rest-server.yaml.template index f3b268612e..ae2c3f74f4 100644 --- a/src/rest-server/deploy/rest-server.yaml.template +++ b/src/rest-server/deploy/rest-server.yaml.template @@ -72,6 +72,10 @@ spec: value: {{ cluster_cfg['hivedscheduler']['webservice'] }} - name: LOG_MANAGER_PORT value: "{{ cluster_cfg['log-manager']['port'] }}" + - name: LOG_MANAGER_ADMIN_NAME + value: "{{ cluster_cfg['log-manager']['admin_name'] }}" + - name: LOG_MANAGER_ADMIN_PASSWORD + value: "{{ cluster_cfg['log-manager']['admin_password'] }}" {%- endif %} - name: RATE_LIMIT_API_PER_MIN value: "{{ cluster_cfg['rest-server']['rate-limit-api-per-min'] }}" @@ -89,7 +93,7 @@ spec: value: {{ cluster_cfg['rest-server']['jwt-expire-time'] }} - name: WEBPORTAL_URL {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %} - value: "{{ cluster_cfg['pylon']['uriHttps']}}" + value: "{{ cluster_cfg['pylon']['uri-https']}}" {%- else %} value: "{{ cluster_cfg['pylon']['uri']}}" {%- endif %} diff --git a/src/rest-server/docs/swagger.yaml b/src/rest-server/docs/swagger.yaml index 504a50915f..4d2aec74e4 100644 --- a/src/rest-server/docs/swagger.yaml +++ b/src/rest-server/docs/swagger.yaml @@ -12,6 +12,7 @@ info: Version 2.1.1: add get event api Version 2.2.0: add get task status api; add jobAttempId filter to job status api and extend job detail schema Version 2.2.1: a user can add/delete tags to/from his/her own jobs + version 2.2.2: add get pod logs api license: name: MIT License url: "https://github.com/microsoft/pai/blob/master/LICENSE" @@ -1485,6 +1486,8 @@ paths: security: - bearerAuth: [] parameters: + - $ref: "#/components/parameters/user" + - $ref: "#/components/parameters/job" - name: type in: query description: filter events with type. Could be "Warning" or "Normal". @@ -1738,7 +1741,7 @@ paths: ssh: 37508 http: 24661 containerGpus: null - containerLog: http://10.151.40.34:9103/log-manager/tail/admin/9893b6e0f9f5997f4a82f23bef39bf32/taskrole/a9f9f1f2-5e43-4423-88c3-022433b8cd7c/ + containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs containerExitCode: -220 containerExitSpec: code: -220 @@ -1765,7 +1768,7 @@ paths: ssh: 37508 http: 24661 containerGpus: null - containerLog: http://10.151.40.34:9103/log-manager/tail/admin/9893b6e0f9f5997f4a82f23bef39bf32/taskrole/a9f9f1f2-5e43-4423-88c3-022433b8cd7c/ + containerLog: /api/v2/jobs/admin~admin_444da84f/pods/07cdd036-1a7c-11eb-830b-000d3ab25bb6/logs containerExitCode: -220 containerExitSpec: code: -220 @@ -1784,6 +1787,43 @@ paths: $ref: "#/components/responses/NoTaskError" "500": $ref: "#/components/responses/UnknownError" + "/api/v2/jobs/{user}~{job}/pods/{podUid}/logs": + get: + tags: + - job + summary: Get job pod log list. + description: Get job pod log list. + operationId: getPodLogs + security: + - bearerAuth: [] + parameters: + - $ref: "#/components/parameters/user" + - $ref: "#/components/parameters/job" + - $ref: "#/components/parameters/podUid" + - name: tailMode + in: query + description: getting log content via tail mode. Could be "true" or "false" + schema: + type: boolean + responses: + "200": + description: Succeeded + content: + application/json: + schema: + $ref: "#/components/schemas/PodLogInfo" + example: + locations: + - name: stderr + uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stderr?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token" + - name: all + uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.all?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token" + - name: stdout + uri: "https://mater_ip/log-manager/node_ip/api/v1/logs/user.pai.stdout?username=user&framework-name=34775529adebae576fbc0bf48d835386&pod-uid=07cdd036-1a7c-11eb-830b-000d3ab25bb6&taskrole=taskrole&token=token" + "404": + $ref: "#/components/responses/NoPodLogsError" + "500": + $ref: "#/components/responses/UnknownError" /api/v2/kubernetes/nodes: get: tags: @@ -1917,6 +1957,13 @@ components: required: true schema: type: string + podUid: + name: podUid + in: path + description: job pod uid + required: true + schema: + type: string schemas: Response: type: object @@ -3067,6 +3114,20 @@ components: - storageConfig - email - extension + PodLogInfo: + type: object + properties: + locations: + type: array + items: + type: object + properties: + name: + type: string + description: log name. + uri: + type: string + description: log content lnk. responses: InvalidParametersError: description: InvalidParametersError @@ -3226,6 +3287,17 @@ components: value: code: NoJobSshInfoError message: "SSH info of job {job} is not found." + NoPodLogsError: + description: NoPodLogsError + content: + application/json: + schema: + $ref: "#/components/schemas/Response" + examples: + NoJobSshInfoError: + value: + code: NoPodLogsError + message: "Logs for pod {podUid} is not found." ConflictUserError: description: ConflictUserError content: diff --git a/src/rest-server/src/controllers/v2/job.js b/src/rest-server/src/controllers/v2/job.js index 0831bb5e62..7621038f3e 100644 --- a/src/rest-server/src/controllers/v2/job.js +++ b/src/rest-server/src/controllers/v2/job.js @@ -20,7 +20,8 @@ const status = require('statuses'); const asyncHandler = require('@pai/middlewares/v2/asyncHandler'); const createError = require('@pai/utils/error'); -const job = require('@pai/models/v2/job'); +const { job, log } = require('@pai/models/v2/job'); +const logger = require('@pai/config/logger'); const { Op } = require('sequelize'); const list = asyncHandler(async (req, res) => { @@ -288,6 +289,26 @@ const getEvents = asyncHandler(async (req, res) => { res.json(data); }); +const getLogs = asyncHandler(async (req, res) => { + try { + const data = await log.getLogListFromLogManager( + req.params.frameworkName, + req.params.podUid, + req.query['tail-mode'], + ); + res.json(data); + } catch (error) { + logger.error(`Got error when retrieving log list, error: ${error}`); + throw error.code === 'NoPodLogsError' + ? error + : createError( + 'Internal Server Error', + 'UnknownError', + 'Failed to get log list', + ); + } +}); + // module exports module.exports = { list, @@ -299,4 +320,5 @@ module.exports = { addTag, deleteTag, getEvents, + getLogs, }; diff --git a/src/rest-server/src/models/v2/job/index.js b/src/rest-server/src/models/v2/job/index.js index 1e5e3db7e0..09e4d86151 100644 --- a/src/rest-server/src/models/v2/job/index.js +++ b/src/rest-server/src/models/v2/job/index.js @@ -37,4 +37,7 @@ if (config.env !== 'test') { } })(); } -module.exports = require('@pai/models/v2/job/k8s'); +module.exports = { + job: require('@pai/models/v2/job/k8s'), + log: require('@pai/models/v2/job/log'), +}; diff --git a/src/rest-server/src/models/v2/job/k8s.js b/src/rest-server/src/models/v2/job/k8s.js index 8e5e697129..67d64f0d24 100644 --- a/src/rest-server/src/models/v2/job/k8s.js +++ b/src/rest-server/src/models/v2/job/k8s.js @@ -76,7 +76,7 @@ const convertFrameworkSummary = (framework) => { }; }; -const convertTaskDetail = async (taskStatus, ports, logPathPrefix) => { +const convertTaskDetail = async (taskStatus, ports, frameworkName) => { // get containerPorts const containerPorts = getContainerPorts( ports, @@ -103,7 +103,7 @@ const convertTaskDetail = async (taskStatus, ports, logPathPrefix) => { containerNodeName: taskStatus.attemptStatus.podNodeName, containerPorts, containerGpus, - containerLog: `http://${taskStatus.attemptStatus.podHostIP}:${process.env.LOG_MANAGER_PORT}/log-manager/tail/${logPathPrefix}/${taskStatus.attemptStatus.podUID}/`, + containerLog: `/api/v2/jobs/${frameworkName}/pods/${taskStatus.attemptStatus.podUID}/logs`, containerExitCode: completionStatus ? completionStatus.code : null, containerExitSpec: completionStatus ? generateExitSpec(completionStatus.code) @@ -158,9 +158,6 @@ const convertFrameworkDetail = async ( const virtualCluster = frameworkWithLatestAttempt.metadata.labels ? frameworkWithLatestAttempt.metadata.labels.virtualCluster : 'unknown'; - const logPathInfix = frameworkWithLatestAttempt.metadata.annotations - ? frameworkWithLatestAttempt.metadata.annotations.logPathInfix - : null; const latestAttemptStatus = frameworkWithLatestAttempt.status.attemptStatus; const latestAttemptCompletionStatus = latestAttemptStatus.completionStatus; @@ -291,7 +288,7 @@ const convertFrameworkDetail = async ( await convertTaskDetail( status, ports[taskRoleStatus.name], - `${userName}/${logPathInfix || jobName}/${taskRoleStatus.name}`, + `${userName}~${jobName}`, ), ), ); diff --git a/src/rest-server/src/models/v2/job/log.js b/src/rest-server/src/models/v2/job/log.js new file mode 100644 index 0000000000..556bf0a1b2 --- /dev/null +++ b/src/rest-server/src/models/v2/job/log.js @@ -0,0 +1,101 @@ +// Copyright (c) Microsoft Corporation +// All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +// to permit persons to whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +const axios = require('axios'); +const job = require('./k8s'); +const logger = require('@pai/config/logger'); +const createError = require('@pai/utils/error'); +const { encodeName } = require('@pai/models/v2/utils/name'); + +const LOG_MANAGER_PORT = process.env.LOG_MANAGER_PORT; +const WEBPORTAL_URL = process.env.WEBPORTAL_URL; + +const constrcutLogManagerPrefix = (nodeIp) => { + return `http://${nodeIp}:${LOG_MANAGER_PORT}/api/v1`; +}; + +const loginLogManager = async (nodeIp, username, password) => { + const prefix = constrcutLogManagerPrefix(nodeIp); + return axios.post(`${prefix}/tokens`, { + username: username, + password: password, + }); +}; + +const getLogListFromLogManager = async (frameworkName, podUid, tailMode) => { + const adminName = process.env.LOG_MANAGER_ADMIN_NAME; + const adminPassword = process.env.LOG_MANAGER_ADMIN_PASSWORD; + + const jobDetail = await job.get(frameworkName); + const noPodLogsErr = createError( + 'Not Found', + 'NoPodLogsError', + `Logs for pod ${podUid} is not found.`, + ); + let nodeIp; + let taskRoleName; + for (const [key, taskRole] of Object.entries(jobDetail.taskRoles)) { + const status = taskRole.taskStatuses.find( + (status) => status.containerId === podUid, + ); + if (!status) { + logger.error(`Failed to find pod which has pod uid ${podUid}`); + throw noPodLogsErr; + } + nodeIp = status.containerIp; + taskRoleName = key; + } + + let res = await loginLogManager(nodeIp, adminName, adminPassword); + const token = res.data.token; + + const prefix = constrcutLogManagerPrefix(nodeIp); + try { + const params = { + token: token, + username: jobDetail.jobStatus.username, + taskrole: taskRoleName, + }; + params['framework-name'] = encodeName(frameworkName); + params['pod-uid'] = podUid; + res = await axios.get(`${prefix}/logs`, { + params: params, + }); + } catch (err) { + if (err.response && err.response.status === 404) { + throw noPodLogsErr; + } + throw err; + } + const logList = res.data; + + const ret = { locations: [] }; + const urlPrefix = `${WEBPORTAL_URL}/log-manager/${nodeIp}:${LOG_MANAGER_PORT}`; + const urlSuffix = tailMode === 'true' ? '&tail-mode=true' : ''; + for (const key in logList) { + ret.locations.push({ + name: key, + uri: `${urlPrefix}${logList[key]}${urlSuffix}`, + }); + } + + return ret; +}; + +module.exports = { + getLogListFromLogManager, +}; diff --git a/src/rest-server/src/routes/v2/job.js b/src/rest-server/src/routes/v2/job.js index 9498449ad9..c1ee6cfbcf 100644 --- a/src/rest-server/src/routes/v2/job.js +++ b/src/rest-server/src/routes/v2/job.js @@ -97,5 +97,10 @@ router /** GET /api/v2/jobs/:frameworkName/events - Get events of a framework */ .get(token.check, controller.getEvents); +router + .route('/:frameworkName/pods/:podUid/logs') + /** GET /api/v2/jobs/:frameworkName/pods/:podUid/logs - Get logs of a pod */ + .get(token.check, controller.getLogs); + // module exports module.exports = router; diff --git a/src/rest-server/src/utils/error.d.ts b/src/rest-server/src/utils/error.d.ts index 01d5d5e997..f384dcc281 100644 --- a/src/rest-server/src/utils/error.d.ts +++ b/src/rest-server/src/utils/error.d.ts @@ -49,6 +49,7 @@ declare type Code = 'UnauthorizedUserError' | 'NoEnoughQuotaError' | 'NotImplementedError' | + 'NoPodLogsError' | 'UnknownError'; declare function createError(status: Status, code: Code, message: string): HttpError; diff --git a/tests/jenkins/test_rest_server_js_sdk.sh b/tests/jenkins/test_rest_server_js_sdk.sh index a97b9426f6..a4f3dffeb1 100644 --- a/tests/jenkins/test_rest_server_js_sdk.sh +++ b/tests/jenkins/test_rest_server_js_sdk.sh @@ -49,7 +49,7 @@ EOT cp ${WORKSPACE}/src/rest-server/docs/swagger.yaml . node tests/common/apiTestCaseGenerator.js -- "swagger.yaml" ".tests/apiTestCase.json" sudo npm install -g mocha - mocha tests/api_tests/**/*.spec.js -t 20000 + mocha tests/api_tests/**/*.spec.js -t 1800000 cd ../.. ;;