diff --git a/.changeset/afraid-maps-deliver.md b/.changeset/afraid-maps-deliver.md new file mode 100644 index 0000000000..1f5d527ce1 --- /dev/null +++ b/.changeset/afraid-maps-deliver.md @@ -0,0 +1,19 @@ +--- +"@scow/scheduler-adapter-protos": minor +"@scow/lib-scheduler-adapter": minor +"@scow/portal-server": minor +"@scow/test-adapter": minor +"@scow/protos": minor +"@scow/mis-server": minor +"@scow/portal-web": minor +"@scow/demo-vagrant": minor +"@scow/mis-web": minor +"@scow/config": minor +"@scow/auth": minor +"@scow/cli": minor +"@scow/lib-ssh": minor +"@scow/grpc-api": minor +"@scow/docs": minor +--- + +重构 scow,对接调度器适配器接口 diff --git a/.devcontainer/docker-compose.devcontainer.yml b/.devcontainer/docker-compose.devcontainer.yml index d8657fb781..27ae25448e 100644 --- a/.devcontainer/docker-compose.devcontainer.yml +++ b/.devcontainer/docker-compose.devcontainer.yml @@ -75,6 +75,13 @@ services: PHPLDAPADMIN_LDAP_HOSTS: ldap://scow-dev:389 PHPLDAPADMIN_HTTPS: false + test-adapter: + image: test-adapter + build: + context: .. + dockerfile: dev/test-adapter/Dockerfile + network_mode: service:scow-dev + volumes: db: ldap: diff --git a/apps/auth/src/auth/ssh/index.ts b/apps/auth/src/auth/ssh/index.ts index fbc8d98c4f..8864931eab 100644 --- a/apps/auth/src/auth/ssh/index.ts +++ b/apps/auth/src/auth/ssh/index.ts @@ -31,7 +31,7 @@ function checkLoginNode(sshConfig: SshConfigSchema) { } const clusterConfig = Object.values(clusters)[0]; - loginNode = getLoginNode(clusterConfig.slurm.loginNodes[0]).address; + loginNode = getLoginNode(clusterConfig.loginNodes[0]).address; if (!loginNode) { throw new Error(`Cluster ${clusterConfig.displayName} has no login node.`); diff --git a/apps/cli/assets/config/clusters/hpc01.yaml b/apps/cli/assets/config/clusters/hpc01.yaml index eba1ec8fa6..2f75d1ba72 100644 --- a/apps/cli/assets/config/clusters/hpc01.yaml +++ b/apps/cli/assets/config/clusters/hpc01.yaml @@ -2,79 +2,17 @@ displayName: hpc01Name # 指定slurm配置 -slurm: - loginNodes: - # 登录节点展示名称 - - name: login01 - # 登录节点的IP或者域名 - # 如果设置的是域名,请确认此节点的/etc/hosts中包含了域名到IP的解析信息 - address: login01 - - name: login02 - address: login02 - - # 集群的分区信息 - partitions: - # 分区1的名字 - - name: compute - # 分区内节点数 - nodes: 28 - # 单节点内存数量,单位M - mem: 7500 - # 核心数 - cores: 2 - # GPU卡数 - gpus: 0 - # QOS - qos: - - low - - normal - - high - # 这个分区的备注信息 - comment: "" - - - name: GPU - nodes: 1 - mem: 262144 - cores: 48 - gpus: 8 - qos: - - low - - normal - - high - comment: "" - - # 如果这个集群要部署管理系统,请增加以下配置 - # 如果不需要,将整个mis块注释掉 - mis: - # 部署slurm.sh的机器的地址 - managerUrl: haha - # slurm.sh在机器中的绝对地址 - scriptPath: /test/slurm.sh - - # 部署slurm.sh的机器通过什么地址访问slurm的数据库 - # 不填写为下面的默认值 - # dbHost: localhost - - # 部署slurm.sh的机器通过什么端口访问slurm的数据库 - # 不填写为下面的默认值 - # dbPort: 3306 - - # slurm数据库的用户名 - # 参考slurmdbd.conf的StorageUser配置 - # 不填写为下面的默认值 - # dbUser: root - - # slurmdbd的数据库用户的密码 - # 参考slurmdbd.conf的StoragePass配置 - dbPassword: password - - # slurm accounting数据库的数据库名 - # 参考slurmdbd.conf的StorageLoc配置 - # 不填写为下面的默认值 - # slurmAcctDbName: "slurm_acct_db" - - # 这个集群在slurm中的集群名字 - clusterName: pkuhpc +loginNodes: + # 登录节点展示名称 + - name: login01 + # 登录节点的IP或者域名 + # 如果设置的是域名,请确认此节点的/etc/hosts中包含了域名到IP的解析信息 + address: login01 + - name: login02 + address: login02 + +# 适配器地址(ip地址:端口号) +adapterUrl: localhost:8972 # 门户系统代理网关节点配置 # proxyGateway: diff --git a/apps/cli/assets/config/mis.yaml b/apps/cli/assets/config/mis.yaml index 02928872e6..1255397e56 100644 --- a/apps/cli/assets/config/mis.yaml +++ b/apps/cli/assets/config/mis.yaml @@ -8,18 +8,6 @@ db: # 获取作业相关配置 fetchJobs: - # 源作业信息数据库的数据库信息 - db: - host: sourcedb - port: 3307 - user: root - password: jobtablepassword - dbName: jobs - tableName: jobs - # 数据库类型,可选mariadb或者mysql - # 默认为mariadb - # type: mariadb - # 周期性获取数据 periodicFetch: # 是否开启 diff --git a/apps/mis-server/config/clusters/hpc00.yml b/apps/mis-server/config/clusters/hpc00.yml index d089de2e41..5cf55655fa 100644 --- a/apps/mis-server/config/clusters/hpc00.yml +++ b/apps/mis-server/config/clusters/hpc00.yml @@ -1,39 +1,32 @@ displayName: hpc00 -misIgnore: true -slurm: - mis: - managerUrl: localhost:22222 - dbPassword: password - clusterName: pkuhpc - scriptPath: /slurmshTest/slurm.sh +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: login + address: localhost:22222 - loginNodes: - - name: login - address: localhost:22222 - - partitions: - - name: C032M0128G - mem: 131072 - cores: 32 - nodes: 32 - gpus: 0 - qos: - - low - - normal - - high - - cryoem - - name: GPU - mem: 262144 - cores: 28 - nodes: 32 - gpus: 4 - qos: - - low - - normal - - high - - cryoem - - name: life - mem: 262144 - cores: 28 - gpus: 4 - nodes: 32 +# partitions: +# - name: C032M0128G +# mem: 131072 +# cores: 32 +# nodes: 32 +# gpus: 0 +# qos: +# - low +# - normal +# - high +# - cryoem +# - name: GPU +# mem: 262144 +# cores: 28 +# nodes: 32 +# gpus: 4 +# qos: +# - low +# - normal +# - high +# - cryoem +# - name: life +# mem: 262144 +# cores: 28 +# gpus: 4 +# nodes: 32 diff --git a/apps/mis-server/config/clusters/hpc01.yml b/apps/mis-server/config/clusters/hpc01.yml index aa62fb38fe..7576c03d63 100644 --- a/apps/mis-server/config/clusters/hpc01.yml +++ b/apps/mis-server/config/clusters/hpc01.yml @@ -1,30 +1,33 @@ displayName: hpc01 -misIgnore: true -slurm: - mis: - managerUrl: localhost:22222 - dbPassword: password - clusterName: pkuhpc1 - scriptPath: /slurmshTest/slurm.sh - loginNodes: - - name: login - address: localhost:22222 - partitions: - - name: compute - nodes: 198 - mem: 63000 - cores: 28 - gpus: 0 - qos: - - low - - normal - - high - - name: gpu - nodes: 1 - mem: 386000 - cores: 48 - gpus: 8 - qos: - - low - - normal - - high +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: login + address: localhost:22222 +# misIgnore: true +# slurm: +# mis: +# managerUrl: localhost:22222 +# dbPassword: password +# clusterName: pkuhpc1 +# scriptPath: /slurmshTest/slurm.sh +# loginNodes: +# - localhost:22222 +# partitions: +# - name: compute +# nodes: 198 +# mem: 63000 +# cores: 28 +# gpus: 0 +# qos: +# - low +# - normal +# - high +# - name: gpu +# nodes: 1 +# mem: 386000 +# cores: 48 +# gpus: 8 +# qos: +# - low +# - normal +# - high diff --git a/apps/mis-server/config/clusters/hpc02.yml b/apps/mis-server/config/clusters/hpc02.yml index 184de9db8b..651049f49e 100644 --- a/apps/mis-server/config/clusters/hpc02.yml +++ b/apps/mis-server/config/clusters/hpc02.yml @@ -1,24 +1,27 @@ displayName: hpc01 -slurm: - loginNodes: - - name: login - address: localhost:22222 - partitions: - - name: compute - nodes: 198 - mem: 63000 - cores: 28 - gpus: 0 - qos: - - low - - normal - - high - - name: gpu - nodes: 1 - mem: 386000 - cores: 48 - gpus: 8 - qos: - - low - - normal - - high +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: login + address: localhost:22222 +# slurm: +# loginNodes: +# - localhost:22222 +# partitions: +# - name: compute +# nodes: 198 +# mem: 63000 +# cores: 28 +# gpus: 0 +# qos: +# - low +# - normal +# - high +# - name: gpu +# nodes: 1 +# mem: 386000 +# cores: 48 +# gpus: 8 +# qos: +# - low +# - normal +# - high diff --git a/apps/mis-server/config/mis.yaml b/apps/mis-server/config/mis.yaml index 4f64116dc6..447e40e0af 100644 --- a/apps/mis-server/config/mis.yaml +++ b/apps/mis-server/config/mis.yaml @@ -5,12 +5,4 @@ db: password: mysqlrootpassword dbName: scow_server_${JEST_WORKER_ID} -fetchJobs: - db: - host: 127.0.0.1 - port: 3307 - user: root - password: jobtablepassword - dbName: jobs - tableName: jobs diff --git a/apps/mis-server/package.json b/apps/mis-server/package.json index a4e9398c76..7e63da6cea 100644 --- a/apps/mis-server/package.json +++ b/apps/mis-server/package.json @@ -24,6 +24,7 @@ "dependencies": { "@ddadaal/tsgrpc-common": "0.2.3", "@ddadaal/tsgrpc-server": "0.19.2", + "@ddadaal/tsgrpc-client": "0.17.5", "@grpc/grpc-js": "1.8.15", "@mikro-orm/cli": "5.7.12", "@mikro-orm/core": "5.7.12", @@ -40,6 +41,8 @@ "@scow/lib-slurm": "workspace:*", "@scow/lib-ssh": "workspace:*", "@scow/protos": "workspace:*", + "@scow/scheduler-adapter-protos": "workspace:*", + "@scow/lib-scheduler-adapter": "workspace:*", "@scow/utils": "workspace:*", "@sinclair/typebox": "0.28.15", "dotenv": "16.3.1", @@ -50,7 +53,6 @@ "wait-on": "7.0.1" }, "devDependencies": { - "@ddadaal/tsgrpc-client": "0.17.5", "@types/google-protobuf": "3.15.6", "@types/node-cron": "3.0.7", "@types/wait-on": "5.3.1" diff --git a/apps/mis-server/src/app.ts b/apps/mis-server/src/app.ts index 34f6cb0940..8e8494641b 100644 --- a/apps/mis-server/src/app.ts +++ b/apps/mis-server/src/app.ts @@ -19,6 +19,7 @@ import { plugins } from "src/plugins"; import { accountServiceServer } from "src/services/account"; import { adminServiceServer } from "src/services/admin"; import { chargingServiceServer } from "src/services/charging"; +import { configServiceServer } from "src/services/config"; import { initServiceServer } from "src/services/init"; import { jobServiceServer } from "src/services/job"; import { jobChargeLimitServer } from "src/services/jobChargeLimit"; @@ -49,6 +50,7 @@ export async function createServer() { await server.register(jobServiceServer); await server.register(chargingServiceServer); await server.register(tenantServiceServer); + await server.register(configServiceServer); const em = server.ext.orm.em.fork(); await updateBlockStatusInSlurm(em, server.ext.clusters, server.logger); diff --git a/apps/mis-server/src/bl/PriceMap.ts b/apps/mis-server/src/bl/PriceMap.ts index 7a965cd2fb..4f6faaea7d 100644 --- a/apps/mis-server/src/bl/PriceMap.ts +++ b/apps/mis-server/src/bl/PriceMap.ts @@ -10,15 +10,19 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { Logger } from "@ddadaal/tsgrpc-server"; import { MySqlDriver, SqlEntityManager } from "@mikro-orm/mysql"; +import { Partition } from "@scow/scheduler-adapter-protos/build/protos/config"; import { calculateJobPrice } from "src/bl/jobPrice"; import { clusters } from "src/config/clusters"; import { JobPriceInfo } from "src/entities/JobInfo"; import { JobPriceItem } from "src/entities/JobPriceItem"; +import { ClusterPlugin } from "src/plugins/clusters"; export interface JobInfo { - biJobIndex: number; + // cluster job id + jobId: number; // scow cluster id cluster: string; partition: string; @@ -43,7 +47,11 @@ export interface PriceMap { } -export async function createPriceMap(em: SqlEntityManager, logger: Logger): Promise { +export async function createPriceMap( + em: SqlEntityManager, + clusterPlugin: ClusterPlugin["clusters"], + logger: Logger, +): Promise { // get all billing items // order by ASC so that items added later overrides items added before. const billingItems = await em.find(JobPriceItem, {}, { @@ -77,16 +85,46 @@ export async function createPriceMap(em: SqlEntityManager, logger: return price; }; + // partitions info for all clusters + const partitionsForClusters: Record = {}; + if (!process.env.SCOW_CONFIG_PATH && process.env.NODE_ENV !== "production") { + // data for test + partitionsForClusters["hpc00"] = [ + { name: "C032M0128G", memMb: 131072, cores: 32, nodes: 32, gpus: 0, qos: ["low", "normal", "high", "cryoem"]}, + { name: "GPU", memMb: 262144, cores: 28, nodes: 32, gpus: 4, qos: ["low", "normal", "high", "cryoem"]}, + { name: "life", memMb: 262144, cores: 28, nodes: 32, gpus: 4, qos: []}, + ]; + partitionsForClusters["hpc01"] = [ + { name: "compute", nodes: 198, memMb: 63000, cores: 28, gpus: 0, qos: ["low", "normal", "high"]}, + { name: "gpu", nodes: 1, memMb: 386000, cores: 48, gpus: 8, qos: ["low", "normal", "high"]}, + ]; + partitionsForClusters["hpc02"] = [ + { name: "compute", nodes: 198, memMb: 63000, cores: 28, gpus: 0, qos: ["low", "normal", "high"]}, + { name: "gpu", nodes: 1, memMb: 386000, cores: 48, gpus: 8, qos: ["low", "normal", "high"]}, + ]; + + } else { + const reply = await clusterPlugin.callOnAll( + logger, + async (client) => await asyncClientCall(client.config, "getClusterConfig", {}), + ); + reply.forEach((x) => { + if (x.success) { + partitionsForClusters[x.cluster] = x.result.partitions; + } + }); + } + return { - calculatePrice: (info) => calculateJobPrice(info, getPriceItem, logger), + calculatePrice: (info) => calculateJobPrice(partitionsForClusters, info, getPriceItem, logger), getMissingDefaultPriceItems: () => { const missingPaths = [] as string[]; for (const cluster in clusters) { - for (const partition of clusters[cluster].slurm.partitions) { + for (const partition of partitionsForClusters[cluster]) { const path = [cluster, partition.name]; const { qos } = partition; diff --git a/apps/mis-server/src/bl/block.ts b/apps/mis-server/src/bl/block.ts index faf4b5539f..38608cabe8 100644 --- a/apps/mis-server/src/bl/block.ts +++ b/apps/mis-server/src/bl/block.ts @@ -10,6 +10,7 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { Logger } from "@ddadaal/tsgrpc-server"; import { Loaded } from "@mikro-orm/core"; import { MySqlDriver, SqlEntityManager } from "@mikro-orm/mysql"; @@ -34,23 +35,23 @@ export async function updateBlockStatusInSlurm( if (account.whitelist) { continue; } - await clusterPlugin.callOnAll(logger, async (ops) => await ops.account.blockAccount({ - request: { accountName: account.accountName }, - logger, - })); + await clusterPlugin.callOnAll(logger, async (client) => + await asyncClientCall(client.account, "blockAccount", { + accountName: account.accountName, + }), + ); } const userAccounts = await em.find(UserAccount, { status: UserStatus.BLOCKED, }, { populate: ["user", "account"]}); for (const ua of userAccounts) { - await clusterPlugin.callOnAll(logger, async (ops) => ops.user.blockUserInAccount({ - request: { + await clusterPlugin.callOnAll(logger, async (client) => + await asyncClientCall(client.user, "blockUserInAccount", { accountName: ua.account.getProperty("accountName"), userId: ua.user.getProperty("userId"), - }, - logger, - })); + }), + ); } const updateBlockTime = await em.upsert(SystemState, { key: SystemState.KEYS.UPDATE_SLURM_BLOCK_STATUS, @@ -86,14 +87,10 @@ export async function blockAccount( return "Whitelisted"; } - await clusterPlugin.callOnAll(logger, async (ops) => { - const resp = await ops.account.blockAccount({ - request: { accountName: account.accountName }, - logger, + await clusterPlugin.callOnAll(logger, async (client) => { + await asyncClientCall(client.account, "blockAccount", { + accountName: account.accountName, }); - if (resp.code === "NOT_FOUND") { - throw new Error(`Account ${account.accountName} not found`); - } }); account.blocked = true; @@ -116,15 +113,10 @@ export async function unblockAccount( if (!account.blocked) { return "ALREADY_UNBLOCKED"; } - await clusterPlugin.callOnAll(logger, async (ops) => { - const resp = await ops.account.unblockAccount({ - request: { accountName: account.accountName }, - logger, + await clusterPlugin.callOnAll(logger, async (client) => { + await asyncClientCall(client.account, "unblockAccount", { + accountName: account.accountName, }); - - if (resp.code === "NOT_FOUND") { - throw new Error(`Account ${account.accountName} not found`); - } }); account.blocked = false; @@ -148,13 +140,12 @@ export async function blockUserInAccount( const accountName = ua.account.$.accountName; const userId = ua.user.$.userId; - await clusterPlugin.clusters.callOnAll(logger, async (ops) => ops.user.blockUserInAccount({ - request: { + await clusterPlugin.clusters.callOnAll(logger, async (client) => + await asyncClientCall(client.user, "blockUserInAccount", { accountName, userId, - }, - logger, - })); + }), + ); ua.status = UserStatus.BLOCKED; @@ -178,10 +169,12 @@ export async function unblockUserInAccount( const accountName = ua.account.getProperty("accountName"); const userId = ua.user.getProperty("userId"); - await clusterPlugin.clusters.callOnAll(logger, async (ops) => ops.user.unblockUserInAccount({ - request: { accountName, userId }, - logger, - })); + await clusterPlugin.clusters.callOnAll(logger, async (client) => + await asyncClientCall(client.user, "unblockUserInAccount", { + accountName, + userId, + }), + ); ua.status = UserStatus.UNBLOCKED; diff --git a/apps/mis-server/src/bl/importUsers.ts b/apps/mis-server/src/bl/importUsers.ts index 7ebaad02b9..b1ec168882 100644 --- a/apps/mis-server/src/bl/importUsers.ts +++ b/apps/mis-server/src/bl/importUsers.ts @@ -27,7 +27,7 @@ import { toRef } from "src/utils/orm"; export interface ImportUsersData { accounts: { accountName: string; - users: {userId: string; userName: string; state: string}[]; + users: {userId: string; userName: string; blocked: boolean}[]; owner: string; blocked: boolean; }[]; @@ -88,14 +88,13 @@ export async function importUsers(data: ImportUsersData, em: SqlEntityManager, accounts.push(account); a.users.forEach((u) => { - const state = u.state; const user = usersMap[u.userId]; userAccounts.push(new UserAccount({ account, user, role: a.owner === u.userId ? UserRole.OWNER : UserRole.USER, - status: state === "allowed!" ? UserStatus.UNBLOCKED : UserStatus.BLOCKED, + status: u.blocked ? UserStatus.BLOCKED : UserStatus.UNBLOCKED, })); }); }); diff --git a/apps/mis-server/src/bl/jobPrice.ts b/apps/mis-server/src/bl/jobPrice.ts index ad25b967b3..0eb1af9df6 100644 --- a/apps/mis-server/src/bl/jobPrice.ts +++ b/apps/mis-server/src/bl/jobPrice.ts @@ -11,14 +11,15 @@ */ import { Logger } from "@ddadaal/tsgrpc-server"; -import { ClusterConfigSchema } from "@scow/config/build/cluster"; +// import { ClusterConfigSchema } from "@scow/config/build/cluster"; import { Decimal } from "@scow/lib-decimal"; +import { Partition } from "@scow/scheduler-adapter-protos/build/protos/config"; import { JobInfo, PriceMap } from "src/bl/PriceMap"; import { clusters } from "src/config/clusters"; import { JobPriceInfo } from "src/entities/JobInfo"; import { AmountStrategy, JobPriceItem } from "src/entities/JobPriceItem"; -type Partition = ClusterConfigSchema["slurm"]["partitions"][number]; +// type Partition = ClusterConfigSchema["slurm"]["partitions"][number]; type AmountStrategyFunc = (info: JobInfo, partition: Partition) => Decimal; @@ -37,14 +38,14 @@ const amountStrategyFuncs: Record = { }, [AmountStrategy.MAX_CPUSALLOC_MEM]: (info, partition) => { - const { mem, cores } = partition; + const { memMb, cores } = partition; return Decimal.max( // 核心数 info.cpusAlloc, // 申请内存总数/(分区内容/分区核心数) new Decimal(info.memReq).div( - new Decimal(mem).div(cores), + new Decimal(memMb).div(cores), ).integerValue(Decimal.ROUND_CEIL), ); }, @@ -53,10 +54,11 @@ const amountStrategyFuncs: Record = { export function calculateJobPrice( + partitionsForClusters: Record, info: JobInfo, getPriceItem: PriceMap["getPriceItem"], logger: Logger): JobPriceInfo { - logger.trace(`Calculating price for job ${info.biJobIndex}`); + logger.trace(`Calculating price for job ${info.jobId} in cluster ${info.cluster}`); const clusterInfo = clusters[info.cluster]; @@ -65,7 +67,7 @@ export function calculateJobPrice( return emptyJobPriceInfo(); } - const partitionInfo = clusterInfo.slurm.partitions.find((x) => x.name === info.partition); + const partitionInfo = partitionsForClusters[info.cluster].find((x) => x.name === info.partition); if (!partitionInfo) { logger.warn(`Unknown partition ${info.partition} of cluster ${info.cluster}`); return emptyJobPriceInfo(); diff --git a/apps/mis-server/src/clusterops/api/account.ts b/apps/mis-server/src/clusterops/api/account.ts deleted file mode 100644 index 2e89a2c0c2..0000000000 --- a/apps/mis-server/src/clusterops/api/account.ts +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { ClusterAccountInfo } from "@scow/protos/build/server/admin"; -import { Request } from "src/clusterops/api"; - -export interface CreateAccountRequest { - accountName: string; - ownerId: string; -} - -export type CreateAccountReply = - | { code: "ALREADY_EXISTS"} - | { code: "OK" }; - -export interface DeleteAccountRequest { - accountName: string; -} - -export type DeleteAccountReply = - | { code: "NOT_FOUND"} - | { code: "OK" }; - - -export interface BlockAccountRequest { - accountName: string; -} - -/** NOT_FOUND: account is not found. */ -export type BlockAccountReply = { - code: "OK" | "NOT_FOUND" | "ALREADY_BLOCKED"; -}; - -export interface UnblockAccountRequest { - accountName: string; -} - -/** NOT_FOUND: account is not found. */ -export type UnblockAccountReply = { - code: "OK" | "NOT_FOUND" | "ALREADY_UNBLOCKED" -}; - -export interface GetAllAccountsWithUsersRequest {} - -export interface GetAllAccountsWithUsersReply { - accounts: ClusterAccountInfo[] -} - -export interface AccountOps { - deleteAccount(req: Request): Promise; - createAccount(req: Request): Promise; - blockAccount(req: Request): Promise; - unblockAccount(req: Request): Promise; - getAllAccountsWithUsers(req: Request): Promise; -} - diff --git a/apps/mis-server/src/clusterops/api/job.ts b/apps/mis-server/src/clusterops/api/job.ts deleted file mode 100644 index e478aee9df..0000000000 --- a/apps/mis-server/src/clusterops/api/job.ts +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { RunningJob } from "@scow/protos/build/common/job"; -import { Request } from "src/clusterops/api"; - -export interface GetRunningJobsRequest { - userId?: string | undefined; - accountNames: string[]; - jobIdList: string[]; -} - -export interface GetRunningJobsReply { - jobs: RunningJob[]; -} - -export interface ChangeJobTimeLimitRequest { - jobId: string; - /** 单位:分钟 */ - delta: number; -} - -/** NOT_FOUND: if job_id is not found. */ -export type ChangeJobTimeLimitReply = - | { code: "NOT_FOUND"} - | { code: "OK" }; - -export interface QueryJobTimeLimitRequest { - jobId: string; -} - -/** NOT_FOUND: if job_id is not found */ -export type QueryJobTimeLimitReply = - | { code: "NOT_FOUND"} - | { - code: "OK", - // 单位秒 - limit: number; -}; - -export interface JobOps { - getRunningJobs(req: Request): Promise; - changeJobTimeLimit(req: Request): Promise; - queryJobTimeLimit(req: Request): Promise; -} diff --git a/apps/mis-server/src/clusterops/api/storage.ts b/apps/mis-server/src/clusterops/api/storage.ts deleted file mode 100644 index 8a79124d61..0000000000 --- a/apps/mis-server/src/clusterops/api/storage.ts +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { Request } from "src/clusterops/api"; - -export enum ChangeStorageQuotaMode { - INCREASE = 0, - DECREASE = 1, - SET = 2, -} - -export interface ChangeStorageQuotaRequest { - userId: string; - mode: ChangeStorageQuotaMode; - value: number; -} - -export type ChangeStorageQuotaReply = - | { code: "NOT_FOUND"} // the user is not found - | { code: "INVALID_VALUE" } // the value is not valid - | { code: "OK", currentQuota: number; } - -export interface QueryUsedStorageQuotaRequest { - userId: string; -} - -export type QueryUsedStorageQuotaReply = - | { code: "NOT_FOUND"} // the user is not found - | { code: "OK", used: number }; // unit: byte - -export interface StorageOps { - changeStorageQuota(req: Request): Promise; - queryUsedStorageQuota(req: Request): Promise; -} diff --git a/apps/mis-server/src/clusterops/api/user.ts b/apps/mis-server/src/clusterops/api/user.ts deleted file mode 100644 index 85c3f03f3a..0000000000 --- a/apps/mis-server/src/clusterops/api/user.ts +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { Request } from "src/clusterops/api"; - -export interface BlockUserInAccountRequest { - userId: string; - accountName: string; -} - -/** NOT_FOUND: user is not found. */ -export interface BlockUserInAccountReply {} - -export interface UnblockUserInAccountRequest { - userId: string; - accountName: string; -} - -/** NOT_FOUND: user is not found. */ -export interface UnblockUserInAccountReply {} - -export interface RemoveUserRequest { - userId: string; - accountName: string; -} - -/** NOT_FOUND: user is not found. */ -export interface RemoveUserReply { - code: "OK" | "NOT_FOUND"; -} - -export interface AddUserToAccountRequest { - userId: string; - accountName: string; -} - -/** ALREADY_EXISTS: User already exists. */ -export interface AddUserToAccountReply { - code: "OK" | "ALREADY_EXISTS" -} - -export interface UserOps { - addUserToAccount(req: Request): Promise; - removeUser(req: Request): Promise; - blockUserInAccount(req: Request): Promise; - unblockUserInAccount(req: Request): Promise; -} diff --git a/apps/mis-server/src/clusterops/slurm/account.ts b/apps/mis-server/src/clusterops/slurm/account.ts deleted file mode 100644 index c24c492d44..0000000000 --- a/apps/mis-server/src/clusterops/slurm/account.ts +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { AccountOps } from "src/clusterops/api/account"; -import { SlurmClusterInfo } from "src/clusterops/slurm"; -import { parseBlockStatus, parseClusterAccounts } from "src/clusterops/slurm/utils/parse"; -import { handleSimpleResponse, throwIfNotReturn0 } from "src/clusterops/slurm/utils/slurm"; - -export const slurmAccountOps = ({ executeSlurmScript }: SlurmClusterInfo): AccountOps => { - - return { - createAccount: async ({ request, logger }) => { - const { accountName, ownerId } = request; - const result = await executeSlurmScript(["-c", accountName, "0", ownerId ], logger); - - return handleSimpleResponse(result, { 6: "ALREADY_EXISTS" }); - }, - - deleteAccount: async ({ request, logger }) => { - const { accountName } = request; - const result = await executeSlurmScript(["-a", accountName], logger); - - return handleSimpleResponse(result, { 7: "NOT_FOUND" }); - - }, - - blockAccount: async ({ request, logger }) => { - const { accountName } = request; - - const result = await executeSlurmScript(["-b", accountName], logger); - - return handleSimpleResponse(result, { 8: "ALREADY_BLOCKED", 7: "NOT_FOUND" }); - }, - - unblockAccount: async ({ request, logger }) => { - const { accountName } = request; - - const result = await executeSlurmScript(["-d", accountName], logger); - - return handleSimpleResponse(result, { 9: "ALREADY_UNBLOCKED", 7: "NOT_FOUND" }); - }, - - getAllAccountsWithUsers: async ({ logger }) => { - const result = await executeSlurmScript(["-l", "all"], logger); - - throwIfNotReturn0(result); - - const accounts = parseClusterAccounts(result.stdout); - - const blockStatusReply = await executeSlurmScript(["-m", accounts.map((x) => x.accountName).join(",")], logger); - const accountsBlockStatus = parseBlockStatus(blockStatusReply.stdout); - - for (const account of accounts) { - const status = accountsBlockStatus[account.accountName]; - account.blocked = status === undefined ? true : status; - } - - return { accounts }; - }, - - }; -}; diff --git a/apps/mis-server/src/clusterops/slurm/index.ts b/apps/mis-server/src/clusterops/slurm/index.ts deleted file mode 100644 index 52e9ffcbcc..0000000000 --- a/apps/mis-server/src/clusterops/slurm/index.ts +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { Logger } from "@ddadaal/tsgrpc-server"; -import { SlurmMisConfigSchema } from "@scow/config/build/mis"; -import { sftpChmod, sshConnect } from "@scow/lib-ssh"; -import { ClusterOps } from "src/clusterops/api"; -import { slurmAccountOps } from "src/clusterops/slurm/account"; -import { slurmJobOps } from "src/clusterops/slurm/job"; -import { slurmStorageOps } from "src/clusterops/slurm/storage"; -import { slurmUserOps } from "src/clusterops/slurm/user"; -import { executeSlurmScript } from "src/clusterops/slurm/utils/slurm"; -import { clusters } from "src/config/clusters"; -import { rootKeyPair } from "src/config/env"; - -export interface SlurmClusterInfo { - slurmConfig: SlurmMisConfigSchema; - partitions: string[]; - - executeSlurmScript: (params: string[], logger: Logger) => ReturnType; -} - -export const createSlurmOps = (cluster: string, logger: Logger): ClusterOps | undefined => { - - const slurmConfig = clusters[cluster].slurm; - - if (!slurmConfig) { - throw new Error(`the slurm property of cluster ${cluster} in clusters/${cluster}.yaml is not set.`); - } - - const slurmMisConfig = slurmConfig.mis; - - if (!slurmMisConfig) { - logger.warn("the slurm.mis property of cluster %s is not set. Ignore the cluster.", cluster); - return undefined; - } - - const partitionNames = slurmConfig.partitions.map((x) => x.name); - - const clusterInfo: SlurmClusterInfo = { - partitions: partitionNames, - slurmConfig: slurmMisConfig, - executeSlurmScript: (params, logger) => executeSlurmScript(slurmMisConfig, partitionNames, params, logger), - }; - - return { - account: slurmAccountOps(clusterInfo), - storage: slurmStorageOps(clusterInfo), - job: slurmJobOps(clusterInfo), - user: slurmUserOps(clusterInfo), - onStartup: async () => { - return await sshConnect(slurmMisConfig.managerUrl, "root", rootKeyPair, logger, async (ssh) => { - logger.info("start to copy slurm.sh"); - // 请求sftp对象 - const sftp = await ssh.requestSFTP(); - // 将slurm.sh复制入指定路径 - await ssh.putFile("scripts/slurm.sh", slurmMisConfig.scriptPath); - // 修改文件权限 - await sftpChmod(sftp)(slurmMisConfig.scriptPath, "555"); - logger.info("copy slurm.sh sucessfully"); - }); - }, - }; - -}; diff --git a/apps/mis-server/src/clusterops/slurm/job.ts b/apps/mis-server/src/clusterops/slurm/job.ts deleted file mode 100644 index 290a3bd984..0000000000 --- a/apps/mis-server/src/clusterops/slurm/job.ts +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { getRunningJobs } from "@scow/lib-slurm"; -import { sshConnect } from "@scow/lib-ssh"; -import { JobOps } from "src/clusterops/api/job"; -import { SlurmClusterInfo } from "src/clusterops/slurm"; -import { handleSimpleResponse, throwIfNotReturn0 } from "src/clusterops/slurm/utils/slurm"; -import { rootKeyPair } from "src/config/env"; - -export const slurmJobOps = ({ slurmConfig, executeSlurmScript }: SlurmClusterInfo): JobOps => { - - return { - getRunningJobs: async ({ request, logger }) => { - const { userId, accountNames, jobIdList } = request; - - const jobs = await sshConnect(slurmConfig.managerUrl, "root", rootKeyPair, logger, async (ssh) => { - return await getRunningJobs(ssh, "root", { userId, accountNames, jobIdList }, logger); - }); - - return { jobs }; - }, - - queryJobTimeLimit: async ({ request, logger }) => { - const { jobId } = request; - const result = await executeSlurmScript(["-t", jobId], logger); - - if (result.code === 7) { - return { code: "NOT_FOUND" }; - } - - throwIfNotReturn0(result); - - // format is [d-]hh:mm:ss, 5-00:00:00 or 00:03:00 - // convert to second - - const results = result.stdout.trim().split(/-|:/).map((x) => +x); - - const [d, h, m, s] = results[3] === undefined - ? [0, ...results] - : results; - - return { code: "OK", limit: s + m * 60 + h * 60 * 60 + d * 60 * 60 * 24 }; - }, - - changeJobTimeLimit: async ({ request, logger }) => { - const { delta, jobId } = request; - - const result = await executeSlurmScript(["-n", jobId, delta + ""], logger); - - return handleSimpleResponse(result, { 7: "NOT_FOUND" }); - }, - }; -}; diff --git a/apps/mis-server/src/clusterops/slurm/storage.ts b/apps/mis-server/src/clusterops/slurm/storage.ts deleted file mode 100644 index 7f9ac1ff07..0000000000 --- a/apps/mis-server/src/clusterops/slurm/storage.ts +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { ChangeStorageQuotaMode, StorageOps } from "src/clusterops/api/storage"; -import { SlurmClusterInfo } from "src/clusterops/slurm"; -import { throwIfNotReturn0 } from "src/clusterops/slurm/utils/slurm"; - -export const slurmStorageOps = ({ executeSlurmScript }: SlurmClusterInfo): StorageOps => { - return { - queryUsedStorageQuota: async ({ request, logger }) => { - const { userId } = request; - const result = await executeSlurmScript(["-y", userId], logger); - - if (result.code === 2) { - return { code: "NOT_FOUND" }; - } - - throwIfNotReturn0(result); - - /** - * format is - * - * used: 512K - * quota: 2T - * - * used显示为整数位大于等于1的最大单位,比如如果是1025K的话,会显示1.001M - */ - - const usedLine = result.stdout.split("\n") - .find((x) => x.startsWith("used:")); - - function throwError(): never { - logger.error(`Unexpected -y output. stdout: ${result.stdout}`); - - throw new Error("Unexpected cmdline output"); - } - - if (usedLine) { - const val = usedLine.substring("used: ".length); - - // parseFloat parses starting num, ignoring what's following a now - const numVal = parseFloat(val); - if (isNaN(numVal)) { - throwError(); - } - - // parse unit to bytes - const units = { - "B": 0, - "K": 1, - "M": 2, - "G": 3, - "T": 4, - "P": 5, - }; - - const unit = val[val.length - 1]; - - const pow = units[unit] ?? 0; - - const bytesVal = numVal * Math.pow(1024, pow); - - return { code: "OK", used: bytesVal }; - } - - throwError(); - }, - changeStorageQuota: async ({ request, logger }) => { - const { userId, mode, value } = request; - const command = { - [ChangeStorageQuotaMode.DECREASE]: "-w", - [ChangeStorageQuotaMode.INCREASE]: "-z", - [ChangeStorageQuotaMode.SET]: "-x", - }; - - const result = await executeSlurmScript([command[mode], userId, value + ""], logger); - - if (result.code === 4) { - return { code: "NOT_FOUND" }; - } - - throwIfNotReturn0(result); - - // TODO handle output format - return { code: "OK", currentQuota: 10 }; - }, - }; -}; diff --git a/apps/mis-server/src/clusterops/slurm/user.ts b/apps/mis-server/src/clusterops/slurm/user.ts deleted file mode 100644 index 2c5d218010..0000000000 --- a/apps/mis-server/src/clusterops/slurm/user.ts +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { UserOps } from "src/clusterops/api/user"; -import { SlurmClusterInfo } from "src/clusterops/slurm"; -import { handleSimpleResponse } from "src/clusterops/slurm/utils/slurm"; - -export const slurmUserOps = ({ executeSlurmScript }: SlurmClusterInfo): UserOps => { - - return { - addUserToAccount: async ({ request, logger }) => { - const { accountName, userId } = request; - const result = await executeSlurmScript(["-g", accountName, "0", userId], logger); - - return handleSimpleResponse(result, { 3: "ALREADY_EXISTS" }); - }, - removeUser: async ({ request, logger }) => { - const { accountName, userId } = request; - const result = await executeSlurmScript(["-k", accountName, userId], logger); - - return handleSimpleResponse(result, { 4: "NOT_FOUND" }); - }, - - blockUserInAccount: async ({ request, logger }) => { - const { accountName, userId } = request; - const result = await executeSlurmScript(["-o", accountName, userId], logger); - - return handleSimpleResponse(result, { 4: "NOT_FOUND" }); - }, - - unblockUserInAccount: async ({ request, logger }) => { - const { accountName, userId } = request; - const result = await executeSlurmScript(["-r", accountName, userId], logger); - - return handleSimpleResponse(result, { 4: "NOT_FOUND" }); - }, - - }; -}; diff --git a/apps/mis-server/src/clusterops/slurm/utils/parse.ts b/apps/mis-server/src/clusterops/slurm/utils/parse.ts deleted file mode 100644 index 2347bc6d28..0000000000 --- a/apps/mis-server/src/clusterops/slurm/utils/parse.ts +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { ClusterAccountInfo, ClusterAccountInfo_ImportStatus, - UserInAccount } from "@scow/protos/build/server/admin"; - -// Parses slurm.sh output -// Accounts with no user are not included -export function parseClusterAccounts(dataStr: string): ClusterAccountInfo[] { - - const accounts: ClusterAccountInfo[] = []; - - if (dataStr.trim() === "") { return accounts; } - - const lines = dataStr.trim().split("\n"); - lines.push(""); - - let i = 0; - while (i < lines.length - 1) { - const account = lines[i].trim(); - const accountIndex = accounts.push({ - accountName: account, - users: [] as UserInAccount[], - importStatus: ClusterAccountInfo_ImportStatus.NOT_EXISTING, - blocked: true, - }); - i++; - while (i < lines.length && lines[i].trim() !== "") { - if (lines[i].trim().startsWith("There is no user in account")) { - accounts.pop(); - break; - } - const [user, status] = lines[i].split(":").map((x) => x.trim()); - if (account === "a_" + user && accounts[accountIndex - 1].owner === undefined) { - accounts[accountIndex - 1].owner = user; - } - accounts[accountIndex - 1].users.push({ userId: user, userName: user, state: status }); - i++; - } - i++; - } - - return accounts; -} - -export function parseBlockStatus(dataStr: string): Record { - const lines = dataStr.split("\n"); - const result: Record = {}; - - for (const line of lines) { - const match = line.match(/^Account (\S+) is (allowed|blocked)!$/); - if (match) { - const accountName = match[1]; - const isBlocked = match[2] === "blocked"; - result[accountName] = isBlocked; - } - } - - return result; -} diff --git a/apps/mis-server/src/clusterops/slurm/utils/slurm.ts b/apps/mis-server/src/clusterops/slurm/utils/slurm.ts deleted file mode 100644 index 6b521454e2..0000000000 --- a/apps/mis-server/src/clusterops/slurm/utils/slurm.ts +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { Logger } from "@ddadaal/tsgrpc-server"; -import { SlurmMisConfigSchema } from "@scow/config/build/mis"; -import { loggedExec, sshConnect, SSHExecError } from "@scow/lib-ssh"; -import { rootKeyPair } from "src/config/env"; - -// Won't throw if return code is not zero -export const executeScript = async ( - slurmMisConfig: SlurmMisConfigSchema, cmd: string, parameters: string[], env: NodeJS.ProcessEnv, logger: Logger, -) => { - - const host = slurmMisConfig.managerUrl; - - return await sshConnect(host, "root", rootKeyPair, logger, async (ssh) => { - return await loggedExec(ssh, logger, false, cmd, parameters, { execOptions: { env } }); - }); -}; - - -export const executeSlurmScript = async ( - slurmMisConfig: SlurmMisConfigSchema, partitions: string[], params: string[], logger: Logger, -) => { - - const partitionsParam = partitions.join(" "); - - const result = await executeScript(slurmMisConfig, slurmMisConfig.scriptPath, params, { - BASE_PARTITIONS: partitionsParam, - CLUSTER_NAME: slurmMisConfig.clusterName, - DB_HOST: slurmMisConfig.dbHost, - DB_PORT: String(slurmMisConfig.dbPort), - DB_USER: slurmMisConfig.dbUser, - DB_PASSWORD: slurmMisConfig.dbPassword, - SLURM_ACCT_DB_NAME: slurmMisConfig.slurmAcctDbName, - }, logger); - - return result; -}; - -export const throwIfNotReturn0 = (result: Awaited>) => { - if (result.code !== 0) { - throw new SSHExecError(result); - } -}; - -/** - * If result is zero, return "OK". If result is in map, return the corresponding value. Otherwise throw. - * @param result the SSH exec response - * @param map the map from exit code to error code - * @returns the error code - */ -export const handleSimpleResponse = ( - result: Awaited>, map: Record, -) => { - if (result.code === null) { throw new Error("Slurm script exited with null code"); } - - if (result.code === 0) { return { code: "OK" as const }; } - - const code = map[result.code]; - if (code) { return { code }; } - throw new SSHExecError(result); -}; - diff --git a/apps/mis-server/src/config/clusters.ts b/apps/mis-server/src/config/clusters.ts index 60ce0e650a..7bae580abd 100644 --- a/apps/mis-server/src/config/clusters.ts +++ b/apps/mis-server/src/config/clusters.ts @@ -14,15 +14,3 @@ import { getClusterConfigs } from "@scow/config/build/cluster"; import { logger } from "src/utils/logger"; export const clusters = getClusterConfigs(undefined, logger); - -// map slurm cluster id to scow cluster id -export const clusterIdMap = Object.entries(clusters).reduce((prev, [key, value]) => { - if (value.scheduler === "slurm" && value.slurm && value.slurm.mis) { - prev[value.slurm.mis.clusterName] = key; - } - return prev; -}, { } as Record); - -export function clusterNameToScowClusterId(clusterName: string) { - return clusterIdMap[clusterName]; -} diff --git a/apps/mis-server/src/entities/JobInfo.ts b/apps/mis-server/src/entities/JobInfo.ts index a47afcc0ee..6cf9084ec7 100644 --- a/apps/mis-server/src/entities/JobInfo.ts +++ b/apps/mis-server/src/entities/JobInfo.ts @@ -12,8 +12,7 @@ import { Entity, Index, PrimaryKey, Property } from "@mikro-orm/core"; import { Decimal } from "@scow/lib-decimal"; -import { clusterNameToScowClusterId } from "src/config/clusters"; -import type { OriginalJob } from "src/entities/OriginalJob"; +import { JobInfo as ClusterJobInfo } from "@scow/scheduler-adapter-protos/build/protos/job"; import { DECIMAL_DEFAULT_RAW, DecimalType } from "src/utils/decimal"; const UNKNOWN_PRICE_ITEM = "UNKNOWN"; @@ -118,31 +117,30 @@ export class JobInfo { constructor( - job: OriginalJob, + job: {cluster: string} & ClusterJobInfo, tenant: string | undefined, jobPriceInfo: JobPriceInfo, ) { - this.biJobIndex = job.biJobIndex; - this.idJob = job.idJob; + this.idJob = job.jobId; this.account = job.account; this.tenant = tenant ?? ""; this.user = job.user; this.partition = job.partition; - this.nodelist = job.nodelist; - this.jobName = job.jobName; - this.cluster = clusterNameToScowClusterId(job.cluster); - this.gpu = job.gpu; + this.nodelist = job.nodeList!; + this.jobName = job.name; + this.cluster = job.cluster; + this.gpu = job.gpusAlloc!; this.cpusReq = job.cpusReq; - this.memReq = job.memReq; + this.memReq = job.memReqMb; this.nodesReq = job.nodesReq; - this.cpusAlloc = job.cpusAlloc; - this.memAlloc = job.memAlloc; - this.nodesAlloc = job.nodesAlloc; - this.timelimit = job.timelimit; - this.timeUsed = job.timeUsed; - this.timeWait = job.timeWait; + this.cpusAlloc = job.cpusAlloc!; + this.memAlloc = job.memAllocMb!; + this.nodesAlloc = job.nodesAlloc!; + this.timelimit = job.timeLimitMinutes; + this.timeUsed = job.elapsedSeconds!; + this.timeWait = ((new Date(job.startTime!)).getTime() - (new Date(job.submitTime!)).getTime()) / 1000; this.qos = job.qos; this.tenantPrice = jobPriceInfo.tenant?.price ?? new Decimal(0); @@ -150,9 +148,8 @@ export class JobInfo { this.accountPrice = jobPriceInfo.account?.price ?? new Decimal(0); this.accountBillingItemId = jobPriceInfo.tenant?.billingItemId ?? UNKNOWN_PRICE_ITEM; - this.recordTime = job.recordTime; - this.timeSubmit = job.timeSubmit; - this.timeStart = job.timeStart; - this.timeEnd = job.timeEnd; + this.timeSubmit = new Date(job.submitTime!); + this.timeStart = new Date(job.startTime!); + this.timeEnd = new Date(job.endTime!); } } diff --git a/apps/mis-server/src/entities/OriginalJob.ts b/apps/mis-server/src/entities/OriginalJob.ts deleted file mode 100644 index d41e74874b..0000000000 --- a/apps/mis-server/src/entities/OriginalJob.ts +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { Entity, Index, PrimaryKey, Property } from "@mikro-orm/core"; -import { misConfig } from "src/config/mis"; - -@Entity({ tableName: misConfig.fetchJobs.db.tableName }) -export class OriginalJob { - - @PrimaryKey() - biJobIndex!: number; - - @Property() - idJob!: number; - - @Property({ length: 255, columnType: "tinytext", comment: "账户" }) - account!: string; - - @Index({ name: "user" }) - @Property({ length: 127, comment: "用户名" }) - user!: string; - - @Property({ length: 255, columnType: "tinytext", comment: "分区" }) - partition!: string; - - @Property({ columnType: "text", length: 65535, comment: "使用节点列表" }) - nodelist!: string; - - @Property({ length: 255, columnType: "tinytext", comment: "作业名" }) - jobName!: string; - - @Property({ length: 50, comment: "集群名" }) - cluster!: string; - - @Index({ name: "time_submit" }) - @Property({ comment: "提交时间" }) - timeSubmit!: Date; - - @Index({ name: "time_start" }) - @Property({ comment: "开始时间" }) - timeStart!: Date; - - @Index({ name: "time_end" }) - @Property({ comment: "结束时间" }) - timeEnd!: Date; - - @Property({ columnType: "int(10)", comment: "使用GPU数。来自gres_req字段" }) - gpu!: number; - - @Property({ columnType: "int unsigned", comment: "申请CPU数tres_req" }) - cpusReq!: number; - - @Property({ columnType: "int unsigned", comment: "申请的内存,单位MB,来自tres_req" }) - memReq!: number; - - @Property({ columnType: "int unsigned", comment: "申请节点数,tres_req" }) - nodesReq!: number; - - @Property({ columnType: "int unsigned", comment: "分配CPU数tres_alloc" }) - cpusAlloc!: number; - - @Property({ columnType: "int unsigned", comment: "分配的内存,单位MB,来自tres_alloc" }) - memAlloc!: number; - - @Property({ columnType: "int unsigned", comment: "分配节点数tres_alloc" }) - nodesAlloc!: number; - - @Property({ columnType: "int unsigned", comment: "作业时间限制" }) - timelimit!: number; - - @Index({ name: "time_used" }) - @Property({ columnType: "bigint unsigned", comment: "作业执行时间" }) - timeUsed!: number; - - @Index({ name: "time_wait" }) - @Property({ columnType: "bigint unsigned", comment: "作业等待时间" }) - timeWait!: number; - - @Property({ length: 255, comment: "QOS" }) - qos!: string; - - @Property({ columnType: "timestamp", defaultRaw: "CURRENT_TIMESTAMP", comment: "记录时间" }) - recordTime!: Date; -} diff --git a/apps/mis-server/src/migrations/Migration20221213020347.ts b/apps/mis-server/src/migrations/Migration20221213020347.ts deleted file mode 100644 index 48931f7fb2..0000000000 --- a/apps/mis-server/src/migrations/Migration20221213020347.ts +++ /dev/null @@ -1,20 +0,0 @@ -import { Migration } from '@mikro-orm/migrations'; -import { clusterIdMap } from "src/config/clusters"; - -export class Migration20221213020347 extends Migration { - - async up(): Promise { - Object.entries(clusterIdMap).forEach(([prev, curr]) => { - if (prev === curr) { return; } - this.addSql(`update \`job_info\` set \`cluster\` = '${curr}' where \`cluster\` = '${prev}';`); - }); - } - - async down(): Promise { - Object.entries(clusterIdMap).forEach(([prev, curr]) => { - if (prev === curr) { return; } - this.addSql(`update \`job_info\` set \`cluster\` = '${prev}' where \`cluster\` = '${curr}';`); - }); - } - -} diff --git a/apps/mis-server/src/plugins/clusters.ts b/apps/mis-server/src/plugins/clusters.ts index a830d6978b..9f99b7620f 100644 --- a/apps/mis-server/src/plugins/clusters.ts +++ b/apps/mis-server/src/plugins/clusters.ts @@ -14,9 +14,8 @@ import { ServiceError } from "@ddadaal/tsgrpc-common"; import { Logger, plugin } from "@ddadaal/tsgrpc-server"; import { status } from "@grpc/grpc-js"; import { getLoginNode } from "@scow/config/build/cluster"; +import { getSchedulerAdapterClient, SchedulerAdapterClient } from "@scow/lib-scheduler-adapter"; import { testRootUserSshLogin } from "@scow/lib-ssh"; -import { ClusterOps } from "src/clusterops/api"; -import { createSlurmOps } from "src/clusterops/slurm"; import { clusters } from "src/config/clusters"; import { rootKeyPair } from "src/config/env"; import { scowErrorMetadata } from "src/utils/error"; @@ -29,13 +28,13 @@ type CallOnAllResult = ({ cluster: string; } & ( // Throw ServiceError if failed. type CallOnAll = ( logger: Logger, - call: (ops: ClusterOps) => Promise, + call: (client: SchedulerAdapterClient) => Promise, ) => Promise>; type CallOnOne = ( cluster: string, logger: Logger, - call: (ops: ClusterOps) => Promise, + call: (client: SchedulerAdapterClient) => Promise, ) => Promise; export type ClusterPlugin = { @@ -45,16 +44,12 @@ export type ClusterPlugin = { } } -const clusterOpsMaps = { - "slurm": createSlurmOps, -} as const; - export const CLUSTEROPS_ERROR_CODE = "CLUSTEROPS_ERROR"; export const clustersPlugin = plugin(async (f) => { if (process.env.NODE_ENV === "production") { - await Promise.all(Object.values(clusters).map(async ({ displayName, slurm: { loginNodes } }) => { + await Promise.all(Object.values(clusters).map(async ({ displayName, loginNodes }) => { const loginNode = getLoginNode(loginNodes[0]); const address = loginNode.address; const node = loginNode.name; @@ -69,46 +64,37 @@ export const clustersPlugin = plugin(async (f) => { })); } - const opsForClusters = Object.entries(clusters).reduce((prev, [cluster, c]) => { - const ops = clusterOpsMaps[(c.scheduler as keyof typeof clusterOpsMaps)](cluster, f.logger); + const adapterClientForClusters = Object.entries(clusters).reduce((prev, [cluster, c]) => { + const client = getSchedulerAdapterClient(c.adapterUrl); - if (ops) { - prev[cluster] = { ops, ignore: c.misIgnore }; - } + prev[cluster] = client; return prev; - }, {} as Record); - - for (const ops of Object.values(opsForClusters).filter((x) => !x.ignore).map((x) => x.ops)) { - await ops.onStartup(); - } + }, {} as Record); - const getClusterOps = (cluster: string) => { - return opsForClusters[cluster]; + const getAdapterClient = (cluster: string) => { + return adapterClientForClusters[cluster]; }; const clustersPlugin = { callOnOne: (async (cluster, logger, call) => { - const ops = getClusterOps(cluster); + const client = getAdapterClient(cluster); - if (!ops) { + if (!client) { throw new Error("Calling actions on non-existing cluster " + cluster); } - if (ops.ignore) { - throw new Error("Call specific actions on ignored cluster " + cluster); - } - return await call(ops.ops); + logger.info("Calling actions on cluster " + cluster); + return await call(client); }), // throws error if failed. callOnAll: (async (logger, call) => { - const results = await Promise.all(Object.entries(opsForClusters) - .filter(([_, c]) => !c.ignore) - .map(async ([cluster, ops]) => { - return call(ops.ops).then((result) => { + const results = await Promise.all(Object.entries(adapterClientForClusters) + .map(async ([cluster, client]) => { + return call(client).then((result) => { logger.info("Executing on %s success", cluster); return { cluster, success: true, result }; }).catch((e) => { diff --git a/apps/mis-server/src/plugins/fetch.ts b/apps/mis-server/src/plugins/fetch.ts index e5343d6897..69e07293ca 100644 --- a/apps/mis-server/src/plugins/fetch.ts +++ b/apps/mis-server/src/plugins/fetch.ts @@ -29,10 +29,16 @@ export interface FetchPlugin { export const fetchPlugin = plugin(async (f) => { let fetchStarted = !!misConfig.fetchJobs.periodicFetch; + let fetchIsRunning = false; const logger = f.logger.child({ plugin: "fetch" }); - const trigger = () => fetchJobs(f.ext.orm.em.fork(), logger, f.ext, f.ext); + const trigger = () => { + if (fetchIsRunning) return; + + fetchIsRunning = true; + return fetchJobs(f.ext.orm.em.fork(), logger, f.ext, f.ext).finally(() => { fetchIsRunning = false; }); + }; const task = cron.schedule( misConfig.fetchJobs.periodicFetch.cron, @@ -74,4 +80,4 @@ export const fetchPlugin = plugin(async (f) => { lastFetched: () => lastFetched, fetch: trigger, }); -}); \ No newline at end of file +}); diff --git a/apps/mis-server/src/plugins/price.ts b/apps/mis-server/src/plugins/price.ts index f81722678d..41766f690d 100644 --- a/apps/mis-server/src/plugins/price.ts +++ b/apps/mis-server/src/plugins/price.ts @@ -25,7 +25,7 @@ export const pricePlugin = plugin(async (s) => { const logger = s.logger.child({ plugin: "price" }); // check price item completeness - const priceMap = await createPriceMap(s.ext.orm.em.fork(), logger); + const priceMap = await createPriceMap(s.ext.orm.em.fork(), s.ext.clusters, logger); const missingItems = priceMap.getMissingDefaultPriceItems(); if (missingItems.length > 0) { logger.warn(` @@ -37,7 +37,7 @@ export const pricePlugin = plugin(async (s) => { } s.addExtension("price", { - createPriceMap: () => createPriceMap(s.ext.orm.em.fork(), logger), + createPriceMap: () => createPriceMap(s.ext.orm.em.fork(), s.ext.clusters, logger), }); }); diff --git a/apps/mis-server/src/services/account.ts b/apps/mis-server/src/services/account.ts index a72894c13f..5c06b6e89d 100644 --- a/apps/mis-server/src/services/account.ts +++ b/apps/mis-server/src/services/account.ts @@ -10,6 +10,7 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { plugin } from "@ddadaal/tsgrpc-server"; import { ServiceError } from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; @@ -167,29 +168,23 @@ export const accountServiceServer = plugin((server) => { logger.info("Creating account in cluster."); await server.ext.clusters.callOnAll( logger, - async (ops) => { - const resp = await ops.account.createAccount({ - request: { accountName, ownerId }, - logger, + async (client) => { + await asyncClientCall(client.account, "createAccount", { + accountName, ownerUserId: ownerId, }); - if (resp.code === "ALREADY_EXISTS") { - // the account is already exists. add the owner to the account manually - await ops.user.addUserToAccount({ - request: { accountName, userId: user.userId }, - logger, - }); - } - - const blockResp = await ops.account.blockAccount({ - request: { accountName }, - logger, + await asyncClientCall(client.account, "blockAccount", { + accountName, + }).catch((e) => { + if (e.code === Status.NOT_FOUND) { + throw { + code: Status.INTERNAL, message: `Account ${accountName} hasn't been created. block failed`, + }; + } else { + throw e; + } }); - if (blockResp.code === "NOT_FOUND") { - throw { - code: Status.INTERNAL, message: `Account ${accountName} hasn't been created. block failed`, - }; - } + }, ).catch(async (e) => { await rollback(e); diff --git a/apps/mis-server/src/services/admin.ts b/apps/mis-server/src/services/admin.ts index fbc3f800ad..12fdf330fb 100644 --- a/apps/mis-server/src/services/admin.ts +++ b/apps/mis-server/src/services/admin.ts @@ -10,10 +10,12 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { plugin } from "@ddadaal/tsgrpc-server"; import { ServiceError } from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; import { AdminServiceServer, AdminServiceService, + ClusterAccountInfo, ClusterAccountInfo_ImportStatus } from "@scow/protos/build/server/admin"; import { updateBlockStatusInSlurm } from "src/bl/block"; import { importUsers, ImportUsersData } from "src/bl/importUsers"; @@ -26,42 +28,43 @@ import { UserAccount, UserRole } from "src/entities/UserAccount"; export const adminServiceServer = plugin((server) => { server.addService(AdminServiceService, { - changeStorageQuota: async ({ request, em, logger }) => { - const { cluster, mode, userId, value } = request; + changeStorageQuota: async ({}) => { + // const { cluster, mode, userId, value } = request; - const quota = await em.findOne(StorageQuota, { - user: { userId }, cluster, - }); + // const quota = await em.findOne(StorageQuota, { + // user: { userId }, cluster, + // }); - if (!quota) { - throw { - code: Status.NOT_FOUND, message: `User ${userId} or cluster ${cluster} is not found`, - }; - } + // if (!quota) { + // throw { + // code: Status.NOT_FOUND, message: `User ${userId} or cluster ${cluster} is not found`, + // }; + // } - const reply = await server.ext.clusters.callOnOne( - cluster, - logger, - async (ops) => ops.storage.changeStorageQuota({ request: { mode, userId, value }, logger }), - ); + // const reply = await server.ext.clusters.callOnOne( + // cluster, + // logger, + // async (ops) => ops.storage.changeStorageQuota({ request: { mode, userId, value }, logger }), + // ); - if (reply.code === "NOT_FOUND") { - throw { - code: Status.NOT_FOUND, message: `User ${userId} or cluster ${cluster} is not found`, - }; - } + // if (reply.code === "NOT_FOUND") { + // throw { + // code: Status.NOT_FOUND, message: `User ${userId} or cluster ${cluster} is not found`, + // }; + // } - if (reply.code === "INVALID_VALUE") { - throw { - code: Status.INVALID_ARGUMENT, message: `The changed storage quota value ${value} is not valid`, - }; - } + // if (reply.code === "INVALID_VALUE") { + // throw { + // code: Status.INVALID_ARGUMENT, message: `The changed storage quota value ${value} is not valid`, + // }; + // } - quota.storageQuota = reply.currentQuota; + // quota.storageQuota = reply.currentQuota; - await em.flush(); + // await em.flush(); - return [{ currentQuota: quota.storageQuota }]; + // return [{ currentQuota: quota.storageQuota }]; + return [{ currentQuota: 10 }]; }, @@ -110,11 +113,11 @@ export const adminServiceServer = plugin((server) => { const result = await server.ext.clusters.callOnOne( cluster, logger, - async (ops) => ops.account.getAllAccountsWithUsers({ - request: {}, logger, - }), + async (client) => await asyncClientCall(client.account, "getAllAccountsWithUsers", {}), ); + const accounts: ClusterAccountInfo[] = []; + const includedAccounts = await em.find(Account, { accountName: { $in: result.accounts.map((x) => x.accountName) }, }, { populate: ["users", "users.user"]}); @@ -127,8 +130,9 @@ export const adminServiceServer = plugin((server) => { const includedAccount = includedAccounts.find((x) => x.accountName === account.accountName); if (!includedAccount) { // account not existed in scow - account.importStatus = ClusterAccountInfo_ImportStatus.NOT_EXISTING; + accounts.push({ ...account, importStatus: ClusterAccountInfo_ImportStatus.NOT_EXISTING }); } else { + let status: ClusterAccountInfo_ImportStatus; if ( !account.users.every((user) => @@ -139,14 +143,16 @@ export const adminServiceServer = plugin((server) => { ) ) { // some users in account not existed in scow - account.importStatus = ClusterAccountInfo_ImportStatus.HAS_NEW_USERS; + status = ClusterAccountInfo_ImportStatus.HAS_NEW_USERS; } else { // both users and account exist in scow - account.importStatus = ClusterAccountInfo_ImportStatus.EXISTING; + status = ClusterAccountInfo_ImportStatus.EXISTING; } account.owner = includedUserAccounts .find((x) => x.account.$.accountName === account.accountName && x.role === UserRole.OWNER)!.user.$.userId; + + accounts.push({ ...account, importStatus: status }); } }); @@ -155,10 +161,10 @@ export const adminServiceServer = plugin((server) => { [ClusterAccountInfo_ImportStatus.HAS_NEW_USERS]: 1, [ClusterAccountInfo_ImportStatus.EXISTING]: 2, }; - result.accounts.sort((a, b) => { + accounts.sort((a, b) => { return order[a.importStatus] - order[b.importStatus]; }); - return [result]; + return [{ accounts }]; }, getFetchInfo: async () => { @@ -184,7 +190,7 @@ export const adminServiceServer = plugin((server) => { fetchJobs: async () => { const reply = await server.ext.fetch.fetch(); - return [reply]; + return [reply ? reply : { newJobsCount: 0 }]; }, updateBlockStatus: async ({ em, logger }) => { diff --git a/apps/mis-server/src/services/config.ts b/apps/mis-server/src/services/config.ts new file mode 100644 index 0000000000..e041508b17 --- /dev/null +++ b/apps/mis-server/src/services/config.ts @@ -0,0 +1,31 @@ +/** + * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy + * SCOW is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + */ + +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; +import { plugin } from "@ddadaal/tsgrpc-server"; +import { ConfigServiceServer, ConfigServiceService } from "@scow/protos/build/common/config"; + +export const configServiceServer = plugin((server) => { + server.addService(ConfigServiceService, { + getClusterConfig: async ({ request, logger }) => { + const { cluster } = request; + + const reply = await server.ext.clusters.callOnOne( + cluster, + logger, + async (client) => await asyncClientCall(client.config, "getClusterConfig", {}), + ); + + return [reply]; + }, + }); +}); diff --git a/apps/mis-server/src/services/job.ts b/apps/mis-server/src/services/job.ts index 70818b52d0..459513f586 100644 --- a/apps/mis-server/src/services/job.ts +++ b/apps/mis-server/src/services/job.ts @@ -10,17 +10,20 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { ensureNotUndefined, plugin } from "@ddadaal/tsgrpc-server"; import { ServiceError, status } from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; import { FilterQuery, QueryOrder, UniqueConstraintViolationException } from "@mikro-orm/core"; import { Decimal, decimalToMoney, moneyToNumber } from "@scow/lib-decimal"; +import { jobInfoToRunningjob } from "@scow/lib-scheduler-adapter"; import { GetJobsResponse, JobBillingItem, JobFilter, JobInfo, JobServiceServer, JobServiceService, } from "@scow/protos/build/server/job"; +import { JobInfo as AdapterJobInfo } from "@scow/scheduler-adapter-protos/build/protos/job"; import { charge, pay } from "src/bl/charging"; import { getActiveBillingItems } from "src/bl/PriceMap"; import { misConfig } from "src/config/mis"; @@ -254,31 +257,42 @@ export const jobServiceServer = plugin((server) => { const reply = await server.ext.clusters.callOnOne( cluster, logger, - async (ops) => ops.job.getRunningJobs({ - request: { userId, accountNames, jobIdList }, - logger, - }), + async (client) => { + const fields = [ + "job_id", "partition", "name", "user", "state", "elapsed_seconds", + "nodes_alloc", "node_list", "reason", "account", "cpus_alloc", + "qos", "submit_time", "time_limit_minutes", "working_directory", + ]; + if (jobIdList.length > 0) { + const jobInfoList: AdapterJobInfo[] = []; + for (const jobId in jobIdList) { + const jobInfo = await asyncClientCall(client.job, "getJobById", { fields, jobId: Number(jobId) }); + if (jobInfo.job) jobInfoList.push(jobInfo.job); + } + return jobInfoList; + } else { + return await asyncClientCall(client.job, "getJobs", { + fields, + filter: { users: userId ? [userId] : [], accounts: accountNames, states: ["RUNNING", "PENDING"]}, + }).then((x) => x.jobs); + } + }, ); - return [{ jobs: reply.jobs }]; + return [{ jobs: reply.map(jobInfoToRunningjob) }]; }, changeJobTimeLimit: async ({ request, logger }) => { const { cluster, delta, jobId } = request; - const reply = await server.ext.clusters.callOnOne( + await server.ext.clusters.callOnOne( cluster, logger, - async (ops) => ops.job.changeJobTimeLimit({ - request: { delta, jobId }, logger, + async (client) => await asyncClientCall(client.job, "changeJobTimeLimit", { + jobId: Number(jobId), deltaMinutes: delta, }), ); - if (reply.code === "NOT_FOUND") { - throw { - code: Status.NOT_FOUND, message: `Cluster ${cluster} or job ${jobId} is not found.`, - }; - } return [{}]; }, @@ -290,19 +304,10 @@ export const jobServiceServer = plugin((server) => { const reply = await server.ext.clusters.callOnOne( cluster, logger, - async (ops) => ops.job.queryJobTimeLimit({ - request: { jobId }, - logger, - }), + async (client) => asyncClientCall(client.job, "queryJobTimeLimit", { jobId: Number(jobId) }), ); - if (reply.code === "NOT_FOUND") { - throw { - code: Status.NOT_FOUND, message: `Cluster ${cluster} or job ${jobId} is not found.`, - }; - } - - return [{ limit: reply.limit }]; + return [{ limit: reply.timeLimitMinutes * 60 }]; }, getBillingItems: async ({ request, em }) => { diff --git a/apps/mis-server/src/services/user.ts b/apps/mis-server/src/services/user.ts index 8391a5aaa2..2f39dc67c9 100644 --- a/apps/mis-server/src/services/user.ts +++ b/apps/mis-server/src/services/user.ts @@ -10,6 +10,7 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { plugin } from "@ddadaal/tsgrpc-server"; import { ServiceError } from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; @@ -107,25 +108,25 @@ export const userServiceServer = plugin((server) => { }]; }, - queryUsedStorageQuota: async ({ request, logger }) => { - const { cluster, userId } = request; + queryUsedStorageQuota: async ({}) => { + // const { cluster, userId } = request; - const reply = await server.ext.clusters.callOnOne( - cluster, - logger, - async (ops) => ops.storage.queryUsedStorageQuota({ - request: { userId }, logger, - }), - ); + // const reply = await server.ext.clusters.callOnOne( + // cluster, + // logger, + // async (ops) => ops.storage.queryUsedStorageQuota({ + // request: { userId }, logger, + // }), + // ); - if (reply.code === "NOT_FOUND") { - throw { - code: Status.NOT_FOUND, message: `User ${userId} is not found.`, - }; - } + // if (reply.code === "NOT_FOUND") { + // throw { + // code: Status.NOT_FOUND, message: `User ${userId} is not found.`, + // }; + // } return [{ - used: reply.used, + used: 10, }]; }, @@ -153,8 +154,8 @@ export const userServiceServer = plugin((server) => { }; } - await server.ext.clusters.callOnAll(logger, async (ops) => { - return await ops.user.addUserToAccount({ request: { accountName, userId }, logger }); + await server.ext.clusters.callOnAll(logger, async (client) => { + return await asyncClientCall(client.user, "addUserToAccount", { userId, accountName }); }); const newUserAccount = new UserAccount({ @@ -196,9 +197,9 @@ export const userServiceServer = plugin((server) => { }; } - await server.ext.clusters.callOnAll(logger, - async (ops) => ops.user.removeUser({ request: { accountName, userId }, logger }), - ); + await server.ext.clusters.callOnAll(logger, async (client) => { + return await asyncClientCall(client.user, "removeUserFromAccount", { userId, accountName }); + }); await em.removeAndFlush(userAccount); diff --git a/apps/mis-server/src/tasks/fetch.ts b/apps/mis-server/src/tasks/fetch.ts index 205821728b..66f895c68a 100644 --- a/apps/mis-server/src/tasks/fetch.ts +++ b/apps/mis-server/src/tasks/fetch.ts @@ -10,62 +10,56 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { Logger } from "@ddadaal/tsgrpc-server"; -import { MikroORM, QueryOrder } from "@mikro-orm/core"; -import { MariaDbDriver } from "@mikro-orm/mariadb"; +import { QueryOrder } from "@mikro-orm/core"; import { SqlEntityManager } from "@mikro-orm/mysql"; import { parsePlaceholder } from "@scow/lib-config"; +import { GetJobsResponse, JobInfo as ClusterJobInfo } from "@scow/scheduler-adapter-protos/build/protos/job"; import { addJobCharge, charge } from "src/bl/charging"; import { emptyJobPriceInfo } from "src/bl/jobPrice"; -import { clusterNameToScowClusterId } from "src/config/clusters"; import { misConfig } from "src/config/mis"; import { Account } from "src/entities/Account"; import { JobInfo } from "src/entities/JobInfo"; -import { OriginalJob } from "src/entities/OriginalJob"; import { UserAccount } from "src/entities/UserAccount"; import { ClusterPlugin } from "src/plugins/clusters"; import { PricePlugin } from "src/plugins/price"; +import testData from "src/testData.json"; -export const createSourceDbOrm = async (logger: Logger) => { - logger.info("Connecting to source db."); - - const dbConnection = await MikroORM.init({ - host: misConfig.fetchJobs.db.host, - port: misConfig.fetchJobs.db.port, - user: misConfig.fetchJobs.db.user, - dbName: misConfig.fetchJobs.db.dbName, - password: misConfig.fetchJobs.db.password, - type: misConfig.fetchJobs.db.type, - forceUndefined: true, - logger: (msg) => logger.info(msg), - entities: [OriginalJob], - }); +async function getLatestDate(em: SqlEntityManager, logger: Logger) { - logger.info("Source db is connected."); + const query = em.fork().createQueryBuilder(JobInfo) + .select("timeEnd") + .orderBy({ timeEnd: QueryOrder.DESC }); - return { - dbConnection, - getEm: () => dbConnection.em.fork(), - close: async () => { - logger.info("Closing source db connection."); - await dbConnection.close(); - logger.info("Source db connection has been closed."); - }, - }; -}; + const { timeEnd = undefined } = (await query.execute("get")) ?? {}; -async function getLatestIndex(em: SqlEntityManager, logger: Logger) { + logger.info(`Latest fetched job's end_time is ${timeEnd}.`); - const query = em.fork().createQueryBuilder(JobInfo) - .select("biJobIndex") - .orderBy({ biJobIndex: QueryOrder.DESC }); + return timeEnd; +} - const { biJobIndex = 0 } = (await query.execute("get")) ?? {}; +const processGetJobsResult = (result: ({ cluster: string; } & ( + | { success: true; result: GetJobsResponse } + | { success: false; error: any } +))[]) => { + const jobs: ({cluster: string} & ClusterJobInfo)[] = []; + result.forEach((clusterResp) => { + if (clusterResp.success) { + jobs.push(...clusterResp.result.jobs.map((job) => ({ cluster: clusterResp.cluster, ...job }))); + } + }); - logger.info(`Latest biJobIndex from billing db is ${biJobIndex}.`); + // sort by end time + jobs.sort((a, b) => { + const endTimeA = new Date(a.endTime!).getTime(); + const endTimeB = new Date(b.endTime!).getTime(); + return endTimeA - endTimeB; + }); - return biJobIndex; -} + // filter jobs without start time + return jobs.filter((x) => x.startTime); +}; export let lastFetched: Date | null = null; @@ -85,140 +79,200 @@ export async function fetchJobs( const priceMap = await pricePlugin.price.createPriceMap(); - const sourceOrm = await createSourceDbOrm(logger); - - try { + const persistJobAndCharge = async (jobs: ({ cluster: string } & ClusterJobInfo)[]) => { + const result = await em.transactional(async (em) => { + // Calculate prices for new info and persist + const pricedJobs = [] as JobInfo[]; + for (const job of jobs) { + const tenant = accountTenantMap.get(job.account); - const latestIndex = await getLatestIndex(em, logger); + if (!tenant) { + logger.warn("Account %s doesn't exist. Doesn't charge the job.", job.account); + } - const startIndex = Math.max(latestIndex + 1, misConfig.fetchJobs.startIndex); - logger.info(`Fetching new info from ${startIndex}`); + try { + const price = tenant ? priceMap.calculatePrice({ + jobId: job.jobId, + cluster: job.cluster, + cpusAlloc: job.cpusAlloc!, + gpu: job.gpusAlloc!, + memAlloc: job.memAllocMb!, + memReq: job.memReqMb, + partition: job.partition, + qos: job.qos, + timeUsed: job.elapsedSeconds!, + account: job.account, + tenant, + }) : emptyJobPriceInfo(); - // Fetch new info + const pricedJob = new JobInfo(job, tenant, price); - // count data - const sourceEm = sourceOrm.getEm(); + em.persist(pricedJob); + await em.flush(); + + pricedJobs.push(pricedJob); + } catch (error) { + logger.warn("invalid job. cluster: %s, jobId: %s, error: %s", job.cluster, job.jobId, error); + } + } + + // add job charge for user account + for (const x of pricedJobs) { + + // add job charge for the user + const ua = await em.findOne(UserAccount, { + account: { accountName: x.account }, + user: { userId: x.user }, + }, { + populate: ["user", "account", "account.tenant"], + }); - const count = await sourceEm.count(OriginalJob, { biJobIndex: { $gte: startIndex } }); - logger.info(`${count} new records to fetch.`); + if (!ua) { + logger.warn({ biJobIndex: x.biJobIndex }, + "User %s in account %s is not found. Don't charge the job.", x.user, x.account); + } - const batchSize = misConfig.fetchJobs.batchSize; - const loopCount = Math.ceil(count / batchSize); + const comment = parsePlaceholder(misConfig.jobChargeComment, x); - logger.info(`Batch size is ${batchSize}. ${loopCount} rounds to complete.`); + if (ua) { + // charge account + await charge({ + amount: x.accountPrice, + type: misConfig.jobChargeType, + comment, + target: ua.account.$, + }, em, logger, clusterPlugin); - for (let i = 0; i < loopCount; i++) { - logger.info(`Fetching next batch from ${i * batchSize}. Round ${i + 1}/${loopCount}`); + // charge tenant + await charge({ + amount: x.tenantPrice, + type: misConfig.jobChargeType, + comment, + target: ua.account.$.tenant.getEntity(), + }, em, logger, clusterPlugin); - const info = await sourceEm.find(OriginalJob, { - biJobIndex: { $gte: startIndex + i * batchSize }, - }, { - limit: batchSize, - orderBy: { biJobIndex: QueryOrder.ASC }, - }); + await addJobCharge(ua, x.accountPrice, clusterPlugin, logger); + } + } - await em.transactional(async (em) => { + return pricedJobs.length; - sourceEm.clear(); + }); - // Calculate prices for new info and persist - const pricedJobs = info.map((i) => { + em.clear(); - const tenant = accountTenantMap.get(i.account); + return result; + }; - if (!tenant) { - logger.warn("Account %s doesn't exist. Doesn't charge the job.", i.account); - } + if (!process.env.SCOW_CONFIG_PATH && process.env.NODE_ENV !== "production") { + const jobsInfo: ({cluster: string} & ClusterJobInfo)[] = []; + // data for test + jobsInfo.push(...testData.map(({ tenant, accountPrice, tenantPrice, ...rest }) => { + return { + ...rest, + state: "COMPLETED", + workingDirectory: "", + }; + })); + + const savedJobsCount = await persistJobAndCharge(jobsInfo); + logger.info(`Completed. Saved ${savedJobsCount} new info.`); + lastFetched = new Date(); + return { newJobsCount: jobsInfo.length }; + } - const price = tenant ? priceMap.calculatePrice({ - biJobIndex: i.biJobIndex, - cluster: clusterNameToScowClusterId(i.cluster), - cpusAlloc: i.cpusAlloc, - gpu: i.gpu, - memAlloc: i.memAlloc, - memReq: i.memReq, - partition: i.partition, - qos: i.qos, - timeUsed: i.timeUsed, - account: i.account, - tenant, - }) : emptyJobPriceInfo(); - // 从job_table读出来的数据实际上是+8时区,但是读出来的时间字符串中不包含时区信息 - // 由于容器本身是+0时区,所以程序将会以为读出来的是+0时区的时间 - // 造成直接存储进数据库的时间比实际时间要多8个小时 - // 这里需要转换一下,减掉8小时 - function convertToUTC(date: Date) { - return new Date(date.getTime() - 8 * 60 * 60 * 1000); + try { + const latestDate = await getLatestDate(em, logger); + const nextDate = latestDate && new Date(latestDate.getTime() + 1000); + const configDate: Date | undefined = + (misConfig.fetchJobs.startDate && new Date(misConfig.fetchJobs.startDate)) as Date | undefined; + + const startFetchDate = (nextDate && configDate) + ? (nextDate > configDate ? nextDate : configDate) + : (nextDate || configDate); + const endFetchDate = new Date(); + logger.info(`Fetching new info which end_time is from ${startFetchDate} to ${endFetchDate}`); + + const fields: string[] = [ + "job_id", "name", "user", "account", "cpus_alloc", "gpus_alloc", "mem_alloc_mb", "mem_req_mb", + "partition", "qos", "elapsed_seconds", "node_list", "nodes_req", "nodes_alloc", "time_limit_minutes", + "submit_time", "start_time", "end_time", + ]; + const fetchWithinTimeRange = async (startDate: Date, endDate: Date, batchSize: number) => { + + // calculate totalCount between startDate and endDate + const totalCount = await clusterPlugin.clusters.callOnAll(logger, async (client) => + await asyncClientCall(client.job, "getJobs", { + fields, + filter: { + users: [], accounts: [], states: [], + endTime: { startTime: startDate?.toISOString(), endTime: endDate.toISOString() }, + }, + pageInfo: { page: 1, pageSize: 1 }, + }), + ).then((result) => { + let totalCount = 0; + result.forEach((clusterResp) => { + if (clusterResp.success) { + totalCount += clusterResp.result.totalCount!; } - - (["timeEnd", "timeStart", "timeSubmit", "recordTime"] as const) - .forEach((k) => { - i[k] = convertToUTC(i[k]); - }); - - const pricedJob = new JobInfo(i, tenant, price); - - em.persist(pricedJob); - - return pricedJob; }); - - // add job charge for user account - - await Promise.all(pricedJobs - .map(async (x) => { - // add job charge for the user - const ua = await em.findOne(UserAccount, { - account: { accountName: x.account }, - user: { userId: x.user }, - }, { - populate: ["user", "account", "account.tenant"], - }); - - if (!ua) { - logger.warn({ biJobIndex: x.biJobIndex }, - "User %s in account %s is not found. Don't charge the job.", x.user, x.account); - } - - const comment = parsePlaceholder(misConfig.jobChargeComment, x); - - if (ua) { - // charge account - await charge({ - amount: x.accountPrice, - type: misConfig.jobChargeType, - comment, - target: ua.account.$, - }, em, logger, clusterPlugin); - - // charge tenant - await charge({ - amount: x.tenantPrice, - type: misConfig.jobChargeType, - comment, - target: ua.account.$.tenant.getEntity(), - }, em, logger, clusterPlugin); - - await addJobCharge(ua, x.accountPrice, clusterPlugin, logger); - } - })); - - logger.info(`Round ${i + 1}/${loopCount} completed and persisted. Wait 2 seconds for next round.`); + return totalCount; }); - await new Promise((res) => setTimeout(res, 2000)); - } - - logger.info(`Completed. Saved ${count} new info.`); - lastFetched = new Date(); - return { newJobsCount: count }; + if (totalCount <= batchSize) { + const jobsInfo: ({cluster: string} & ClusterJobInfo)[] = []; + jobsInfo.push(...(await clusterPlugin.clusters.callOnAll(logger, async (client) => + await asyncClientCall(client.job, "getJobs", { + fields, + filter: { + users: [], accounts: [], states: [], + endTime: { startTime: startDate?.toISOString(), endTime: endDate.toISOString() }, + }, + }), + ).then(processGetJobsResult))); + + let currentJobsGroup: ({ cluster: string } & ClusterJobInfo)[] = []; + let previousDate: string | null = null; + let savedJobsCount = 0; + + for (const job of jobsInfo) { + if (job.endTime! === previousDate) { + currentJobsGroup.push(job); + } else { + savedJobsCount += await persistJobAndCharge(currentJobsGroup); + currentJobsGroup = [job]; + } + previousDate = job.endTime!; + } + + // process last group + if (currentJobsGroup.length > 0) { + savedJobsCount += await persistJobAndCharge(currentJobsGroup); + } + + logger.info(`Completed. Saved ${savedJobsCount} new info.`); + lastFetched = new Date(); + return savedJobsCount; + + } else { + const midDate = new Date((startDate.getTime() + endDate.getTime()) / 2); + const firstHalfJobsCount = await fetchWithinTimeRange(startDate, midDate, batchSize); + const secondHalfJobsCount = await fetchWithinTimeRange(new Date(midDate.getTime() + 1000), endDate, batchSize); + return firstHalfJobsCount + secondHalfJobsCount; + } + + }; + + const newJobsCount = await fetchWithinTimeRange( + startFetchDate ?? new Date(0), + endFetchDate, + misConfig.fetchJobs.batchSize, + ); + return { newJobsCount }; } catch (e) { logger.error("Error when fetching jobs. %o", e); throw e; - } finally { - await sourceOrm.close(); - - } } diff --git a/apps/mis-server/src/testData.json b/apps/mis-server/src/testData.json new file mode 100644 index 0000000000..0339923a1a --- /dev/null +++ b/apps/mis-server/src/testData.json @@ -0,0 +1,938 @@ +[ + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 374400, + "cluster": "hpc00", + "memAllocMb": 374400, + "elapsedSeconds": 8465, + "cpusAlloc": 96, + "partition": "C032M0128G", + "account": "hpcc", + "user": "c", + "endTime": "2022-01-13T03:19:51.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 0, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "tenant": "another", + "name": "test", + "accountPrice": 18.059, + "tenantPrice": 9.029, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 374400, + "cluster": "hpc00", + "memAllocMb": 374400, + "elapsedSeconds": 9456, + "cpusAlloc": 96, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:19:52.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 1, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "tenant": "default", + "name": "test", + "accountPrice": 10.086, + "tenantPrice": 10.086, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "cryoem", + "memReqMb": 1872000, + "cluster": "hpc00", + "memAllocMb": 1872000, + "elapsedSeconds": 46884, + "cpusAlloc": 480, + "partition": "C032M0128G", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:19:53.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 2, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "tenant": "default", + "name": "test", + "accountPrice": 250.048, + "tenantPrice": 250.048, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "cryoem", + "memReqMb": 1872000, + "cluster": "hpc00", + "memAllocMb": 1872000, + "elapsedSeconds": 51057, + "cpusAlloc": 480, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:19:54.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 3, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 272.304, + "tenantPrice": 272.304, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "cryoem", + "memReqMb": 1872000, + "cluster": "hpc00", + "memAllocMb": 1872000, + "elapsedSeconds": 56285, + "cpusAlloc": 480, + "partition": "C032M0128G", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:19:55.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 4, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 300.187, + "tenantPrice": 300.187, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 124800, + "cluster": "hpc00", + "memAllocMb": 124800, + "elapsedSeconds": 116224, + "cpusAlloc": 32, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:19:56.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 5, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 41.324, + "tenantPrice": 41.324, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 124800, + "cluster": "hpc00", + "memAllocMb": 124800, + "elapsedSeconds": 15413, + "cpusAlloc": 32, + "partition": "C032M0128G", + "account": "hpcc", + "user": "c", + "endTime": "2022-01-13T03:19:57.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 6, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "another", + "accountPrice": 10.96, + "tenantPrice": 5.48, + "nodeList": "" + }, + { + "gpusAlloc": 16, + "qos": "cryoem", + "memReqMb": 1008000, + "cluster": "hpc00", + "memAllocMb": 1008000, + "elapsedSeconds": 64874, + "cpusAlloc": 112, + "partition": "life", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:19:58.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 7, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "cryoem", + "memReqMb": 3744000, + "cluster": "hpc00", + "memAllocMb": 3744000, + "elapsedSeconds": 5679, + "cpusAlloc": 960, + "partition": "C032M0128G", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:19:59.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 8, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 60.576, + "tenantPrice": 60.576, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 124800, + "cluster": "hpc00", + "memAllocMb": 124800, + "elapsedSeconds": 1, + "cpusAlloc": 32, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:00.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 9, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 124800, + "cluster": "hpc00", + "memAllocMb": 124800, + "elapsedSeconds": 137, + "cpusAlloc": 32, + "partition": "C032M0128G", + "account": "hpcc", + "user": "c", + "endTime": "2022-01-13T03:20:01.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 10, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "another", + "accountPrice": 0.097, + "tenantPrice": 0.049, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 124800, + "cluster": "hpc00", + "memAllocMb": 124800, + "elapsedSeconds": 561, + "cpusAlloc": 32, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:02.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 11, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0.199, + "tenantPrice": 0.199, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 124800, + "cluster": "hpc00", + "memAllocMb": 124800, + "elapsedSeconds": 13540, + "cpusAlloc": 32, + "partition": "C032M0128G", + "account": "hpcc", + "user": "c", + "endTime": "2022-01-13T03:20:03.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 12, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "another", + "accountPrice": 9.628, + "tenantPrice": 4.814, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 21, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:04.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 13, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 21, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:05.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 14, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "another", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 21, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:06.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 15, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 21, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:07.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 16, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "another", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 23, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:08.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 17, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 24, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:09.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 18, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "another", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "low", + "memReqMb": 3900, + "cluster": "hpc00", + "memAllocMb": 3900, + "elapsedSeconds": 26, + "cpusAlloc": 1, + "partition": "C032M0128G", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:10.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 19, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 8041, + "cluster": "hpc01", + "memAllocMb": 8041, + "elapsedSeconds": 212, + "cpusAlloc": 1, + "partition": "gpu", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:11.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 20, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 8041, + "cluster": "hpc01", + "memAllocMb": 8041, + "elapsedSeconds": 0, + "cpusAlloc": 1, + "partition": "gpu", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:12.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 21, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 8041, + "cluster": "hpc01", + "memAllocMb": 8041, + "elapsedSeconds": 0, + "cpusAlloc": 1, + "partition": "gpu", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:13.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 22, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 8041, + "cluster": "hpc01", + "memAllocMb": 8041, + "elapsedSeconds": 177, + "cpusAlloc": 1, + "partition": "gpu", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:14.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 23, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 8041, + "cluster": "hpc01", + "memAllocMb": 8041, + "elapsedSeconds": 47, + "cpusAlloc": 1, + "partition": "gpu", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:15.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 24, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 7094, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:16.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 25, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0.118, + "tenantPrice": 0.118, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 11507, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:17.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 26, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0.192, + "tenantPrice": 0.192, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 1321, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:18.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 27, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0.022, + "tenantPrice": 0.022, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 504000, + "cluster": "hpc01", + "memAllocMb": 504000, + "elapsedSeconds": 2, + "cpusAlloc": 224, + "partition": "compute", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:19.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 28, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0.007, + "tenantPrice": 0.007, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 504000, + "cluster": "hpc01", + "memAllocMb": 504000, + "elapsedSeconds": 2, + "cpusAlloc": 224, + "partition": "compute", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:20.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 29, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0.007, + "tenantPrice": 0.007, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 1, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:21.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 30, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 0, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:22.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 31, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 0, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:23.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 32, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 19, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpcb", + "user": "b", + "endTime": "2022-01-13T03:20:24.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 33, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 0, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpca", + "user": "a", + "endTime": "2022-01-13T03:20:25.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 34, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + }, + { + "gpusAlloc": 0, + "qos": "normal", + "memReqMb": 2250, + "cluster": "hpc01", + "memAllocMb": 2250, + "elapsedSeconds": 0, + "cpusAlloc": 1, + "partition": "compute", + "account": "hpca", + "user": "not-exist-user", + "endTime": "2022-01-13T03:20:26.715Z", + "submitTime": "2022-01-13T03:19:50.715Z", + "startTime": "2022-01-13T03:19:50.715Z", + "jobId": 35, + "timeWait": 0, + "timeLimitMinutes": 0, + "nodesAlloc": 0, + "nodesReq": 0, + "cpusReq": 0, + "name": "test", + "tenant": "default", + "accountPrice": 0, + "tenantPrice": 0, + "nodeList": "" + } +] \ No newline at end of file diff --git a/apps/mis-server/src/utils/createUser.ts b/apps/mis-server/src/utils/createUser.ts index 7a769c579c..98105768ab 100644 --- a/apps/mis-server/src/utils/createUser.ts +++ b/apps/mis-server/src/utils/createUser.ts @@ -59,9 +59,8 @@ export async function createUserInDatabase( export async function insertKeyToNewUser(userId: string, password: string, logger: Logger) { // Making an ssh Request to the login node as the user created. if (process.env.NODE_ENV === "production") { - await Promise.all(Object.values(clusters).map(async ({ displayName, slurm, misIgnore }) => { - if (misIgnore) { return; } - const node = getLoginNode(slurm.loginNodes[0]); + await Promise.all(Object.values(clusters).map(async ({ displayName, loginNodes }) => { + const node = getLoginNode(loginNodes[0]); logger.info("Checking if user can login to %s by login node %s", displayName, node.name); const error = await insertKeyAsUser(node.address, userId, password, rootKeyPair, logger).catch((e) => e); diff --git a/apps/mis-server/tests/admin/fetch.test.ts b/apps/mis-server/tests/admin/fetch.test.ts index 5a9b1392ca..b5f4e7484a 100644 --- a/apps/mis-server/tests/admin/fetch.test.ts +++ b/apps/mis-server/tests/admin/fetch.test.ts @@ -19,8 +19,7 @@ import { MySqlDriver } from "@mikro-orm/mysql"; import { AdminServiceClient } from "@scow/protos/build/server/admin"; import { createServer } from "src/app"; import { misConfig } from "src/config/mis"; -import { createSourceDbOrm } from "src/tasks/fetch"; -import { clearAndClose, dropDatabase } from "tests/data/helpers"; +import { dropDatabase } from "tests/data/helpers"; let server: Server; let orm: MikroORM; @@ -67,21 +66,11 @@ it("starts and stops fetch", async () => { }); it("triggers fetch and updates last updated", async () => { + let info = await asyncClientCall(client, "getFetchInfo", {}); + expect(info.lastFetchTime).toBeUndefined(); - const jobTableOrm = await createSourceDbOrm(server.logger); - - try { - await jobTableOrm.dbConnection.getSchemaGenerator().ensureDatabase(); - await jobTableOrm.dbConnection.getSchemaGenerator().createSchema(); - - let info = await asyncClientCall(client, "getFetchInfo", {}); - expect(info.lastFetchTime).toBeUndefined(); - - await asyncClientCall(client, "fetchJobs", {}); + await asyncClientCall(client, "fetchJobs", {}); - info = await asyncClientCall(client, "getFetchInfo", {}); - expect(info.lastFetchTime).not.toBeUndefined(); - } finally { - await clearAndClose(jobTableOrm.dbConnection); - } + info = await asyncClientCall(client, "getFetchInfo", {}); + expect(info.lastFetchTime).not.toBeUndefined(); }); diff --git a/apps/mis-server/tests/admin/getClusterUsers.test.ts b/apps/mis-server/tests/admin/getClusterUsers.test.ts deleted file mode 100644 index 8ae2f55337..0000000000 --- a/apps/mis-server/tests/admin/getClusterUsers.test.ts +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { ClusterAccountInfo_ImportStatus } from "@scow/protos/build/server/admin"; -import { parseClusterAccounts } from "src/clusterops/slurm/utils/parse"; - - -const dataStr = ` -a_user1 -user1 : allowed! -user2 : blocked! - -a_t2 -There is no user in account ! -account2 -user2:allowed! -user3:blocked! - -a_t3 -There is no user in account ! -`; - -it("test whether the string from 'slurm.sh -l all' can be parsed successfully", async () => { - const result = parseClusterAccounts(dataStr); - - expect(result).toStrictEqual([ - { - accountName: "a_user1", - users: [ - { userId: "user1", userName: "user1", state: "allowed!" }, - { userId: "user2", userName: "user2", state: "blocked!" }, - ], - owner: "user1", - importStatus: ClusterAccountInfo_ImportStatus.NOT_EXISTING, - blocked: true, - }, - { - accountName: "account2", - users: [ - { userId: "user2", userName: "user2", state: "allowed!" }, - { userId: "user3", userName: "user3", state: "blocked!" }], - importStatus: ClusterAccountInfo_ImportStatus.NOT_EXISTING, - blocked: true, - }, - ], - ); -}); diff --git a/apps/mis-server/tests/admin/importUsers.test.ts b/apps/mis-server/tests/admin/importUsers.test.ts index ccbbe976a1..67c9099b61 100644 --- a/apps/mis-server/tests/admin/importUsers.test.ts +++ b/apps/mis-server/tests/admin/importUsers.test.ts @@ -47,13 +47,13 @@ const data = { accounts: [ { accountName: "a_user1", - users: [{ userId: "user1", userName: "user1Name", state: "allowed!" }, { userId: "user2", userName: "user2", state: "blocked!" }], + users: [{ userId: "user1", userName: "user1Name", blocked: false }, { userId: "user2", userName: "user2", blocked: true }], owner: "user1", blocked: false, }, { accountName: "account2", - users: [{ userId: "user2", userName: "user2", state: "allowed!" }, { userId: "user3", userName: "user3", state: "blocked!" }], + users: [{ userId: "user2", userName: "user2", blocked: false }, { userId: "user3", userName: "user3", blocked: true }], owner: "user2", blocked: false, }, diff --git a/apps/mis-server/tests/init/init.test.ts b/apps/mis-server/tests/init/init.test.ts index a522ba05ed..ac47018810 100644 --- a/apps/mis-server/tests/init/init.test.ts +++ b/apps/mis-server/tests/init/init.test.ts @@ -13,13 +13,9 @@ import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { Server } from "@ddadaal/tsgrpc-server"; import { ChannelCredentials, status } from "@grpc/grpc-js"; -import { sftpExists, sftpStat, sshConnect } from "@scow/lib-ssh"; import { CreateInitAdminRequest, InitServiceClient, SetAsInitAdminRequest, UnsetInitAdminRequest } from "@scow/protos/build/server/init"; import { createServer } from "src/app"; -import { createSlurmOps } from "src/clusterops/slurm"; -import { clusters } from "src/config/clusters"; -import { rootKeyPair } from "src/config/env"; import { Tenant } from "src/entities/Tenant"; import { PlatformRole, TenantRole, User } from "src/entities/User"; import { DEFAULT_TENANT_NAME } from "src/utils/constants"; @@ -56,28 +52,6 @@ it("Test function userExist", async () => { expect(result.existsInAuth).toBe(true); }); -it("To test whether the slurm.sh is automatically copied successfully", async () => { - const ops = createSlurmOps("hpc00", server.logger); - if (!ops) { - throw new Error("The cluster configuration file does not exist"); - } - await ops.onStartup(); - const testSlurmMisConfig00 = clusters["hpc00"].slurm.mis; - if (!testSlurmMisConfig00) { - throw new Error("The cluster configuration file does not exist"); - } - await sshConnect(testSlurmMisConfig00.managerUrl, "test", rootKeyPair, server.logger, async (ssh) => { - const sftp = await ssh.requestSFTP(); - // 测试指定路径是否存在slurm.sh文件 - const result = await sftpExists(sftp, testSlurmMisConfig00.scriptPath); - expect(result).toEqual(true); - // 解析出文件mode低9位权限字段并测试是否为555 - const stats = await sftpStat(sftp)(testSlurmMisConfig00.scriptPath); - const testNumberPermission = (stats.mode & parseInt("777", 8)).toString(8); - expect(testNumberPermission).toEqual("555"); - }); -}); - it("querys init state and updates if complete", async () => { const queryInitialized = async () => { diff --git a/apps/mis-server/tests/job/JobService.test.ts b/apps/mis-server/tests/job/JobService.test.ts index 3a2fc12412..a34686a17a 100644 --- a/apps/mis-server/tests/job/JobService.test.ts +++ b/apps/mis-server/tests/job/JobService.test.ts @@ -46,33 +46,33 @@ afterEach(async () => { }); const mockOriginalJobData = ( - biJobIndex: number, ua: UserAccount, + ua: UserAccount, tenantPrice: Decimal, accountPrice: Decimal, -) => new JobInfo({ - biJobIndex, - "idJob": 5119061, +) => new JobInfo({ cluster: "pkuhpc", ...{ + "jobId": 5119061, "account": ua.account.getProperty("accountName"), user: ua.user.getProperty("userId"), "partition": "C032M0128G", - "nodelist": "a5u15n01", - "jobName": "CoW", - "cluster": "pkuhpc", - "timeSubmit": new Date("2020-04-23T22:23:00.000Z"), - "timeStart": new Date("2020-04-23T22:25:12.000Z"), - "timeEnd": new Date("2020-04-23T23:18:02.000Z"), - "gpu": 0, + "nodeList": "a5u15n01", + "name": "CoW", + "state": "COMPLETED", + "workingDirectory": "", + "submitTime": "2020-04-23T22:23:00.000Z", + "startTime": "2020-04-23T22:25:12.000Z", + "endTime": "2020-04-23T23:18:02.000Z", + "gpusAlloc": 0, "cpusReq": 32, - "memReq": 124000, + "memReqMb": 124000, "nodesReq": 1, "cpusAlloc": 32, - "memAlloc": 124000, + "memAllocMb": 124000, "nodesAlloc": 1, - "timelimit": 7200, - "timeUsed": 3170, + "timeLimitMinutes": 7200, + "elapsedSeconds": 3170, "timeWait": 132, "qos": "normal", "recordTime": new Date("2020-04-23T23:49:50.000Z"), -}, data.tenant.name, { +} }, data.tenant.name, { tenant: { billingItemId: "", price: tenantPrice }, account: { billingItemId: "", price: accountPrice }, }); @@ -84,9 +84,9 @@ function createClient() { it("changes job prices", async () => { // insert jobs const jobs = [ - mockOriginalJobData(1, data.uaAB, new Decimal(1), new Decimal(2)), - mockOriginalJobData(2, data.uaBB, new Decimal(2), new Decimal(4)), - mockOriginalJobData(3, data.uaAA, new Decimal(4), new Decimal(8)), + mockOriginalJobData(data.uaAB, new Decimal(1), new Decimal(2)), + mockOriginalJobData(data.uaBB, new Decimal(2), new Decimal(4)), + mockOriginalJobData(data.uaAA, new Decimal(4), new Decimal(8)), ]; data.accountA.balance = new Decimal(100); @@ -143,8 +143,8 @@ it("returns 10 jobs if pageSize is undefined or 0", async () => { const em = server.ext.orm.em.fork(); - await em.persistAndFlush(range(1, 20).map((x) => - mockOriginalJobData(x, data.uaAA, new Decimal(20), new Decimal(10)))); + await em.persistAndFlush(range(1, 20).map((_) => + mockOriginalJobData(data.uaAA, new Decimal(20), new Decimal(10)))); const test = async (pageSize?: number) => { const client = createClient(); @@ -167,14 +167,14 @@ it("returns 10 jobs if pageSize is undefined or 0", async () => { it("returns jobs starting from start_bi_job_index", async () => { const em = server.ext.orm.em.fork(); - await em.persistAndFlush(range(1, 20).map((x) => - mockOriginalJobData(x, data.uaAA, new Decimal(20), new Decimal(10)))); + await em.persistAndFlush(range(1, 20).map((_) => + mockOriginalJobData(data.uaAA, new Decimal(20), new Decimal(10)))); - await em.persistAndFlush(range(20, 40).map((x) => - mockOriginalJobData(x, data.uaCC, new Decimal(20), new Decimal(10)))); + await em.persistAndFlush(range(20, 40).map((_) => + mockOriginalJobData(data.uaCC, new Decimal(20), new Decimal(10)))); - await em.persistAndFlush(range(40, 60).map((x) => - mockOriginalJobData(x, data.uaAB, new Decimal(20), new Decimal(10)))); + await em.persistAndFlush(range(40, 60).map((_) => + mockOriginalJobData(data.uaAB, new Decimal(20), new Decimal(10)))); const client = createClient(); @@ -196,8 +196,8 @@ it("returns jobs starting from start_bi_job_index", async () => { it("returns 0 job if Accout not exist or is not in scope of permissions", async () => { const em = server.ext.orm.em.fork(); - await em.persistAndFlush(range(1, 20).map((x) => - mockOriginalJobData(x, data.uaAA, new Decimal(20), new Decimal(10)))); + await em.persistAndFlush(range(1, 20).map((_) => + mockOriginalJobData(data.uaAA, new Decimal(20), new Decimal(10)))); const test = async (filter: JobFilter) => { const client = createClient(); diff --git a/apps/mis-server/tests/job/billingItems.test.ts b/apps/mis-server/tests/job/billingItems.test.ts index fc2c1f35ea..52a1fc9f08 100644 --- a/apps/mis-server/tests/job/billingItems.test.ts +++ b/apps/mis-server/tests/job/billingItems.test.ts @@ -18,9 +18,7 @@ import { MySqlDriver } from "@mikro-orm/mysql"; import { Decimal, decimalToMoney, numberToMoney } from "@scow/lib-decimal"; import { AddBillingItemRequest, JobBillingItem, JobServiceClient } from "@scow/protos/build/server/job"; import { createServer } from "src/app"; -import { calculateJobPrice } from "src/bl/jobPrice"; import { createPriceMap } from "src/bl/PriceMap"; -import { clusterNameToScowClusterId } from "src/config/clusters"; import { AmountStrategy, JobPriceItem } from "src/entities/JobPriceItem"; import { Tenant } from "src/entities/Tenant"; import { createPriceItems } from "src/tasks/createBillingItems"; @@ -240,28 +238,33 @@ it("adds billing item to another tenant", async () => { it("calculates price", async () => { - const priceMap = await createPriceMap(orm.em.fork(), server.logger); + const priceMap = await createPriceMap(orm.em.fork(), server.ext.clusters, server.logger); // obtain test data by running the following data in db - // eslint-disable-next-line - // select json_object('biJobIndex', bi_job_index, 'cluster', cluster, 'partition', `partition`, 'qos', qos, 'timeUsed', time_used, 'cpusAlloc', cpus_alloc, 'gpu', gpu, 'memReq', mem_req, 'memAlloc', mem_alloc, 'price', price) from job_info where cluster="未名生科一号" limit 20; - const testData = (await import("./testData.json")).default; + const testData = (await import("src/testData.json")).default; const wrongPrices = [] as { - biJobIndex: number; tenantPrice: { expected: number; actual: number | undefined }; accountPrice: { expected: number; actual: number | undefined } }[]; testData.forEach((t) => { - const price = calculateJobPrice({ - ...t, - cluster: clusterNameToScowClusterId(t.cluster), - }, priceMap.getPriceItem, server.logger); + const price = priceMap.calculatePrice({ + jobId: t.jobId, + cluster: t.cluster, + cpusAlloc: t.cpusAlloc, + gpu: t.gpusAlloc, + memAlloc: t.memAllocMb, + memReq: t.memReqMb, + partition: t.partition, + qos: t.qos, + timeUsed: t.elapsedSeconds, + account: t.account, + tenant: t.tenant, + }); if (price.tenant?.price.toNumber() !== t.tenantPrice || price.account?.price.toNumber() !== t.accountPrice) { wrongPrices.push({ - biJobIndex: t.biJobIndex, tenantPrice: { expected: t.tenantPrice, actual: price.tenant?.price.toNumber() }, accountPrice: { expected: t.accountPrice, actual: price.account?.price.toNumber() }, }); @@ -274,7 +277,7 @@ it("calculates price", async () => { it("gets missing price items in platform scope", async () => { { - const priceMap = await createPriceMap(orm.em.fork(), server.logger); + const priceMap = await createPriceMap(orm.em.fork(), server.ext.clusters, server.logger); expect(priceMap.getMissingDefaultPriceItems()).toBeArrayOfSize(0); } @@ -288,7 +291,7 @@ it("gets missing price items in platform scope", async () => { } { - const priceMap = await createPriceMap(em.fork(), server.logger); + const priceMap = await createPriceMap(em.fork(), server.ext.clusters, server.logger); expect(priceMap.getMissingDefaultPriceItems()) .toIncludeSameMembers(priceItemsToBeDeleted.map((x) => x.path.join("."))); } diff --git a/apps/mis-server/tests/job/fetchJobs.test.ts b/apps/mis-server/tests/job/fetchJobs.test.ts index b09c7c0d7a..231074d97d 100644 --- a/apps/mis-server/tests/job/fetchJobs.test.ts +++ b/apps/mis-server/tests/job/fetchJobs.test.ts @@ -16,21 +16,18 @@ import { MySqlDriver, SqlEntityManager } from "@mikro-orm/mysql"; import { Decimal } from "@scow/lib-decimal"; import { createServer } from "src/app"; import { setJobCharge } from "src/bl/charging"; -import { clusterNameToScowClusterId } from "src/config/clusters"; import { JobInfo } from "src/entities/JobInfo"; -import { OriginalJob } from "src/entities/OriginalJob"; import { UserStatus } from "src/entities/UserAccount"; import { createPriceItems } from "src/tasks/createBillingItems"; -import { createSourceDbOrm, fetchJobs } from "src/tasks/fetch"; +import { fetchJobs } from "src/tasks/fetch"; +import testData from "src/testData.json"; import { reloadEntities } from "src/utils/orm"; import { InitialData, insertInitialData } from "tests/data/data"; -import { clearAndClose, dropDatabase } from "tests/data/helpers"; +import { dropDatabase } from "tests/data/helpers"; -import testData from "./testData.json"; let data: InitialData; let server: Server; -let jobTableOrm: Awaited>; let initialEm: SqlEntityManager; @@ -44,22 +41,9 @@ beforeEach(async () => { data = await insertInitialData(initialEm); - // insert raw job table info data - jobTableOrm = await createSourceDbOrm(server.logger); - const jobsData = testData.map(({ tenantPrice, accountPrice, tenant, ...rest }) => { - const job = new OriginalJob(); - Object.assign(job, rest); - return job; - }); - - await jobTableOrm.dbConnection.getSchemaGenerator().ensureDatabase(); - await jobTableOrm.dbConnection.getSchemaGenerator().createSchema(); - - await jobTableOrm.getEm().persistAndFlush(jobsData); }); afterEach(async () => { - await clearAndClose(jobTableOrm.dbConnection); await dropDatabase(server.ext.orm); await server.close(); }); @@ -79,25 +63,12 @@ it("fetches the data", async () => { expect(jobs).toBeArrayOfSize(testData.length); - // check the cluster is mapped to scow cluster id - const testDataJobToCluster = testData.reduce((acc, x) => { - acc[x.biJobIndex] = x.cluster; - return acc; - }, {} as Record); - - jobs.forEach((x) => { - expect(x.cluster).toBe(clusterNameToScowClusterId(testDataJobToCluster[x.biJobIndex])); - }); - - jobs.sort((a, b) => a.biJobIndex - b.biJobIndex); - - const wrongPrices = [] as { biJobIndex: number; tenantPrice: { expected: number; actual: number }; accountPrice: { expected: number; actual: number } }[]; + const wrongPrices = [] as { tenantPrice: { expected: number; actual: number }; accountPrice: { expected: number; actual: number } }[]; testData.forEach((t) => { - const job = jobs.find((x) => x.biJobIndex === t.biJobIndex) ?? { biJobIndex: t.biJobIndex, accountPrice: new Decimal(-1), tenantPrice: new Decimal(-1) }; + const job = jobs.find((x) => x.cluster === t.cluster && x.idJob === t.jobId) ?? { accountPrice: new Decimal(-1), tenantPrice: new Decimal(-1) }; if (job.tenantPrice.toNumber() !== t.tenantPrice || job.accountPrice.toNumber() !== t.accountPrice) { wrongPrices.push({ - biJobIndex: t.biJobIndex, tenantPrice: { expected: t.tenantPrice, actual: job.tenantPrice.toNumber() }, accountPrice: { expected: t.accountPrice, actual: job.accountPrice.toNumber() }, }); diff --git a/apps/mis-server/tests/job/testData.json b/apps/mis-server/tests/job/testData.json deleted file mode 100644 index 2c69f0300c..0000000000 --- a/apps/mis-server/tests/job/testData.json +++ /dev/null @@ -1,974 +0,0 @@ -[ - { - "gpu": 0, - "qos": "low", - "memReq": 374400, - "cluster": "pkuhpc", - "memAlloc": 374400, - "timeUsed": 8465, - "cpusAlloc": 96, - "partition": "C032M0128G", - "biJobIndex": 972137, - "account": "hpcc", - "user": "c", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "tenant": "another", - "jobName": "test", - "accountPrice": 18.059, - "tenantPrice": 9.029, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 374400, - "cluster": "pkuhpc", - "memAlloc": 374400, - "timeUsed": 9456, - "cpusAlloc": 96, - "partition": "C032M0128G", - "biJobIndex": 972138, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "tenant": "default", - "jobName": "test", - "accountPrice": 10.086, - "tenantPrice": 10.086, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "cryoem", - "memReq": 1872000, - "cluster": "pkuhpc", - "memAlloc": 1872000, - "timeUsed": 46884, - "cpusAlloc": 480, - "partition": "C032M0128G", - "biJobIndex": 972181, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "tenant": "default", - "jobName": "test", - "accountPrice": 250.048, - "tenantPrice": 250.048, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "cryoem", - "memReq": 1872000, - "cluster": "pkuhpc", - "memAlloc": 1872000, - "timeUsed": 51057, - "cpusAlloc": 480, - "partition": "C032M0128G", - "biJobIndex": 972190, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 272.304, - "tenantPrice": 272.304, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "cryoem", - "memReq": 1872000, - "cluster": "pkuhpc", - "memAlloc": 1872000, - "timeUsed": 56285, - "cpusAlloc": 480, - "partition": "C032M0128G", - "biJobIndex": 972195, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 300.187, - "tenantPrice": 300.187, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 124800, - "cluster": "pkuhpc", - "memAlloc": 124800, - "timeUsed": 116224, - "cpusAlloc": 32, - "partition": "C032M0128G", - "biJobIndex": 972205, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 41.324, - "tenantPrice": 41.324, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 124800, - "cluster": "pkuhpc", - "memAlloc": 124800, - "timeUsed": 15413, - "cpusAlloc": 32, - "partition": "C032M0128G", - "biJobIndex": 972306, - "account": "hpcc", - "user": "c", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "another", - "accountPrice": 10.96, - "tenantPrice": 5.48, - "nodelist": "" - }, - { - "gpu": 16, - "qos": "cryoem", - "memReq": 1008000, - "cluster": "pkuhpc", - "memAlloc": 1008000, - "timeUsed": 64874, - "cpusAlloc": 112, - "partition": "life", - "biJobIndex": 972312, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "cryoem", - "memReq": 3744000, - "cluster": "pkuhpc", - "memAlloc": 3744000, - "timeUsed": 5679, - "cpusAlloc": 960, - "partition": "C032M0128G", - "biJobIndex": 972324, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 60.576, - "tenantPrice": 60.576, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 124800, - "cluster": "pkuhpc", - "memAlloc": 124800, - "timeUsed": 1, - "cpusAlloc": 32, - "partition": "C032M0128G", - "biJobIndex": 972339, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 124800, - "cluster": "pkuhpc", - "memAlloc": 124800, - "timeUsed": 137, - "cpusAlloc": 32, - "partition": "C032M0128G", - "biJobIndex": 972340, - "account": "hpcc", - "user": "c", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "another", - "accountPrice": 0.097, - "tenantPrice": 0.049, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 124800, - "cluster": "pkuhpc", - "memAlloc": 124800, - "timeUsed": 561, - "cpusAlloc": 32, - "partition": "C032M0128G", - "biJobIndex": 972341, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0.199, - "tenantPrice": 0.199, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 124800, - "cluster": "pkuhpc", - "memAlloc": 124800, - "timeUsed": 13540, - "cpusAlloc": 32, - "partition": "C032M0128G", - "biJobIndex": 972484, - "account": "hpcc", - "user": "c", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "another", - "accountPrice": 9.628, - "tenantPrice": 4.814, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 21, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972485, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 21, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972486, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "another", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 21, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972487, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 21, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972488, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "another", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 23, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972489, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 24, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972490, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "another", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "low", - "memReq": 3900, - "cluster": "pkuhpc", - "memAlloc": 3900, - "timeUsed": 26, - "cpusAlloc": 1, - "partition": "C032M0128G", - "biJobIndex": 972491, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 8041, - "cluster": "pkuhpc1", - "memAlloc": 8041, - "timeUsed": 212, - "cpusAlloc": 1, - "partition": "gpu", - "biJobIndex": 7411, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 8041, - "cluster": "pkuhpc1", - "memAlloc": 8041, - "timeUsed": 0, - "cpusAlloc": 1, - "partition": "gpu", - "biJobIndex": 8776, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 8041, - "cluster": "pkuhpc1", - "memAlloc": 8041, - "timeUsed": 0, - "cpusAlloc": 1, - "partition": "gpu", - "biJobIndex": 9212, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 8041, - "cluster": "pkuhpc1", - "memAlloc": 8041, - "timeUsed": 177, - "cpusAlloc": 1, - "partition": "gpu", - "biJobIndex": 9214, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 8041, - "cluster": "pkuhpc1", - "memAlloc": 8041, - "timeUsed": 47, - "cpusAlloc": 1, - "partition": "gpu", - "biJobIndex": 9254, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 7094, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6609, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0.118, - "tenantPrice": 0.118, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 11507, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6610, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0.192, - "tenantPrice": 0.192, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 1321, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6611, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0.022, - "tenantPrice": 0.022, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 504000, - "cluster": "pkuhpc1", - "memAlloc": 504000, - "timeUsed": 2, - "cpusAlloc": 224, - "partition": "compute", - "biJobIndex": 6612, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0.007, - "tenantPrice": 0.007, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 504000, - "cluster": "pkuhpc1", - "memAlloc": 504000, - "timeUsed": 2, - "cpusAlloc": 224, - "partition": "compute", - "biJobIndex": 6613, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0.007, - "tenantPrice": 0.007, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 1, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6605, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 0, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6606, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 0, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6607, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 19, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6608, - "account": "hpcb", - "user": "b", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 0, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6622, - "account": "hpca", - "user": "a", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - }, - { - "gpu": 0, - "qos": "normal", - "memReq": 2250, - "cluster": "pkuhpc1", - "memAlloc": 2250, - "timeUsed": 0, - "cpusAlloc": 1, - "partition": "compute", - "biJobIndex": 6623, - "account": "hpca", - "user": "not-exist-user", - "timeEnd": "2022-01-13T03:19:50.715Z", - "timeSubmit": "2022-01-13T03:19:50.715Z", - "timeStart": "2022-01-13T03:19:50.715Z", - "idJob": 0, - "timeWait": 0, - "timelimit": 0, - "nodesAlloc": 0, - "nodesReq": 0, - "cpusReq": 0, - "jobName": "test", - "tenant": "default", - "accountPrice": 0, - "tenantPrice": 0, - "nodelist": "" - } -] diff --git a/apps/mis-web/config/clusters/hpc01.yml b/apps/mis-web/config/clusters/hpc01.yml index eff9f58c6b..75ece8ad4b 100644 --- a/apps/mis-web/config/clusters/hpc01.yml +++ b/apps/mis-web/config/clusters/hpc01.yml @@ -1,23 +1,24 @@ displayName: "hpc01Name" -slurm: - partitions: - - name: compute - nodes: 3 - mem: 262144 - cores: 32 - gpus: 0 - qos: - - low - - normal - - high - unit: "cpusAlloc" - comment: "说明" - - - name: GPU - nodes: 1 - mem: 262144 - cores: 48 - gpus: 8 - qos: ["low", "normal", "high"] - unit: "gpu" - comment: "说明" +adapterUrl: 0.0.0.0:6000 +# slurm: +# partitions: +# - name: compute +# nodes: 3 +# mem: 262144 +# cores: 32 +# gpus: 0 +# qos: +# - low +# - normal +# - high +# unit: "cpusAlloc" +# comment: "说明" + +# - name: GPU +# nodes: 1 +# mem: 262144 +# cores: 48 +# gpus: 8 +# qos: ["low", "normal", "high"] +# unit: "gpu" +# comment: "说明" diff --git a/apps/mis-web/config/mis.yaml b/apps/mis-web/config/mis.yaml index 60bfc681aa..16fd6b7e4c 100644 --- a/apps/mis-web/config/mis.yaml +++ b/apps/mis-web/config/mis.yaml @@ -5,23 +5,6 @@ db: password: mysqlrootpassword dbName: scow_server_${JEST_WORKER_ID} -fetchJobs: - db: - host: 127.0.0.1 - port: 3307 - user: root - password: jobtablepassword - dbName: jobs - tableName: jobs - -clusters: - hpc01: - ignore: true - slurm: - managerUrl: haha - dbPassword: password - associationTableName: user_association - scriptPath: /test/slurm.sh createUser: type: external diff --git a/apps/mis-web/package.json b/apps/mis-web/package.json index 1644b34c09..7b382345d8 100644 --- a/apps/mis-web/package.json +++ b/apps/mis-web/package.json @@ -32,8 +32,8 @@ "@scow/lib-auth": "workspace:*", "@scow/lib-config": "workspace:*", "@scow/lib-decimal": "workspace:*", - "@scow/lib-web": "workspace:*", "@scow/protos": "workspace:*", + "@scow/lib-web": "workspace:*", "@scow/utils": "workspace:*", "@sinclair/typebox": "0.28.15", "antd": "5.6.2", diff --git a/apps/mis-web/src/apis/api.mock.ts b/apps/mis-web/src/apis/api.mock.ts index 782faa6de9..4e1427763c 100644 --- a/apps/mis-web/src/apis/api.mock.ts +++ b/apps/mis-web/src/apis/api.mock.ts @@ -70,7 +70,6 @@ export const runningJob: RunningJob = { name: "123", nodes: "123", nodesOrReason: "!23", - nodesToBeUsed: "123", partition: "123", qos: "123", runningTime: "123", @@ -241,8 +240,8 @@ export const mockApi: MockApi = { { accountName: "a_user1", users: [ - { userId: "user1", userName: "user1", state: "allowed!" }, - { userId: "user2", userName: "user2", state: "allowed!" }, + { userId: "user1", userName: "user1", blocked: false }, + { userId: "user2", userName: "user2", blocked: false }, ], owner: "user1", importStatus: ClusterAccountInfo_ImportStatus.NOT_EXISTING, @@ -251,15 +250,15 @@ export const mockApi: MockApi = { { accountName: "account2", users: [ - { userId: "user2", userName: "user2", state: "allowed!" }, - { userId: "user3", userName: "user3", state: "allowed!" }, + { userId: "user2", userName: "user2", blocked: false }, + { userId: "user3", userName: "user3", blocked: false }, ], importStatus: ClusterAccountInfo_ImportStatus.HAS_NEW_USERS, blocked: false, }, { accountName: "a_user4", - users: [{ userId: "user4", userName: "user4", state: "allowed!" }], + users: [{ userId: "user4", userName: "user4", blocked: false }], importStatus: ClusterAccountInfo_ImportStatus.EXISTING, blocked: false, }, diff --git a/apps/mis-web/src/models/UserSchemaModel.ts b/apps/mis-web/src/models/UserSchemaModel.ts index c51e3bf25f..7e1f1a3da4 100644 --- a/apps/mis-web/src/models/UserSchemaModel.ts +++ b/apps/mis-web/src/models/UserSchemaModel.ts @@ -47,7 +47,7 @@ export type PlatformUserInfo = Static; export const UserInAccount = Type.Object({ userId: Type.String(), userName: Type.String(), - state: Type.String(), + blocked: Type.Boolean(), }); export type UserInAccount = Static; diff --git a/apps/mis-web/src/pages/api/job/getBillingItems.ts b/apps/mis-web/src/pages/api/job/getBillingItems.ts index 2bb3f5e5a1..9bac4ac632 100644 --- a/apps/mis-web/src/pages/api/job/getBillingItems.ts +++ b/apps/mis-web/src/pages/api/job/getBillingItems.ts @@ -13,6 +13,7 @@ import { typeboxRoute, typeboxRouteSchema } from "@ddadaal/next-typed-api-routes-runtime"; import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { numberToMoney } from "@scow/lib-decimal"; +import { ConfigServiceClient } from "@scow/protos/build/common/config"; import { GetBillingItemsResponse, JobBillingItem, JobServiceClient } from "@scow/protos/build/server/job"; import { Static, Type } from "@sinclair/typebox"; import { USE_MOCK } from "src/apis/useMock"; @@ -144,7 +145,9 @@ export default /* #__PURE__*/typeboxRoute(GetBillingItemsSchema, async (req, res const result = { activeItems: [] as BillingItemType[], historyItems: [] as BillingItemType[], nextId }; - for (const [cluster, { slurm: { partitions } }] of Object.entries(runtimeConfig.CLUSTERS_CONFIG)) { + for (const [cluster] of Object.entries(runtimeConfig.CLUSTERS_CONFIG)) { + const client = getClient(ConfigServiceClient); + const partitions = await asyncClientCall(client, "getClusterConfig", { cluster }).then((resp) => resp.partitions); for (const partition of partitions) { for (const qos of partition.qos ?? [""]) { const path = [cluster, partition.name, qos].filter((x) => x).join("."); diff --git a/apps/mis-web/src/pages/api/job/getBillingTable.ts b/apps/mis-web/src/pages/api/job/getBillingTable.ts index 13764d19e6..8e3211b53e 100644 --- a/apps/mis-web/src/pages/api/job/getBillingTable.ts +++ b/apps/mis-web/src/pages/api/job/getBillingTable.ts @@ -11,11 +11,14 @@ */ import { typeboxRoute, typeboxRouteSchema } from "@ddadaal/next-typed-api-routes-runtime"; +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; +import { ConfigServiceClient } from "@scow/protos/build/common/config"; import { JobBillingItem } from "@scow/protos/build/server/job"; import { Static, Type } from "@sinclair/typebox"; import { authenticate } from "src/auth/server"; import { PlatformRole } from "src/models/User"; import { getBillingItems } from "src/pages/api/job/getBillingItems"; +import { getClient } from "src/utils/client"; import { publicConfig, runtimeConfig } from "src/utils/config"; import { moneyToString } from "src/utils/money"; @@ -78,7 +81,10 @@ export async function getBillingTableItems(tenantName: string | undefined) { const tableItems: JobBillingTableItem[] = []; const clusters = runtimeConfig.CLUSTERS_CONFIG; - for (const [cluster, { slurm: { partitions } }] of Object.entries(clusters)) { + for (const [cluster] of Object.entries(clusters)) { + const client = getClient(ConfigServiceClient); + const partitions = await asyncClientCall(client, "getClusterConfig", { cluster }).then((resp) => resp.partitions); + const partitionCount = partitions.length; let clusterItemIndex = 0; for (const partition of partitions) { @@ -97,7 +103,7 @@ export async function getBillingTableItems(tenantName: string | undefined) { cluster: publicConfig.CLUSTERS[cluster]?.name ?? cluster, cores: partition.cores, gpus: partition.gpus, - mem: partition.mem, + mem: partition.memMb, nodes: partition.nodes, partition: partition.name, partitionCount, diff --git a/apps/mis-web/src/pages/api/job/runningJobs.ts b/apps/mis-web/src/pages/api/job/runningJobs.ts index d4741b5e07..747f40b6a3 100644 --- a/apps/mis-web/src/pages/api/job/runningJobs.ts +++ b/apps/mis-web/src/pages/api/job/runningJobs.ts @@ -32,7 +32,6 @@ export const RunningJob = Type.Object({ cores: Type.String(), qos: Type.String(), submissionTime: Type.String(), - nodesToBeUsed: Type.String(), /** * days-hours:minutes:seconds. * The value may be "NOT_SET" if not yet established or "UNLIMITED" for no diff --git a/apps/portal-server/config/clusters/hpc01.yaml b/apps/portal-server/config/clusters/hpc01.yaml index 9a8a042337..49614e0153 100644 --- a/apps/portal-server/config/clusters/hpc01.yaml +++ b/apps/portal-server/config/clusters/hpc01.yaml @@ -1,28 +1,29 @@ displayName: hpc01Name -slurm: - loginNodes: - - name: login - address: localhost:22222 - partitions: - - name: compute - nodes: 3 - mem: 262144 - cores: 32 - gpus: 0 - qos: - - low - - normal - - high - comment: 说明 +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: login + address: localhost:22222 +# slurm: +# partitions: +# - name: compute +# nodes: 3 +# mem: 262144 +# cores: 32 +# gpus: 0 +# qos: +# - low +# - normal +# - high +# comment: 说明 - - name: GPU - nodes: 1 - mem: 262144 - cores: 48 - gpus: 8 - qos: - - low - - normal - - high - - highest - comment: 说明 +# - name: GPU +# nodes: 1 +# mem: 262144 +# cores: 48 +# gpus: 8 +# qos: +# - low +# - normal +# - high +# - highest +# comment: 说明 diff --git a/apps/portal-server/config/clusters/hpc02.yaml b/apps/portal-server/config/clusters/hpc02.yaml index ca96bda573..0867f62b70 100644 --- a/apps/portal-server/config/clusters/hpc02.yaml +++ b/apps/portal-server/config/clusters/hpc02.yaml @@ -1,24 +1,25 @@ displayName: hpc02Name -slurm: - loginNodes: - - name: login - address: localhost:22 - partitions: - - name: GPU - nodes: 2 - mem: 262144 - cores: 29 - gpus: 8 - qos: - - normal - - high - - highest - comment: 说明 +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: login + address: localhost:22 +# slurm: +# partitions: +# - name: GPU +# nodes: 2 +# mem: 262144 +# cores: 29 +# gpus: 8 +# qos: +# - normal +# - high +# - highest +# comment: 说明 - - name: another - nodes: 2 - mem: 262144 - cores: 29 - gpus: 8 - comment: 说明 +# - name: another +# nodes: 2 +# mem: 262144 +# cores: 29 +# gpus: 8 +# comment: 说明 diff --git a/apps/portal-server/package.json b/apps/portal-server/package.json index 7c2b060cbf..260cd274db 100644 --- a/apps/portal-server/package.json +++ b/apps/portal-server/package.json @@ -22,6 +22,7 @@ "dependencies": { "@ddadaal/tsgrpc-common": "0.2.3", "@ddadaal/tsgrpc-server": "0.19.2", + "@ddadaal/tsgrpc-client": "0.17.5", "@grpc/grpc-js": "1.8.15", "@scow/config": "workspace:*", "@scow/lib-config": "workspace:*", @@ -29,7 +30,10 @@ "@scow/lib-slurm": "workspace:*", "@scow/lib-ssh": "workspace:*", "@scow/protos": "workspace:*", + "@scow/scheduler-adapter-protos": "workspace:*", + "@scow/lib-scheduler-adapter": "workspace:*", "@scow/utils": "workspace:*", + "@scow/rich-error-model": "workspace:*", "@sinclair/typebox": "0.28.15", "dayjs": "1.11.8", "dotenv": "16.3.1", @@ -43,7 +47,6 @@ "wait-on": "7.0.1" }, "devDependencies": { - "@ddadaal/tsgrpc-client": "0.17.5", "@types/node-cron": "3.0.7", "@types/shell-quote": "1.7.1", "@types/ssh2": "1.11.11", diff --git a/apps/portal-server/src/app.ts b/apps/portal-server/src/app.ts index 4158f27d69..b849495907 100644 --- a/apps/portal-server/src/app.ts +++ b/apps/portal-server/src/app.ts @@ -17,6 +17,7 @@ import { clusters } from "src/config/clusters"; import { config } from "src/config/env"; import { plugins } from "src/plugins"; import { appServiceServer } from "src/services/app"; +import { configServiceServer } from "src/services/config"; import { desktopServiceServer } from "src/services/desktop"; import { fileServiceServer } from "src/services/file"; import { jobServiceServer } from "src/services/job"; @@ -46,6 +47,7 @@ export async function createServer() { await server.register(jobServiceServer); await server.register(fileServiceServer); await server.register(shellServiceServer); + await server.register(configServiceServer); if (process.env.NODE_ENV === "production") { await checkClustersRootUserLogin(server.logger); diff --git a/apps/portal-server/src/clusterops/api/app.ts b/apps/portal-server/src/clusterops/api/app.ts index da89026791..faca2c9ced 100644 --- a/apps/portal-server/src/clusterops/api/app.ts +++ b/apps/portal-server/src/clusterops/api/app.ts @@ -30,14 +30,8 @@ export interface CreateAppRequest { } export type CreateAppReply = { - code: "OK"; sessionId: string; jobId: number; -} | { - code: "SBATCH_FAILED", - message: string; -} | { - code: "APP_NOT_FOUND"; } export interface GetAppSessionsRequest { @@ -68,6 +62,14 @@ export interface ConnectToAppRequest { sessionId: string; } +export type ConnectToAppReply = { + appId: string; + host: string; + port: number; + password: string; + customFormData?: {[key: string]: string}; +}; + export interface SubmissionInfo { userId: string; cluster: string; @@ -93,17 +95,6 @@ export type GetAppLastSubmissionReply = { lastSubmissionInfo?: SubmissionInfo; } -export type ConnectToAppReply = - | { code: "NOT_FOUND" } // sessionId is not found - | { code: "UNAVAILABLE" } // the app is not available to connect yet - | { code: "OK", - appId: string; - host: string; - port: number; - password: string; - customFormData?: {[key: string]: string}; -}; - export interface AppOps { createApp(req: CreateAppRequest, logger: Logger): Promise; listAppSessions(req: GetAppSessionsRequest, logger: Logger): Promise; diff --git a/apps/portal-server/src/clusterops/api/job.ts b/apps/portal-server/src/clusterops/api/job.ts index c716a44660..edea45c5f2 100644 --- a/apps/portal-server/src/clusterops/api/job.ts +++ b/apps/portal-server/src/clusterops/api/job.ts @@ -10,27 +10,8 @@ * See the Mulan PSL v2 for more details. */ -import { RunningJob } from "@scow/protos/build/common/job"; import { Logger } from "ts-log"; -export interface ListRunningJobsRequest { - userId: string; -} - -export interface ListRunningJobsReply { - results: RunningJob[]; -} - -export interface SubmitJobRequest { - userId: string; - jobInfo: JobTemplate; - script: string; - saveAsTemplate: boolean; -} - -export type SubmitJobReply = - | { code: "OK", jobId: number; } - | { code: "SBATCH_FAILED", message: string }; export interface JobTemplate { jobName: string; @@ -50,22 +31,6 @@ export interface JobTemplate { comment?: string | undefined; } -export interface GenerateJobScriptRequest { - jobInfo: JobTemplate; -} - -export interface GenerateJobScriptReply { - script: string; -} - -export interface ListAccountsRequest { - userId: string; -} - -export interface ListAccountsReply { - accounts: string[]; -} - export interface ListJobTemplatesRequest { userId: string; } @@ -87,52 +52,21 @@ export interface GetJobTemplateRequest { } export type GetJobTemplateReply = { - code: "OK" template: JobTemplate; -} | { - code: "NOT_FOUND" } -export interface CancelJobRequest { +export interface SaveJobTemplateRequest { userId: string; jobId: number; + jobInfo: JobTemplate; } -export type CancelJobReply = { code: "OK" } | { code: "NOT_FOUND" }; - -export interface ListAllJobsInfoRequest { - userId: string; - startTime?: Date; - endTime?: Date; -} - -export interface JobInfo { - jobId: number; - name: string; - account: string; - partition: string; - qos: string; - state: string; - workingDirectory: string; - reason: string; - elapsed: string; - timeLimit: string; - submitTime: string; - startTime: string; - endTime: string; -} +export interface SaveJobTemplateReply { -export interface ListAllJobsInfoReply { - results: JobInfo[]; } export interface JobOps { - listRunningJobs(req: ListRunningJobsRequest, logger: Logger): Promise; - listAccounts(req: ListAccountsRequest, logger: Logger): Promise; - generateJobScript(req: GenerateJobScriptRequest, logger: Logger): Promise; - submitJob(req: SubmitJobRequest, logger: Logger): Promise; listJobTemplates(req: ListJobTemplatesRequest, logger: Logger): Promise; - getJobTamplate(req: GetJobTemplateRequest, logger: Logger): Promise; - cancelJob(req: CancelJobRequest, logger: Logger): Promise; - listAllJobsInfo(req: ListAllJobsInfoRequest, logger: Logger): Promise; + getJobTemplate(req: GetJobTemplateRequest, logger: Logger): Promise; + saveJobTemplate(req: SaveJobTemplateRequest, logger: Logger): Promise; } diff --git a/apps/portal-server/src/clusterops/slurm/app.ts b/apps/portal-server/src/clusterops/app.ts similarity index 78% rename from apps/portal-server/src/clusterops/slurm/app.ts rename to apps/portal-server/src/clusterops/app.ts index 058e54b6d8..b8652d8c6d 100644 --- a/apps/portal-server/src/clusterops/slurm/app.ts +++ b/apps/portal-server/src/clusterops/app.ts @@ -10,24 +10,27 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; +import { ServiceError } from "@grpc/grpc-js"; +import { Status } from "@grpc/grpc-js/build/src/constants"; import { getPlaceholderKeys } from "@scow/lib-config/build/parse"; +import { formatTime } from "@scow/lib-scheduler-adapter"; import { getUserHomedir, - loggedExec, sftpChmod, sftpExists, sftpReaddir, sftpReadFile, sftpRealPath, sftpWriteFile } from "@scow/lib-ssh"; -import { RunningJob } from "@scow/protos/build/common/job"; + sftpChmod, sftpExists, sftpReaddir, sftpReadFile, sftpRealPath, sftpWriteFile } from "@scow/lib-ssh"; +import { parseErrorDetails } from "@scow/rich-error-model"; +import { JobInfo, SubmitJobRequest } from "@scow/scheduler-adapter-protos/build/protos/job"; import fs from "fs"; import { join } from "path"; import { quote } from "shell-quote"; import { AppOps, AppSession, SubmissionInfo } from "src/clusterops/api/app"; -import { displayIdToPort } from "src/clusterops/slurm/bl/port"; import { clusters } from "src/config/clusters"; import { portalConfig } from "src/config/portal"; import { getClusterAppConfigs, splitSbatchArgs } from "src/utils/app"; +import { getAdapterClient } from "src/utils/clusters"; import { getIpFromProxyGateway } from "src/utils/proxy"; import { getClusterLoginNode, sshConnect } from "src/utils/ssh"; -import { parseDisplayId, refreshPassword, refreshPasswordByProxyGateway, VNCSERVER_BIN_PATH } from "src/utils/turbovnc"; - -import { querySqueue } from "./bl/queryJobInfo"; -import { generateJobScript, parseSbatchOutput } from "./bl/submitJob"; +import { displayIdToPort, parseDisplayId, + refreshPassword, refreshPasswordByProxyGateway, VNCSERVER_BIN_PATH } from "src/utils/turbovnc"; interface SessionMetadata { sessionId: string; @@ -57,7 +60,7 @@ const VNC_SESSION_INFO = "VNC_SESSION_INFO"; const APP_LAST_SUBMISSION_INFO = "last_submission.json"; const BIN_BASH_SCRIPT_HEADER = "#!/bin/bash -l\n"; -export const slurmAppOps = (cluster: string): AppOps => { +export const appOps = (cluster: string): AppOps => { const host = getClusterLoginNode(cluster); @@ -70,6 +73,8 @@ export const slurmAppOps = (cluster: string): AppOps => { const { appId, userId, account, coreCount, nodeCount, gpuCount, memory, maxTime, proxyBasePath, partition, qos, customAttributes, appJobName } = request; + const memoryMb = memory ? Number(memory.slice(0, -2)) : undefined; + const userSbatchOptions = customAttributes["sbatchOptions"] ? splitSbatchArgs(customAttributes["sbatchOptions"]) @@ -78,7 +83,9 @@ export const slurmAppOps = (cluster: string): AppOps => { // prepare script file const appConfig = apps[appId]; - if (!appConfig) { return { code: "APP_NOT_FOUND" }; } + if (!appConfig) { + throw { code: Status.NOT_FOUND, message: `app id ${appId} is not found` }; + } const jobName = appJobName; @@ -95,24 +102,30 @@ export const slurmAppOps = (cluster: string): AppOps => { const sftp = await ssh.requestSFTP(); - const submitAndWriteMetadata = async (script: string, env?: Record) => { - const remoteEntryPath = join(workingDirectory, "entry.sh"); + const getEnvVariables = (env: Record) => + Object.keys(env).map((x) => `export ${x}=${quote([env[x] ?? ""])}\n`).join(""); - await sftpWriteFile(sftp)(remoteEntryPath, script); + const submitAndWriteMetadata = async (request: SubmitJobRequest) => { + const remoteEntryPath = join(workingDirectory, "entry.sh"); // submit entry.sh - // createApp is slow already - // use executeAsUser increases code complexity greatly - const { code, stderr, stdout } = await loggedExec(ssh, logger, false, - "sbatch", [remoteEntryPath], { execOptions: { env: env as NodeJS.ProcessEnv } }, - ); - - if (code !== 0) { - return { code: "SBATCH_FAILED", message: stderr } as const; - } + const client = getAdapterClient(cluster); + const reply = await asyncClientCall(client.job, "submitJob", request).catch((e) => { + const ex = e as ServiceError; + const errors = parseErrorDetails(ex.metadata); + if (errors[0] && errors[0].$type === "google.rpc.ErrorInfo" && errors[0].reason === "SBATCH_FAILED") { + throw { + code: Status.INTERNAL, + message: "sbatch failed", + details: e.details, + }; + } + else { + throw e; + } + }); - // parse stdout output to get the job id - const jobId = parseSbatchOutput(stdout); + const jobId = reply.jobId; // write session metadata const metadata: SessionMetadata = { @@ -122,6 +135,9 @@ export const slurmAppOps = (cluster: string): AppOps => { appId, }; + // entry.sh save the generated script + await sftpWriteFile(sftp)(remoteEntryPath, reply.generatedScript); + await sftpWriteFile(sftp)(join(workingDirectory, SESSION_METADATA_NAME), JSON.stringify(metadata)); // write a last_submission session @@ -135,16 +151,16 @@ export const slurmAppOps = (cluster: string): AppOps => { qos: request.qos, nodeCount: request.nodeCount, coreCount: request.coreCount, + maxTime: request.timeLimitMinutes!, gpuCount: request.gpuCount, - maxTime: request.maxTime, submitTime: new Date().toISOString(), - customAttributes: request.customAttributes, + customAttributes: customAttributes, }; await sftpWriteFile(sftp)(join(lastSubmissionDirectory, APP_LAST_SUBMISSION_INFO), JSON.stringify(lastSubmissionInfo)); - return { code: "OK", jobId, sessionId: metadata.sessionId } as const; + return { jobId, sessionId: metadata.sessionId } as const; }; let customAttributesExport: string = ""; @@ -176,22 +192,15 @@ export const slurmAppOps = (cluster: string): AppOps => { const configSlurmOptions: string[] = appConfig.slurm?.options ?? []; - const script = generateJobScript({ - jobName, - command: SERVER_ENTRY_COMMAND, - account: account, - nodeCount: nodeCount, - coreCount: coreCount, - gpuCount: gpuCount, - memory: memory, - maxTime: maxTime, - partition: partition, - workingDirectory, - qos: qos, - otherOptions: configSlurmOptions.concat(userSbatchOptions), - }); + const extraOptions = configSlurmOptions.concat(userSbatchOptions); - return await submitAndWriteMetadata(script, { SERVER_SESSION_INFO }); + const envVariables = getEnvVariables({ SERVER_SESSION_INFO }); + + return await submitAndWriteMetadata({ + userId, jobName, account, partition: partition!, qos, nodeCount, gpuCount: gpuCount ?? 0, memoryMb, + coreCount, timeLimitMinutes: maxTime, script: envVariables + SERVER_ENTRY_COMMAND, + workingDirectory, extraOptions, + }); } else { // vnc app const beforeScript = customAttributesExport + (appConfig.vnc!.beforeScript ?? ""); @@ -204,23 +213,16 @@ export const slurmAppOps = (cluster: string): AppOps => { const configSlurmOptions: string[] = appConfig.slurm?.options ?? []; - const script = generateJobScript({ - jobName, - command: VNC_ENTRY_COMMAND, - account: account, - nodeCount: nodeCount, - coreCount: coreCount, - gpuCount: gpuCount, - memory: memory, - maxTime: maxTime, - partition: partition, - workingDirectory, - qos: qos, - output: VNC_OUTPUT_FILE, - otherOptions: configSlurmOptions.concat(userSbatchOptions), + const extraOptions = configSlurmOptions.concat(userSbatchOptions); + + const envVariables = getEnvVariables({ VNC_SESSION_INFO, VNCSERVER_BIN_PATH }); + + return await submitAndWriteMetadata({ + userId, jobName, account, partition: partition!, qos, nodeCount, gpuCount: gpuCount ?? 0, memoryMb, + coreCount, timeLimitMinutes: maxTime, script: envVariables + VNC_ENTRY_COMMAND, + workingDirectory, stdout: VNC_OUTPUT_FILE, extraOptions, }); - return await submitAndWriteMetadata(script, { VNC_SESSION_INFO, VNCSERVER_BIN_PATH }); } }); @@ -251,12 +253,20 @@ export const slurmAppOps = (cluster: string): AppOps => { return await sshConnect(host, "root", logger, async (ssh) => { // If a job is not running, it cannot be ready - const runningJobsInfo = await querySqueue(ssh, userId, logger, ["-u", userId]); + const client = getAdapterClient(cluster); + const runningJobsInfo = await asyncClientCall(client.job, "getJobs", { + fields: ["job_id", "state", "elapsed_seconds", "time_limit_minutes", "reason"], + filter: { + users: [userId], accounts: [], + states: ["RUNNING", "PENDING"], + }, + }).then((resp) => resp.jobs); + const runningJobInfoMap = runningJobsInfo.reduce((prev, curr) => { prev[curr.jobId] = curr; return prev; - }, {} as Record); + }, {} as Record); const sftp = await ssh.requestSFTP(); @@ -281,7 +291,7 @@ export const slurmAppOps = (cluster: string): AppOps => { const content = await sftpReadFile(sftp)(metadataPath); const sessionMetadata = JSON.parse(content.toString()) as SessionMetadata; - const runningJobInfo: RunningJob | undefined = runningJobInfoMap[sessionMetadata.jobId]; + const runningJobInfo: JobInfo | undefined = runningJobInfoMap[sessionMetadata.jobId]; const app = apps[sessionMetadata.appId]; @@ -335,9 +345,9 @@ export const slurmAppOps = (cluster: string): AppOps => { submitTime: new Date(sessionMetadata.submitTime), state: runningJobInfo?.state ?? "ENDED", dataPath: await sftpRealPath(sftp)(jobDir), - runningTime: runningJobInfo?.runningTime ?? "", - timeLimit: runningJobInfo?.timeLimit ?? "", - reason: isPendingOrTerminated ? (runningJobInfo?.nodesOrReason ?? "") : undefined, + runningTime: runningJobInfo?.elapsedSeconds ? formatTime(runningJobInfo.elapsedSeconds * 1000) : "", + timeLimit: runningJobInfo?.timeLimitMinutes ? formatTime(runningJobInfo.timeLimitMinutes * 60 * 1000) : "", + reason: isPendingOrTerminated ? (runningJobInfo?.reason ?? "") : undefined, host, port, }); @@ -360,7 +370,7 @@ export const slurmAppOps = (cluster: string): AppOps => { const jobDir = join(userHomeDir, portalConfig.appJobsDir, sessionId); if (!await sftpExists(sftp, jobDir)) { - return { code: "NOT_FOUND" }; + throw { code: Status.NOT_FOUND, message: `session id ${sessionId} is not found` }; } const metadataPath = join(jobDir, SESSION_METADATA_NAME); @@ -379,7 +389,6 @@ export const slurmAppOps = (cluster: string): AppOps => { const customFormData = rest as {[key: string]: string}; const ip = await getIpFromProxyGateway(cluster, HOST, logger); return { - code: "OK", appId: sessionMetadata.appId, host: ip || HOST, port: +PORT, @@ -436,7 +445,6 @@ export const slurmAppOps = (cluster: string): AppOps => { return await sshConnect(host, userId, logger, async (computeNodeSsh) => { const password = await refreshPassword(computeNodeSsh, null, logger, displayId!); return { - code: "OK", appId: sessionMetadata.appId, host, port: displayIdToPort(displayId!), @@ -447,7 +455,8 @@ export const slurmAppOps = (cluster: string): AppOps => { } } } - return { code: "UNAVAILABLE" }; + + throw { code: Status.UNAVAILABLE, message: `session id ${sessionId} cannot be connected` }; }); }, diff --git a/apps/portal-server/src/clusterops/index.ts b/apps/portal-server/src/clusterops/index.ts index 0a7156a632..f15398d998 100644 --- a/apps/portal-server/src/clusterops/index.ts +++ b/apps/portal-server/src/clusterops/index.ts @@ -11,15 +11,15 @@ */ import { ClusterOps } from "src/clusterops/api"; -import { createSlurmClusterOps } from "src/clusterops/slurm"; +import { appOps } from "src/clusterops/app"; +import { jobOps } from "src/clusterops/job"; import { clusters } from "src/config/clusters"; -const clusterOpsMaps = { - "slurm": createSlurmClusterOps, -} as const; - -const opsForClusters = Object.entries(clusters).reduce((prev, [cluster, c]) => { - prev[cluster] = clusterOpsMaps[c.scheduler](cluster); +const opsForClusters = Object.entries(clusters).reduce((prev, [cluster]) => { + prev[cluster] = { + app: appOps(cluster), + job: jobOps(cluster), + } as ClusterOps; return prev; }, {} as Record); diff --git a/apps/portal-server/src/clusterops/job.ts b/apps/portal-server/src/clusterops/job.ts new file mode 100644 index 0000000000..dbb7ed7e6e --- /dev/null +++ b/apps/portal-server/src/clusterops/job.ts @@ -0,0 +1,114 @@ +/** + * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy + * SCOW is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + */ + +import { ServiceError } from "@ddadaal/tsgrpc-common"; +import { Status } from "@grpc/grpc-js/build/src/constants"; +import { sftpExists, sftpReaddir, sftpReadFile, sftpWriteFile } from "@scow/lib-ssh"; +import { join } from "path"; +import { JobOps, JobTemplateInfo } from "src/clusterops/api/job"; +import { portalConfig } from "src/config/portal"; +import { getClusterLoginNode, sshConnect } from "src/utils/ssh"; + +export interface JobMetadata { + jobName: string; + account: string; + partition?: string; + qos?: string; + nodeCount: number; + coreCount: number; + gpuCount?: number; + maxTime: number; + command: string; + comment?: string; + submitTime: string; + workingDirectory: string; + memory?: string; +} + +export const jobOps = (cluster: string): JobOps => { + + const host = getClusterLoginNode(cluster); + + if (!host) { throw new Error(`Cluster ${cluster} has no login node`); } + + return { + + getJobTemplate: async (request, logger) => { + const { id, userId } = request; + + return await sshConnect(host, userId, logger, async (ssh) => { + const sftp = await ssh.requestSFTP(); + + const file = join(portalConfig.savedJobsDir, id); + + if (!await sftpExists(sftp, file)) { + throw { code: Status.NOT_FOUND, message: `Job template id ${id} is not found.` }; + } + + const content = await sftpReadFile(sftp)(file); + logger.info("getJobTamplate to %s", content); + const data = JSON.parse(content.toString()) as JobMetadata; + + return { template: data }; + }); + }, + + listJobTemplates: async (request, logger) => { + const { userId } = request; + + return await sshConnect(host, userId, logger, async (ssh) => { + const sftp = await ssh.requestSFTP(); + + if (!await sftpExists(sftp, portalConfig.savedJobsDir)) { return { results: []}; } + + const list = await sftpReaddir(sftp)(portalConfig.savedJobsDir); + + const results = await Promise.all(list.map(async ({ filename }) => { + const content = await sftpReadFile(sftp)(join(portalConfig.savedJobsDir, filename)); + const data = JSON.parse(content.toString()) as JobMetadata; + + return { + id: filename, + submitTime: new Date(data.submitTime), + comment: data.comment, + jobName: data.jobName, + } as JobTemplateInfo; + })); + + return { results }; + }); + }, + + saveJobTemplate: async (request, logger) => { + const { userId, jobId, jobInfo } = request; + + return await sshConnect(host, userId, logger, async (ssh) => { + const sftp = await ssh.requestSFTP(); + + const id = `${jobInfo.jobName}-${jobId}`; + logger.info("Save job to %s", id); + + await ssh.mkdir(portalConfig.savedJobsDir); + + const filePath = join(portalConfig.savedJobsDir, id); + const metadata: JobMetadata = { ...jobInfo, submitTime: new Date().toISOString() }; + await sftpWriteFile(sftp)(filePath, JSON.stringify(metadata)); + + logger.info("Saved job as template to %s", filePath); + + return {}; + }); + + }, + + }; +}; diff --git a/apps/portal-server/src/clusterops/slurm/bl/port.ts b/apps/portal-server/src/clusterops/slurm/bl/port.ts deleted file mode 100644 index ffb9d1af91..0000000000 --- a/apps/portal-server/src/clusterops/slurm/bl/port.ts +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import net from "net"; - -export async function getFreePort(): Promise { - return new Promise((res) => { - const server = net.createServer((s) => { - s.end("Hello world\n"); - }); - server.listen(0, () => { - const port = (server.address() as net.AddressInfo).port; - res(port); - }).close(); - }); - -} - -const DISPLAY_ID_PORT_DELTA = 5900; - -export function displayIdToPort(displayId: number): number { - return DISPLAY_ID_PORT_DELTA + displayId; -} - -export function portToDisplayId(port: number): number { - return port - DISPLAY_ID_PORT_DELTA; -} diff --git a/apps/portal-server/src/clusterops/slurm/bl/queryJobInfo.ts b/apps/portal-server/src/clusterops/slurm/bl/queryJobInfo.ts deleted file mode 100644 index 5e4239a282..0000000000 --- a/apps/portal-server/src/clusterops/slurm/bl/queryJobInfo.ts +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { executeAsUser } from "@scow/lib-ssh"; -import { RunningJob } from "@scow/protos/build/common/job"; -import { NodeSSH } from "node-ssh"; -import { JobInfo } from "src/clusterops/api/job"; -import dayjs, { Dayjs } from "src/utils/dayjs"; -import { Logger } from "ts-log"; - -const SEPARATOR = "__x__x__"; - -export async function querySqueue(ssh: NodeSSH, userId: string, logger: Logger, params: string[]) { - const result = await executeAsUser(ssh, userId, logger, true, - "squeue", - [ - "-o", - ["%A", "%P", "%j", "%u", "%T", "%M", "%D", "%R", "%a", "%C", "%q", "%V", "%Y", "%l", "%Z"].join(SEPARATOR), - "--noheader", - ...params, - ], - ); - - const jobs = result.stdout.split("\n").filter((x) => x).map((x) => { - const [ - jobId, - partition, name, user, state, runningTime, - nodes, nodesOrReason, account, cores, - qos, submissionTime, nodesToBeUsed, timeLimit, workingDir, - ] = x.split(SEPARATOR); - - return { - jobId, - partition, name, user, state, runningTime, - nodes, nodesOrReason, account, cores, - qos, submissionTime, nodesToBeUsed, timeLimit, - workingDir, - } as RunningJob; - }); - - return jobs; -} - -function applyOffset(time: Dayjs, tz: string): Dayjs { - // tz is of format +08:00 - - const [h, m] = tz.substring(1).split(":"); - - const sign = tz[0]; - if (sign === "+") { - return time.add(+h, "hours").add(+m, "minutes"); - } else { - return time.subtract(+h, "hours").subtract(+m, "minutes"); - } - -} - -function formatTime(time: Date, tz: string) { - return applyOffset(dayjs(time), tz).format("YYYY-MM-DD[T]HH:mm:ss"); -} - -/** - * Query sacct for running jobs - * @param ssh ssh object connected as root - * @param userId the user - * @param logger logger - * @param startTime start time - * @param endTime end time - * @returns - */ -export async function querySacct(ssh: NodeSSH, userId: string, logger: Logger, startTime?: Date, endTime?: Date) { - - // get the timezone of target machine - const { stdout: tz } = await executeAsUser(ssh, userId, logger, true, "date", ["+%:z"]); - - const result = await executeAsUser(ssh, userId, logger, true, - "sacct", - [ - "-X", - "--noheader", - "--format", "JobID,JobName,Account,Partition,QOS,State,WorkDir,Reason,Elapsed,TimeLimit,Submit,Start,End", - ...startTime ? ["--starttime", formatTime(startTime, tz)] : [], - ...endTime ? ["--endtime", formatTime(endTime, tz)] : [], - "--parsable2", - ], - ); - - if (result.stdout.length === 0) { - return []; - } - - const jobs = result.stdout.split("\n").map((x) => { - const [ - jobId, name, account, partition, qos, state, - workingDirectory, reason, elapsed, timeLimit, submitTime, startTime, endTime, - ] = x.split("|"); - - return { - jobId: +jobId, name, account, partition, qos, state, - workingDirectory, reason, elapsed, timeLimit, submitTime, startTime, endTime, - } as JobInfo; - }); - - return jobs; -} diff --git a/apps/portal-server/src/clusterops/slurm/bl/submitJob.ts b/apps/portal-server/src/clusterops/slurm/bl/submitJob.ts deleted file mode 100644 index 292129bc34..0000000000 --- a/apps/portal-server/src/clusterops/slurm/bl/submitJob.ts +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { JobTemplate } from "src/clusterops/api/job"; - -export function parseSbatchOutput(output: string): number { - // Submitted batch job 34987 - const splitted = output.split(" "); - return +splitted[splitted.length - 1]; -} - -export interface JobMetadata { - jobName: string; - account: string; - partition?: string; - qos?: string; - nodeCount: number; - coreCount: number; - gpuCount?: number; - maxTime: number; - command: string; - comment?: string; - submitTime: string; - workingDirectory: string; - memory?: string; -} - -export function generateJobScript(jobInfo: JobTemplate & { - output?: string; - otherOptions?: string[]; -}) { - const { - jobName, account, coreCount, gpuCount, maxTime, nodeCount, - partition, qos, command, workingDirectory, - output, errorOutput, otherOptions, memory, - } = jobInfo; - let script = "#!/bin/bash\n"; - - function append(param: string) { - script += "#SBATCH " + param + "\n"; - } - - append("-A " + account); - append("--partition=" + partition); - append("--qos=" + qos); - append("-J " + jobName); - append("--nodes=" + nodeCount); - append("-c " + coreCount); - append("--time=" + maxTime); - append("--chdir=" + workingDirectory); - if (gpuCount) { - append("--gres=gpu:" + gpuCount); - } - if (memory) { - append("--mem=" + memory); - } - if (output) { - append("--output=" + output); - } - if (errorOutput) { - append("--error=" + errorOutput); - } - - if (otherOptions) { - otherOptions.forEach((opt) => { - append(opt); - }); - } - - - script += "\n"; - script += command; - - return script; -} - -export const JOB_METADATA_NAME = "metadata.json"; - - diff --git a/apps/portal-server/src/clusterops/slurm/job.ts b/apps/portal-server/src/clusterops/slurm/job.ts deleted file mode 100644 index 1f4a669d99..0000000000 --- a/apps/portal-server/src/clusterops/slurm/job.ts +++ /dev/null @@ -1,177 +0,0 @@ -/** - * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy - * SCOW is licensed under Mulan PSL v2. - * You can use this software according to the terms and conditions of the Mulan PSL v2. - * You may obtain a copy of Mulan PSL v2 at: - * http://license.coscl.org.cn/MulanPSL2 - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - * See the Mulan PSL v2 for more details. - */ - -import { getRunningJobs } from "@scow/lib-slurm"; -import { executeAsUser, loggedExec, sftpExists, sftpReaddir, sftpReadFile, sftpWriteFile } from "@scow/lib-ssh"; -import { join } from "path"; -import { JobOps, JobTemplateInfo } from "src/clusterops/api/job"; -import { querySacct } from "src/clusterops/slurm/bl/queryJobInfo"; -import { generateJobScript, JobMetadata, parseSbatchOutput } from "src/clusterops/slurm/bl/submitJob"; -import { portalConfig } from "src/config/portal"; -import { getClusterLoginNode, sshConnect } from "src/utils/ssh"; - -export const slurmJobOps = (cluster: string): JobOps => { - - const host = getClusterLoginNode(cluster); - - if (!host) { throw new Error(`Cluster ${cluster} has no login node`); } - - return { - listAccounts: async (request, logger) => { - const { userId } = request; - - const accounts = await sshConnect(host, "root", logger, async (ssh) => { - const { stdout } = await executeAsUser(ssh, userId, logger, true, - "sacctmgr", ["show", "ass", `user=${userId}`, "format=account%20"]); - - /** - Account - -------------------- - {account1} - {account2} - */ - - const accounts = stdout.split("\n").slice(2).map((x) => x.trim()); - - return [...new Set(accounts)]; - }); - - return { accounts }; - }, - - generateJobScript: async (request) => { - const script = generateJobScript(request.jobInfo); - - return { script }; - }, - - submitJob: async (request, logger) => { - const { jobInfo, userId, saveAsTemplate } = request; - - return await sshConnect(host, userId, logger, async (ssh) => { - - const dir = jobInfo.workingDirectory; - - const script = generateJobScript(jobInfo); - - const sftp = await ssh.requestSFTP(); - - // make sure workingDirectory exists. - await ssh.mkdir(dir, undefined, sftp); - - // use sbatch to allocate the script. pass the script into sbatch in stdin - const { code, stderr, stdout } = await loggedExec(ssh, logger, false, - "sbatch", [], - { stdin: script }, - ); - - if (code !== 0) { - return { code: "SBATCH_FAILED", message: stderr }; - } - - // parse stdout output to get the job id - const jobId = parseSbatchOutput(stdout); - - if (saveAsTemplate) { - const id = `${jobInfo.jobName}-${jobId}`; - logger.info("Save job to %s", id); - - await ssh.mkdir(portalConfig.savedJobsDir); - - const filePath = join(portalConfig.savedJobsDir, id); - const metadata: JobMetadata = { ...jobInfo, submitTime: new Date().toISOString() }; - await sftpWriteFile(sftp)(filePath, JSON.stringify(metadata)); - - logger.info("Saved job as template to %s", filePath); - } - - return { code: "OK", jobId }; - }); - }, - - getJobTamplate: async (request, logger) => { - const { id, userId } = request; - - return await sshConnect(host, userId, logger, async (ssh) => { - const sftp = await ssh.requestSFTP(); - - const file = join(portalConfig.savedJobsDir, id); - - if (!await sftpExists(sftp, file)) { return { code: "NOT_FOUND" }; } - - const content = await sftpReadFile(sftp)(file); - logger.info("getJobTamplate to %s", content); - const data = JSON.parse(content.toString()) as JobMetadata; - - return { code: "OK", template: data }; - }); - }, - - listJobTemplates: async (request, logger) => { - const { userId } = request; - - return await sshConnect(host, userId, logger, async (ssh) => { - const sftp = await ssh.requestSFTP(); - - if (!await sftpExists(sftp, portalConfig.savedJobsDir)) { return { results: []}; } - - const list = await sftpReaddir(sftp)(portalConfig.savedJobsDir); - - const results = await Promise.all(list.map(async ({ filename }) => { - const content = await sftpReadFile(sftp)(join(portalConfig.savedJobsDir, filename)); - const data = JSON.parse(content.toString()) as JobMetadata; - - return { - id: filename, - submitTime: new Date(data.submitTime), - comment: data.comment, - jobName: data.jobName, - } as JobTemplateInfo; - })); - - return { results }; - }); - }, - - listRunningJobs: async (request, logger) => { - const { userId } = request; - - return await sshConnect(host, "root", logger, async (ssh) => { - const results = await getRunningJobs(ssh, userId, { userId }, logger); - - return { results }; - }); - - }, - - cancelJob: async (request, logger) => { - const { jobId, userId } = request; - - return await sshConnect(host, "root", logger, async (ssh) => { - await executeAsUser(ssh, userId, logger, true, "scancel", [jobId + ""]); - return { code: "OK" }; - }); - }, - - listAllJobsInfo: async (request, logger) => { - const { userId, startTime, endTime } = request; - - return await sshConnect(host, "root", logger, async (ssh) => { - const results = await querySacct(ssh, userId, logger, startTime, endTime); - - return { results }; - }); - }, - - - }; -}; diff --git a/apps/portal-server/src/services/app.ts b/apps/portal-server/src/services/app.ts index 195e588d25..3d16d154d3 100644 --- a/apps/portal-server/src/services/app.ts +++ b/apps/portal-server/src/services/app.ts @@ -43,14 +43,6 @@ export const appServiceServer = plugin((server) => { sessionId, userId, }, logger); - if (reply.code === "NOT_FOUND") { - throw { code: Status.NOT_FOUND, message: `session id ${sessionId} is not found` }; - } - - if (reply.code === "UNAVAILABLE") { - throw { code: Status.UNAVAILABLE, message: `session id ${sessionId} cannot be connected` }; - } - const app = apps[reply.appId]; if (!app) { @@ -169,14 +161,6 @@ export const appServiceServer = plugin((server) => { proxyBasePath, }, logger); - if (reply.code === "SBATCH_FAILED") { - throw { code: Status.INTERNAL, message: "sbatch failed", details: reply.message }; - } - - if (reply.code === "APP_NOT_FOUND") { - throw { code: Status.NOT_FOUND, message: `app id ${appId} is not found` }; - } - return [{ jobId: reply.jobId, sessionId: reply.sessionId }]; }, diff --git a/apps/portal-server/src/services/config.ts b/apps/portal-server/src/services/config.ts new file mode 100644 index 0000000000..e49331b880 --- /dev/null +++ b/apps/portal-server/src/services/config.ts @@ -0,0 +1,32 @@ +/** + * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy + * SCOW is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + */ + +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; +import { plugin } from "@ddadaal/tsgrpc-server"; +import { ConfigServiceServer, ConfigServiceService } from "@scow/protos/build/common/config"; +import { getAdapterClient } from "src/utils/clusters"; +import { clusterNotFound } from "src/utils/errors"; + +export const configServiceServer = plugin((server) => { + server.addService(ConfigServiceService, { + getClusterConfig: async ({ request }) => { + const { cluster } = request; + + const client = getAdapterClient(cluster); + if (!client) { throw clusterNotFound(cluster); } + + const reply = await asyncClientCall(client.config, "getClusterConfig", {}); + + return [reply]; + }, + }); +}); diff --git a/apps/portal-server/src/services/desktop.ts b/apps/portal-server/src/services/desktop.ts index 69a2a9272b..a51fa94833 100644 --- a/apps/portal-server/src/services/desktop.ts +++ b/apps/portal-server/src/services/desktop.ts @@ -15,11 +15,11 @@ import { ServiceError } from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; import { executeAsUser } from "@scow/lib-ssh"; import { DesktopServiceServer, DesktopServiceService } from "@scow/protos/build/portal/desktop"; -import { displayIdToPort } from "src/clusterops/slurm/bl/port"; import { portalConfig } from "src/config/portal"; import { clusterNotFound } from "src/utils/errors"; import { getClusterLoginNode, sshConnect } from "src/utils/ssh"; -import { parseDisplayId, parseListOutput, parseOtp, refreshPassword, VNCSERVER_BIN_PATH } from "src/utils/turbovnc"; +import { displayIdToPort, + parseDisplayId, parseListOutput, parseOtp, refreshPassword, VNCSERVER_BIN_PATH } from "src/utils/turbovnc"; function ensureEnabled() { if (!portalConfig.loginDesktop.enabled) { diff --git a/apps/portal-server/src/services/job.ts b/apps/portal-server/src/services/job.ts index 3313d3cf77..1ad4b89ff3 100644 --- a/apps/portal-server/src/services/job.ts +++ b/apps/portal-server/src/services/job.ts @@ -10,49 +10,48 @@ * See the Mulan PSL v2 for more details. */ +import { asyncClientCall } from "@ddadaal/tsgrpc-client"; import { ServiceError } from "@ddadaal/tsgrpc-common"; import { plugin } from "@ddadaal/tsgrpc-server"; import { Status } from "@grpc/grpc-js/build/src/constants"; +import { jobInfoToPortalJobInfo, jobInfoToRunningjob } from "@scow/lib-scheduler-adapter"; +import { createDirectoriesRecursively } from "@scow/lib-ssh"; import { JobServiceServer, JobServiceService } from "@scow/protos/build/portal/job"; +import { parseErrorDetails } from "@scow/rich-error-model"; import { getClusterOps } from "src/clusterops"; import { JobTemplate } from "src/clusterops/api/job"; -import { clusterNotFound, jobNotFound } from "src/utils/errors"; +import { getAdapterClient } from "src/utils/clusters"; +import { clusterNotFound } from "src/utils/errors"; +import { getClusterLoginNode, sshConnect } from "src/utils/ssh"; export const jobServiceServer = plugin((server) => { server.addService(JobServiceService, { - cancelJob: async ({ request, logger }) => { + cancelJob: async ({ request }) => { const { cluster, jobId, userId } = request; - const clusterops = getClusterOps(cluster); - - if (!clusterops) { throw clusterNotFound(cluster); } - - const reply = await clusterops.job.cancelJob({ - jobId, - userId, - }, logger); + const client = getAdapterClient(cluster); + if (!client) { throw clusterNotFound(cluster); } - if (reply.code === "NOT_FOUND") { - throw jobNotFound(jobId); - } + await asyncClientCall(client.job, "cancelJob", { + userId, jobId, + }); return [{}]; }, - listAccounts: async ({ request, logger }) => { + listAccounts: async ({ request }) => { const { cluster, userId } = request; - const clusterops = getClusterOps(cluster); - - if (!clusterops) { throw clusterNotFound(cluster); } + const client = getAdapterClient(cluster); + if (!client) { throw clusterNotFound(cluster); } - const reply = await clusterops.job.listAccounts({ + const reply = await asyncClientCall(client.account, "listAccounts", { userId, - }, logger); + }); return [{ accounts: reply.accounts }]; }, @@ -64,14 +63,10 @@ export const jobServiceServer = plugin((server) => { if (!clusterops) { throw clusterNotFound(cluster); } - const reply = await clusterops.job.getJobTamplate({ + const reply = await clusterops.job.getJobTemplate({ id: templateId, userId, }, logger); - if (reply.code === "NOT_FOUND") { - throw { code: Status.NOT_FOUND, message: `Job template id ${templateId} is not found.` }; - } - return [{ template: reply.template }]; }, @@ -92,35 +87,44 @@ export const jobServiceServer = plugin((server) => { }, - listRunningJobs: async ({ request, logger }) => { + listRunningJobs: async ({ request }) => { const { cluster, userId } = request; - const clusterops = getClusterOps(cluster); + const client = getAdapterClient(cluster); + if (!client) { throw clusterNotFound(cluster); } - if (!clusterops) { throw clusterNotFound(cluster); } + const reply = await asyncClientCall(client.job, "getJobs", { + fields: [ + "job_id", "partition", "name", "user", "state", "elapsed_seconds", + "nodes_alloc", "node_list", "reason", "account", "cpus_alloc", + "qos", "submit_time", "time_limit_minutes", "working_directory", + ], + filter: { users: [userId], accounts: [], states: ["PENDING", "RUNNING"]}, + }); - const reply = await clusterops.job.listRunningJobs({ - userId, - }, logger); - - return [{ results: reply.results }]; + return [{ results: reply.jobs.map(jobInfoToRunningjob) }]; }, - listAllJobs: async ({ request, logger }) => { + listAllJobs: async ({ request }) => { const { cluster, userId, endTime, startTime } = request; - const clusterops = getClusterOps(cluster); - - if (!clusterops) { throw clusterNotFound(cluster); } + const client = getAdapterClient(cluster); + if (!client) { throw clusterNotFound(cluster); } - const reply = await clusterops.job.listAllJobsInfo({ - userId, - endTime: endTime ? new Date(endTime) : undefined, - startTime: startTime ? new Date(startTime) : undefined, - }, logger); + const reply = await asyncClientCall(client.job, "getJobs", { + fields: [ + "job_id", "name", "account", "partition", "qos", "state", "working_directory", + "reason", "elapsed_seconds", "time_limit_minutes", "submit_time", + "start_time", "end_time", + ], + filter: { + users: [userId], accounts: [], states: [], + submitTime: { startTime, endTime }, + }, + }); - return [{ results: reply.results }]; + return [{ results: reply.jobs.map(jobInfoToPortalJobInfo) }]; }, @@ -128,47 +132,65 @@ export const jobServiceServer = plugin((server) => { const { cluster, command, jobName, coreCount, gpuCount, maxTime, saveAsTemplate, userId, nodeCount, partition, qos, account, comment, workingDirectory, output, errorOutput, memory } = request; - const jobInfo: JobTemplate = { - jobName, - coreCount, - maxTime, - nodeCount, - gpuCount, - partition, - qos, - account, - command, - comment, - workingDirectory, - output, - errorOutput, - memory, - }; - const clusterops = getClusterOps(cluster); - - const scriptReply = await clusterops.job.generateJobScript({ - jobInfo, - }, logger); - - const reply = await clusterops.job.submitJob({ - userId, - jobInfo, - script: scriptReply.script, - saveAsTemplate, - }, logger); - - if (reply.code === "SBATCH_FAILED") { - throw new ServiceError({ - code: Status.INTERNAL, - details: reply.message, - }); + const client = getAdapterClient(cluster); + if (!client) { throw clusterNotFound(cluster); } + + // make sure working directory exists + const host = getClusterLoginNode(cluster); + if (!host) { throw clusterNotFound(cluster); } + await sshConnect(host, userId, logger, async (ssh) => { + const sftp = await ssh.requestSFTP(); + await createDirectoriesRecursively(sftp, workingDirectory); + }); + + const reply = await asyncClientCall(client.job, "submitJob", { + userId, jobName, account, partition: partition!, qos, nodeCount, gpuCount: gpuCount || 0, + memoryMb: Number(memory?.split("M")[0]), coreCount, timeLimitMinutes: maxTime, + script: command, workingDirectory, stdout: output, stderr: errorOutput, extraOptions: [], + }).catch((e) => { + const ex = e as ServiceError; + const errors = parseErrorDetails(ex.metadata); + if (errors[0] && errors[0].$type === "google.rpc.ErrorInfo" && errors[0].reason === "SBATCH_FAILED") { + throw { + code: Status.INTERNAL, + message: "sbatch failed", + details: e.details, + }; + } else { + throw e; + } + }); + + if (saveAsTemplate) { + const jobInfo: JobTemplate = { + jobName, + coreCount, + maxTime, + nodeCount, + gpuCount, + partition, + qos, + account, + command, + comment, + workingDirectory, + output, + errorOutput, + memory, + }; + + const clusterOps = getClusterOps(cluster); + if (!clusterOps) { throw clusterNotFound(cluster); } + + await clusterOps.job.saveJobTemplate({ + userId, jobId: reply.jobId, jobInfo, + }, logger); } return [{ jobId: reply.jobId }]; }, - }); }); diff --git a/apps/mis-server/src/clusterops/api/index.ts b/apps/portal-server/src/utils/clusters.ts similarity index 54% rename from apps/mis-server/src/clusterops/api/index.ts rename to apps/portal-server/src/utils/clusters.ts index 4bc8482016..4ba6cdc40d 100644 --- a/apps/mis-server/src/clusterops/api/index.ts +++ b/apps/portal-server/src/utils/clusters.ts @@ -10,21 +10,16 @@ * See the Mulan PSL v2 for more details. */ -import { Logger } from "@ddadaal/tsgrpc-server"; -import { AccountOps } from "src/clusterops/api/account"; -import { JobOps } from "src/clusterops/api/job"; -import { StorageOps } from "src/clusterops/api/storage"; -import { UserOps } from "src/clusterops/api/user"; +import { getSchedulerAdapterClient, SchedulerAdapterClient } from "@scow/lib-scheduler-adapter"; +import { clusters } from "src/config/clusters"; -export interface Request { - request: T; - logger: Logger; -} -export interface ClusterOps { - account: AccountOps; - job: JobOps; - storage: StorageOps; - user: UserOps; - onStartup: () => Promise; -} \ No newline at end of file +const adapterClientForClusters = Object.entries(clusters).reduce((prev, [cluster, c]) => { + const client = getSchedulerAdapterClient(c.adapterUrl); + prev[cluster] = client; + return prev; +}, {} as Record); + +export const getAdapterClient = (cluster: string) => { + return adapterClientForClusters[cluster]; +}; diff --git a/apps/portal-server/src/utils/ssh.ts b/apps/portal-server/src/utils/ssh.ts index 52436dd359..f4485fec64 100644 --- a/apps/portal-server/src/utils/ssh.ts +++ b/apps/portal-server/src/utils/ssh.ts @@ -22,7 +22,7 @@ import { Logger } from "ts-log"; export function getClusterLoginNode(cluster: string): string | undefined { - const loginNode = getLoginNode(clusters[cluster]?.slurm?.loginNodes?.[0]); + const loginNode = getLoginNode(clusters[cluster]?.loginNodes?.[0]); return loginNode?.address; } @@ -60,7 +60,7 @@ export async function sshConnect( * Check whether all clusters can be logged in as root user */ export async function checkClustersRootUserLogin(logger: Logger) { - await Promise.all(Object.values(clusters).map(async ({ displayName, slurm: { loginNodes } }) => { + await Promise.all(Object.values(clusters).map(async ({ displayName, loginNodes }) => { const node = getLoginNode(loginNodes[0]); logger.info("Checking if root can login to %s by login node %s", displayName, node.name); const error = await testRootUserSshLogin(node.address, rootKeyPair, console); diff --git a/apps/portal-server/src/utils/turbovnc.ts b/apps/portal-server/src/utils/turbovnc.ts index ed2172eaf2..282ecddf49 100644 --- a/apps/portal-server/src/utils/turbovnc.ts +++ b/apps/portal-server/src/utils/turbovnc.ts @@ -18,6 +18,7 @@ import { parseIp } from "src/utils/proxy"; import { Logger } from "ts-log"; export const VNCSERVER_BIN_PATH = join(portalConfig.turboVNCPath, "bin", "vncserver"); +const DISPLAY_ID_PORT_DELTA = 5900; export function parseListOutput(output: string): number[] { const ids = [] as number[]; @@ -61,6 +62,15 @@ export function parseDisplayId(stdout: string): number { throw new Error("Error parsing display id"); } + +export function displayIdToPort(displayId: number): number { + return DISPLAY_ID_PORT_DELTA + displayId; +} + +export function portToDisplayId(port: number): number { + return port - DISPLAY_ID_PORT_DELTA; +} + const vncPasswdPath = join(portalConfig.turboVNCPath, "bin", "vncpasswd"); /** diff --git a/apps/portal-web/config/clusters/hpc01.yaml b/apps/portal-web/config/clusters/hpc01.yaml index 62cbfccd6a..ff3d6c0f69 100644 --- a/apps/portal-web/config/clusters/hpc01.yaml +++ b/apps/portal-web/config/clusters/hpc01.yaml @@ -1,28 +1,31 @@ displayName: hpc01Name -slurm: - loginNodes: - - name: loginNode01 - address: localhost:22222 - partitions: - - name: compute - nodes: 3 - mem: 262144 - cores: 32 - gpus: 0 - qos: - - low - - normal - - high - comment: 说明 +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: loginNode01 + address: localhost:22222 +# slurm: +# loginNodes: +# - localhost:22222 +# partitions: +# - name: compute +# nodes: 3 +# mem: 262144 +# cores: 32 +# gpus: 0 +# qos: +# - low +# - normal +# - high +# comment: 说明 - - name: GPU - nodes: 1 - mem: 262144 - cores: 48 - gpus: 8 - qos: - - low - - normal - - high - - highest - comment: 说明 +# - name: GPU +# nodes: 1 +# mem: 262144 +# cores: 48 +# gpus: 8 +# qos: +# - low +# - normal +# - high +# - highest +# comment: 说明 diff --git a/apps/portal-web/config/clusters/hpc02.yaml b/apps/portal-web/config/clusters/hpc02.yaml index 493e7ba203..ba2319bc58 100644 --- a/apps/portal-web/config/clusters/hpc02.yaml +++ b/apps/portal-web/config/clusters/hpc02.yaml @@ -1,24 +1,27 @@ displayName: hpc02Name -slurm: - loginNodes: - - name: loginNode02 - address: localhost:22 - partitions: - - name: GPU - nodes: 2 - mem: 262144 - cores: 29 - gpus: 8 - qos: - - normal - - high - - highest - comment: 说明 +adapterUrl: 0.0.0.0:6000 +loginNodes: + - name: loginNode02 + address: localhost:22 +# slurm: +# loginNodes: +# - localhost:22 +# partitions: +# - name: GPU +# nodes: 2 +# mem: 262144 +# cores: 29 +# gpus: 8 +# qos: +# - normal +# - high +# - highest +# comment: 说明 - - name: another - nodes: 2 - mem: 262144 - cores: 29 - gpus: 8 - comment: 说明 +# - name: another +# nodes: 2 +# mem: 262144 +# cores: 29 +# gpus: 8 +# comment: 说明 diff --git a/apps/portal-web/src/apis/api.mock.ts b/apps/portal-web/src/apis/api.mock.ts index 9751c68a16..aaf0d72746 100644 --- a/apps/portal-web/src/apis/api.mock.ts +++ b/apps/portal-web/src/apis/api.mock.ts @@ -34,7 +34,6 @@ export const runningJob: RunningJob = { name: "123", nodes: "123", nodesOrReason: "!23", - nodesToBeUsed: "123", partition: "123", qos: "123", runningTime: "123", @@ -65,12 +64,13 @@ export const mockApi: MockApi = { getClusterInfo: async ({ query: { cluster } }) => ({ clusterInfo: { submitJobDirTemplate: "/home/ddadaal/Code/{{ name }}", - slurm: { + scheduler: { + name: "slurm", partitions: [ - { cores: 123, name: "123", nodes: 123, qos: ["123"], gpus: 10, mem: 1000 }, - { cores: 1234, name: cluster, nodes: 1234, qos: ["1234"], gpus: 10, mem: 1000 }, - { name : "compute", mem: 2048, cores:2, gpus:0, nodes: 1, qos: ["normal"], comment: "两个计算节点分区" }, - { name : "GPU", mem: 2048, cores:2, gpus:2, nodes: 1, qos: ["normal"], comment: "GPU" }, + { cores: 123, name: "123", nodes: 123, qos: ["123"], gpus: 10, memMb: 1000 }, + { cores: 1234, name: cluster, nodes: 1234, qos: ["1234"], gpus: 10, memMb: 1000 }, + { name : "compute", memMb: 2048, cores:2, gpus:0, nodes: 1, qos: ["normal"], comment: "两个计算节点分区" }, + { name : "GPU", memMb: 2048, cores:2, gpus:2, nodes: 1, qos: ["normal"], comment: "GPU" }, ], }, } }), diff --git a/apps/portal-web/src/pageComponents/app/LaunchAppForm.tsx b/apps/portal-web/src/pageComponents/app/LaunchAppForm.tsx index 52503b1e10..b4cde2c500 100644 --- a/apps/portal-web/src/pageComponents/app/LaunchAppForm.tsx +++ b/apps/portal-web/src/pageComponents/app/LaunchAppForm.tsx @@ -110,7 +110,7 @@ export const LaunchAppForm: React.FC = ({ clusterId, appId, attributes, a setLoading(true); form.setFieldValue("appJobName", genAppJobName(appName)); - setCurrentPartitionInfo(data.clusterInfo.slurm.partitions[0]); + setCurrentPartitionInfo(data.clusterInfo.scheduler.partitions[0]); await api.getAppLastSubmission({ query: { cluster: clusterId, appId } }) .then((lastSubmitData) => { @@ -122,11 +122,11 @@ export const LaunchAppForm: React.FC = ({ clusterId, appId, attributes, a // 如果存在上一次提交信息,且上一次提交信息中的分区,qos,cpu核心数满足当前集群配置,则填入上一次提交信息中的相应值 const setSubmitPartition = lastSubmitPartition && - data.clusterInfo.slurm.partitions.some((item) => { return item.name === lastSubmitPartition; }); + data.clusterInfo.scheduler.partitions.some((item) => { return item.name === lastSubmitPartition; }); const clusterPartition = setSubmitPartition - ? data.clusterInfo.slurm.partitions.filter((item) => { return item.name === lastSubmitPartition; })[0] - : data.clusterInfo.slurm.partitions[0]; + ? data.clusterInfo.scheduler.partitions.filter((item) => { return item.name === lastSubmitPartition; })[0] + : data.clusterInfo.scheduler.partitions[0]; setCurrentPartitionInfo(clusterPartition); const clusterPartitionCoreCount = clusterPartition.cores; @@ -195,7 +195,7 @@ export const LaunchAppForm: React.FC = ({ clusterId, appId, attributes, a const handlePartitionChange = (partition: string) => { const partitionInfo = clusterInfoQuery.data - ? clusterInfoQuery.data.clusterInfo.slurm.partitions.find((x) => x.name === partition) + ? clusterInfoQuery.data.clusterInfo.scheduler.partitions.find((x) => x.name === partition) : undefined; form.setFieldValue("qos", partitionInfo?.qos?.[0]); if (!!partitionInfo?.gpus) { @@ -245,8 +245,8 @@ export const LaunchAppForm: React.FC = ({ clusterId, appId, attributes, a const memorySize = (currentPartitionInfo ? currentPartitionInfo.gpus ? nodeCount * gpuCount * Math.floor(currentPartitionInfo.cores / currentPartitionInfo.gpus) - * Math.floor(currentPartitionInfo.mem / currentPartitionInfo.cores) : - nodeCount * coreCount * Math.floor(currentPartitionInfo.mem / currentPartitionInfo.cores) : 0); + * Math.floor(currentPartitionInfo.memMb / currentPartitionInfo.cores) : + nodeCount * coreCount * Math.floor(currentPartitionInfo.memMb / currentPartitionInfo.cores) : 0); const memory = memorySize + "MB"; const memoryDisplay = formatSize(memorySize, ["MB", "GB", "TB"]); @@ -282,7 +282,7 @@ export const LaunchAppForm: React.FC = ({ clusterId, appId, attributes, a