Skip to content

Commit

Permalink
refactor(mis,portal): 重构scow后端, 对接调度器适配器接口 (#632)
Browse files Browse the repository at this point in the history
### 1. 部署调度器适配器

首先需要确保您的集群上部署了对应的调度器适配器,得到访问它的地址及端口号

部署适配器可参考文档:

- [slurm](https://github.com/PKUHPC/scow-slurm-adapter)

### 2. 修改SCOW配置文件

首先确保您使用了最新的SCOW镜像(可查看`install.yaml`中的`imageTag`字段)

在用于部署scow的`scow-deployment`文件夹中,修改配置文件:

- 首先修改集群配置文件

  主要变化为删除`slurm`配置项, 将`loginNodes`配置项作为独立的一项配置。新增`adapterUrl`配置项,标识适配器地址

```
# 集群显示名称
displayName: hpc01

# 调度器适配器的地址
adapterUrl: "192.168.88.101:8972"

# 登录节点
loginNodes:
  - "192.168.88.102"
```

- 修改管理系统配置文件

  删除了`fetchJobs`配置项中的`db`项,即不再采用源作业信息数据库,通过适配器同步作业信息

### 3. 不再使用源作业信息数据库


部署使用适配器后,可以不再部署[`export-jobs`](https://github.com/PKUHPC/export-jobs)项目,同步作业信息的功能由适配器完成
  • Loading branch information
qhqhqhq authored Jul 11, 2023
1 parent 6b6f08a commit 5b7f0e8
Show file tree
Hide file tree
Showing 136 changed files with 4,202 additions and 3,941 deletions.
19 changes: 19 additions & 0 deletions .changeset/afraid-maps-deliver.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
"@scow/scheduler-adapter-protos": minor
"@scow/lib-scheduler-adapter": minor
"@scow/portal-server": minor
"@scow/test-adapter": minor
"@scow/protos": minor
"@scow/mis-server": minor
"@scow/portal-web": minor
"@scow/demo-vagrant": minor
"@scow/mis-web": minor
"@scow/config": minor
"@scow/auth": minor
"@scow/cli": minor
"@scow/lib-ssh": minor
"@scow/grpc-api": minor
"@scow/docs": minor
---

重构 scow,对接调度器适配器接口
7 changes: 7 additions & 0 deletions .devcontainer/docker-compose.devcontainer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ services:
PHPLDAPADMIN_LDAP_HOSTS: ldap://scow-dev:389
PHPLDAPADMIN_HTTPS: false

test-adapter:
image: test-adapter
build:
context: ..
dockerfile: dev/test-adapter/Dockerfile
network_mode: service:scow-dev

volumes:
db:
ldap:
Expand Down
2 changes: 1 addition & 1 deletion apps/auth/src/auth/ssh/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ function checkLoginNode(sshConfig: SshConfigSchema) {
}
const clusterConfig = Object.values(clusters)[0];

loginNode = getLoginNode(clusterConfig.slurm.loginNodes[0]).address;
loginNode = getLoginNode(clusterConfig.loginNodes[0]).address;

if (!loginNode) {
throw new Error(`Cluster ${clusterConfig.displayName} has no login node.`);
Expand Down
84 changes: 11 additions & 73 deletions apps/cli/assets/config/clusters/hpc01.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,79 +2,17 @@
displayName: hpc01Name

# 指定slurm配置
slurm:
loginNodes:
# 登录节点展示名称
- name: login01
# 登录节点的IP或者域名
# 如果设置的是域名,请确认此节点的/etc/hosts中包含了域名到IP的解析信息
address: login01
- name: login02
address: login02

# 集群的分区信息
partitions:
# 分区1的名字
- name: compute
# 分区内节点数
nodes: 28
# 单节点内存数量,单位M
mem: 7500
# 核心数
cores: 2
# GPU卡数
gpus: 0
# QOS
qos:
- low
- normal
- high
# 这个分区的备注信息
comment: ""

- name: GPU
nodes: 1
mem: 262144
cores: 48
gpus: 8
qos:
- low
- normal
- high
comment: ""

# 如果这个集群要部署管理系统,请增加以下配置
# 如果不需要,将整个mis块注释掉
mis:
# 部署slurm.sh的机器的地址
managerUrl: haha
# slurm.sh在机器中的绝对地址
scriptPath: /test/slurm.sh

# 部署slurm.sh的机器通过什么地址访问slurm的数据库
# 不填写为下面的默认值
# dbHost: localhost

# 部署slurm.sh的机器通过什么端口访问slurm的数据库
# 不填写为下面的默认值
# dbPort: 3306

# slurm数据库的用户名
# 参考slurmdbd.conf的StorageUser配置
# 不填写为下面的默认值
# dbUser: root

# slurmdbd的数据库用户的密码
# 参考slurmdbd.conf的StoragePass配置
dbPassword: password

# slurm accounting数据库的数据库名
# 参考slurmdbd.conf的StorageLoc配置
# 不填写为下面的默认值
# slurmAcctDbName: "slurm_acct_db"

# 这个集群在slurm中的集群名字
clusterName: pkuhpc
loginNodes:
# 登录节点展示名称
- name: login01
# 登录节点的IP或者域名
# 如果设置的是域名,请确认此节点的/etc/hosts中包含了域名到IP的解析信息
address: login01
- name: login02
address: login02

# 适配器地址(ip地址:端口号)
adapterUrl: localhost:8972

# 门户系统代理网关节点配置
# proxyGateway:
Expand Down
12 changes: 0 additions & 12 deletions apps/cli/assets/config/mis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,6 @@ db:

# 获取作业相关配置
fetchJobs:
# 源作业信息数据库的数据库信息
db:
host: sourcedb
port: 3307
user: root
password: jobtablepassword
dbName: jobs
tableName: jobs
# 数据库类型,可选mariadb或者mysql
# 默认为mariadb
# type: mariadb

# 周期性获取数据
periodicFetch:
# 是否开启
Expand Down
67 changes: 30 additions & 37 deletions apps/mis-server/config/clusters/hpc00.yml
Original file line number Diff line number Diff line change
@@ -1,39 +1,32 @@
displayName: hpc00
misIgnore: true
slurm:
mis:
managerUrl: localhost:22222
dbPassword: password
clusterName: pkuhpc
scriptPath: /slurmshTest/slurm.sh
adapterUrl: 0.0.0.0:6000
loginNodes:
- name: login
address: localhost:22222

loginNodes:
- name: login
address: localhost:22222

partitions:
- name: C032M0128G
mem: 131072
cores: 32
nodes: 32
gpus: 0
qos:
- low
- normal
- high
- cryoem
- name: GPU
mem: 262144
cores: 28
nodes: 32
gpus: 4
qos:
- low
- normal
- high
- cryoem
- name: life
mem: 262144
cores: 28
gpus: 4
nodes: 32
# partitions:
# - name: C032M0128G
# mem: 131072
# cores: 32
# nodes: 32
# gpus: 0
# qos:
# - low
# - normal
# - high
# - cryoem
# - name: GPU
# mem: 262144
# cores: 28
# nodes: 32
# gpus: 4
# qos:
# - low
# - normal
# - high
# - cryoem
# - name: life
# mem: 262144
# cores: 28
# gpus: 4
# nodes: 32
61 changes: 32 additions & 29 deletions apps/mis-server/config/clusters/hpc01.yml
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
displayName: hpc01
misIgnore: true
slurm:
mis:
managerUrl: localhost:22222
dbPassword: password
clusterName: pkuhpc1
scriptPath: /slurmshTest/slurm.sh
loginNodes:
- name: login
address: localhost:22222
partitions:
- name: compute
nodes: 198
mem: 63000
cores: 28
gpus: 0
qos:
- low
- normal
- high
- name: gpu
nodes: 1
mem: 386000
cores: 48
gpus: 8
qos:
- low
- normal
- high
adapterUrl: 0.0.0.0:6000
loginNodes:
- name: login
address: localhost:22222
# misIgnore: true
# slurm:
# mis:
# managerUrl: localhost:22222
# dbPassword: password
# clusterName: pkuhpc1
# scriptPath: /slurmshTest/slurm.sh
# loginNodes:
# - localhost:22222
# partitions:
# - name: compute
# nodes: 198
# mem: 63000
# cores: 28
# gpus: 0
# qos:
# - low
# - normal
# - high
# - name: gpu
# nodes: 1
# mem: 386000
# cores: 48
# gpus: 8
# qos:
# - low
# - normal
# - high
49 changes: 26 additions & 23 deletions apps/mis-server/config/clusters/hpc02.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@
displayName: hpc01
slurm:
loginNodes:
- name: login
address: localhost:22222
partitions:
- name: compute
nodes: 198
mem: 63000
cores: 28
gpus: 0
qos:
- low
- normal
- high
- name: gpu
nodes: 1
mem: 386000
cores: 48
gpus: 8
qos:
- low
- normal
- high
adapterUrl: 0.0.0.0:6000
loginNodes:
- name: login
address: localhost:22222
# slurm:
# loginNodes:
# - localhost:22222
# partitions:
# - name: compute
# nodes: 198
# mem: 63000
# cores: 28
# gpus: 0
# qos:
# - low
# - normal
# - high
# - name: gpu
# nodes: 1
# mem: 386000
# cores: 48
# gpus: 8
# qos:
# - low
# - normal
# - high
8 changes: 0 additions & 8 deletions apps/mis-server/config/mis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,4 @@ db:
password: mysqlrootpassword
dbName: scow_server_${JEST_WORKER_ID}

fetchJobs:
db:
host: 127.0.0.1
port: 3307
user: root
password: jobtablepassword
dbName: jobs
tableName: jobs

4 changes: 3 additions & 1 deletion apps/mis-server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"dependencies": {
"@ddadaal/tsgrpc-common": "0.2.3",
"@ddadaal/tsgrpc-server": "0.19.2",
"@ddadaal/tsgrpc-client": "0.17.5",
"@grpc/grpc-js": "1.8.15",
"@mikro-orm/cli": "5.7.12",
"@mikro-orm/core": "5.7.12",
Expand All @@ -40,6 +41,8 @@
"@scow/lib-slurm": "workspace:*",
"@scow/lib-ssh": "workspace:*",
"@scow/protos": "workspace:*",
"@scow/scheduler-adapter-protos": "workspace:*",
"@scow/lib-scheduler-adapter": "workspace:*",
"@scow/utils": "workspace:*",
"@sinclair/typebox": "0.28.15",
"dotenv": "16.3.1",
Expand All @@ -50,7 +53,6 @@
"wait-on": "7.0.1"
},
"devDependencies": {
"@ddadaal/tsgrpc-client": "0.17.5",
"@types/google-protobuf": "3.15.6",
"@types/node-cron": "3.0.7",
"@types/wait-on": "5.3.1"
Expand Down
2 changes: 2 additions & 0 deletions apps/mis-server/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { plugins } from "src/plugins";
import { accountServiceServer } from "src/services/account";
import { adminServiceServer } from "src/services/admin";
import { chargingServiceServer } from "src/services/charging";
import { configServiceServer } from "src/services/config";
import { initServiceServer } from "src/services/init";
import { jobServiceServer } from "src/services/job";
import { jobChargeLimitServer } from "src/services/jobChargeLimit";
Expand Down Expand Up @@ -49,6 +50,7 @@ export async function createServer() {
await server.register(jobServiceServer);
await server.register(chargingServiceServer);
await server.register(tenantServiceServer);
await server.register(configServiceServer);

const em = server.ext.orm.em.fork();
await updateBlockStatusInSlurm(em, server.ext.clusters, server.logger);
Expand Down
Loading

0 comments on commit 5b7f0e8

Please sign in to comment.