1. 下载并安装 munge 和 slurm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# 源码安装,为防止干扰,先退出 conda 环境
conda deactivate
# 自定义环境安装目录
prefix=/work/run/projects/$(id -nu)/.local
# 下载并安装 munge
munge_version=munge-0.5.18
wget https://github.com/dun/munge/releases/download/${munge_version}$/${munge_version}.tar.xz
tar -xvf ${munge_version}.tar.xz && cd ${munge_version}
./configure --prefix=$prefix --sysconfdir=$prefix/etc
make -j20 && make install # 编译(-j 后面跟CPU核心数,加快速度)
# 配置环境变量
echo 'export PATH='${prefix}'/bin:'${prefix}'/sbin$PATH' >> $HOME/.bashrc
echo 'export LD_LIBRARY_PATH='${prefix}'/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc
mungekey -c -k ${prefix}/etc/munge/munge.key
# chmod 600 $HOME/.ssh/key/munge.key
mkdir -p ${prefix}/var/run/munge
mkdir -p ${prefix}/var/lib/munge
mkdir -p ${prefix}/log
nohup munged --trusted-group=root > ${prefix}/log/munged.log 2>&1 &
# 测试是否正常启动
munge -n | unmunge
## 退出并删除临时文件
cd .. && rm -rf ${munge_version} ${munge_version}.tar.xz


# slurm 安装
## 旧 slurm 清理
find $prefix \( -not -path "$prefix/etc/*" -o -path "$prefix/*" \) \
-type f \
-regextype posix-extended \
-regex ".*\/([^\/]+-)?(sacct|sacctmgr|salloc|sattach|sbatch|sbcast|scancel|scontrol|scrontab|sdiag|sinfo|sprio|squeue|sreport|srun|sshare|sstat|strigger|sh5util|slurm|sackd)[a-zA-Z0-9_-]*" \
-delete
## 导出环境变量(确保能找到 Munge 库)
export PATH=${prefix}/bin:${prefix}/sbin$PATH
export LD_LIBRARY_PATH=${prefix}/lib:$LD_LIBRARY_PATH
## 下载并安装 Slurm
slurm_version=slurm-25.11.3
wget https://download.schedmd.com/slurm/${slurm_version}.tar.bz2
tar -xvf ${slurm_version}.tar.bz2 && cd ${slurm_version}

## 解决 dbus_dev 依赖缺失问题
dbus_dev=dbus-1.12.20
wget https://mirrors.aliyun.com/blfs/conglomeration/dbus/${dbus_dev}.tar.gz
tar -xvf ${dbus_dev}.tar.gz && cd ${dbus_dev}
./configure --prefix=$prefix
make -j20 && make install
cd ..
## 用户态 cgroup 源码修改
sed -i -e 's/^#define DEFAULT_SYSTEM_CGSLICE.*/#define DEFAULT_SYSTEM_CGSLICE "user.slice\/user-1024.slice\/user@1024.service"/g' \
-e 's/^#define SLURMD_CGROUP.*/#define SLURMD_CGROUP ""/' \
src/plugins/cgroup/v2/cgroup_v2.c

# 配置 $prefix/lib/pkgconfig 可自动识别 $prefix 下的依赖
./configure --prefix=$prefix --with-munge=$prefix --enable-cgroupv2 PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
make -j20 && make install
rm -rf ${slurm_version} ${slurm_version}.tar.bz2

2. 配置 slurm

2.1 配置数据库

参考:https://slurm.schedmd.com/accounting.html

  1. 配置 $prefix/etc/slurmdbd.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
AuthType=auth/munge
AuthInfo=/work/run/projects/bio-24/.local/var/run/munge/munge.socket.2
SlurmUser=bio-24
DbdAddr=localhost
DbdHost=localhost
DbdPort=6819
LogFile=/work/run/projects/bio-24/.local/var/log/slurmdbd.log
PidFile=/work/run/projects/bio-24/.local/var/run/slurmdbd.pid

# 数据库连接
StorageType=accounting_storage/mysql
StorageHost=127.0.0.1 # 注意 localhost 会使用 unix socket 而非 tcp 连接
StoragePort=33060
StorageUser=slurm
StoragePass=slurmpassword
StorageLoc=slurm_acct_db # 注意!新版本用StorageLoc指定数据库名,而非StorageDatabase
  1. 配置 mysql
1
2
3
4
5
6
DROP USER IF EXISTS 'slurm'@'%';    # 删除已存在的用户
CREATE USER 'slurm'@'%' IDENTIFIED BY 'slurmpassword'; # 创建新用户
grant all on slurm_acct_db.* TO 'slurm'@'%';
SHOW ENGINES;
DROP DATABASE IF EXISTS slurm_acct_db; # 删除已存在的数据库
CREATE DATABASE slurm_acct_db CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
  1. 启动 slurmdbd
1
slurmdbd

2.2 配置 slurmctld 和 slurmd

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
ClusterName=local-slurm
SlurmctldHost=shenxin-a
ReturnToService=2
SlurmctldPidFile=/work/run/projects/bio-24/.local/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/work/run/projects/bio-24/.local/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/work/run/projects/bio-24/.local/var/spool/slurmctld
SlurmUser=bio-24
SlurmdUser=bio-24
StateSaveLocation=/work/run/projects/bio-24/.local/var/spool/slurmctld
TaskPlugin=task/affinity
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
SchedulerType=sched/backfill
SelectType=select/cons_tres
AccountingStorageHost=shenxin-a
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
AccountingStoreFlags=job_comment,job_env,job_extra,job_script
JobCompType=jobcomp/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=info
SlurmctldLogFile=/work/run/projects/bio-24/.local/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/work/run/projects/bio-24/.local/var/log/slurmd.log
NodeName=shenxin-a CPUs=512 RealMemory=900000 Sockets=2 CoresPerSocket=128 ThreadsPerCore=2 State=UNKNOWN
PartitionName=rnaseq Nodes=ALL Default=YES MaxTime=INFINITE State=UP