1. 下载并安装 munge 和 slurm
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 # 源码安装,为防止干扰,先退出 conda 环境 conda deactivate # 自定义环境安装目录 prefix=/work/run/projects/$(id -nu)/.local # 下载并安装 munge munge_version=munge-0.5.18 wget https://github.com/dun/munge/releases/download/${munge_version}$/${munge_version}.tar.xz tar -xvf ${munge_version}.tar.xz && cd ${munge_version} ./configure --prefix=$prefix --sysconfdir=$prefix/etc make -j20 && make install # 编译(-j 后面跟CPU核心数,加快速度) # 配置环境变量 echo 'export PATH='${prefix}'/bin:'${prefix}'/sbin$PATH' >> $HOME/.bashrc echo 'export LD_LIBRARY_PATH='${prefix}'/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc mungekey -c -k ${prefix}/etc/munge/munge.key # chmod 600 $HOME /.ssh/key/munge.keymkdir -p ${prefix}/var/run/munge mkdir -p ${prefix}/var/lib/munge mkdir -p ${prefix}/log nohup munged --trusted-group=root > ${prefix}/log/munged.log 2>&1 & # 测试是否正常启动 munge -n | unmunge # cd .. && rm -rf ${munge_version} ${munge_version}.tar.xz # slurm 安装 # find $prefix \( -not -path "$prefix/etc/*" -o -path "$prefix/*" \) \ -type f \ -regextype posix-extended \ -regex ".*\/([^\/]+-)?(sacct|sacctmgr|salloc|sattach|sbatch|sbcast|scancel|scontrol|scrontab|sdiag|sinfo|sprio|squeue|sreport|srun|sshare|sstat|strigger|sh5util|slurm|sackd)[a-zA-Z0-9_-]*" \ -delete # export PATH=${prefix}/bin:${prefix}/sbin$PATH export LD_LIBRARY_PATH=${prefix}/lib:$LD_LIBRARY_PATH # slurm_version=slurm-25.11.3 wget https://download.schedmd.com/slurm/${slurm_version}.tar.bz2 tar -xvf ${slurm_version}.tar.bz2 && cd ${slurm_version} # dbus_dev=dbus-1.12.20 wget https://mirrors.aliyun.com/blfs/conglomeration/dbus/${dbus_dev}.tar.gz tar -xvf ${dbus_dev}.tar.gz && cd ${dbus_dev} ./configure --prefix=$prefix make -j20 && make install cd .. # sed -i -e 's/^#define DEFAULT_SYSTEM_CGSLICE.*/#define DEFAULT_SYSTEM_CGSLICE "user.slice\/user-1024.slice\/user@1024.service"/g' \ -e 's/^#define SLURMD_CGROUP.*/#define SLURMD_CGROUP ""/' \ src/plugins/cgroup/v2/cgroup_v2.c # 配置 $prefix /lib/pkgconfig 可自动识别 $prefix 下的依赖 ./configure --prefix=$prefix --with-munge=$prefix --enable-cgroupv2 PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH" make -j20 && make install rm -rf ${slurm_version} ${slurm_version}.tar.bz2
2. 配置 slurm
2.1 配置数据库
参考:https://slurm.schedmd.com/accounting.html
配置 $prefix/etc/slurmdbd.conf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 AuthType=auth/munge AuthInfo=/work/run/projects/bio-24/.local/var/run/munge/munge.socket.2 SlurmUser=bio-24 DbdAddr=localhost DbdHost=localhost DbdPort=6819 LogFile=/work/run/projects/bio-24/.local/var/log/slurmdbd.log PidFile=/work/run/projects/bio-24/.local/var/run/slurmdbd.pid # 数据库连接 StorageType=accounting_storage/mysql StorageHost=127.0.0.1 # 注意 localhost 会使用 unix socket 而非 tcp 连接 StoragePort=33060 StorageUser=slurm StoragePass=slurmpassword StorageLoc=slurm_acct_db # 注意!新版本用StorageLoc指定数据库名,而非StorageDatabase
配置 mysql
1 2 3 4 5 6 DROP USER IF EXISTS 'slurm' @'%' ; # 删除已存在的用户CREATE USER 'slurm' @'%' IDENTIFIED BY 'slurmpassword' ; # 创建新用户grant all on slurm_acct_db.* TO 'slurm' @'%' ;SHOW ENGINES;DROP DATABASE IF EXISTS slurm_acct_db; # 删除已存在的数据库CREATE DATABASE slurm_acct_db CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
启动 slurmdbd
2.2 配置 slurmctld 和 slurmd
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 ClusterName=local-slurm SlurmctldHost=shenxin-a ReturnToService=2 SlurmctldPidFile=/work/run/projects/bio-24/.local/var/run/slurmctld.pid SlurmctldPort=6817 SlurmdPidFile=/work/run/projects/bio-24/.local/var/run/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/work/run/projects/bio-24/.local/var/spool/slurmctld SlurmUser=bio-24 SlurmdUser=bio-24 StateSaveLocation=/work/run/projects/bio-24/.local/var/spool/slurmctld TaskPlugin=task/affinity InactiveLimit=0 KillWait=30 MinJobAge=300 SlurmctldTimeout=120 SlurmdTimeout=300 Waittime=0 SchedulerType=sched/backfill SelectType=select/cons_tres AccountingStorageHost=shenxin-a AccountingStoragePort=6819 AccountingStorageType=accounting_storage/slurmdbd AccountingStoreFlags=job_comment,job_env,job_extra,job_script JobCompType=jobcomp/none JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=info SlurmctldLogFile=/work/run/projects/bio-24/.local/var/log/slurmctld.log SlurmdDebug=info SlurmdLogFile=/work/run/projects/bio-24/.local/var/log/slurmd.log NodeName=shenxin-a CPUs=512 RealMemory=900000 Sockets=2 CoresPerSocket=128 ThreadsPerCore=2 State=UNKNOWN PartitionName=rnaseq Nodes=ALL Default=YES MaxTime=INFINITE State=UP