下载数据
mkdir -p /shared/data && cd /shared/data
#AWS中国区域请下载这个
wget https://hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn/conus_2.5km_v4.tar.gz
tar -xzvf conus_2.5km_v4.tar.gz
输出
--2021-08-27 10:22:54-- https://hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn/conus_2.5km_v4.tar.gz
Resolving hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn (hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn)... 52.82.189.21
Connecting to hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn (hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn)|52.82.189.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3700827783 (3.4G) [application/x-tar]
Saving to: ‘conus_2.5km_v4.tar.gz’
[ec2-user@ip-10-60-0-12 data]$ tar -xzvf conus_2.5km_v4.tar.gz
./conus_2.5km_v4/
./conus_2.5km_v4/VEGPARM.TBL
./conus_2.5km_v4/RRTMG_SW_DATA_DBL
./conus_2.5km_v4/ozone.formatted
./conus_2.5km_v4/namelist.output
./conus_2.5km_v4/LANDUSE.TBL
./conus_2.5km_v4/qr_acr_qgV2.dat
./conus_2.5km_v4/namelist.input
./conus_2.5km_v4/RRTMG_LW_DATA
./conus_2.5km_v4/RRTMG_SW_DATA
./conus_2.5km_v4/RRTM_DATA_DBL
./conus_2.5km_v4/MPTABLE.TBL
./conus_2.5km_v4/wrfinput_d01
建立wrf.exe 软连接
ln -s /shared/wrf-arm/WRF-4.2.2/main/wrf.exe /shared/data/conus_2.5km_v4/wrf.exe
编写wrf.sbatch
#!/bin/bash
#SBATCH --wait-all-nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=8
#SBATCH --nodes=2
#SBATCH --ntasks-per-core=1
#SBATCH --export=ALL
#SBATCH --exclusive
#SBATCH -o /shared/slurm.out
#ENV VARIABLES#
#---------------------Run-time env-----------------------------------------
ulimit -s unlimited
export OMP_STACKSIZE=12G
export OMP_NUM_THREADS=8
export KMP_AFFINITY=scatter,verbose
export PATH=/shared/gcc-10.2.0/bin:$PATH
export LD_LIBRARY_PATH=/shared/gcc-10.2.0/lib64:$LD_LIBRARY_PATH
export PATH=/shared/wrf-arm/bin:$PATH
export LD_LIBRARY_PATH=/shared/wrf-arm/lib:$LD_LIBRARY_PATH
wrf_root=/shared/wrf
wrf_install=${wrf_root}/wrf-arm
export LD_LIBRARY_PATH=${wrf_install}/netcdf/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${wrf_install}/pnetcdf/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${wrf_install}/hdf5/lib:$LD_LIBRARY_PATH
#--------------------------------------------------------------------------
echo "Running WRF on $(date)"
cd /shared/data/conus_2.5km_v4/
mpirun --report-bindings ./wrf.exe &>> wrf.out
echo nstasks=$SLURM_NTASKS
date -u +%Y-%m-%d_%H:%M:%S >> wrf.times
提交任务
sbatch wrf.sbatch
查看任务
qsub wrf.sbatch
输出
3
qstat
输出
Job id Name Username Time Use S Queue
------------------- ---------------- --------------- -------- - ---------------
2 wrf.sbatch ec2-user 365:00:0 R compute
3 wrf.sbatch ec2-user 365:00:0 R compute
tail -f /shared/data/conus_2.5km_v4/rsl.out.0000
output
*************************************
Domain # 1: dx = 2500.000 m
WRF V4.2.2 MODEL
*************************************
Parent domain
ids,ide,jds,jde 1 1901 1 1301
ims,ime,jms,jme -4 1906 -4 1306
ips,ipe,jps,jpe 1 1901 1 1301
*************************************
DYNAMICS OPTION: Eulerian Mass Coordinate
查看测试结果
more wrf.out
输出
--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:
Executable: ./wrf.exe
Node: compute-dy-c6g16xlarge-2
while attempting to start process rank 0.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:
Executable: ./wrf.exe
Node: compute-dy-c6g16xlarge-1
while attempting to start process rank 0.
--------------------------------------------------------------------------
16 total processes failed to start
--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:
Executable: ./wrf.exe
Node: compute-dy-c6g16xlarge-1
while attempting to start process rank 0.
--------------------------------------------------------------------------
16 total processes failed to start
[compute-dy-c6g16xlarge-1:04678] MCW rank 0 bound to socket 0[core 0[hwt 0]]: [B/././././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 1 bound to socket 0[core 1[hwt 0]]: [./B/./././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 2 bound to socket 0[core 2[hwt 0]]: [././B/././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 3 bound to socket 0[core 3[hwt 0]]: [./././B/./././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 4 bound to socket 0[core 4[hwt 0]]: [././././B/././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 5 bound to socket 0[core 5[hwt 0]]: [./././././B/./././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 6 bound to socket 0[core 6[hwt 0]]: [././././././B/././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 7 bound to socket 0[core 7[hwt 0]]: [./././././././B/./././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 8 bound to socket 0[core 0[hwt 0]]: [B/././././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 9 bound to socket 0[core 1[hwt 0]]: [./B/./././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 10 bound to socket 0[core 2[hwt 0]]: [././B/./././././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 11 bound to socket 0[core 3[hwt 0]]: [./././B/././././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 12 bound to socket 0[core 4[hwt 0]]: [././././B/./././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 13 bound to socket 0[core 5[hwt 0]]: [./././././B/././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 14 bound to socket 0[core 6[hwt 0]]: [././././././B/./././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 15 bound to socket 0[core 7[hwt 0]]: [./././././././B/././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
starting wrf task 6 of 16
starting wrf task 1 of 16
starting wrf task 2 of 16
starting wrf task 5 of 16
starting wrf task 3 of 16
starting wrf task 4 of 16
starting wrf task 7 of 16
starting wrf task 0 of 16
starting wrf task 14 of 16
starting wrf task 10 of 16
starting wrf task 12 of 16
starting wrf task 13 of 16
starting wrf task 15 of 16
starting wrf task 8 of 16
starting wrf task 11 of 16
starting wrf task 9 of 16
查看计算节点
scontrol show nodes
输出
NodeName=compute-dy-c6g16xlarge-1 Arch=aarch64 CoresPerSocket=1
CPUAlloc=64 CPUTot=64 CPULoad=42.22
AvailableFeatures=dynamic,c6g.16xlarge,default
ActiveFeatures=dynamic,c6g.16xlarge,default
Gres=(null)
NodeAddr=10.60.2.153 NodeHostName=compute-dy-c6g16xlarge-1 Version=20.11.7
OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
RealMemory=1 AllocMem=0 FreeMem=65485 Sockets=64 Boards=1
State=ALLOCATED+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=compute
BootTime=2021-08-27T10:37:28 SlurmdStartTime=2021-08-27T10:39:53
CfgTRES=cpu=64,mem=1M,billing=64
AllocTRES=cpu=64
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
NodeName=compute-dy-c6g16xlarge-2 Arch=aarch64 CoresPerSocket=1
CPUAlloc=64 CPUTot=64 CPULoad=44.31
AvailableFeatures=dynamic,c6g.16xlarge,default
ActiveFeatures=dynamic,c6g.16xlarge,default
Gres=(null)
NodeAddr=10.60.2.59 NodeHostName=compute-dy-c6g16xlarge-2 Version=20.11.7
OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
RealMemory=1 AllocMem=0 FreeMem=82115 Sockets=64 Boards=1
State=ALLOCATED+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=compute
BootTime=2021-08-27T10:38:26 SlurmdStartTime=2021-08-27T10:40:10
CfgTRES=cpu=64,mem=1M,billing=64
AllocTRES=cpu=64
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
NodeName=compute-dy-c6g16xlarge-3 Arch=aarch64 CoresPerSocket=1
CPUAlloc=0 CPUTot=64 CPULoad=0.07
AvailableFeatures=dynamic,c6g.16xlarge,default
ActiveFeatures=dynamic,c6g.16xlarge,default
Gres=(null)
NodeAddr=compute-dy-c6g16xlarge-3 NodeHostName=compute-dy-c6g16xlarge-3 Version=20.11.7
OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
RealMemory=1 AllocMem=0 FreeMem=125889 Sockets=64 Boards=1
State=IDLE+CLOUD+POWER ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=compute
State=IDLE+CLOUD+POWER ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=compute
BootTime=None SlurmdStartTime=None
CfgTRES=cpu=64,mem=1M,billing=64
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
NodeName=compute-dy-c6g16xlarge-9 CoresPerSocket=1
CPUAlloc=0 CPUTot=64 CPULoad=N/A
AvailableFeatures=dynamic,c6g.16xlarge,default
ActiveFeatures=dynamic,c6g.16xlarge,default
Gres=(null)
NodeAddr=compute-dy-c6g16xlarge-9 NodeHostName=compute-dy-c6g16xlarge-9
RealMemory=1 AllocMem=0 FreeMem=N/A Sockets=64 Boards=1
State=IDLE+CLOUD+POWER ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=compute
BootTime=None SlurmdStartTime=None
CfgTRES=cpu=64,mem=1M,billing=64
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)
NodeName=compute-st-c6g16xlarge-1 Arch=aarch64 CoresPerSocket=1
CPUAlloc=0 CPUTot=64 CPULoad=0.00
AvailableFeatures=static,c6g.16xlarge,default
ActiveFeatures=static,c6g.16xlarge,default
Gres=(null)
NodeAddr=10.60.2.154 NodeHostName=compute-st-c6g16xlarge-1 Version=20.11.7
OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
RealMemory=1 AllocMem=0 FreeMem=125648 Sockets=64 Boards=1
State=IDLE+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=compute
BootTime=2021-08-27T08:39:44 SlurmdStartTime=2021-08-27T08:41:52
CfgTRES=cpu=64,mem=1M,billing=64
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Comment=(null)