下载数据

mkdir -p /shared/data && cd /shared/data
#AWS中国区域请下载这个
wget https://hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn/conus_2.5km_v4.tar.gz
tar -xzvf conus_2.5km_v4.tar.gz

输出

--2021-08-27 10:22:54--  https://hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn/conus_2.5km_v4.tar.gz
Resolving hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn (hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn)... 52.82.189.21
Connecting to hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn (hpc-workshop-data.s3.cn-northwest-1.amazonaws.com.cn)|52.82.189.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3700827783 (3.4G) [application/x-tar]
Saving to: ‘conus_2.5km_v4.tar.gz’
[ec2-user@ip-10-60-0-12 data]$ tar -xzvf conus_2.5km_v4.tar.gz
./conus_2.5km_v4/
./conus_2.5km_v4/VEGPARM.TBL
./conus_2.5km_v4/RRTMG_SW_DATA_DBL
./conus_2.5km_v4/ozone.formatted
./conus_2.5km_v4/namelist.output
./conus_2.5km_v4/LANDUSE.TBL
./conus_2.5km_v4/qr_acr_qgV2.dat
./conus_2.5km_v4/namelist.input
./conus_2.5km_v4/RRTMG_LW_DATA
./conus_2.5km_v4/RRTMG_SW_DATA
./conus_2.5km_v4/RRTM_DATA_DBL
./conus_2.5km_v4/MPTABLE.TBL
./conus_2.5km_v4/wrfinput_d01

建立wrf.exe 软连接

ln -s /shared/wrf-arm/WRF-4.2.2/main/wrf.exe /shared/data/conus_2.5km_v4/wrf.exe

编写wrf.sbatch

#!/bin/bash
#SBATCH --wait-all-nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=8
#SBATCH --nodes=2
#SBATCH --ntasks-per-core=1
#SBATCH --export=ALL
#SBATCH --exclusive
#SBATCH -o /shared/slurm.out

#ENV VARIABLES#

#---------------------Run-time env-----------------------------------------
ulimit -s unlimited

export OMP_STACKSIZE=12G
export OMP_NUM_THREADS=8
export KMP_AFFINITY=scatter,verbose

export PATH=/shared/gcc-10.2.0/bin:$PATH
export LD_LIBRARY_PATH=/shared/gcc-10.2.0/lib64:$LD_LIBRARY_PATH

export PATH=/shared/wrf-arm/bin:$PATH
export LD_LIBRARY_PATH=/shared/wrf-arm/lib:$LD_LIBRARY_PATH

wrf_root=/shared/wrf
wrf_install=${wrf_root}/wrf-arm
export LD_LIBRARY_PATH=${wrf_install}/netcdf/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${wrf_install}/pnetcdf/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${wrf_install}/hdf5/lib:$LD_LIBRARY_PATH

#--------------------------------------------------------------------------
echo "Running WRF on $(date)"
cd /shared/data/conus_2.5km_v4/
mpirun --report-bindings ./wrf.exe &>> wrf.out
echo nstasks=$SLURM_NTASKS
date -u +%Y-%m-%d_%H:%M:%S >> wrf.times

提交任务

sbatch wrf.sbatch

查看任务

qsub wrf.sbatch

输出

3
qstat

输出

Job id              Name             Username        Time Use S Queue
------------------- ---------------- --------------- -------- - ---------------
2                   wrf.sbatch       ec2-user        365:00:0 R compute
3                   wrf.sbatch       ec2-user        365:00:0 R compute
tail -f /shared/data/conus_2.5km_v4/rsl.out.0000

output

*************************************
Domain # 1: dx =  2500.000 m
WRF V4.2.2 MODEL
 *************************************
 Parent domain
 ids,ide,jds,jde            1        1901           1        1301
 ims,ime,jms,jme           -4        1906          -4        1306
 ips,ipe,jps,jpe            1        1901           1        1301
 *************************************
DYNAMICS OPTION: Eulerian Mass Coordinate

查看测试结果

more wrf.out

输出

--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:

Executable: ./wrf.exe
Node: compute-dy-c6g16xlarge-2

while attempting to start process rank 0.
--------------------------------------------------------------------------

--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:

Executable: ./wrf.exe
Node: compute-dy-c6g16xlarge-1

while attempting to start process rank 0.
--------------------------------------------------------------------------
16 total processes failed to start
--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:

Executable: ./wrf.exe
Node: compute-dy-c6g16xlarge-1

while attempting to start process rank 0.
--------------------------------------------------------------------------
16 total processes failed to start
[compute-dy-c6g16xlarge-1:04678] MCW rank 0 bound to socket 0[core 0[hwt 0]]: [B/././././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 1 bound to socket 0[core 1[hwt 0]]: [./B/./././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 2 bound to socket 0[core 2[hwt 0]]: [././B/././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 3 bound to socket 0[core 3[hwt 0]]: [./././B/./././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 4 bound to socket 0[core 4[hwt 0]]: [././././B/././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 5 bound to socket 0[core 5[hwt 0]]: [./././././B/./././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 6 bound to socket 0[core 6[hwt 0]]: [././././././B/././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-1:04678] MCW rank 7 bound to socket 0[core 7[hwt 0]]: [./././././././B/./././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 8 bound to socket 0[core 0[hwt 0]]: [B/././././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 9 bound to socket 0[core 1[hwt 0]]: [./B/./././././././././././././././././././././././././././././././
././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 10 bound to socket 0[core 2[hwt 0]]: [././B/./././././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 11 bound to socket 0[core 3[hwt 0]]: [./././B/././././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 12 bound to socket 0[core 4[hwt 0]]: [././././B/./././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 13 bound to socket 0[core 5[hwt 0]]: [./././././B/././././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 14 bound to socket 0[core 6[hwt 0]]: [././././././B/./././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
[compute-dy-c6g16xlarge-2:04637] MCW rank 15 bound to socket 0[core 7[hwt 0]]: [./././././././B/././././././././././././././././././././././././.
/././././././././././././././././././././././././././././././.]
 starting wrf task            6  of           16
 starting wrf task            1  of           16
 starting wrf task            2  of           16
 starting wrf task            5  of           16
 starting wrf task            3  of           16
 starting wrf task            4  of           16
 starting wrf task            7  of           16
 starting wrf task            0  of           16
 starting wrf task           14  of           16
 starting wrf task           10  of           16
 starting wrf task           12  of           16
 starting wrf task           13  of           16
 starting wrf task           15  of           16
 starting wrf task            8  of           16
 starting wrf task           11  of           16
 starting wrf task            9  of           16

查看计算节点

scontrol show nodes

输出


NodeName=compute-dy-c6g16xlarge-1 Arch=aarch64 CoresPerSocket=1
   CPUAlloc=64 CPUTot=64 CPULoad=42.22
   AvailableFeatures=dynamic,c6g.16xlarge,default
   ActiveFeatures=dynamic,c6g.16xlarge,default
   Gres=(null)
   NodeAddr=10.60.2.153 NodeHostName=compute-dy-c6g16xlarge-1 Version=20.11.7
   OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
   RealMemory=1 AllocMem=0 FreeMem=65485 Sockets=64 Boards=1
   State=ALLOCATED+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=compute
   BootTime=2021-08-27T10:37:28 SlurmdStartTime=2021-08-27T10:39:53
   CfgTRES=cpu=64,mem=1M,billing=64
   AllocTRES=cpu=64
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)

NodeName=compute-dy-c6g16xlarge-2 Arch=aarch64 CoresPerSocket=1
   CPUAlloc=64 CPUTot=64 CPULoad=44.31
   AvailableFeatures=dynamic,c6g.16xlarge,default
   ActiveFeatures=dynamic,c6g.16xlarge,default
   Gres=(null)
   NodeAddr=10.60.2.59 NodeHostName=compute-dy-c6g16xlarge-2 Version=20.11.7
   OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
   RealMemory=1 AllocMem=0 FreeMem=82115 Sockets=64 Boards=1
   State=ALLOCATED+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=compute
   BootTime=2021-08-27T10:38:26 SlurmdStartTime=2021-08-27T10:40:10
   CfgTRES=cpu=64,mem=1M,billing=64
   AllocTRES=cpu=64
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)

NodeName=compute-dy-c6g16xlarge-3 Arch=aarch64 CoresPerSocket=1
   CPUAlloc=0 CPUTot=64 CPULoad=0.07
   AvailableFeatures=dynamic,c6g.16xlarge,default
   ActiveFeatures=dynamic,c6g.16xlarge,default
   Gres=(null)
   NodeAddr=compute-dy-c6g16xlarge-3 NodeHostName=compute-dy-c6g16xlarge-3 Version=20.11.7
   OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
   RealMemory=1 AllocMem=0 FreeMem=125889 Sockets=64 Boards=1
   State=IDLE+CLOUD+POWER ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=compute
   State=IDLE+CLOUD+POWER ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=compute
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=64,mem=1M,billing=64
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)

NodeName=compute-dy-c6g16xlarge-9 CoresPerSocket=1
   CPUAlloc=0 CPUTot=64 CPULoad=N/A
   AvailableFeatures=dynamic,c6g.16xlarge,default
   ActiveFeatures=dynamic,c6g.16xlarge,default
   Gres=(null)
   NodeAddr=compute-dy-c6g16xlarge-9 NodeHostName=compute-dy-c6g16xlarge-9
   RealMemory=1 AllocMem=0 FreeMem=N/A Sockets=64 Boards=1
   State=IDLE+CLOUD+POWER ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=compute
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=64,mem=1M,billing=64
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)

NodeName=compute-st-c6g16xlarge-1 Arch=aarch64 CoresPerSocket=1
   CPUAlloc=0 CPUTot=64 CPULoad=0.00
   AvailableFeatures=static,c6g.16xlarge,default
   ActiveFeatures=static,c6g.16xlarge,default
   Gres=(null)
   NodeAddr=10.60.2.154 NodeHostName=compute-st-c6g16xlarge-1 Version=20.11.7
   OS=Linux 4.14.243-185.433.amzn2.aarch64 #1 SMP Mon Aug 9 05:56:00 UTC 2021
   RealMemory=1 AllocMem=0 FreeMem=125648 Sockets=64 Boards=1
   State=IDLE+CLOUD ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=compute
   BootTime=2021-08-27T08:39:44 SlurmdStartTime=2021-08-27T08:41:52
   CfgTRES=cpu=64,mem=1M,billing=64
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Comment=(null)