- Each node on Hamsi has a 1.6 TB NVME scratch disk. Considering how busy the IB network in TRUBA is, this is an invaluable resource. This script is designed to run the calculation entirely there. IT IS NOT SUITABLE FOR MULTIPLE NODES
- Script asks for a single node exclusively. It is a bit redundant but, I am both having the
exclusive
flag andnodes
flag in this script - The rest is my usual script with 10-minute emergency save function. I have compiled the Quantum Espresso in the usual manner using intel compilers, and wrote a module for it.
#!/bin/bash -l
################### Quantum Espresso Job Batch Script Example ###################
# Section for defining queue-system variables:
#-------------------------------------
# SLURM-section
#SBATCH --partition=hamsi
#SBATCH --ntasks=56
#SBATCH --exclusive
#SBATCH --nodes 1-1
#SBATCH --job-name=hamsi-qe
#SBATCH --time=3-00:00:00
## asks SLURM to send the USR1 signal 10 minutes before the end of the time limit
#SBATCH --signal=B:USR1@600
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.err
###########################################################
# This section is for defining job variables and settings
# that needs to be defined before running the job
###########################################################
#name of the input file
base_name="H2TPP-kanoetal-GAUPBE"
scratch_base=/tmp/${USER}/
# For the efficient use of GPU/CPU hybrid use a K point parallelization, and openMP threads
pw_executable="pw.x"
##Gatchas:
# Hybrid functional are not available with pools
# GPUs only use one GPU for ntg, set always to 1
# NDIAG for GPUs not implemented yet
pw_parameters="-npool 1 -inp"
NPROC=56
export OMP_NUM_THREADS=1
# We load all the default program system settings with module load:
module --quiet purge
module load compiler/latest mkl/latest mpi/latest hamsi/Q-E/oneapi/7.1git-oneapi22
# You may check other available versions with "module avail q-e"
# A unique file tag for the created files
file_tag=$( date +"%d%m%y-%H%M" )
# Define and create a unique local scratch directory for this job. Remember, this will
# make your job run much faster!
SCRATCH_DIRECTORY=${scratch_base}/${base_name}/
mkdir -p ${SCRATCH_DIRECTORY}
echo "SCRATCH is at ${SCRATCH_DIRECTORY}"
cd ${SCRATCH_DIRECTORY}
pwd
# You can copy everything you need to the scratch directory
# ${SLURM_SUBMIT_DIR} points to the path where this script was submitted from
echo "copying pseudo dir to ${SCRATCH_DIRECTORY}"
rsync -arp ${SLURM_SUBMIT_DIR}/pseudo .
ls -l pseudo
echo "copying in.${base_name} to ${SCRATCH_DIRECTORY}"
cp ${SLURM_SUBMIT_DIR}/in.${base_name} .
ls -ltrh
df -h
#############################################################################
# This section is about collecting the results from the local scratch back to
# where the job was run. Make sure that you have enough quota, if you want to
# collect the wave functions as well!
#############################################################################
# define the handler function
# note that this is not executed here, but rather
# when the associated signal is sent
cleanup_function()
{
cd ${SCRATCH_DIRECTORY}
echo "function cleanup_function called at $(date)"
tar cvf results-${SLURM_JOB_ID}-$file_tag.tar *
gzip results-${SLURM_JOB_ID}-$file_tag.tar
du -sh results-${SLURM_JOB_ID}-$file_tag.tar.gz
cp results-${SLURM_JOB_ID}-$file_tag.tar.gz ${SLURM_SUBMIT_DIR}
}
# call cleanup_function once we receive USR1 signal
trap 'cleanup_function' USR1
###############################################################################
# This section actually runs the job. It needs to be after the previous two
# sections
#################################################################################
echo "starting calculation at $(date)"
# First, let's go to the local scratch directory
cd ${SCRATCH_DIRECTORY}
# Running the program:
# the "&" after the compute step and "wait" are important for the cleanup process
# the "tee" is used to mirror the output to the slurm output, so that you can follow
# the job progress more easily
run_line="mpirun -np ${NPROC} ${pw_executable} ${pw_parameters} ${SLURM_SUBMIT_DIR}/in.${base_name} > ${SLURM_SUBMIT_DIR}/out.${base_name}-${file_tag}_${SLURM_JOB_ID} &"
echo $run_line
eval $run_line
while jobs %% &>/dev/null
do
date
echo "----------memory-------------------"
free -h
echo "----------GPU-------------------"
nvidia-smi
echo "----------CPU-------------------"
mpstat
sleep 180
done
echo "Job finished at"
date
cleanup_function
################### Job Ended ###################
exit 0