I am new to Singularity, and here is my attempt at using the singularity containers of NVIDIA for Quantum Espresso:
NVIDIA NGC has a set of containers for Quantum Espresso here.
First build
one of the images as provided there, an create a local image
singularity build qe_6.6a1.sif docker://nvcr.io/hpc/quantum_espresso:v6.6a1
This will create an image called qe_6.6a1.sif
.
Note: I am using qe_6.6a1 since it seems to be the one that has been used in the advertised benchmarks (see here and here). This version is not the same as the standart Quantum Espresso. The standart Quantum Espresso is exclusively openACC, whereas this one is using "old-style" CUDA kernels.
The created image can be run in an MPI environment following the suggestion found here.
singularity run --nv '-B<local_directory>:/host_pwd' --pwd <location of images>/qe_6.6a1.sif pw.x -npool 1 -ndiag 1 -ntg 1 -inp in.pw 2>&1 | tee out.pw
You can check the available mpi transports compiled into slurm in your HPC using
srun --mpi list
In the system I am using, pmi2,cray_shasta,pmix_v3 and pmix are supported.
The way I ask for an interactive node is this:
srun --partition=gpu --job-name <name> --gres=gpu:1 --mem 20G --ntasks-per-node=2 --time 01:00:00 -N 1 --pty bash
The full slurm script I used is here:
#!/bin/bash
################### Quantum Espresso Job Batch Script Example ###################
# Section for defining queue-system variables:
#-------------------------------------
# SLURM-section
#SBATCH --partition=<part>
#SBATCH --nodes=<no nodes>
#SBATCH --gres=gpu:<no gpus>
#SBATCH --mem-per-cpu=<ram>
#SBATCH --ntasks-per-node=<cpus per node>
#SBATCH --job-name=<name>
#SBATCH --time=10:00:00
## asks SLURM to send the USR1 signal 10 minutes before the end of the time limit
#SBATCH --signal=B:USR1@600
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.err
###########################################################
# This section is for defining job variables and settings
# that needs to be defined before running the job
###########################################################
#Number of actual processors to be used and threads
NPROC=2
export OMP_NUM_THREADS=1
#### parameters for the Q-E binary ###
###Gatchas:
# Hybrid functional are not available with pools
# GPUs only use one GPU for ntg, set always to 1
# NDIAG for GPUs not implemented yet
pw_parameters="-npool 1 -ndiag 1 -ntg 1 "
# A unique file tag for the created files
file_tag=$( date +"%d%m%y-%H%M" )
# We load all the default program system settings with module load:
module load NVHPC/22.7
# You may check other available versions with "module avail q-e"
#name of the input file
base_name=<name of the input without in.>
if [ -z ${WORKDIR+x} ]; then
scratch_base=/scratch/slurm/$SLURM_JOB_ID
echo "WORKDIR was not defined, scratch base is $scratch_base"
else
scratch_base=$WORKDIR
echo "using WORKDIR=$WORKDIR"
fi
clean_scratch=1
singularity_image="<location of images>/qe_6.6a1.sif"
pw_executable="pw.x"
#################Directory handling##########################################################
# Define and create a unique local scratch directory for this job. Remember, this will
# make your job run much faster!
SCRATCH_DIRECTORY=${scratch_base}/${base_name}/
mkdir -p ${SCRATCH_DIRECTORY}
if [ -d ${SCRATCH_DIRECTORY} ]; then
echo "SCRATCH is at ${SCRATCH_DIRECTORY}"
else
SCRATCH_DIRECTORY=$SLURM_SUBMIT_DIR/${base_name}-tmp/
mkdir -p ${SCRATCH_DIRECTORY}
echo "unable to access ${scratch_base}/${base_name}/, using ${SCRATCH_DIRECTORY} instead"
fi
cd ${SCRATCH_DIRECTORY}
pwd
if [ -z ${clean_scratch+x} ]; then
echo "Not cleaning scratch, the current contents are:"
ls -alh
else
echo "Cleaning scratch"
rm -rf ${SCRATCH_DIRECTORY}/*
fi
# You can copy everything you need to the scratch directory
# ${SLURM_SUBMIT_DIR} points to the path where this script was submitted from
echo "copying pseudo dir to ${SCRATCH_DIRECTORY}"
rsync -arp ${SLURM_SUBMIT_DIR}/pseudo .
ls -l pseudo
echo "copying in.${base_name} to ${SCRATCH_DIRECTORY}"
cp ${SLURM_SUBMIT_DIR}/in.${base_name} .
ls -ltrh
df -h
RUNNER="singularity run --nv '-B${SCRATCH_DIRECTORY}:/host_pwd' --pwd /host_pwd ${singularity_image} "
#############################################################################
# This section is about collecting the results from the local scratch back to
# where the job was run. Make sure that you have enough quota, if you want to
# collect the wave functions as well!
#############################################################################
# define the handler function
# note that this is not executed here, but rather
# when the associated signal is sent
cleanup_function()
{
# Attempt to exit MPS server
run_line="srun --ntasks-per-node=1 /bin/bash -c 'echo quit | nvidia-cuda-mps-control || true'"
echo $run_line
eval $run_line
cd ${SCRATCH_DIRECTORY}
echo "function cleanup_function called at $(date)"
tar cvf results-${SLURM_JOB_ID}-$file_tag.tar *
gzip results-${SLURM_JOB_ID}-$file_tag.tar
du -sh results-${SLURM_JOB_ID}-$file_tag.tar.gz
cp results-${SLURM_JOB_ID}-$file_tag.tar.gz ${SLURM_SUBMIT_DIR}
}
# call cleanup_function once we receive USR1 signal
trap 'cleanup_function' USR1
###############################################################################
# This section actually runs the job. It needs to be after the previous two
# sections
#################################################################################
echo "starting calculation at $(date)"
# First, let's go to the local scratch directory
cd ${SCRATCH_DIRECTORY}
# Running the program:
# Attempt to start MPS server within container if needed
if (( SLURM_CPUS_PER_GPU > 1 )); then
run_line="srun --ntasks-per-node=1 /bin/bash -c 'nvidia-cuda-mps-control -d; sleep infinity' &"
echo $run_line
eval $run_line
fi
run_line="${RUNNER} mpirun ${pw_executable} ${pw_parameters} -input in.${base_name} 2>&1 | tee out.${base_name}-${file_tag}_${SLURM_JOB_ID} &"
echo $run_line
start_time="$(date -u +%s)"
eval $run_line
while jobs %% &>/dev/null
do
date
echo "----------memory-------------------"
free -h
echo "----------GPU-------------------"
nvidia-smi
echo "----------CPU-------------------"
mpstat
sleep 10
done
end_time="$(date -u +%s)"
elapsed="$(($end_time-$start_time))"
echo "Total of $elapsed seconds elapsed for process"
echo "Job finished at"
date
cleanup_function
################### Job Ended ###################
exit 0