sit/check_lmod_modules.sh

172 lines
5.3 KiB
Bash
Raw Permalink Normal View History

#!/bin/bash -l
#
# Copyright (c) 2022 Christoph Niethammer <niethammer@hlrs.de>
#
# Script checking an lmod based module environment for problems during module loading/unloading
#
function print_usage() {
echo "Script checking the module environment for problems during module loading/unloading"
echo "usage: $0 [MODULE_PATTERN] [--exclude PATTERN]"
}
# Command line options:
declare -a modulespecs # check only subset of modules
declare -a exclude_pattern # exclude modules with given pattern, e.g. nightly builds
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--exclude-pattern|--exclude)
exclude_patterns+=($2)
shift 2
;;
-h|--help)
print_usage
exit 0
;;
*)
modulespecs+=($1)
shift
;;
esac
done
# definitions for color output if outputting to tty
if [ -t 1 ] ; then
Color_Off='\e[0m' # Text Reset
IGreen='\e[0;92m' # Intense Green
IRed='\e[0;91m' # Intense Red
IMagenta='\e[0;95m' # magenta
fi
# intermediate files, logfiles
LOGDIR=${LOGDIR:=$PWD}
tmpdir=/tmp/check_modules-$USER
mkdir -p $tmpdir
logfile="$LOGDIR/check_modules.log" # logfile with detailed information
module_load_logfile="$tmpdir/.module_load.log" # output of 'module load commands
module_rm_logfile="$tmpdir/.module_rm.log" # output of 'module rm' commands
module_clean_env_file="$tmpdir/.module_clean_env" # original environment
# safe the original environment
set > $module_clean_env_file
module save check_modules
# list of all failed modules
failed_modules=()
echo "--------------------" | tee $logfile
echo "Module environment check" | tee -a $logfile
echo "--------------------" | tee -a $logfile
echo "Date: $(date)" | tee -a $logfile
echo "Host: $(/bin/hostname)" | tee -a $logfile
echo "USER: $USER" | tee -a $logfile
echo "Logfile: $logfile" | tee -a $logfile
echo "--------------------" | tee -a $logfile
echo Modulespecs: ${modulespecs[@]} | tee -a $logfile
echo Exclude: ${exclude_patterns[@]} | tee -a $logfile
echo "--------------------" | tee -a $logfile
echo "Environment:" >> $logfile
cat $module_clean_env_file >> $logfile
echo "--------------------" >> $logfile
for modulespec in ${modulespecs[@]}
do
for m_original in $(module -t spider $modulespec 2>&1); do
match=0
for exclude_pattern in ${exclude_patterns[@]}
do
if [[ "$m_original" =~ "$exclude_pattern" ]]; then
echo -e "Skipping $m_original (matches $exclude_pattern) ... ${IMagenta} skipped${Color_Off}"
match=1
continue
fi
done
if [ $match == 1 ]; then
continue
fi
if [[ $m_original =~ ^[A-Za-z] ]]; then # skip any non module line in output
m=$(echo $m_original | sed -e 's/(.*)//') # Remove aliases e.g. (default)
echo "Checking $m_original ... "
echo "Checking $m_original ... " >> $logfile
readarray deps < <( module spider $m_original |& grep '^ ')
for dep in "${deps[@]}" ; do
dep=${dep//[$'\t\r\n']}
echo -n "... with deps $dep ..."
echo "... with deps $dep ..." >> $logfile
module load $dep >>$logfile 2>&1
cmd="module load $m"
echo $cmd >> $logfile
$cmd > $module_load_logfile 2>&1
cat $module_load_logfile >> $logfile
module li >>$logfile 2>&1
# check if module was loaded and did not report errors during loading
if module -t li 2>&1 | grep $m >/dev/null && ! grep ERROR $module_load_logfile >/dev/null ; then
cmd="module rm $m"
echo $cmd >> $logfile
$cmd > $module_rm_logfile 2>&1
cat $module_rm_logfile >> $logfile
module li >>$logfile 2>&1
# check if module was unloaded
if module -t li 2>&1 | grep $m > /dev/null; then
echo -e "${IRed}unloading failed${Color_Off}"
echo "ERROR: unloading module '$m' failed" >> $logfile
failed_modules=(${failed_modules[@]} "$m_original [$dep]")
else
echo -e "${IGreen}success${Color_Off}"
echo "SUCCESS" >> $logfile
fi
else
echo -e "${IRed}loading failed${Color_Off}"
echo "ERROR: loading module '$m' failed" >> $logfile
failed_modules=(${failed_modules[@]} "$m [$dep]")
fi
module unload $dep >>$logfile 2>&1
# clean up module environment
cmd="module purge"
echo $cmd >> $logfile
$cmd >>$logfile 2>&1
module li >>$logfile 2>&1
echo "Resetting environment ..." >>$logfile
# Reset the complete environment manually to overcome problems with
# inconsistent internal caches of the module command after module purge.
#source $module_clean_env_file 2>/dev/null
module restore check_modules 2>/dev/null
# clean up intermediate files
rm -f $module_load_logfile
rm -f $module_rm_logfile
echo >>$logfile 2>&1
done
fi
done
done
# clean up file storing the initial environment
rm -f $module_clean_env_file
rm -rf $tmpdir
echo "----------------------------------------"
echo "Summary of failed modules (${#failed_modules[@]}):"
echo "----------------------------------------"
for m in ${failed_modules[@]}; do
echo "$m"
done
echo "----------------------------------------"