#!/bin/bash #PBS -N ray-job #PBS -l select=2:node_type=rome #PBS -l walltime=1:00:00 export WS_DIR= export PROJECT_DIR=$WS_DIR/ export JOB_SCRIPT=monte-carlo-pi.py export ENV_ARCHIVE=ray_env.tar.gz export OBJECT_STORE_MEMORY=128000000000 # Environment variables after this line should not change export SRC_DIR=$PROJECT_DIR/src export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT export DEPLOYMENT_SCRIPTS=$PROJECT_DIR/deployment_scripts export ENV_PATH=/run/user/$PBS_JOBID/ray_env # We use the ram disk to extract the environment packages since a large number of files decreases the performance of the parallel file system. mkdir -p $ENV_PATH tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH # This line extracts the packages to ram disk. source $ENV_PATH/bin/activate export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'` export RAY_ADDRESS=$IP_ADDRESS:6379 export REDIS_PASSWORD=$(openssl rand -base64 32) export NCCL_DEBUG=INFO ray start --disable-usage-stats \ --head \ --node-ip-address=$IP_ADDRESS \ --port=6379 \ --dashboard-host=127.0.0.1 \ --redis-password=$REDIS_PASSWORD \ --object-store-memory=$OBJECT_STORE_MEMORY export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l) for ((i=1;i<$NUM_NODES;i++)); do pbsdsh -n $i -- bash -l -c "'$DEPLOYMENT_SCRIPTS/start-ray-worker.sh' '$WS_DIR' '$ENV_ARCHIVE' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" & done python3 $PYTHON_FILE ray stop --grace-period 30 rm -rf $ENV_PATH # It's nice to clean up before you terminate the job.