#!/bin/bash #PBS -N ray-job #PBS -l select=2:node_type=rome #PBS -l walltime=1:00:00 export WS_DIR= export PROJECT_DIR=$WS_DIR/ export ENV_PATH= export JOB_SCRIPT=monte-carlo-pi.py export OBJECT_STORE_MEMORY=128000000000 # Environment variables after this line should not change export SRC_DIR=$PROJECT_DIR/src export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT export DEPLOYMENT_SCRIPTS=$PROJECT_DIR/deployment_scripts source $ENV_PATH/bin/activate export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'` export RAY_ADDRESS=$IP_ADDRESS:6379 export REDIS_PASSWORD=$(openssl rand -base64 32) export NCCL_DEBUG=INFO ray start --disable-usage-stats \ --head \ --node-ip-address=$IP_ADDRESS \ --port=6379 \ --dashboard-host=127.0.0.1 \ --redis-password=$REDIS_PASSWORD \ --object-store-memory=$OBJECT_STORE_MEMORY export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l) for ((i=1;i<$NUM_NODES;i++)); do pbsdsh -n $i -- bash -l -c "'$DEPLOYMENT_SCRIPTS/start-ray-worker.sh' '$WS_DIR' '$ENV_PATH' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" & done python3 $PYTHON_FILE ray stop --grace-period 30