#!/bin/bash #PBS -N output-ray-job #PBS -l select=2:node_type=rome-ai #PBS -l walltime=1:00:00 export JOB_SCRIPT=modeling_evaluation.py export WS_DIR=/lustre/hpe/ws10/ws10.3/ws/hpckkaya-ifu export ENV_ARCHIVE=ray-environment-v0.3.tar.gz export SRC_DIR=$WS_DIR/ifu/src/ray-workflow export DATA_DIR=/lustre/hpe/ws10/ws10.3/ws/hpckkaya-ifu-data/hpclzhon-ifu_data-1668830707 export RESULTS_DIR=$WS_DIR/ray_results export NCCL_DEBUG=INFO # Environment variables after this line should not change export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT export ENV_PATH=/run/user/$PBS_JOBID/ray_env mkdir -p $ENV_PATH tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH source $ENV_PATH/bin/activate conda-unpack export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'` export RAY_ADDRESS=$IP_ADDRESS:6379 export REDIS_PASSWORD=$(uuidgen) # export RAY_scheduler_spread_threshold=0.0 export OBJECT_STORE_MEMORY=128000000000 ray start --disable-usage-stats \ --head \ --node-ip-address=$IP_ADDRESS \ --port=6379 \ --dashboard-host=127.0.0.1 \ --redis-password=$REDIS_PASSWORD \ --object-store-memory=$OBJECT_STORE_MEMORY export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l) for ((i=1;i<$NUM_NODES;i++)); do pbsdsh -n $i -- bash -l -c "'$SRC_DIR/ray-start-worker.sh' '$WS_DIR' '$ENV_ARCHIVE' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" & done # uncomment if you don't already control inside the code # if [[ $NUM_NODES -gt 1 ]] # then # sleep 90 #fi python3 $PYTHON_FILE ray stop rm -rf $ENV_PATH