dask_template/deployment_scripts/submit-ray-job.pbs

50 lines
1.6 KiB
Text
Raw Normal View History

2024-01-05 15:08:04 +00:00
#!/bin/bash
#PBS -N ray-job
2024-01-05 15:11:49 +00:00
#PBS -l select=2:node_type=rome
2024-01-05 15:08:04 +00:00
#PBS -l walltime=1:00:00
export WS_DIR=<workspace_dir>
export PROJECT_DIR=$WS_DIR/<project_name>
export JOB_SCRIPT=monte-carlo-pi.py
export ENV_ARCHIVE=ray_env.tar.gz
export OBJECT_STORE_MEMORY=128000000000
# Environment variables after this line should not change
export SRC_DIR=$PROJECT_DIR/src
export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT
export DEPLOYMENT_SCRIPTS=$PROJECT_DIR/deployment_scripts
export ENV_PATH=/run/user/$PBS_JOBID/ray_env # We use the ram disk to extract the environment packages since a large number of files decreases the performance of the parallel file system.
mkdir -p $ENV_PATH
tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH # This line extracts the packages to ram disk.
source $ENV_PATH/bin/activate
export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'`
export RAY_ADDRESS=$IP_ADDRESS:6379
export REDIS_PASSWORD=$(openssl rand -base64 32)
export NCCL_DEBUG=INFO
ray start --disable-usage-stats \
--head \
--node-ip-address=$IP_ADDRESS \
--port=6379 \
--dashboard-host=127.0.0.1 \
--redis-password=$REDIS_PASSWORD \
--object-store-memory=$OBJECT_STORE_MEMORY
export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l)
for ((i=1;i<$NUM_NODES;i++)); do
2024-01-05 15:25:02 +00:00
pbsdsh -n $i -- bash -l -c "'$DEPLOYMENT_SCRIPTS/start-ray-worker.sh' '$WS_DIR' '$ENV_ARCHIVE' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" &
2024-01-05 15:08:04 +00:00
done
python3 $PYTHON_FILE
ray stop --grace-period 30
rm -rf $ENV_PATH # It's nice to clean up before you terminate the job.