forked from SiVeGCS/dask_template
59 lines
1.6 KiB
Bash
59 lines
1.6 KiB
Bash
|
#!/bin/bash
|
||
|
#PBS -N output-ray-job
|
||
|
#PBS -l select=2:node_type=rome-ai
|
||
|
#PBS -l walltime=1:00:00
|
||
|
|
||
|
export JOB_SCRIPT=modeling_evaluation.py
|
||
|
|
||
|
export WS_DIR=/lustre/hpe/ws10/ws10.3/ws/hpckkaya-ifu
|
||
|
export ENV_ARCHIVE=ray-environment-v0.3.tar.gz
|
||
|
|
||
|
export SRC_DIR=$WS_DIR/ifu/src/ray-workflow
|
||
|
export DATA_DIR=/lustre/hpe/ws10/ws10.3/ws/hpckkaya-ifu-data/hpclzhon-ifu_data-1668830707
|
||
|
export RESULTS_DIR=$WS_DIR/ray_results
|
||
|
|
||
|
export NCCL_DEBUG=INFO
|
||
|
|
||
|
# Environment variables after this line should not change
|
||
|
|
||
|
export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT
|
||
|
export ENV_PATH=/run/user/$PBS_JOBID/ray_env
|
||
|
|
||
|
mkdir -p $ENV_PATH
|
||
|
tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH
|
||
|
source $ENV_PATH/bin/activate
|
||
|
conda-unpack
|
||
|
|
||
|
export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'`
|
||
|
|
||
|
export RAY_ADDRESS=$IP_ADDRESS:6379
|
||
|
export REDIS_PASSWORD=$(uuidgen)
|
||
|
|
||
|
# export RAY_scheduler_spread_threshold=0.0
|
||
|
export OBJECT_STORE_MEMORY=128000000000
|
||
|
|
||
|
ray start --disable-usage-stats \
|
||
|
--head \
|
||
|
--node-ip-address=$IP_ADDRESS \
|
||
|
--port=6379 \
|
||
|
--dashboard-host=127.0.0.1 \
|
||
|
--redis-password=$REDIS_PASSWORD \
|
||
|
--object-store-memory=$OBJECT_STORE_MEMORY
|
||
|
|
||
|
export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l)
|
||
|
|
||
|
for ((i=1;i<$NUM_NODES;i++)); do
|
||
|
pbsdsh -n $i -- bash -l -c "'$SRC_DIR/ray-start-worker.sh' '$WS_DIR' '$ENV_ARCHIVE' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" &
|
||
|
done
|
||
|
|
||
|
# uncomment if you don't already control inside the code
|
||
|
# if [[ $NUM_NODES -gt 1 ]]
|
||
|
# then
|
||
|
# sleep 90
|
||
|
#fi
|
||
|
|
||
|
python3 $PYTHON_FILE
|
||
|
|
||
|
ray stop
|
||
|
|
||
|
rm -rf $ENV_PATH
|