2023-12-07 09:26:25 +00:00
|
|
|
#!/bin/bash
|
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
export WS_DIR=<workspace_dir>
|
2023-12-07 09:26:25 +00:00
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
# Get the first character of the hostname
|
|
|
|
first_char=$(hostname | cut -c1)
|
2023-12-07 09:26:25 +00:00
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
# Check if the first character is not "r"
|
|
|
|
if [[ $first_char != "r" ]]; then
|
|
|
|
# it's not a cpu node.
|
|
|
|
echo "Hostname does not start with 'r'."
|
|
|
|
# Get the first seven characters of the hostname
|
|
|
|
first_seven_chars=$(hostname | cut -c1,2,3,4,5,6,7)
|
|
|
|
# Check if it is an ai node
|
|
|
|
if [[ $first_seven_chars != "hawk-ai" ]]; then
|
|
|
|
echo "Hostname does not start with 'hawk-ai' too. Exiting."
|
|
|
|
return 1
|
|
|
|
else
|
|
|
|
echo "GPU node detected."
|
|
|
|
export OBJ_STR_MEMORY=350000000000
|
|
|
|
export TEMP_CHECKPOINT_DIR=/localscratch/$PBS_JOBID/model_checkpoints/
|
|
|
|
mkdir -p $TEMP_CHECKPOINT_DIR
|
|
|
|
fi
|
2023-12-07 09:26:25 +00:00
|
|
|
else
|
2024-01-05 12:44:48 +00:00
|
|
|
echo "CPU node detected."
|
2023-12-07 09:26:25 +00:00
|
|
|
fi
|
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
module load bigdata/conda
|
2023-12-07 09:26:25 +00:00
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
export RAY_DEDUP_LOGS=0
|
2023-12-07 09:26:25 +00:00
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
export ENV_ARCHIVE=ray_env.tar.gz
|
|
|
|
export CONDA_ENVS=/run/user/$PBS_JOBID/envs
|
|
|
|
export ENV_NAME=ray_env
|
|
|
|
export ENV_PATH=$CONDA_ENVS/$ENV_NAME
|
2023-12-07 09:26:25 +00:00
|
|
|
|
2024-01-05 12:44:48 +00:00
|
|
|
mkdir -p $ENV_PATH
|
|
|
|
|
|
|
|
tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH
|
|
|
|
|
|
|
|
source $ENV_PATH/bin/activate
|
|
|
|
|
2024-01-05 13:28:18 +00:00
|
|
|
export CONDA_ENVS_PATH=CONDA_ENVS
|