54 lines
2.1 KiB
Bash
54 lines
2.1 KiB
Bash
#!/bin/bash
|
|
|
|
export CURRENT_WORKSPACE=$1
|
|
|
|
# Check if running in a PBS Job environment
|
|
if [ -z ${PBS_NODEFILE+x} ]; then
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] ERROR: This script is meant to run as a part of PBS Job. Don't start it at login nodes."
|
|
exit 1
|
|
fi
|
|
|
|
export NUM_NODES=$(wc -l < $PBS_NODEFILE)
|
|
|
|
if [ $NUM_NODES -lt 2 ]; then
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] WARNING: You have a single node job running. Dask cluster requires at least 2 nodes."
|
|
exit 1
|
|
fi
|
|
|
|
export ALL_NODES=$(cat $PBS_NODEFILE)
|
|
export SCHEDULER_NODE="$(head -n1 $PBS_NODEFILE)-ib"
|
|
export WORKER_NODES=$(tail -n+2 $PBS_NODEFILE)
|
|
|
|
export DASK_SCHEDULER_PORT=8786
|
|
export DASK_UI_PORT=8787
|
|
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] INFO: Starting Dask cluster with $NUM_NODES nodes."
|
|
# Path to localscratch
|
|
export DASK_ENV="$HOME/dask"
|
|
mkdir -p $DASK_ENV
|
|
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] INFO: Extracting Dask environment to $DASK_ENV"
|
|
# Extract Dask environment in localscratch
|
|
tar -xzf $CURRENT_WORKSPACE/dask-env.tar.gz -C $DASK_ENV
|
|
chmod -R 700 $DASK_ENV
|
|
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] INFO: Setting up Dask environment"
|
|
# Start the dask environment
|
|
source $DASK_ENV/bin/activate
|
|
conda-unpack
|
|
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] INFO: Starting Dask Scheduler at $SCHEDULER_NODE on port $DASK_SCHEDULER_PORT"
|
|
dask scheduler --host $SCHEDULER_NODE --port $DASK_SCHEDULER_PORT &
|
|
|
|
export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l)
|
|
|
|
# Assuming you have a Dask worker script named 'dask-worker-script.py', modify this accordingly
|
|
for ((i=1;i<$NUM_NODES;i++)); do
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] INFO: Starting Dask Worker at $i"
|
|
pbsdsh -n $i -o -- bash -l -c "source $CURRENT_WORKSPACE/dask-worker.sh $CURRENT_WORKSPACE $SCHEDULER_NODE"
|
|
done
|
|
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S') - Master] INFO: Dask cluster ready, wait for workers to connect to the scheduler."
|
|
|
|
# Optionally, you can provide a script for the workers to execute using ssh, similar to Spark.
|
|
# Example: ssh $node "source activate your_conda_env && python your_dask_worker_script.py" &
|