From ad953fd96a1786922c29ec8919cecb9f74f786ca Mon Sep 17 00:00:00 2001 From: Kerem Kayabay Date: Fri, 5 Jan 2024 16:08:04 +0100 Subject: [PATCH] prepare for multi node cluster --- README.md | 74 ++++++++++++++++++++------ deployment_scripts/start-ray-worker.sh | 8 +-- deployment_scripts/submit-ray-job.pbs | 50 +++++++++++++++++ deployment_scripts/submit-ray-job.sh | 59 -------------------- src/monte-carlo-pi.py | 71 ++++++++++++++---------- 5 files changed, 152 insertions(+), 110 deletions(-) create mode 100644 deployment_scripts/submit-ray-job.pbs delete mode 100644 deployment_scripts/submit-ray-job.sh diff --git a/README.md b/README.md index 6bb972a..fd58623 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This guide shows you how to launch a Ray cluster on HLRS' Hawk system. - [Table of Contents](#table-of-contents) - [Prerequisites](#prerequisites) - [Getting Started](#getting-started) - - [Launch a Ray Cluster in Interactive Mode](#launch-a-ray-cluster-in-interactive-mode) + - [Launch a local Ray Cluster in Interactive Mode](#launch-a-local-ray-cluster-in-interactive-mode) - [Launch a Ray Cluster in Batch Mode](#launch-a-ray-cluster-in-batch-mode) ## Prerequisites @@ -23,13 +23,13 @@ For more information, look at the documentation for [Conda on HLRS HPC systems]( Only the main and r channels are available using the conda module on the clusters. To use custom packages, we need to move the local conda environment to Hawk. -1. Clone this repository to your local machine: +**Step 1.** Clone this repository to your local machine: ```bash git clone ``` -2. Go into the directory and create an environment using Conda and environment.yaml. +**Step 2.** Go into the directory and create an environment using Conda and environment.yaml. Note: Be sure to add the necessary packages in `deployment_scripts/environment.yaml`: @@ -38,7 +38,7 @@ cd deployment_scripts ./create-env.sh ``` -3. Package the environment and transfer the archive to the target system: +**Step 3.** Package the environment and transfer the archive to the target system: ```bash (base) $ conda pack -n -o ray_env.tar.gz # conda-pack must be installed in the base environment @@ -58,49 +58,76 @@ scp ray_env.tar.gz @hawk.hww.hlrs.de: rm ray_env.tar.gz # We don't need the archive locally anymore. ``` -4. Clone the repository on Hawk to use the deployment scripts and project structure: +**Step 4.** Clone the repository on Hawk to use the deployment scripts and project structure: ```bash cd git clone ``` -## Launch a Ray Cluster in Interactive Mode +## Launch a local Ray Cluster in Interactive Mode Using a single node interactively provides opportunities for faster code debugging. -1. On the Hawk login node, start an interactive job using: +**Step 1.** On the Hawk login node, start an interactive job using: ```bash qsub -I -l select=1:node_type=rome -l walltime=01:00:00 ``` -2. Go into the directory with all code: +**Step 2.** Go into the project directory: ```bash cd /deployment_scripts ``` -3. Deploy the conda environment to the ram disk: +**Step 3.** Deploy the conda environment to the ram disk: + +Change the following line by editing `deploy-env.sh`: + +```bash +export WS_DIR= +``` + +Then, use the following command to deploy and activate the environment: ```bash source deploy-env.sh ``` Note: Make sure all permissions are set using `chmod +x`. -4. Initialize the Ray cluster. +**Step 4.** Initialize the Ray cluster. -You can use a Python interpreter to start a Ray cluster: +You can use a Python interpreter to start a local Ray cluster: ```python import ray -ray.init(dashboard_host='127.0.0.1') +ray.init() ``` -1. Connect to the dashboard. +**Step 5.** Connect to the dashboard. -Warning: Always use `127.0.0.1` as the dashboard host to make the Ray cluster reachable by only you. +Warning: Do not change the default dashboard host `127.0.0.1` to keep Ray cluster reachable by only you. + +Note: We recommend using a dedicated Firefox profile for accessing web-based services on HLRS Compute Platforms. If you haven't created a profile, check out our [guide](https://kb.hlrs.de/platforms/index.php/How_to_use_Web_Based_Services_on_HLRS_Compute_Platforms). + +You need the job id and the hostname for your current job. You can obtain this information on the login node using: + +```bash +qstat -anw # get the job id and the hostname +``` + +Then, on your local computer, + +```bash +export PBS_JOBID= # e.g., 2316419.hawk-pbs5 +ssh # e.g., r38c3t8n3 +``` + +Check your SSH config in the first step if this doesn't work. + +Then, launch Firefox web browser using the configured profile. Open `localhost:8265` to access the Ray dashboard. ## Launch a Ray Cluster in Batch Mode @@ -108,11 +135,24 @@ Warning: Always use `127.0.0.1` as the dashboard host to make the Ray cluster re ```bash cd deployment_scripts -chmod +x ray-start-worker.sh +chmod +x start-ray-worker.sh ``` 2. Submit a job to launch the head and worker nodes. -You must modify the following variables in `submit-ray-job.sh`: +You must modify the following lines in `submit-ray-job.sh`: - Line 3 changes the cluster size. The default configuration launches a 3 node cluster. -- `$PROJECT_DIR` +- `export WS_DIR=` - set the correct workspace directory. +- `export PROJECT_DIR=$WS_DIR/` - set the correct project directory. + +Note: The job script `src/monte-carlo-pi.py` waits for all nodes in the Ray cluster to become available. Preserve this pattern in your Python code while using a multiple node Ray cluster. + +Launch the job and monitor the progress. As the job starts, its status (S) shifts from Q (Queued) to R (Running). Upon completion, the job will no longer appear in the `qstat -a` display. + +```bash +qsub submit-ray-job.pbs +qstat -anw # Q: Queued, R: Running, E: Ending +ls -l # list files after the job finishes +cat ray-job.o... # inspect the output file +cat ray-job.e... # inspect the error file +``` \ No newline at end of file diff --git a/deployment_scripts/start-ray-worker.sh b/deployment_scripts/start-ray-worker.sh index 04242fd..0726fb0 100644 --- a/deployment_scripts/start-ray-worker.sh +++ b/deployment_scripts/start-ray-worker.sh @@ -11,11 +11,7 @@ export RAY_ADDRESS=$3 export REDIS_PASSWORD=$4 export OBJECT_STORE_MEMORY=$5 -# printenv | grep 'RAY_ADDRESS\|REDIS_PASSWORD' - -# module load system/nvidia/ALL.ALL.525.125.06 - -export ENV_PATH=/run/user/$PBS_JOBID/ray_env +export ENV_PATH=/run/user/$PBS_JOBID/ray_env # We use the ram disk to extract the environment packages since a large number of files decreases the performance of the parallel file system. mkdir -p $ENV_PATH tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH @@ -27,4 +23,4 @@ ray start --address=$RAY_ADDRESS \ --object-store-memory=$OBJECT_STORE_MEMORY \ --block -rm -rf $ENV_PATH \ No newline at end of file +rm -rf $ENV_PATH # It's nice to clean up before you terminate the job \ No newline at end of file diff --git a/deployment_scripts/submit-ray-job.pbs b/deployment_scripts/submit-ray-job.pbs new file mode 100644 index 0000000..3f12671 --- /dev/null +++ b/deployment_scripts/submit-ray-job.pbs @@ -0,0 +1,50 @@ +#!/bin/bash +#PBS -N ray-job +#PBS -l select=2:node_type=rome-ai +#PBS -l walltime=1:00:00 + +export WS_DIR= +export PROJECT_DIR=$WS_DIR/ +export JOB_SCRIPT=monte-carlo-pi.py + +export ENV_ARCHIVE=ray_env.tar.gz + +export OBJECT_STORE_MEMORY=128000000000 + +# Environment variables after this line should not change + +export SRC_DIR=$PROJECT_DIR/src +export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT +export DEPLOYMENT_SCRIPTS=$PROJECT_DIR/deployment_scripts +export ENV_PATH=/run/user/$PBS_JOBID/ray_env # We use the ram disk to extract the environment packages since a large number of files decreases the performance of the parallel file system. + +mkdir -p $ENV_PATH +tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH # This line extracts the packages to ram disk. +source $ENV_PATH/bin/activate + +export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'` + +export RAY_ADDRESS=$IP_ADDRESS:6379 +export REDIS_PASSWORD=$(openssl rand -base64 32) + +export NCCL_DEBUG=INFO + +ray start --disable-usage-stats \ + --head \ + --node-ip-address=$IP_ADDRESS \ + --port=6379 \ + --dashboard-host=127.0.0.1 \ + --redis-password=$REDIS_PASSWORD \ + --object-store-memory=$OBJECT_STORE_MEMORY + +export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l) + +for ((i=1;i<$NUM_NODES;i++)); do + pbsdsh -n $i -- bash -l -c "'$DEPLOYMENT_SCRIPTS/ray-start-worker.sh' '$WS_DIR' '$ENV_ARCHIVE' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" & +done + +python3 $PYTHON_FILE + +ray stop --grace-period 30 + +rm -rf $ENV_PATH # It's nice to clean up before you terminate the job. \ No newline at end of file diff --git a/deployment_scripts/submit-ray-job.sh b/deployment_scripts/submit-ray-job.sh deleted file mode 100644 index 97d0487..0000000 --- a/deployment_scripts/submit-ray-job.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -#PBS -N output-ray-job -#PBS -l select=2:node_type=rome-ai -#PBS -l walltime=1:00:00 - -export JOB_SCRIPT=modeling_evaluation.py - -export WS_DIR=/lustre/hpe/ws10/ws10.3/ws/hpckkaya-ifu -export ENV_ARCHIVE=ray-environment-v0.3.tar.gz - -export SRC_DIR=$WS_DIR/ifu/src/ray-workflow -export DATA_DIR=/lustre/hpe/ws10/ws10.3/ws/hpckkaya-ifu-data/hpclzhon-ifu_data-1668830707 -export RESULTS_DIR=$WS_DIR/ray_results - -export NCCL_DEBUG=INFO - -# Environment variables after this line should not change - -export PYTHON_FILE=$SRC_DIR/$JOB_SCRIPT -export ENV_PATH=/run/user/$PBS_JOBID/ray_env - -mkdir -p $ENV_PATH -tar -xzf $WS_DIR/$ENV_ARCHIVE -C $ENV_PATH -source $ENV_PATH/bin/activate -conda-unpack - -export IP_ADDRESS=`ip addr show ib0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | awk '{print $1}'` - -export RAY_ADDRESS=$IP_ADDRESS:6379 -export REDIS_PASSWORD=$(uuidgen) - -# export RAY_scheduler_spread_threshold=0.0 -export OBJECT_STORE_MEMORY=128000000000 - -ray start --disable-usage-stats \ - --head \ - --node-ip-address=$IP_ADDRESS \ - --port=6379 \ - --dashboard-host=127.0.0.1 \ - --redis-password=$REDIS_PASSWORD \ - --object-store-memory=$OBJECT_STORE_MEMORY - -export NUM_NODES=$(sort $PBS_NODEFILE |uniq | wc -l) - -for ((i=1;i<$NUM_NODES;i++)); do - pbsdsh -n $i -- bash -l -c "'$SRC_DIR/ray-start-worker.sh' '$WS_DIR' '$ENV_ARCHIVE' '$RAY_ADDRESS' '$REDIS_PASSWORD' '$OBJECT_STORE_MEMORY'" & -done - -# uncomment if you don't already control inside the code -# if [[ $NUM_NODES -gt 1 ]] -# then - # sleep 90 -#fi - -python3 $PYTHON_FILE - -ray stop - -rm -rf $ENV_PATH \ No newline at end of file diff --git a/src/monte-carlo-pi.py b/src/monte-carlo-pi.py index 79e1f19..da124b1 100644 --- a/src/monte-carlo-pi.py +++ b/src/monte-carlo-pi.py @@ -6,12 +6,11 @@ import time import random import os -ray.init(address="auto", _node_ip_address=os.environ["IP_ADDRESS"], _redis_password=os.environ["REDIS_PASSWORD"]) - -cluster_resources = ray.available_resources() -available_cpu_cores = cluster_resources.get('CPU', 0) -print(cluster_resources) - +# Change this to match your cluster scale. +NUM_SAMPLING_TASKS = 100 +NUM_SAMPLES_PER_TASK = 10_000_000 +TOTAL_NUM_SAMPLES = NUM_SAMPLING_TASKS * NUM_SAMPLES_PER_TASK + @ray.remote class ProgressActor: def __init__(self, total_num_samples: int): @@ -44,31 +43,47 @@ def sampling_task(num_samples: int, task_id: int, progress_actor.report_progress.remote(task_id, num_samples) return num_inside -# Change this to match your cluster scale. -NUM_SAMPLING_TASKS = 100 -NUM_SAMPLES_PER_TASK = 10_000_000 -TOTAL_NUM_SAMPLES = NUM_SAMPLING_TASKS * NUM_SAMPLES_PER_TASK +def wait_for_nodes(expected_num_nodes: int): + while True: + num_nodes = len(ray.nodes()) + if num_nodes >= expected_num_nodes: + break + print(f'Currently {num_nodes} nodes connected. Waiting for more...') + time.sleep(5) # wait for 5 seconds before checking again + +if __name__ == "__main__": -# Create the progress actor. -progress_actor = ProgressActor.remote(TOTAL_NUM_SAMPLES) + num_nodes = int(os.environ["NUM_NODES"]) + assert num_nodes > 1, "If the environment variable NUM_NODES is set, it should be greater than 1." -# Create and execute all sampling tasks in parallel. -results = [ - sampling_task.remote(NUM_SAMPLES_PER_TASK, i, progress_actor) - for i in range(NUM_SAMPLING_TASKS) -] + redis_password = os.environ["REDIS_PASSWORD"] + ray.init(address="auto", _redis_password=redis_password) -# Query progress periodically. -while True: - progress = ray.get(progress_actor.get_progress.remote()) - print(f"Progress: {int(progress * 100)}%") + wait_for_nodes(num_nodes) - if progress == 1: - break + cluster_resources = ray.available_resources() + print(cluster_resources) - time.sleep(1) + # Create the progress actor. + progress_actor = ProgressActor.remote(TOTAL_NUM_SAMPLES) -# Get all the sampling tasks results. -total_num_inside = sum(ray.get(results)) -pi = (total_num_inside * 4) / TOTAL_NUM_SAMPLES -print(f"Estimated value of π is: {pi}") + # Create and execute all sampling tasks in parallel. + results = [ + sampling_task.remote(NUM_SAMPLES_PER_TASK, i, progress_actor) + for i in range(NUM_SAMPLING_TASKS) + ] + + # Query progress periodically. + while True: + progress = ray.get(progress_actor.get_progress.remote()) + print(f"Progress: {int(progress * 100)}%") + + if progress == 1: + break + + time.sleep(1) + + # Get all the sampling tasks results. + total_num_inside = sum(ray.get(results)) + pi = (total_num_inside * 4) / TOTAL_NUM_SAMPLES + print(f"Estimated value of π is: {pi}")