2016-06-14 15:42:28 -07:00
|
|
|
# This script can be used to start Ray on an existing cluster.
|
|
|
|
#
|
|
|
|
# How to use it: Create a file "nodes.txt" that contains a list of the IP
|
|
|
|
# addresses of the nodes in the cluster. Put the head node first. This node will
|
|
|
|
# host the driver and the scheduler.
|
|
|
|
|
|
|
|
import os
|
|
|
|
import subprocess
|
|
|
|
import socket
|
|
|
|
import argparse
|
|
|
|
import threading
|
|
|
|
import IPython
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Parse information about the cluster.")
|
|
|
|
parser.add_argument("--nodes", type=str, required=True, help="Test file with node IP addresses, one line per address.")
|
|
|
|
parser.add_argument("--key-file", type=str, required=True, help="Path to the file that contains the private key.")
|
|
|
|
parser.add_argument("--username", type=str, required=True, help="User name for logging in.")
|
|
|
|
parser.add_argument("--installation-directory", type=str, required=True, help="The directory in which to install Ray.")
|
|
|
|
|
|
|
|
def run_command_over_ssh(node_ip_address, username, key_file, command):
|
2016-06-21 18:17:38 -07:00
|
|
|
full_command = "ssh -o StrictHostKeyChecking=no -i {} {}@{} '{}'".format(key_file, username, node_ip_address, command)
|
2016-06-14 15:42:28 -07:00
|
|
|
subprocess.call([full_command], shell=True)
|
|
|
|
print "Finished running command '{}' on {}@{}.".format(command, username, node_ip_address)
|
|
|
|
|
|
|
|
def install_ray_multi_node(node_ip_addresses, username, key_file, installation_directory):
|
|
|
|
def install_ray_over_ssh(node_ip_address, username, key_file, installation_directory):
|
2016-06-22 11:42:04 -07:00
|
|
|
install_ray_command = """
|
|
|
|
sudo apt-get update &&
|
|
|
|
sudo apt-get -y install git &&
|
|
|
|
mkdir -p "{}" &&
|
|
|
|
cd "{}" &&
|
|
|
|
git clone "https://github.com/amplab/ray";
|
|
|
|
cd ray;
|
|
|
|
./setup.sh;
|
|
|
|
./build.sh
|
|
|
|
""".format(installation_directory, installation_directory)
|
2016-06-14 15:42:28 -07:00
|
|
|
run_command_over_ssh(node_ip_address, username, key_file, install_ray_command)
|
|
|
|
threads = []
|
|
|
|
for node_ip_address in node_ip_addresses:
|
|
|
|
t = threading.Thread(target=install_ray_over_ssh, args=(node_ip_address, username, key_file, installation_directory))
|
|
|
|
t.start()
|
|
|
|
threads.append(t)
|
|
|
|
for t in threads:
|
|
|
|
t.join()
|
|
|
|
|
|
|
|
def start_ray_multi_node(node_ip_addresses, username, key_file, worker_path, installation_directory):
|
|
|
|
build_directory = os.path.join(installation_directory, "ray/build")
|
2016-06-22 11:42:04 -07:00
|
|
|
start_scheduler_command = """
|
|
|
|
cd "{}";
|
|
|
|
nohup ./scheduler {}:10001 > scheduler.out 2> scheduler.err < /dev/null &
|
|
|
|
""".format(build_directory, node_ip_addresses[0])
|
2016-06-14 15:42:28 -07:00
|
|
|
run_command_over_ssh(node_ip_addresses[0], username, key_file, start_scheduler_command)
|
|
|
|
|
|
|
|
for i, node_ip_address in enumerate(node_ip_addresses):
|
|
|
|
scripts_directory = os.path.join(installation_directory, "ray/scripts")
|
2016-06-22 11:42:04 -07:00
|
|
|
start_workers_command = """
|
|
|
|
cd "{}";
|
|
|
|
source ../setup-env.sh;
|
|
|
|
python start_workers.py --scheduler-address={}:10001 --node-ip={} --worker-path="{}" > start_workers.out 2> start_workers.err < /dev/null &
|
|
|
|
""".format(scripts_directory, node_ip_addresses[0], node_ip_addresses[i], worker_path)
|
2016-06-14 15:42:28 -07:00
|
|
|
run_command_over_ssh(node_ip_address, username, key_file, start_workers_command)
|
|
|
|
|
|
|
|
print "cluster started; you can start the shell on the head node with:"
|
2016-06-22 11:42:04 -07:00
|
|
|
setup_env_path = os.path.join(args.installation_directory, "ray/setup-env.sh")
|
2016-06-14 15:42:28 -07:00
|
|
|
shell_script_path = os.path.join(args.installation_directory, "ray/scripts/shell.py")
|
2016-06-22 11:42:04 -07:00
|
|
|
print """
|
|
|
|
source "{}";
|
2016-06-27 16:33:12 -07:00
|
|
|
python "{}" --scheduler-address={}:10001 --objstore-address={}:20001 --worker-address={}:30001 --attach
|
2016-06-22 11:42:04 -07:00
|
|
|
""".format(setup_env_path, shell_script_path, node_ip_addresses[0], node_ip_addresses[0], node_ip_addresses[0])
|
2016-06-14 15:42:28 -07:00
|
|
|
|
2016-06-21 18:17:38 -07:00
|
|
|
def stop_ray_multi_node(node_ip_addresses, username, key_file):
|
|
|
|
kill_cluster_command = "killall scheduler objstore python > /dev/null 2> /dev/null"
|
2016-06-14 15:42:28 -07:00
|
|
|
for node_ip_address in node_ip_addresses:
|
|
|
|
run_command_over_ssh(node_ip_address, username, key_file, kill_cluster_command)
|
|
|
|
|
2016-06-21 18:17:38 -07:00
|
|
|
def update_ray_multi_node(node_ip_addresses, username, key_file, installation_directory):
|
|
|
|
ray_directory = os.path.join(installation_directory, "ray")
|
2016-06-22 11:42:04 -07:00
|
|
|
update_cluster_command = """
|
|
|
|
cd "{}" &&
|
|
|
|
git fetch &&
|
|
|
|
git reset --hard "@{{upstream}}" -- &&
|
|
|
|
(make -C "./build" clean || rm -rf "./build") &&
|
|
|
|
./build.sh
|
|
|
|
""".format(ray_directory)
|
2016-06-21 18:17:38 -07:00
|
|
|
for node_ip_address in node_ip_addresses:
|
|
|
|
run_command_over_ssh(node_ip_address, username, key_file, update_cluster_command)
|
|
|
|
|
2016-06-14 15:42:28 -07:00
|
|
|
# Returns true if address is a valid IPv4 address and false otherwise.
|
|
|
|
def is_valid_ip(ip_address):
|
|
|
|
try:
|
|
|
|
socket.inet_aton(ip_address)
|
|
|
|
return True
|
|
|
|
except socket.error:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
args = parser.parse_args()
|
|
|
|
username = args.username
|
|
|
|
key_file = args.key_file
|
|
|
|
installation_directory = args.installation_directory
|
|
|
|
node_ip_addresses = map(lambda s: str(s.strip()), open(args.nodes).readlines())
|
|
|
|
for index, node_ip_address in enumerate(node_ip_addresses):
|
|
|
|
if not is_valid_ip(node_ip_address):
|
|
|
|
print "\nWARNING: The string '{}' from line {} in the file {} is not a valid IP address.\n".format(node_ip_address, index + 1, args.nodes)
|
|
|
|
|
|
|
|
def install_ray(node_ip_addresses):
|
|
|
|
install_ray_multi_node(node_ip_addresses, username, key_file, installation_directory)
|
|
|
|
|
|
|
|
def start_ray(node_ip_addresses, worker_path):
|
|
|
|
start_ray_multi_node(node_ip_addresses, username, key_file, worker_path, installation_directory)
|
|
|
|
|
|
|
|
def stop_ray(node_ip_addresses):
|
|
|
|
stop_ray_multi_node(node_ip_addresses, username, key_file)
|
|
|
|
|
2016-06-21 18:17:38 -07:00
|
|
|
def update_ray(node_ip_addresses):
|
|
|
|
update_ray_multi_node(node_ip_addresses, username, key_file, installation_directory)
|
|
|
|
|
2016-06-14 15:42:28 -07:00
|
|
|
IPython.embed()
|