2016-12-11 12:25:31 -08:00
from __future__ import absolute_import
from __future__ import division
2016-11-02 00:39:35 -07:00
from __future__ import print_function
2016-11-05 21:34:11 -07:00
import psutil
2016-02-22 13:55:06 -08:00
import os
2016-11-05 21:34:11 -07:00
import random
2016-12-21 18:53:12 -08:00
import redis
2016-11-04 00:41:20 -07:00
import signal
2016-12-23 15:31:40 -08:00
import socket
2016-08-15 11:02:54 -07:00
import string
2016-11-18 19:57:51 -08:00
import subprocess
import sys
2016-11-05 21:34:11 -07:00
import time
2017-01-17 20:34:31 -08:00
from collections import namedtuple , OrderedDict
2017-02-10 12:46:23 -08:00
import threading
2016-02-22 13:55:06 -08:00
2016-08-01 17:55:38 -07:00
# Ray modules
2016-11-18 19:57:51 -08:00
import photon
2016-11-02 16:40:37 -07:00
import plasma
2016-11-18 19:57:51 -08:00
import global_scheduler
2016-04-05 00:34:23 -07:00
2017-01-17 20:34:31 -08:00
PROCESS_TYPE_WORKER = " worker "
PROCESS_TYPE_LOCAL_SCHEDULER = " local_scheduler "
PROCESS_TYPE_PLASMA_MANAGER = " plasma_manager "
PROCESS_TYPE_PLASMA_STORE = " plasma_store "
PROCESS_TYPE_GLOBAL_SCHEDULER = " global_scheduler "
PROCESS_TYPE_REDIS_SERVER = " redis_server "
# This is a dictionary tracking all of the processes of different types that
# have been started by this services module. Note that the order of the keys is
# important because it determines the order in which these processes will be
# terminated when Ray exits, and certain orders will cause errors to be logged
# to the screen.
all_processes = OrderedDict ( [ ( PROCESS_TYPE_WORKER , [ ] ) ,
( PROCESS_TYPE_LOCAL_SCHEDULER , [ ] ) ,
( PROCESS_TYPE_PLASMA_MANAGER , [ ] ) ,
( PROCESS_TYPE_PLASMA_STORE , [ ] ) ,
( PROCESS_TYPE_GLOBAL_SCHEDULER , [ ] ) ,
( PROCESS_TYPE_REDIS_SERVER , [ ] ) ] )
2016-02-22 13:55:06 -08:00
2016-11-04 00:41:20 -07:00
# True if processes are run in the valgrind profiler.
RUN_PHOTON_PROFILER = False
RUN_PLASMA_MANAGER_PROFILER = False
RUN_PLASMA_STORE_PROFILER = False
2016-12-28 14:17:29 -08:00
# ObjectStoreAddress tuples contain all information necessary to connect to an
# object store. The fields are:
# - name: The socket name for the object store
# - manager_name: The socket name for the object store manager
# - manager_port: The Internet port that the object store manager listens on
ObjectStoreAddress = namedtuple ( " ObjectStoreAddress " , [ " name " ,
" manager_name " ,
" manager_port " ] )
2017-01-31 00:28:00 -08:00
def address ( ip_address , port ) :
return ip_address + " : " + str ( port )
2016-04-05 00:34:23 -07:00
2017-02-01 19:18:46 -08:00
def get_ip_address ( address ) :
try :
ip_address = address . split ( " : " ) [ 0 ]
except :
raise Exception ( " Unable to parse IP address from address {} " . format ( address ) )
return ip_address
2016-12-28 14:17:29 -08:00
def get_port ( address ) :
try :
port = int ( address . split ( " : " ) [ 1 ] )
except :
raise Exception ( " Unable to parse port from address {} " . format ( address ) )
return port
2016-11-02 00:39:35 -07:00
def new_port ( ) :
2016-08-15 11:02:54 -07:00
return random . randint ( 10000 , 65535 )
2016-04-05 00:34:23 -07:00
2016-11-02 00:39:35 -07:00
def random_name ( ) :
return str ( random . randint ( 0 , 99999999 ) )
2017-01-17 20:34:31 -08:00
def kill_process ( p ) :
""" Kill a process.
Args :
p : The process to kill .
Returns :
True if the process was killed successfully and false otherwise .
"""
if p . poll ( ) is not None : # process has already terminated
return True
if RUN_PHOTON_PROFILER or RUN_PLASMA_MANAGER_PROFILER or RUN_PLASMA_STORE_PROFILER :
os . kill ( p . pid , signal . SIGINT ) # Give process signal to write profiler data.
time . sleep ( 0.1 ) # Wait for profiling data to be written.
2017-02-10 12:46:23 -08:00
# Allow the process one second to exit gracefully.
p . terminate ( )
timer = threading . Timer ( 1 , lambda p : p . kill ( ) , [ p ] )
try :
timer . start ( )
p . wait ( )
finally :
timer . cancel ( )
2017-01-17 20:34:31 -08:00
if p . poll ( ) is not None :
return True
2017-02-10 12:46:23 -08:00
# If the process did not exit within one second, force kill it.
p . kill ( )
if p . poll ( ) is not None :
2017-01-17 20:34:31 -08:00
return True
2017-02-10 12:46:23 -08:00
2017-01-17 20:34:31 -08:00
# The process was not killed for some reason.
return False
2016-02-22 13:55:06 -08:00
def cleanup ( ) :
2016-07-12 23:54:18 -07:00
""" When running in local mode, shutdown the Ray processes.
2016-07-07 14:05:25 -07:00
This method is used to shutdown processes that were started with
2017-01-31 00:28:00 -08:00
services . start_ray_head ( ) . It kills all scheduler , object store , and worker
2016-08-01 17:55:38 -07:00
processes that were started by this services module . Driver processes are
started and disconnected by worker . py .
2016-07-07 14:05:25 -07:00
"""
2016-08-04 17:47:08 -07:00
successfully_shut_down = True
2016-11-02 00:39:35 -07:00
# Terminate the processes in reverse order.
2017-01-17 20:34:31 -08:00
for process_type in all_processes . keys ( ) :
# Kill all of the processes of a certain type.
for p in all_processes [ process_type ] :
success = kill_process ( p )
successfully_shut_down = successfully_shut_down and success
# Reset the list of processes of this type.
all_processes [ process_type ] = [ ]
2017-01-19 20:27:46 -08:00
if not successfully_shut_down :
2016-11-02 00:39:35 -07:00
print ( " Ray did not shut down properly. " )
2016-02-22 13:55:06 -08:00
2017-01-17 20:34:31 -08:00
def all_processes_alive ( exclude = [ ] ) :
""" Check if all of the processes are still alive.
Args :
exclude : Don ' t check the processes whose types are in this list.
"""
for process_type , processes in all_processes . items ( ) :
# Note that p.poll() returns the exit code that the process exited with, so
# an exit code of None indicates that the process is still alive.
if not all ( [ p . poll ( ) is None for p in processes ] ) and process_type not in exclude :
return False
return True
2016-12-09 17:49:31 -08:00
2016-12-23 15:31:40 -08:00
def get_node_ip_address ( address = " 8.8.8.8:53 " ) :
""" Determine the IP address of the local node.
Args :
address ( str ) : The IP address and port of any known live service on the
network you care about .
Returns :
The IP address of the current node .
"""
2017-01-31 00:28:00 -08:00
ip_address , port = address . split ( " : " )
2016-12-23 15:31:40 -08:00
s = socket . socket ( socket . AF_INET , socket . SOCK_DGRAM )
2017-01-31 00:28:00 -08:00
s . connect ( ( ip_address , int ( port ) ) )
2016-12-23 15:31:40 -08:00
return s . getsockname ( ) [ 0 ]
2017-01-31 00:28:00 -08:00
def wait_for_redis_to_start ( redis_ip_address , redis_port , num_retries = 5 ) :
2016-12-22 21:54:19 -08:00
""" Wait for a Redis server to be available.
This is accomplished by creating a Redis client and sending a random command
to the server until the command gets through .
Args :
2017-01-31 00:28:00 -08:00
redis_ip_address ( str ) : The IP address of the redis server .
2016-12-22 21:54:19 -08:00
redis_port ( int ) : The port of the redis server .
num_retries ( int ) : The number of times to try connecting with redis . The
client will sleep for one second between attempts .
Raises :
Exception : An exception is raised if we could not connect with Redis .
"""
2017-01-31 00:28:00 -08:00
redis_client = redis . StrictRedis ( host = redis_ip_address , port = redis_port )
2016-12-22 21:54:19 -08:00
# Wait for the Redis server to start.
counter = 0
while counter < num_retries :
try :
# Run some random command and see if it worked.
2017-01-31 00:28:00 -08:00
print ( " Waiting for redis server at {} : {} to respond... " . format ( redis_ip_address , redis_port ) )
2016-12-22 21:54:19 -08:00
redis_client . client_list ( )
except redis . ConnectionError as e :
# Wait a little bit.
time . sleep ( 1 )
print ( " Failed to connect to the redis server, retrying. " )
counter + = 1
else :
break
if counter == num_retries :
raise Exception ( " Unable to connect to Redis. If the Redis instance is on a different machine, check that your firewall is configured properly. " )
2017-01-31 00:28:00 -08:00
def start_redis ( port = None , num_retries = 20 , cleanup = True , redirect_output = False ) :
2016-11-06 22:24:39 -08:00
""" Start a Redis server.
Args :
2017-01-31 00:28:00 -08:00
port ( int ) : If provided , start a Redis server with this port .
2016-11-06 22:24:39 -08:00
num_retries ( int ) : The number of times to attempt to start Redis .
cleanup ( bool ) : True if using Ray in local mode . If cleanup is true , then
this process will be killed by serices . cleanup ( ) when the Python process
that imported services exits .
2016-12-21 18:53:12 -08:00
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2016-11-06 22:24:39 -08:00
Returns :
2017-01-31 00:28:00 -08:00
The port used by Redis . If a port is passed in , then the same value is
returned .
2016-11-06 22:24:39 -08:00
Raises :
Exception : An exception is raised if Redis could not be started .
"""
2017-01-17 16:56:40 -08:00
redis_filepath = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , " ../core/src/common/thirdparty/redis/src/redis-server " )
redis_module = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , " ../core/src/common/redis_module/libray_redis_module.so " )
2016-12-15 14:47:10 -08:00
assert os . path . isfile ( redis_filepath )
assert os . path . isfile ( redis_module )
2016-11-06 22:24:39 -08:00
counter = 0
2017-01-31 00:28:00 -08:00
if port is not None :
if num_retries != 1 :
2017-02-12 12:39:32 -08:00
raise Exception ( " num_retries must be 1 if port is specified. " )
2017-01-31 00:28:00 -08:00
else :
port = new_port ( )
2016-11-06 22:24:39 -08:00
while counter < num_retries :
if counter > 0 :
print ( " Redis failed to start, retrying now. " )
2016-12-21 18:53:12 -08:00
with open ( os . devnull , " w " ) as FNULL :
stdout = FNULL if redirect_output else None
stderr = FNULL if redirect_output else None
p = subprocess . Popen ( [ redis_filepath , " --port " , str ( port ) , " --loglevel " , " warning " , " --loadmodule " , redis_module ] , stdout = stdout , stderr = stderr )
2016-11-06 22:24:39 -08:00
time . sleep ( 0.1 )
# Check if Redis successfully started (or at least if it the executable did
# not exit within 0.1 seconds).
if p . poll ( ) is None :
if cleanup :
2017-01-17 20:34:31 -08:00
all_processes [ PROCESS_TYPE_REDIS_SERVER ] . append ( p )
2016-12-21 18:53:12 -08:00
break
2017-01-31 00:28:00 -08:00
port = new_port ( )
2016-11-06 22:24:39 -08:00
counter + = 1
2016-12-21 18:53:12 -08:00
if counter == num_retries :
raise Exception ( " Couldn ' t start Redis. " )
2016-02-22 13:55:06 -08:00
2016-12-21 18:53:12 -08:00
# Create a Redis client just for configuring Redis.
redis_client = redis . StrictRedis ( host = " 127.0.0.1 " , port = port )
# Wait for the Redis server to start.
2016-12-22 21:54:19 -08:00
wait_for_redis_to_start ( " 127.0.0.1 " , port )
2016-12-21 18:53:12 -08:00
# Configure Redis to generate keyspace notifications. TODO(rkn): Change this
# to only generate notifications for the export keys.
redis_client . config_set ( " notify-keyspace-events " , " Kl " )
# Configure Redis to not run in protected mode so that processes on other
# hosts can connect to it. TODO(rkn): Do this in a more secure way.
redis_client . config_set ( " protected-mode " , " no " )
2017-02-07 14:21:25 -08:00
# Put a time stamp in Redis to indicate when it was started.
redis_client . set ( " redis_start_time " , time . time ( ) )
2017-01-31 00:28:00 -08:00
return port
2016-12-21 18:53:12 -08:00
def start_global_scheduler ( redis_address , cleanup = True , redirect_output = False ) :
2016-11-18 19:57:51 -08:00
""" Start a global scheduler process.
Args :
redis_address ( str ) : The address of the Redis instance .
cleanup ( bool ) : True if using Ray in local mode . If cleanup is true , then
this process will be killed by serices . cleanup ( ) when the Python process
that imported services exits .
2016-12-21 18:53:12 -08:00
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2016-11-18 19:57:51 -08:00
"""
2016-12-21 18:53:12 -08:00
p = global_scheduler . start_global_scheduler ( redis_address , redirect_output = redirect_output )
2016-11-18 19:57:51 -08:00
if cleanup :
2017-01-17 20:34:31 -08:00
all_processes [ PROCESS_TYPE_GLOBAL_SCHEDULER ] . append ( p )
2016-11-18 19:57:51 -08:00
2017-02-10 12:46:23 -08:00
def start_local_scheduler ( redis_address ,
node_ip_address ,
plasma_store_name ,
plasma_manager_name ,
worker_path ,
plasma_address = None ,
cleanup = True ,
redirect_output = False ,
static_resource_list = None ,
num_workers = 0 ) :
2016-11-18 19:57:51 -08:00
""" Start a local scheduler process.
Args :
redis_address ( str ) : The address of the Redis instance .
2016-12-20 20:21:35 -08:00
node_ip_address ( str ) : The IP address of the node that this local scheduler
is running on .
2016-11-18 19:57:51 -08:00
plasma_store_name ( str ) : The name of the plasma store socket to connect to .
2016-12-04 17:08:16 -08:00
plasma_manager_name ( str ) : The name of the plasma manager socket to connect
to .
2017-01-27 01:28:48 -08:00
worker_path ( str ) : The path of the script to use when the local scheduler
starts up new workers .
2016-11-18 19:57:51 -08:00
cleanup ( bool ) : True if using Ray in local mode . If cleanup is true , then
this process will be killed by serices . cleanup ( ) when the Python process
that imported services exits .
2016-12-21 18:53:12 -08:00
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2017-02-09 01:34:14 -08:00
static_resource_list ( list ) : An ordered list of the configured resource
capacities for this local scheduler .
2017-02-10 12:46:23 -08:00
num_workers ( int ) : The number of workers that the local scheduler should
start .
2016-11-18 19:57:51 -08:00
Return :
The name of the local scheduler socket .
"""
2017-01-27 01:28:48 -08:00
local_scheduler_name , p = photon . start_local_scheduler ( plasma_store_name ,
plasma_manager_name ,
worker_path = worker_path ,
node_ip_address = node_ip_address ,
redis_address = redis_address ,
plasma_address = plasma_address ,
use_profiler = RUN_PHOTON_PROFILER ,
2017-02-09 01:34:14 -08:00
redirect_output = redirect_output ,
2017-02-10 12:46:23 -08:00
static_resource_list = static_resource_list ,
num_workers = num_workers )
2016-08-08 16:01:13 -07:00
if cleanup :
2017-01-17 20:34:31 -08:00
all_processes [ PROCESS_TYPE_LOCAL_SCHEDULER ] . append ( p )
2016-11-02 00:39:35 -07:00
return local_scheduler_name
2016-07-07 14:05:25 -07:00
2017-02-12 12:39:32 -08:00
def start_objstore ( node_ip_address , redis_address , object_manager_port = None ,
cleanup = True , redirect_output = False , objstore_memory = None ) :
2016-07-12 23:54:18 -07:00
""" This method starts an object store process.
Args :
2016-12-21 18:53:12 -08:00
node_ip_address ( str ) : The IP address of the node running the object store .
2016-11-18 19:57:51 -08:00
redis_address ( str ) : The address of the Redis instance to connect to .
2017-02-12 12:39:32 -08:00
object_manager_port ( int ) : The port to use for the object manager . If this
is not provided , one will be generated randomly .
2016-08-08 16:01:13 -07:00
cleanup ( bool ) : True if using Ray in local mode . If cleanup is true , then
this process will be killed by serices . cleanup ( ) when the Python process
that imported services exits .
2016-12-21 18:53:12 -08:00
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2016-11-18 19:57:51 -08:00
Return :
A tuple of the Plasma store socket name , the Plasma manager socket name , and
the plasma manager port .
2016-07-07 14:05:25 -07:00
"""
2016-12-28 14:17:29 -08:00
if objstore_memory is None :
# Compute a fraction of the system memory for the Plasma store to use.
system_memory = psutil . virtual_memory ( ) . total
if sys . platform == " linux " or sys . platform == " linux2 " :
# On linux we use /dev/shm, its size is half the size of the physical
# memory. To not overflow it, we set the plasma memory limit to 0.4 times
# the size of the physical memory.
objstore_memory = int ( system_memory * 0.4 )
2017-01-03 15:33:29 -05:00
# Compare the requested memory size to the memory available in /dev/shm.
shm_fd = os . open ( " /dev/shm " , os . O_RDONLY )
try :
shm_fs_stats = os . fstatvfs ( shm_fd )
# The value shm_fs_stats.f_bsize is the block size and the value
# shm_fs_stats.f_bavail is the number of available blocks.
shm_avail = shm_fs_stats . f_bsize * shm_fs_stats . f_bavail
if objstore_memory > shm_avail :
print ( " Warning: Reducing object store memory because /dev/shm has only {} bytes available. You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you may need to pass an argument with the flag ' --shm-size ' to ' docker run ' . " . format ( shm_avail ) )
objstore_memory = int ( shm_avail * 0.8 )
finally :
os . close ( shm_fd )
2016-12-28 14:17:29 -08:00
else :
2017-01-03 15:33:29 -05:00
objstore_memory = int ( system_memory * 0.8 )
2016-11-18 19:57:51 -08:00
# Start the Plasma store.
2016-12-28 14:17:29 -08:00
plasma_store_name , p1 = plasma . start_plasma_store ( plasma_store_memory = objstore_memory , use_profiler = RUN_PLASMA_STORE_PROFILER , redirect_output = redirect_output )
2016-11-18 19:57:51 -08:00
# Start the plasma manager.
2017-02-12 12:39:32 -08:00
if object_manager_port is not None :
plasma_manager_name , p2 , plasma_manager_port = plasma . start_plasma_manager ( plasma_store_name , redis_address , plasma_manager_port = object_manager_port , node_ip_address = node_ip_address , num_retries = 1 , run_profiler = RUN_PLASMA_MANAGER_PROFILER , redirect_output = redirect_output )
assert plasma_manager_port == object_manager_port
else :
plasma_manager_name , p2 , plasma_manager_port = plasma . start_plasma_manager ( plasma_store_name , redis_address , node_ip_address = node_ip_address , run_profiler = RUN_PLASMA_MANAGER_PROFILER , redirect_output = redirect_output )
2016-08-08 16:01:13 -07:00
if cleanup :
2017-01-17 20:34:31 -08:00
all_processes [ PROCESS_TYPE_PLASMA_STORE ] . append ( p1 )
all_processes [ PROCESS_TYPE_PLASMA_MANAGER ] . append ( p2 )
2016-03-08 16:14:02 -08:00
2016-12-28 14:17:29 -08:00
return ObjectStoreAddress ( plasma_store_name , plasma_manager_name ,
plasma_manager_port )
2016-11-02 00:39:35 -07:00
2016-12-21 18:53:12 -08:00
def start_worker ( node_ip_address , object_store_name , object_store_manager_name , local_scheduler_name , redis_address , worker_path , cleanup = True , redirect_output = False ) :
2016-07-12 23:54:18 -07:00
""" This method starts a worker process.
Args :
2016-12-04 17:08:16 -08:00
node_ip_address ( str ) : The IP address of the node that this worker is
running on .
object_store_name ( str ) : The name of the object store .
object_store_manager_name ( str ) : The name of the object store manager .
local_scheduler_name ( str ) : The name of the local scheduler .
2016-12-21 18:53:12 -08:00
redis_address ( int ) : The address that the Redis server is listening on .
2016-07-12 23:54:18 -07:00
worker_path ( str ) : The path of the source code which the worker process will
run .
2016-11-02 00:39:35 -07:00
cleanup ( bool ) : True if using Ray in local mode . If cleanup is true , then
this process will be killed by services . cleanup ( ) when the Python process
that imported services exits . This is True by default .
2016-12-21 18:53:12 -08:00
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2016-07-07 14:05:25 -07:00
"""
2016-08-04 17:47:08 -07:00
command = [ " python " ,
worker_path ,
2016-12-04 17:08:16 -08:00
" --node-ip-address= " + node_ip_address ,
" --object-store-name= " + object_store_name ,
" --object-store-manager-name= " + object_store_manager_name ,
" --local-scheduler-name= " + local_scheduler_name ,
2016-12-21 18:53:12 -08:00
" --redis-address= " + str ( redis_address ) ]
with open ( os . devnull , " w " ) as FNULL :
stdout = FNULL if redirect_output else None
stderr = FNULL if redirect_output else None
p = subprocess . Popen ( command , stdout = stdout , stderr = stderr )
2016-08-08 16:01:13 -07:00
if cleanup :
2017-01-17 20:34:31 -08:00
all_processes [ PROCESS_TYPE_WORKER ] . append ( p )
2016-11-17 22:33:29 -08:00
2016-12-28 14:17:29 -08:00
def start_ray_processes ( address_info = None ,
node_ip_address = " 127.0.0.1 " ,
num_workers = 0 ,
num_local_schedulers = 1 ,
worker_path = None ,
cleanup = True ,
redirect_output = False ,
2017-01-31 00:28:00 -08:00
include_global_scheduler = False ,
2017-02-09 01:34:14 -08:00
include_redis = False ,
2017-02-10 12:46:23 -08:00
start_workers_from_local_scheduler = True ,
2017-02-09 01:34:14 -08:00
num_cpus = None ,
num_gpus = None ) :
2016-12-28 14:17:29 -08:00
""" Helper method to start Ray processes.
2016-12-21 18:53:12 -08:00
Args :
2016-12-28 14:17:29 -08:00
address_info ( dict ) : A dictionary with address information for processes
that have already been started . If provided , address_info will be
modified to include processes that are newly started .
2016-12-21 18:53:12 -08:00
node_ip_address ( str ) : The IP address of this node .
num_workers ( int ) : The number of workers to start .
2016-12-28 14:17:29 -08:00
num_local_schedulers ( int ) : The total number of local schedulers required .
This is also the total number of object stores required . This method will
start new instances of local schedulers and object stores until there are
num_local_schedulers existing instances of each , including ones already
registered with the given address_info .
2016-12-21 18:53:12 -08:00
worker_path ( str ) : The path of the source code that will be run by the
worker .
cleanup ( bool ) : If cleanup is true , then the processes started here will be
killed by services . cleanup ( ) when the Python process that called this
method exits .
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2016-12-28 14:17:29 -08:00
include_global_scheduler ( bool ) : If include_global_scheduler is True , then
start a global scheduler process .
2017-01-31 00:28:00 -08:00
include_redis ( bool ) : If include_redis is True , then start a Redis server
process .
2017-02-10 12:46:23 -08:00
start_workers_from_local_scheduler ( bool ) : If this flag is True , then start
the initial workers from the local scheduler . Else , start them from
Python .
2017-02-09 01:34:14 -08:00
num_cpus : A list of length num_local_schedulers containing the number of
CPUs each local scheduler should be configured with .
num_gpus : A list of length num_local_schedulers containing the number of
GPUs each local scheduler should be configured with .
2016-12-28 14:17:29 -08:00
Returns :
A dictionary of the address information for the processes that were
started .
2016-12-21 18:53:12 -08:00
"""
2017-02-09 01:34:14 -08:00
if not isinstance ( num_cpus , list ) :
num_cpus = num_local_schedulers * [ num_cpus ]
if not isinstance ( num_gpus , list ) :
num_gpus = num_local_schedulers * [ num_gpus ]
assert len ( num_cpus ) == num_local_schedulers
assert len ( num_gpus ) == num_local_schedulers
2016-12-28 14:17:29 -08:00
if address_info is None :
address_info = { }
address_info [ " node_ip_address " ] = node_ip_address
2016-12-21 18:53:12 -08:00
if worker_path is None :
worker_path = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , " workers/default_worker.py " )
2016-12-28 14:17:29 -08:00
# Start Redis if there isn't already an instance running. TODO(rkn): We are
# suppressing the output of Redis because on Linux it prints a bunch of
# warning messages when it starts up. Instead of suppressing the output, we
# should address the warnings.
redis_address = address_info . get ( " redis_address " )
2017-01-31 00:28:00 -08:00
if include_redis :
if redis_address is None :
# Start a Redis server. The start_redis method will choose a random port.
redis_port = start_redis ( cleanup = cleanup , redirect_output = redirect_output )
redis_address = address ( node_ip_address , redis_port )
address_info [ " redis_address " ] = redis_address
time . sleep ( 0.1 )
else :
# A Redis address was provided, so start a Redis server with the given
# port. TODO(rkn): We should check that the IP address corresponds to the
# machine that this method is running on.
2017-02-01 19:18:46 -08:00
redis_ip_address = get_ip_address ( redis_address )
redis_port = get_port ( redis_address )
2017-01-31 00:28:00 -08:00
new_redis_port = start_redis ( port = int ( redis_port ) ,
num_retries = 1 ,
cleanup = cleanup ,
redirect_output = redirect_output )
assert redis_port == new_redis_port
else :
if redis_address is None :
raise Exception ( " Redis address expected " )
2016-12-28 14:17:29 -08:00
# Start the global scheduler, if necessary.
if include_global_scheduler :
start_global_scheduler ( redis_address , cleanup = cleanup ,
redirect_output = redirect_output )
# Initialize with existing services.
if " object_store_addresses " not in address_info :
address_info [ " object_store_addresses " ] = [ ]
object_store_addresses = address_info [ " object_store_addresses " ]
if " local_scheduler_socket_names " not in address_info :
address_info [ " local_scheduler_socket_names " ] = [ ]
local_scheduler_socket_names = address_info [ " local_scheduler_socket_names " ]
2017-02-12 12:39:32 -08:00
# Get the ports to use for the object managers if any are provided.
object_manager_ports = address_info [ " object_manager_ports " ] if " object_manager_ports " in address_info else None
if not isinstance ( object_manager_ports , list ) :
object_manager_ports = num_local_schedulers * [ object_manager_ports ]
assert len ( object_manager_ports ) == num_local_schedulers
2016-12-28 14:17:29 -08:00
# Start any object stores that do not yet exist.
2017-02-12 12:39:32 -08:00
for i in range ( num_local_schedulers - len ( object_store_addresses ) ) :
2016-12-21 18:53:12 -08:00
# Start Plasma.
2016-12-28 14:17:29 -08:00
object_store_address = start_objstore ( node_ip_address , redis_address ,
2017-02-12 12:39:32 -08:00
object_manager_port = object_manager_ports [ i ] ,
2016-12-28 14:17:29 -08:00
cleanup = cleanup ,
redirect_output = redirect_output )
object_store_addresses . append ( object_store_address )
2016-12-21 18:53:12 -08:00
time . sleep ( 0.1 )
2016-12-28 14:17:29 -08:00
2017-02-10 12:46:23 -08:00
# Determine how many workers to start for each local scheduler.
num_workers_per_local_scheduler = [ 0 ] * num_local_schedulers
for i in range ( num_workers ) :
num_workers_per_local_scheduler [ i % num_local_schedulers ] + = 1
2016-12-28 14:17:29 -08:00
# Start any local schedulers that do not yet exist.
for i in range ( len ( local_scheduler_socket_names ) , num_local_schedulers ) :
# Connect the local scheduler to the object store at the same index.
object_store_address = object_store_addresses [ i ]
plasma_address = " {} : {} " . format ( node_ip_address ,
object_store_address . manager_port )
2017-02-10 12:46:23 -08:00
# Determine how many workers this local scheduler should start.
if start_workers_from_local_scheduler :
num_local_scheduler_workers = num_workers_per_local_scheduler [ i ]
num_workers_per_local_scheduler [ i ] = 0
else :
# If we're starting the workers from Python, the local scheduler should
# not start any workers.
num_local_scheduler_workers = 0
2016-12-21 18:53:12 -08:00
# Start the local scheduler.
2016-12-28 14:17:29 -08:00
local_scheduler_name = start_local_scheduler ( redis_address ,
node_ip_address ,
object_store_address . name ,
object_store_address . manager_name ,
2017-01-27 01:28:48 -08:00
worker_path ,
2016-12-28 14:17:29 -08:00
plasma_address = plasma_address ,
cleanup = cleanup ,
2017-02-09 01:34:14 -08:00
redirect_output = redirect_output ,
2017-02-10 12:46:23 -08:00
static_resource_list = [ num_cpus [ i ] , num_gpus [ i ] ] ,
num_workers = num_local_scheduler_workers )
2016-12-28 14:17:29 -08:00
local_scheduler_socket_names . append ( local_scheduler_name )
2016-12-21 18:53:12 -08:00
time . sleep ( 0.1 )
2016-12-28 14:17:29 -08:00
# Make sure that we have exactly num_local_schedulers instances of object
# stores and local schedulers.
assert len ( object_store_addresses ) == num_local_schedulers
assert len ( local_scheduler_socket_names ) == num_local_schedulers
2017-02-10 12:46:23 -08:00
# Start any workers that the local scheduler has not already started.
for i , num_local_scheduler_workers in enumerate ( num_workers_per_local_scheduler ) :
object_store_address = object_store_addresses [ i ]
local_scheduler_name = local_scheduler_socket_names [ i ]
for j in range ( num_local_scheduler_workers ) :
start_worker ( node_ip_address ,
object_store_address . name ,
object_store_address . manager_name ,
local_scheduler_name ,
redis_address ,
worker_path ,
cleanup = cleanup ,
redirect_output = redirect_output )
num_workers_per_local_scheduler [ i ] - = 1
# Make sure that we've started all the workers.
assert ( sum ( num_workers_per_local_scheduler ) == 0 )
2016-12-28 14:17:29 -08:00
2016-12-21 18:53:12 -08:00
# Return the addresses of the relevant processes.
return address_info
2016-12-28 14:17:29 -08:00
def start_ray_node ( node_ip_address ,
redis_address ,
2017-02-12 12:39:32 -08:00
object_manager_ports = None ,
2016-12-28 14:17:29 -08:00
num_workers = 0 ,
num_local_schedulers = 1 ,
worker_path = None ,
cleanup = True ,
2017-02-09 01:34:14 -08:00
redirect_output = False ,
num_cpus = None ,
num_gpus = None ) :
2016-12-28 14:17:29 -08:00
""" Start the Ray processes for a single node.
This assumes that the Ray processes on some master node have already been
started .
2016-07-12 23:54:18 -07:00
Args :
2016-12-21 18:53:12 -08:00
node_ip_address ( str ) : The IP address of this node .
2016-12-28 14:17:29 -08:00
redis_address ( str ) : The address of the Redis server .
2017-02-12 12:39:32 -08:00
object_manager_ports ( list ) : A list of the ports to use for the object
managers . There should be one per object manager being started on this
node ( typically just one ) .
2016-08-01 17:55:38 -07:00
num_workers ( int ) : The number of workers to start .
2016-12-04 17:08:16 -08:00
num_local_schedulers ( int ) : The number of local schedulers to start . This is
also the number of plasma stores and plasma managers to start .
2016-07-12 23:54:18 -07:00
worker_path ( str ) : The path of the source code that will be run by the
2016-08-01 17:55:38 -07:00
worker .
2016-12-21 18:53:12 -08:00
cleanup ( bool ) : If cleanup is true , then the processes started here will be
killed by services . cleanup ( ) when the Python process that called this
method exits .
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2016-08-01 17:55:38 -07:00
Returns :
2016-12-28 14:17:29 -08:00
A dictionary of the address information for the processes that were
started .
2016-07-07 14:05:25 -07:00
"""
2016-12-28 14:17:29 -08:00
address_info = {
" redis_address " : redis_address ,
2017-02-12 12:39:32 -08:00
" object_manager_ports " : object_manager_ports ,
2016-12-28 14:17:29 -08:00
}
return start_ray_processes ( address_info = address_info ,
node_ip_address = node_ip_address ,
num_workers = num_workers ,
num_local_schedulers = num_local_schedulers ,
worker_path = worker_path ,
cleanup = cleanup ,
2017-02-09 01:34:14 -08:00
redirect_output = redirect_output ,
num_cpus = num_cpus ,
num_gpus = num_gpus )
2016-12-28 14:17:29 -08:00
2017-01-31 00:28:00 -08:00
def start_ray_head ( address_info = None ,
node_ip_address = " 127.0.0.1 " ,
num_workers = 0 ,
num_local_schedulers = 1 ,
worker_path = None ,
cleanup = True ,
2017-02-09 01:34:14 -08:00
redirect_output = False ,
2017-02-10 12:46:23 -08:00
start_workers_from_local_scheduler = True ,
2017-02-09 01:34:14 -08:00
num_cpus = None ,
num_gpus = None ) :
2016-12-28 14:17:29 -08:00
""" Start Ray in local mode.
Args :
address_info ( dict ) : A dictionary with address information for processes
that have already been started . If provided , address_info will be
modified to include processes that are newly started .
node_ip_address ( str ) : The IP address of this node .
num_workers ( int ) : The number of workers to start .
num_local_schedulers ( int ) : The total number of local schedulers required .
This is also the total number of object stores required . This method will
start new instances of local schedulers and object stores until there are
at least num_local_schedulers existing instances of each , including ones
already registered with the given address_info .
worker_path ( str ) : The path of the source code that will be run by the
worker .
cleanup ( bool ) : If cleanup is true , then the processes started here will be
killed by services . cleanup ( ) when the Python process that called this
method exits .
redirect_output ( bool ) : True if stdout and stderr should be redirected to
/ dev / null .
2017-02-10 12:46:23 -08:00
start_workers_from_local_scheduler ( bool ) : If this flag is True , then start
the initial workers from the local scheduler . Else , start them from
Python .
2017-02-09 01:34:14 -08:00
num_cpus ( int ) : number of cpus to configure the local scheduler with .
num_gpus ( int ) : number of gpus to configure the local scheduler with .
2016-12-28 14:17:29 -08:00
Returns :
A dictionary of the address information for the processes that were
started .
"""
return start_ray_processes ( address_info = address_info ,
node_ip_address = node_ip_address ,
num_workers = num_workers ,
num_local_schedulers = num_local_schedulers ,
worker_path = worker_path ,
cleanup = cleanup ,
redirect_output = redirect_output ,
2017-01-31 00:28:00 -08:00
include_global_scheduler = True ,
2017-02-09 01:34:14 -08:00
include_redis = True ,
2017-02-10 12:46:23 -08:00
start_workers_from_local_scheduler = start_workers_from_local_scheduler ,
2017-02-09 01:34:14 -08:00
num_cpus = num_cpus ,
num_gpus = num_gpus )