Dynamically grow worker pool to partially solve hanging workloads (#286)

* First pass at a policy to solve deadlock

* Address Robert's comments

* stress test

* unit test

* Fix test cases

* Fix test for python3

* add more logging

* White space.
This commit is contained in:
Stephanie Wang 2017-02-17 17:08:52 -08:00 committed by Robert Nishihara
parent 0bbf08a4ac
commit a0dd3a44c0
11 changed files with 393 additions and 38 deletions

View file

@ -471,6 +471,7 @@ class Worker(object):
# their original index in the object_ids argument.
unready_ids = dict((object_id, i) for (i, (object_id, val)) in
enumerate(final_results) if val is None)
was_blocked = (len(unready_ids) > 0)
# Try reconstructing any objects we haven't gotten yet. Try to get them
# until GET_TIMEOUT_MILLISECONDS milliseconds passes, then repeat.
while len(unready_ids) > 0:
@ -487,6 +488,11 @@ class Worker(object):
final_results[index] = (object_id, val)
unready_ids.pop(object_id)
# If there were objects that we weren't able to get locally, let the local
# scheduler know that we're now unblocked.
if was_blocked:
self.photon_client.notify_unblocked()
# Unwrap the object from the list (it was wrapped put_object).
assert len(final_results) == len(object_ids)
for i in range(len(final_results)):

View file

@ -29,7 +29,10 @@ enum photon_message_type {
EVENT_LOG_MESSAGE,
/** Send an initial connection message to the local scheduler.
* This contains the worker's process ID and actor ID. */
REGISTER_WORKER_INFO
REGISTER_WORKER_INFO,
/** For a worker that was blocked on some object(s), tell the local scheduler
* that the worker is now unblocked. */
NOTIFY_UNBLOCKED,
};
/* These are needed to define the UT_arrays. */
@ -112,6 +115,9 @@ typedef struct {
* no task is running on the worker, this will be NULL. This is used to
* update the task table. */
task *task_in_progress;
/** A flag to indicate whether this worker is currently blocking on an
* object(s) that isn't available locally yet. */
bool is_blocked;
/** The process ID of the client. If this is set to zero, the client has not
* yet registered a process ID. */
pid_t pid;

View file

@ -83,9 +83,19 @@ struct scheduling_algorithm_state {
* about a new local scheduler arrives, we will resubmit all of these tasks
* locally. */
UT_array *cached_submitted_actor_tasks;
/** An array of worker indices corresponding to clients that are
* waiting for tasks. */
/** An array of pointers to workers in the worker pool. These are workers
* that have registered a PID with us and that are now waiting to be
* assigned a task to execute. */
UT_array *available_workers;
/** An array of pointers to workers that are currently executing a task,
* unblocked. These are the workers that are leasing some number of
* resources. */
UT_array *executing_workers;
/** An array of pointers to workers that are currently executing a task,
* blocked on some object(s) that isn't available locally yet. These are the
* workers that are executing a task, but that have temporarily returned the
* task's required resources. */
UT_array *blocked_workers;
/** A hash map of the objects that are available in the local Plasma store.
* The key is the object ID. This information could be a little stale. */
object_entry *local_objects;
@ -107,9 +117,13 @@ scheduling_algorithm_state *make_scheduling_algorithm_state(void) {
/* Initialize the local data structures used for queuing tasks and workers. */
algorithm_state->waiting_task_queue = NULL;
algorithm_state->dispatch_task_queue = NULL;
utarray_new(algorithm_state->available_workers, &worker_icd);
utarray_new(algorithm_state->cached_submitted_actor_tasks, &task_spec_icd);
algorithm_state->local_actor_infos = NULL;
utarray_new(algorithm_state->available_workers, &worker_icd);
utarray_new(algorithm_state->executing_workers, &worker_icd);
utarray_new(algorithm_state->blocked_workers, &worker_icd);
return algorithm_state;
}
@ -146,6 +160,8 @@ void free_scheduling_algorithm_state(
utarray_free(algorithm_state->cached_submitted_actor_tasks);
/* Free the list of available workers. */
utarray_free(algorithm_state->available_workers);
utarray_free(algorithm_state->executing_workers);
utarray_free(algorithm_state->blocked_workers);
/* Free the cached information about which objects are present locally. */
object_entry *obj_entry, *tmp_obj_entry;
HASH_ITER(hh, algorithm_state->local_objects, obj_entry, tmp_obj_entry) {
@ -556,10 +572,6 @@ void dispatch_tasks(local_scheduler_state *state,
/* Assign as many tasks as we can, while there are workers available. */
DL_FOREACH_SAFE(algorithm_state->dispatch_task_queue, elt, tmp) {
if (utarray_len(algorithm_state->available_workers) <= 0) {
/* There are no more available workers, so we're done. */
break;
}
/* TODO(atumanov): as an optimization, we can also check if all dynamic
* capacity is zero and bail early. */
bool task_satisfied = true;
@ -574,6 +586,19 @@ void dispatch_tasks(local_scheduler_state *state,
if (!task_satisfied) {
continue; /* Proceed to the next task. */
}
/* If there is a task to assign, but there are no more available workers in
* the worker pool, then exit. Ensure that there will be an available
* worker during a future invocation of dispatch_tasks. */
if (utarray_len(algorithm_state->available_workers) == 0) {
if (utarray_len(state->child_pids) == 0) {
/* If there are no workers, including those pending PID registration,
* then we must start a new one to replenish the worker pool. */
start_worker(state, NIL_ACTOR_ID);
}
break;
}
/* Dispatch this task to an available worker and dequeue the task. */
LOG_DEBUG("Dispatching task");
/* Get the last available worker in the available worker queue. */
@ -581,10 +606,12 @@ void dispatch_tasks(local_scheduler_state *state,
algorithm_state->available_workers);
/* Tell the available worker to execute the task. */
assign_task_to_worker(state, elt->spec, *worker);
/* Remove the available worker from the queue and free the struct. */
/* Remove the worker from the available queue, and add it to the executing
* workers. */
utarray_pop_back(algorithm_state->available_workers);
utarray_push_back(algorithm_state->executing_workers, worker);
/* Dequeue the task and free the struct. */
print_resource_info(state, elt->spec);
/* Deque the task. */
DL_DELETE(algorithm_state->dispatch_task_queue, elt);
free_task_spec(elt->spec);
free(elt);
@ -923,12 +950,37 @@ void handle_worker_available(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
local_scheduler_client *worker) {
CHECK(worker->task_in_progress == NULL);
/* Check that the worker isn't in the pool of available workers. */
for (local_scheduler_client **p = (local_scheduler_client **) utarray_front(
algorithm_state->available_workers);
p != NULL; p = (local_scheduler_client **) utarray_next(
algorithm_state->available_workers, p)) {
DCHECK(*p != worker);
}
/* Check that the worker isn't in the list of blocked workers. */
for (local_scheduler_client **p = (local_scheduler_client **) utarray_front(
algorithm_state->blocked_workers);
p != NULL; p = (local_scheduler_client **) utarray_next(
algorithm_state->blocked_workers, p)) {
DCHECK(*p != worker);
}
/* If the worker was executing a task, it must have finished, so remove it
* from the list of executing workers. */
for (int i = 0; i < utarray_len(algorithm_state->executing_workers); ++i) {
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
algorithm_state->executing_workers, i);
if (*p == worker) {
utarray_erase(algorithm_state->executing_workers, i, 1);
break;
}
}
/* Check that we actually erased the worker. */
for (int i = 0; i < utarray_len(algorithm_state->executing_workers); ++i) {
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
algorithm_state->executing_workers, i);
DCHECK(*p != worker);
}
/* Add worker to the list of available workers. */
utarray_push_back(algorithm_state->available_workers, &worker);
@ -954,6 +1006,88 @@ void handle_actor_worker_available(local_scheduler_state *state,
dispatch_actor_task(state, algorithm_state, actor_id);
}
void handle_worker_blocked(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
local_scheduler_client *worker) {
/* Find the worker in the list of executing workers. */
for (int i = 0; i < utarray_len(algorithm_state->executing_workers); ++i) {
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
algorithm_state->executing_workers, i);
if (*p == worker) {
/* Remove the worker from the list of executing workers. */
utarray_erase(algorithm_state->executing_workers, i, 1);
/* Check that the worker isn't in the list of blocked workers. */
for (local_scheduler_client **q =
(local_scheduler_client **) utarray_front(
algorithm_state->blocked_workers);
q != NULL; q = (local_scheduler_client **) utarray_next(
algorithm_state->blocked_workers, q)) {
DCHECK(*q != worker);
}
/* Return the resources that the blocked worker was using. */
CHECK(worker->task_in_progress != NULL);
task_spec *spec = task_task_spec(worker->task_in_progress);
update_dynamic_resources(state, spec, true);
/* Add the worker to the list of blocked workers. */
worker->is_blocked = true;
utarray_push_back(algorithm_state->blocked_workers, &worker);
return;
}
}
/* The worker should have been in the list of executing workers, so this line
* should be unreachable. */
LOG_FATAL(
"Worker registered as blocked, but it was not in the list of executing "
"workers.");
}
void handle_worker_unblocked(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
local_scheduler_client *worker) {
/* Find the worker in the list of blocked workers. */
for (int i = 0; i < utarray_len(algorithm_state->blocked_workers); ++i) {
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
algorithm_state->blocked_workers, i);
if (*p == worker) {
/* Remove the worker from the list of blocked workers. */
utarray_erase(algorithm_state->blocked_workers, i, 1);
/* Check that the worker isn't in the list of executing workers. */
for (local_scheduler_client **q =
(local_scheduler_client **) utarray_front(
algorithm_state->executing_workers);
q != NULL; q = (local_scheduler_client **) utarray_next(
algorithm_state->executing_workers, q)) {
DCHECK(*q != worker);
}
/* Lease back the resources that the blocked worker will need. */
/* TODO(swang): Leasing back the resources to blocked workers can cause
* us to transiently exceed the maximum number of resources. This can be
* fixed by having blocked workers explicitly yield and wait to be given
* back resources before continuing execution. */
CHECK(worker->task_in_progress != NULL);
task_spec *spec = task_task_spec(worker->task_in_progress);
update_dynamic_resources(state, spec, false);
/* Add the worker to the list of executing workers. */
worker->is_blocked = false;
utarray_push_back(algorithm_state->executing_workers, &worker);
return;
}
}
/* The worker should have been in the list of blocked workers, so this line
* should be unreachable. */
LOG_FATAL(
"Worker registered as unblocked, but it was not in the list of blocked "
"workers.");
}
void handle_object_available(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
object_id object_id) {
@ -1064,3 +1198,11 @@ int num_dispatch_tasks(scheduling_algorithm_state *algorithm_state) {
DL_COUNT(algorithm_state->dispatch_task_queue, elt, count);
return count;
}
void print_worker_info(const char *message,
scheduling_algorithm_state *algorithm_state) {
LOG_DEBUG("%s: %d available, %d executing, %d blocked", message,
utarray_len(algorithm_state->available_workers),
utarray_len(algorithm_state->executing_workers),
utarray_len(algorithm_state->blocked_workers));
}

View file

@ -139,7 +139,7 @@ void handle_object_available(local_scheduler_state *state,
void handle_object_removed(local_scheduler_state *state, object_id object_id);
/**
* This function is called when a new worker becomes available
* This function is called when a new worker becomes available.
*
* @param state The state of the local scheduler.
* @param algorithm_state State maintained by the scheduling algorithm.
@ -189,6 +189,32 @@ void handle_actor_worker_disconnect(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
actor_id actor_id);
/**
* This function is called when a worker that was executing a task becomes
* blocked on an object that isn't available locally yet.
*
* @param state The state of the local scheduler.
* @param algorithm_state State maintained by the scheduling algorithm.
* @param worker The worker that is blocked.
* @return Void.
*/
void handle_worker_blocked(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
local_scheduler_client *worker);
/**
* This function is called when a worker that was blocked on an object that
* wasn't available locally yet becomes unblocked.
*
* @param state The state of the local scheduler.
* @param algorithm_state State maintained by the scheduling algorithm.
* @param worker The worker that is now unblocked.
* @return Void.
*/
void handle_worker_unblocked(local_scheduler_state *state,
scheduling_algorithm_state *algorithm_state,
local_scheduler_client *worker);
/**
* This function fetches queued task's missing object dependencies. It is
* called every LOCAL_SCHEDULER_FETCH_TIMEOUT_MILLISECONDS.
@ -201,6 +227,17 @@ void handle_actor_worker_disconnect(local_scheduler_state *state,
*/
int fetch_object_timeout_handler(event_loop *loop, timer_id id, void *context);
/**
* A helper function to print debug information about the current state and
* number of workers.
*
* @param message A message to identify the log message.
* @param algorithm_state State maintained by the scheduling algorithm.
* @return Void.
*/
void print_worker_info(const char *message,
scheduling_algorithm_state *algorithm_state);
/** The following methods are for testing purposes only. */
#ifdef PHOTON_TEST
/**

View file

@ -76,3 +76,7 @@ void photon_reconstruct_object(photon_conn *conn, object_id object_id) {
void photon_log_message(photon_conn *conn) {
write_message(conn->conn, LOG_MESSAGE, 0, NULL);
}
void photon_notify_unblocked(photon_conn *conn) {
write_message(conn->conn, NOTIFY_UNBLOCKED, 0, NULL);
}

View file

@ -93,4 +93,12 @@ void photon_reconstruct_object(photon_conn *conn, object_id object_id);
*/
void photon_log_message(photon_conn *conn);
/**
* Notify the local scheduler that this client (worker) is no longer blocked.
*
* @param conn The connection information.
* @return Void.
*/
void photon_notify_unblocked(photon_conn *conn);
#endif

View file

@ -80,6 +80,11 @@ static PyObject *PyPhotonClient_log_event(PyObject *self, PyObject *args) {
Py_RETURN_NONE;
}
static PyObject *PyPhotonClient_notify_unblocked(PyObject *self) {
photon_notify_unblocked(((PyPhotonClient *) self)->photon_connection);
Py_RETURN_NONE;
}
static PyMethodDef PyPhotonClient_methods[] = {
{"submit", (PyCFunction) PyPhotonClient_submit, METH_VARARGS,
"Submit a task to the local scheduler."},
@ -89,6 +94,8 @@ static PyMethodDef PyPhotonClient_methods[] = {
METH_VARARGS, "Ask the local scheduler to reconstruct an object."},
{"log_event", (PyCFunction) PyPhotonClient_log_event, METH_VARARGS,
"Log an event to the event log through the local scheduler."},
{"notify_unblocked", (PyCFunction) PyPhotonClient_notify_unblocked,
METH_NOARGS, "Notify the local scheduler that we are unblocked."},
{NULL} /* Sentinel */
};

View file

@ -68,6 +68,9 @@ void print_resource_info(const local_scheduler_state *state,
* @return Void.
*/
void kill_worker(local_scheduler_client *worker, bool wait) {
/* TODO(swang): This method should also propagate changes to other parts of
* the system to reflect the killed task in progress, if there was one. This
* includes updating dynamic resources and updating the task table. */
/* Erase the worker from the array of workers. */
local_scheduler_state *state = worker->local_scheduler_state;
int num_workers = utarray_len(state->workers);
@ -96,6 +99,7 @@ void kill_worker(local_scheduler_client *worker, bool wait) {
/* Wait for the process to exit. */
waitpid(worker->pid, NULL, 0);
}
LOG_INFO("Killed worker with pid %d", worker->pid);
}
/* Clean up the client socket after killing the worker so that the worker
@ -187,12 +191,15 @@ void free_local_scheduler(local_scheduler_state *state) {
*/
void start_worker(local_scheduler_state *state, actor_id actor_id) {
/* We can't start a worker if we don't have the path to the worker script. */
CHECK(state->config.start_worker_command != NULL);
if (state->config.start_worker_command == NULL) {
LOG_WARN("No valid command to start worker provided. Cannot start worker.");
return;
}
/* Launch the process to create the worker. */
pid_t pid = fork();
if (pid != 0) {
utarray_push_back(state->child_pids, &pid);
LOG_DEBUG("Started worker with pid %d", pid);
LOG_INFO("Started worker with pid %d", pid);
return;
}
@ -279,6 +286,11 @@ local_scheduler_state *init_local_scheduler(
} else {
state->config.start_worker_command = NULL;
}
if (start_worker_command == NULL) {
LOG_WARN(
"No valid command to start a worker provided, local scheduler will not "
"start any workers.");
}
state->config.global_scheduler_exists = global_scheduler_exists;
state->loop = loop;
@ -343,6 +355,29 @@ local_scheduler_state *init_local_scheduler(
return state;
}
void update_dynamic_resources(local_scheduler_state *state,
task_spec *spec,
bool return_resources) {
for (int i = 0; i < MAX_RESOURCE_INDEX; ++i) {
double resource = task_spec_get_required_resource(spec, i);
if (!return_resources) {
/* If we are not returning resources, we are leasing them, so we want to
* subtract the resource quantities from our accounting. */
resource *= -1;
}
/* Add or subtract the task's resources from our count. */
state->dynamic_resources[i] += resource;
if (!return_resources && state->dynamic_resources[i] < 0) {
/* We are using more resources than we have been allocated. */
LOG_WARN("photon dynamic resources dropped to %8.4f\t%8.4f\n",
state->dynamic_resources[0], state->dynamic_resources[1]);
}
CHECK(state->dynamic_resources[i] <= state->static_resources[i]);
}
print_resource_info(state, spec);
}
void assign_task_to_worker(local_scheduler_state *state,
task_spec *spec,
local_scheduler_client *worker) {
@ -362,13 +397,7 @@ void assign_task_to_worker(local_scheduler_state *state,
/* Resource accounting:
* Update dynamic resource vector in the local scheduler state. */
for (int i = 0; i < MAX_RESOURCE_INDEX; i++) {
state->dynamic_resources[i] -= task_spec_get_required_resource(spec, i);
CHECKM(state->dynamic_resources[i] >= 0,
"photon dynamic resources dropped to %8.4f\t%8.4f\n",
state->dynamic_resources[0], state->dynamic_resources[1]);
}
print_resource_info(state, spec);
update_dynamic_resources(state, spec, false);
task *task = alloc_task(spec, TASK_STATUS_RUNNING,
state->db ? get_db_client_id(state->db) : NIL_ID);
/* Record which task this worker is executing. This will be freed in
@ -580,12 +609,7 @@ void process_message(event_loop *loop,
if (worker->task_in_progress != NULL) {
task_spec *spec = task_task_spec(worker->task_in_progress);
/* Return dynamic resources back for the task in progress. */
for (int i = 0; i < MAX_RESOURCE_INDEX; i++) {
state->dynamic_resources[i] += task_spec_get_required_resource(spec, i);
/* Sanity-check resource vector boundary conditions. */
CHECK(state->dynamic_resources[i] <= state->static_resources[i]);
}
print_resource_info(state, spec);
update_dynamic_resources(state, spec, true);
/* If we're connected to Redis, update tables. */
if (state->db != NULL) {
/* Update control state tables. */
@ -608,6 +632,16 @@ void process_message(event_loop *loop,
}
} break;
case RECONSTRUCT_OBJECT: {
if (worker->task_in_progress != NULL && !worker->is_blocked) {
/* TODO(swang): For now, we don't handle blocked actors. */
if (actor_ids_equal(worker->actor_id, NIL_ACTOR_ID)) {
/* If the worker was executing a task (i.e. non-driver) and it wasn't
* already blocked on an object that's not locally available, update its
* state to blocked. */
handle_worker_blocked(state, state->algorithm_state, worker);
print_worker_info("Reconstructing", state->algorithm_state);
}
}
object_id *obj_id = (object_id *) utarray_front(state->input_buffer);
reconstruct_object(state, *obj_id);
} break;
@ -622,6 +656,18 @@ void process_message(event_loop *loop,
} break;
case LOG_MESSAGE: {
} break;
case NOTIFY_UNBLOCKED: {
if (worker->task_in_progress != NULL) {
/* TODO(swang): For now, we don't handle blocked actors. */
if (actor_ids_equal(worker->actor_id, NIL_ACTOR_ID)) {
/* If the worker was executing a task (i.e. non-driver), update its
* state to not blocked. */
CHECK(worker->is_blocked);
handle_worker_unblocked(state, state->algorithm_state, worker);
}
}
print_worker_info("Worker unblocked", state->algorithm_state);
} break;
default:
/* This code should be unreachable. */
CHECK(0);
@ -639,6 +685,7 @@ void new_client_connection(event_loop *loop,
local_scheduler_client *worker = malloc(sizeof(local_scheduler_client));
worker->sock = new_socket;
worker->task_in_progress = NULL;
worker->is_blocked = false;
worker->pid = 0;
worker->is_child = false;
worker->actor_id = NIL_ACTOR_ID;

View file

@ -67,6 +67,44 @@ void reconstruct_object(local_scheduler_state *state, object_id object_id);
void print_resource_info(const local_scheduler_state *s, const task_spec *spec);
/**
* Kill a worker.
*
* @param worker The local scheduler client to kill.
* @param wait A boolean representing whether to wait for the killed worker to
* exit.
* @param Void.
*/
void kill_worker(local_scheduler_client *worker, bool wait);
/**
* Start a worker. This forks a new worker process that can be added to the
* pool of available workers, pending registration of its PID with the local
* scheduler.
*
* @param state The local scheduler state.
* @param actor_id The ID of the actor for this worker. If this worker is not an
* actor, then NIL_ACTOR_ID should be used.
* @param Void.
*/
void start_worker(local_scheduler_state *state, actor_id actor_id);
/**
* Update our accounting for the current resources being used, according to
* some task that is starting or finishing execution.
*
* @param state The local scheduler state.
* @param spec The specification for the task that is or was using resources.
* @param return_resources A boolean representing whether the task is starting
* or finishing execution. If true, then the task is finishing execution
* (possibly temporarily), so it will add to the dynamic resources
* available. Else, it will take from the dynamic resources available.
* @return Void.
*/
void update_dynamic_resources(local_scheduler_state *state,
task_spec *spec,
bool return_resources);
/** The following methods are for testing purposes only. */
#ifdef PHOTON_TEST
local_scheduler_state *init_local_scheduler(
@ -92,17 +130,6 @@ void process_message(event_loop *loop,
void *context,
int events);
void kill_worker(local_scheduler_client *worker, bool wait);
/**
* Start a new worker by forking.
*
* @param state The local scheduler state.
* @param actor_id The ID of the actor for this worker. If this worker is not an
* actor, then NIL_ACTOR_ID should be used.
* @return Void.
*/
void start_worker(local_scheduler_state *state, actor_id actor_id);
#endif
#endif /* PHOTON_SCHEDULER_H */

View file

@ -367,7 +367,7 @@ class APITest(unittest.TestCase):
ray.worker.cleanup()
def testWait(self):
ray.init(num_workers=1)
ray.init(num_workers=1, num_cpus=1)
@ray.remote
def f(delay):
@ -1115,6 +1115,55 @@ class ResourcesTest(unittest.TestCase):
ray.worker.cleanup()
class WorkerPoolTests(unittest.TestCase):
def tearDown(self):
ray.worker.cleanup()
def testNoWorkers(self):
ray.init(num_workers=0)
@ray.remote
def f():
return 1
# Make sure we can call a remote function. This will require starting a new
# worker.
ray.get(f.remote())
ray.get([f.remote() for _ in range(100)])
def testBlockingTasks(self):
ray.init(num_workers=1)
@ray.remote
def f(i, j):
return (i, j)
@ray.remote
def g(i):
# Each instance of g submits and blocks on the result of another remote
# task.
object_ids = [f.remote(i, j) for j in range(10)]
return ray.get(object_ids)
ray.get([g.remote(i) for i in range(100)])
@ray.remote
def _sleep(i):
time.sleep(1)
return (i)
@ray.remote
def sleep():
# Each instance of sleep submits and blocks on the result of another
# remote task, which takes one second to execute.
ray.get([_sleep.remote(i) for i in range(10)])
ray.get(sleep.remote())
ray.worker.cleanup()
class SchedulingAlgorithm(unittest.TestCase):
def attempt_to_load_balance(self, remote_function, args, total_tasks,

View file

@ -300,5 +300,27 @@ class ReconstructionTestsMultinode(ReconstructionTests):
# one worker each.
num_local_schedulers = 4
# NOTE(swang): This test tries to launch 1000 workers and breaks.
#class WorkerPoolTests(unittest.TestCase):
#
# def tearDown(self):
# ray.worker.cleanup()
#
# def testBlockingTasks(self):
# @ray.remote
# def f(i, j):
# return (i, j)
#
# @ray.remote
# def g(i):
# # Each instance of g submits and blocks on the result of another remote
# # task.
# object_ids = [f.remote(i, j) for j in range(10)]
# return ray.get(object_ids)
#
# ray.init(num_workers=1)
# ray.get([g.remote(i) for i in range(1000)])
# ray.worker.cleanup()
if __name__ == "__main__":
unittest.main(verbosity=2)