mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Availability after worker failure (#316)
* Availability after a killed worker * Workers exit cleanly * Memory cleanup in photon C tests * Worker failure in multinode * Consolidate worker cleanup handlers * Update the result table before handling a task submission * KILL_WORKER_TIMEOUT -> KILL_WORKER_TIMEOUT_MILLISECONDS * Log a warning instead of crashing if no result table entry found
This commit is contained in:
parent
232601f90d
commit
be1618f041
14 changed files with 307 additions and 118 deletions
|
@ -216,14 +216,18 @@ class TestGlobalStateStore(unittest.TestCase):
|
|||
"node_id")
|
||||
|
||||
def testTaskTableAddAndLookup(self):
|
||||
TASK_STATUS_WAITING = 1
|
||||
TASK_STATUS_SCHEDULED = 2
|
||||
TASK_STATUS_QUEUED = 4
|
||||
|
||||
# Check that task table adds, updates, and lookups work correctly.
|
||||
task_args = [1, b"node_id", b"task_spec"]
|
||||
task_args = [TASK_STATUS_WAITING, b"node_id", b"task_spec"]
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
|
||||
*task_args)
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
|
||||
self.assertEqual(response, task_args)
|
||||
|
||||
task_args[0] = 2
|
||||
task_args[0] = TASK_STATUS_SCHEDULED
|
||||
self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id", *task_args[:2])
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
|
||||
self.assertEqual(response, task_args)
|
||||
|
@ -241,7 +245,7 @@ class TestGlobalStateStore(unittest.TestCase):
|
|||
|
||||
# If the current value is the same as the test value, and the set value is
|
||||
# different, the update happens, and the response is the entire task.
|
||||
task_args[1] += 1
|
||||
task_args[1] = TASK_STATUS_QUEUED
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
|
@ -252,7 +256,7 @@ class TestGlobalStateStore(unittest.TestCase):
|
|||
|
||||
# If the current value is no longer the same as the test value, the
|
||||
# response is nil.
|
||||
task_args[1] += 1
|
||||
task_args[1] = TASK_STATUS_WAITING
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
|
@ -262,6 +266,27 @@ class TestGlobalStateStore(unittest.TestCase):
|
|||
self.assertEqual(get_response2, get_response)
|
||||
self.assertNotEqual(get_response2, task_args[1:])
|
||||
|
||||
# If the test value is a bitmask that matches the current value, the update
|
||||
# happens.
|
||||
task_args[0] = TASK_STATUS_SCHEDULED | TASK_STATUS_QUEUED
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
self.assertEqual(response, task_args[1:])
|
||||
|
||||
# If the test value is a bitmask that does not match the current value, the
|
||||
# update does not happen.
|
||||
task_args[1] = TASK_STATUS_SCHEDULED
|
||||
old_response = response
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
self.assertEqual(response, None)
|
||||
# Check that the update did not happen.
|
||||
get_response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
|
||||
self.assertEqual(get_response, old_response)
|
||||
self.assertNotEqual(get_response, task_args[1:])
|
||||
|
||||
def testTaskTableSubscribe(self):
|
||||
scheduling_state = 1
|
||||
node_id = "node_id"
|
||||
|
|
|
@ -14,6 +14,7 @@ import numpy as np
|
|||
import os
|
||||
import random
|
||||
import redis
|
||||
import signal
|
||||
import string
|
||||
import sys
|
||||
import threading
|
||||
|
@ -936,23 +937,31 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
|
|||
num_gpus=num_gpus)
|
||||
|
||||
def cleanup(worker=global_worker):
|
||||
"""Disconnect the driver, and terminate any processes started in init.
|
||||
"""Disconnect the worker, and terminate any processes started in init.
|
||||
|
||||
This will automatically run at the end when a Python process that uses Ray
|
||||
exits. It is ok to run this twice in a row. Note that we manually call
|
||||
services.cleanup() in the tests because we need to start and stop many
|
||||
clusters in the tests, but the import and exit only happen once.
|
||||
"""
|
||||
# If this is a driver, push the finish time to Redis.
|
||||
if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
|
||||
{"end_time": time.time()})
|
||||
|
||||
disconnect(worker)
|
||||
worker.set_mode(None)
|
||||
if hasattr(worker, "photon_client"):
|
||||
del worker.photon_client
|
||||
if hasattr(worker, "plasma_client"):
|
||||
worker.plasma_client.shutdown()
|
||||
|
||||
if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
# If this is a driver, push the finish time to Redis and clean up any
|
||||
# other services that were started with the driver.
|
||||
worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
|
||||
{"end_time": time.time()})
|
||||
services.cleanup()
|
||||
else:
|
||||
# If this is not a driver, make sure there are no orphan processes.
|
||||
for process_type, processes in services.all_processes.items():
|
||||
assert(len(processes) == 0)
|
||||
|
||||
worker.set_mode(None)
|
||||
|
||||
atexit.register(cleanup)
|
||||
|
||||
|
@ -1559,6 +1568,12 @@ def main_loop(worker=global_worker):
|
|||
that occurred while executing the command, and waits for the next command.
|
||||
"""
|
||||
|
||||
def exit(signum, frame):
|
||||
cleanup(worker=worker)
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGTERM, exit)
|
||||
|
||||
def process_task(task): # wrapping these lines in a function should cause the local variables to go out of scope more quickly, which is useful for inspecting reference counts
|
||||
"""Execute a task assigned to this worker.
|
||||
|
||||
|
|
|
@ -844,17 +844,18 @@ int TaskTableUpdate_RedisCommand(RedisModuleCtx *ctx,
|
|||
|
||||
/**
|
||||
* Test and update an entry in the task table if the current value matches the
|
||||
* test value. This does not update the task specification in the table.
|
||||
* test value bitmask. This does not update the task specification in the
|
||||
* table.
|
||||
*
|
||||
* This is called from a client with the command:
|
||||
*
|
||||
* RAY.TASK_TABLE_TEST_AND_UPDATE <task ID> <test state> <state>
|
||||
* RAY.TASK_TABLE_TEST_AND_UPDATE <task ID> <test state bitmask> <state>
|
||||
* <local scheduler ID>
|
||||
*
|
||||
* @param task_id A string that is the ID of the task.
|
||||
* @param test_state A string that is the test value for the scheduling state.
|
||||
* The update happens if and only if the current scheduling state
|
||||
* matches this value.
|
||||
* @param test_state_bitmask A string that is the test bitmask for the
|
||||
* scheduling state. The update happens if and only if the current
|
||||
* scheduling state AND-ed with the bitmask is greater than 0.
|
||||
* @param state A string that is the scheduling state (a scheduling_state enum
|
||||
* instance) to update the task entry with. The string's value must be a
|
||||
* nonnegative integer less than 100, so that it has width at most 2. If
|
||||
|
@ -862,7 +863,7 @@ int TaskTableUpdate_RedisCommand(RedisModuleCtx *ctx,
|
|||
* 2.
|
||||
* @param ray_client_id A string that is the ray client ID of the associated
|
||||
* local scheduler, if any, to update the task entry with.
|
||||
* @return If the current scheduling state does not match the test value,
|
||||
* @return If the current scheduling state does not match the test bitmask,
|
||||
* returns nil. Else, returns the same as RAY.TASK_TABLE_GET: an array
|
||||
* of strings representing the updated task fields in the following
|
||||
* order: 1) (integer) scheduling state 2) (string) associated node ID,
|
||||
|
@ -903,16 +904,16 @@ int TaskTableTestAndUpdate_RedisCommand(RedisModuleCtx *ctx,
|
|||
"Found invalid scheduling state (must "
|
||||
"be an integer of width 2");
|
||||
}
|
||||
long long test_state_integer;
|
||||
int status = RedisModule_StringToLongLong(argv[2], &test_state_integer);
|
||||
long long test_state_bitmask;
|
||||
int status = RedisModule_StringToLongLong(argv[2], &test_state_bitmask);
|
||||
if (status != REDISMODULE_OK) {
|
||||
RedisModule_CloseKey(key);
|
||||
RedisModule_FreeString(ctx, state);
|
||||
return RedisModule_ReplyWithError(
|
||||
ctx, "Invalid test value for scheduling state");
|
||||
}
|
||||
if (current_state_integer != test_state_integer) {
|
||||
/* The current value does not match the test value, so do not perform the
|
||||
if ((current_state_integer & test_state_bitmask) == 0) {
|
||||
/* The current value does not match the test bitmask, so do not perform the
|
||||
* update. */
|
||||
RedisModule_CloseKey(key);
|
||||
RedisModule_FreeString(ctx, state);
|
||||
|
|
|
@ -864,8 +864,8 @@ void redis_task_table_test_and_update(table_callback_data *callback_data) {
|
|||
db->context, redis_task_table_test_and_update_callback,
|
||||
(void *) callback_data->timer_id,
|
||||
"RAY.TASK_TABLE_TEST_AND_UPDATE %b %d %d %b", task_id.id,
|
||||
sizeof(task_id.id), update_data->test_state, update_data->update_state,
|
||||
update_data->local_scheduler_id.id,
|
||||
sizeof(task_id.id), update_data->test_state_bitmask,
|
||||
update_data->update_state, update_data->local_scheduler_id.id,
|
||||
sizeof(update_data->local_scheduler_id.id));
|
||||
if ((status == REDIS_ERR) || db->context->err) {
|
||||
LOG_REDIS_DEBUG(db->context, "error in redis_task_table_test_and_update");
|
||||
|
|
|
@ -32,14 +32,14 @@ void task_table_update(db_handle *db_handle,
|
|||
|
||||
void task_table_test_and_update(db_handle *db_handle,
|
||||
task_id task_id,
|
||||
int test_state,
|
||||
int test_state_bitmask,
|
||||
int update_state,
|
||||
retry_info *retry,
|
||||
task_table_get_callback done_callback,
|
||||
void *user_context) {
|
||||
task_table_test_and_update_data *update_data =
|
||||
malloc(sizeof(task_table_test_and_update_data));
|
||||
update_data->test_state = test_state;
|
||||
update_data->test_state_bitmask = test_state_bitmask;
|
||||
update_data->update_state = update_state;
|
||||
/* Update the task entry's local scheduler with this client's ID. */
|
||||
update_data->local_scheduler_id = db_handle->client;
|
||||
|
|
|
@ -96,10 +96,11 @@ void task_table_update(db_handle *db_handle,
|
|||
*
|
||||
* @param db_handle Database handle.
|
||||
* @param task_id The task ID of the task entry to update.
|
||||
* @param test_state The value to test the current task entry's scheduling
|
||||
* state against.
|
||||
* @param test_state_bitmask The bitmask to apply to the task entry's current
|
||||
* scheduling state. The update happens if and only if the current
|
||||
* scheduling state AND-ed with the bitmask is greater than 0.
|
||||
* @param update_state The value to update the task entry's scheduling state
|
||||
* with, if the current state matches test_state.
|
||||
* with, if the current state matches test_state_bitmask.
|
||||
* @param retry Information about retrying the request to the database.
|
||||
* @param done_callback Function to be called when database returns result.
|
||||
* @param user_context Data that will be passed to done_callback and
|
||||
|
@ -108,7 +109,7 @@ void task_table_update(db_handle *db_handle,
|
|||
*/
|
||||
void task_table_test_and_update(db_handle *db_handle,
|
||||
task_id task_id,
|
||||
int test_state,
|
||||
int test_state_bitmask,
|
||||
int update_state,
|
||||
retry_info *retry,
|
||||
task_table_get_callback done_callback,
|
||||
|
@ -116,7 +117,7 @@ void task_table_test_and_update(db_handle *db_handle,
|
|||
|
||||
/* Data that is needed to test and set the task's scheduling state. */
|
||||
typedef struct {
|
||||
int test_state;
|
||||
int test_state_bitmask;
|
||||
int update_state;
|
||||
db_client_id local_scheduler_id;
|
||||
} task_table_test_and_update_data;
|
||||
|
|
|
@ -348,8 +348,10 @@ typedef enum {
|
|||
TASK_STATUS_RUNNING = 8,
|
||||
/** The task is done executing. */
|
||||
TASK_STATUS_DONE = 16,
|
||||
/** The task was not able to finish. */
|
||||
TASK_STATUS_LOST = 32,
|
||||
/** The task will be submitted for reexecution. */
|
||||
TASK_STATUS_RECONSTRUCTING = 32
|
||||
TASK_STATUS_RECONSTRUCTING = 64
|
||||
} scheduling_state;
|
||||
|
||||
/** A task is an execution of a task specification. It has a state of execution
|
||||
|
|
|
@ -803,24 +803,6 @@ bool resource_constraints_satisfied(local_scheduler_state *state,
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the result table, which holds mappings of object ID -> ID of the
|
||||
* task that created it.
|
||||
*
|
||||
* @param state The scheduler state.
|
||||
* @param spec The task spec in question.
|
||||
* @return Void.
|
||||
*/
|
||||
void update_result_table(local_scheduler_state *state, task_spec *spec) {
|
||||
if (state->db != NULL) {
|
||||
task_id task_id = task_spec_id(spec);
|
||||
for (int64_t i = 0; i < task_num_returns(spec); ++i) {
|
||||
object_id return_id = task_return(spec, i);
|
||||
result_table_add(state->db, return_id, task_id, NULL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void handle_task_submitted(local_scheduler_state *state,
|
||||
scheduling_algorithm_state *algorithm_state,
|
||||
task_spec *spec) {
|
||||
|
@ -843,10 +825,6 @@ void handle_task_submitted(local_scheduler_state *state,
|
|||
|
||||
/* Try to dispatch tasks, since we may have added one to the queue. */
|
||||
dispatch_tasks(state, algorithm_state);
|
||||
|
||||
/* Update the result table, which holds mappings of object ID -> ID of the
|
||||
* task that created it. */
|
||||
update_result_table(state, spec);
|
||||
}
|
||||
|
||||
void handle_actor_task_submitted(local_scheduler_state *state,
|
||||
|
@ -881,10 +859,6 @@ void handle_actor_task_submitted(local_scheduler_state *state,
|
|||
give_task_to_local_scheduler(state, algorithm_state, spec,
|
||||
entry->local_scheduler_id);
|
||||
}
|
||||
|
||||
/* Update the result table, which holds mappings of object ID -> ID of the
|
||||
* task that created it. */
|
||||
update_result_table(state, spec);
|
||||
}
|
||||
|
||||
void handle_actor_creation_notification(
|
||||
|
@ -994,6 +968,50 @@ void handle_worker_available(local_scheduler_state *state,
|
|||
dispatch_tasks(state, algorithm_state);
|
||||
}
|
||||
|
||||
void handle_worker_removed(local_scheduler_state *state,
|
||||
scheduling_algorithm_state *algorithm_state,
|
||||
local_scheduler_client *worker) {
|
||||
/* Make sure that we remove the worker at most once. */
|
||||
bool removed = false;
|
||||
int64_t num_workers;
|
||||
|
||||
/* Remove the worker from available workers, if it's there. */
|
||||
num_workers = utarray_len(algorithm_state->available_workers);
|
||||
for (int64_t i = num_workers - 1; i >= 0; --i) {
|
||||
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
|
||||
algorithm_state->available_workers, i);
|
||||
DCHECK(!((*p == worker) && removed));
|
||||
if (*p == worker) {
|
||||
utarray_erase(algorithm_state->available_workers, i, 1);
|
||||
removed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the worker from executing workers, if it's there. */
|
||||
num_workers = utarray_len(algorithm_state->executing_workers);
|
||||
for (int64_t i = num_workers - 1; i >= 0; --i) {
|
||||
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
|
||||
algorithm_state->executing_workers, i);
|
||||
DCHECK(!((*p == worker) && removed));
|
||||
if (*p == worker) {
|
||||
utarray_erase(algorithm_state->executing_workers, i, 1);
|
||||
removed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the worker from blocked workers, if it's there. */
|
||||
num_workers = utarray_len(algorithm_state->blocked_workers);
|
||||
for (int64_t i = num_workers - 1; i >= 0; --i) {
|
||||
local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
|
||||
algorithm_state->blocked_workers, i);
|
||||
DCHECK(!((*p == worker) && removed));
|
||||
if (*p == worker) {
|
||||
utarray_erase(algorithm_state->blocked_workers, i, 1);
|
||||
removed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void handle_actor_worker_available(local_scheduler_state *state,
|
||||
scheduling_algorithm_state *algorithm_state,
|
||||
local_scheduler_client *worker) {
|
||||
|
|
|
@ -150,6 +150,18 @@ void handle_worker_available(local_scheduler_state *state,
|
|||
scheduling_algorithm_state *algorithm_state,
|
||||
local_scheduler_client *worker);
|
||||
|
||||
/**
|
||||
* This function is called when a worker is removed.
|
||||
*
|
||||
* @param state The state of the local scheduler.
|
||||
* @param algorithm_state State maintained by the scheduling algorithm.
|
||||
* @param worker The worker that is removed.
|
||||
* @return Void.
|
||||
*/
|
||||
void handle_worker_removed(local_scheduler_state *state,
|
||||
scheduling_algorithm_state *algorithm_state,
|
||||
local_scheduler_client *worker);
|
||||
|
||||
/**
|
||||
* This version of handle_worker_available is called whenever the worker that is
|
||||
* available is running an actor.
|
||||
|
|
|
@ -28,7 +28,7 @@ static int PyPhotonClient_init(PyPhotonClient *self,
|
|||
}
|
||||
|
||||
static void PyPhotonClient_dealloc(PyPhotonClient *self) {
|
||||
free(((PyPhotonClient *) self)->photon_connection);
|
||||
photon_disconnect(((PyPhotonClient *) self)->photon_connection);
|
||||
Py_TYPE(self)->tp_free((PyObject *) self);
|
||||
}
|
||||
|
||||
|
|
|
@ -57,21 +57,27 @@ void print_resource_info(const local_scheduler_state *state,
|
|||
#endif
|
||||
}
|
||||
|
||||
int force_kill_worker(event_loop *loop, timer_id id, void *context) {
|
||||
local_scheduler_client *worker = (local_scheduler_client *) context;
|
||||
kill(worker->pid, SIGKILL);
|
||||
close(worker->sock);
|
||||
free(worker);
|
||||
return EVENT_LOOP_TIMER_DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Kill a worker, if it is a child process, and clean up all of its associated
|
||||
* state.
|
||||
*
|
||||
* @param worker A pointer to the worker we want to kill.
|
||||
* @param wait A bool representing whether we should wait for the worker's
|
||||
* process to exit. If the worker is not a child process, this flag is
|
||||
* ignored.
|
||||
* @param cleanup A bool representing whether we're cleaning up the entire local
|
||||
* scheduler's state, or just this worker. If true, then the worker will
|
||||
* be force-killed immediately. Else, the worker will be given a chance
|
||||
* to clean up its own state.
|
||||
* @return Void.
|
||||
*/
|
||||
void kill_worker(local_scheduler_client *worker, bool wait) {
|
||||
/* TODO(swang): This method should also propagate changes to other parts of
|
||||
* the system to reflect the killed task in progress, if there was one. This
|
||||
* includes updating dynamic resources and updating the task table. */
|
||||
/* Erase the worker from the array of workers. */
|
||||
void kill_worker(local_scheduler_client *worker, bool cleanup) {
|
||||
/* Erase the local scheduler's reference to the worker. */
|
||||
local_scheduler_state *state = worker->local_scheduler_state;
|
||||
int num_workers = utarray_len(state->workers);
|
||||
for (int i = 0; i < utarray_len(state->workers); ++i) {
|
||||
|
@ -86,6 +92,8 @@ void kill_worker(local_scheduler_client *worker, bool wait) {
|
|||
"Found duplicate workers");
|
||||
CHECKM(utarray_len(state->workers) != num_workers,
|
||||
"Tried to kill worker that doesn't exist");
|
||||
/* Erase the algorithm state's reference to the worker. */
|
||||
handle_worker_removed(state, state->algorithm_state, worker);
|
||||
|
||||
/* Remove the client socket from the event loop so that we don't process the
|
||||
* SIGPIPE when the worker is killed. */
|
||||
|
@ -93,27 +101,47 @@ void kill_worker(local_scheduler_client *worker, bool wait) {
|
|||
|
||||
/* If the worker has registered a process ID with us and it's a child
|
||||
* process, use it to send a kill signal. */
|
||||
bool free_worker = true;
|
||||
if (worker->is_child && worker->pid != 0) {
|
||||
if (cleanup) {
|
||||
/* If we're exiting the local scheduler anyway, it's okay to force kill
|
||||
* the worker immediately. Wait for the process to exit. */
|
||||
kill(worker->pid, SIGKILL);
|
||||
if (wait) {
|
||||
/* Wait for the process to exit. */
|
||||
waitpid(worker->pid, NULL, 0);
|
||||
close(worker->sock);
|
||||
} else {
|
||||
/* If we're just cleaning up a single worker, allow it some time to clean
|
||||
* up its state before force killing. The client socket will be closed
|
||||
* and the worker struct will be freed after the timeout. */
|
||||
kill(worker->pid, SIGTERM);
|
||||
event_loop_add_timer(state->loop, KILL_WORKER_TIMEOUT_MILLISECONDS,
|
||||
force_kill_worker, (void *) worker);
|
||||
free_worker = false;
|
||||
}
|
||||
LOG_INFO("Killed worker with pid %d", worker->pid);
|
||||
}
|
||||
|
||||
/* Clean up the client socket after killing the worker so that the worker
|
||||
* can't receive the SIGPIPE before exiting. */
|
||||
close(worker->sock);
|
||||
|
||||
/* Clean up the task in progress. */
|
||||
if (worker->task_in_progress) {
|
||||
/* TODO(swang): Update the task table to mark the task as lost. */
|
||||
/* Return the resources that the worker was using. */
|
||||
task_spec *spec = task_task_spec(worker->task_in_progress);
|
||||
update_dynamic_resources(state, spec, true);
|
||||
/* Update the task table to reflect that the task failed to complete. */
|
||||
if (state->db != NULL) {
|
||||
task_set_state(worker->task_in_progress, TASK_STATUS_LOST);
|
||||
task_table_update(state->db, worker->task_in_progress, NULL, NULL, NULL);
|
||||
} else {
|
||||
free_task(worker->task_in_progress);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DEBUG("Killed worker with pid %d", worker->pid);
|
||||
if (free_worker) {
|
||||
/* Clean up the client socket after killing the worker so that the worker
|
||||
* can't receive the SIGPIPE before exiting. */
|
||||
close(worker->sock);
|
||||
free(worker);
|
||||
}
|
||||
}
|
||||
|
||||
void free_local_scheduler(local_scheduler_state *state) {
|
||||
|
@ -130,15 +158,6 @@ void free_local_scheduler(local_scheduler_state *state) {
|
|||
state->config.start_worker_command = NULL;
|
||||
}
|
||||
|
||||
/* Disconnect from the database. */
|
||||
if (state->db != NULL) {
|
||||
db_disconnect(state->db);
|
||||
state->db = NULL;
|
||||
}
|
||||
/* Disconnect from plasma. */
|
||||
plasma_disconnect(state->plasma_conn);
|
||||
state->plasma_conn = NULL;
|
||||
|
||||
/* Kill any child processes that didn't register as a worker yet. */
|
||||
pid_t *worker_pid;
|
||||
for (worker_pid = (pid_t *) utarray_front(state->child_pids);
|
||||
|
@ -152,6 +171,8 @@ void free_local_scheduler(local_scheduler_state *state) {
|
|||
|
||||
/* Free the list of workers and any tasks that are still in progress on those
|
||||
* workers. */
|
||||
/* TODO(swang): It's possible that the local scheduler will exit before all
|
||||
* of its task table updates make it to redis. */
|
||||
for (local_scheduler_client **worker =
|
||||
(local_scheduler_client **) utarray_front(state->workers);
|
||||
worker != NULL;
|
||||
|
@ -161,6 +182,15 @@ void free_local_scheduler(local_scheduler_state *state) {
|
|||
utarray_free(state->workers);
|
||||
state->workers = NULL;
|
||||
|
||||
/* Disconnect from the database. */
|
||||
if (state->db != NULL) {
|
||||
db_disconnect(state->db);
|
||||
state->db = NULL;
|
||||
}
|
||||
/* Disconnect from plasma. */
|
||||
plasma_disconnect(state->plasma_conn);
|
||||
state->plasma_conn = NULL;
|
||||
|
||||
/* Free the mapping from the actor ID to the ID of the local scheduler
|
||||
* responsible for that actor. */
|
||||
actor_map_entry *current_actor_map_entry, *temp_actor_map_entry;
|
||||
|
@ -480,7 +510,7 @@ void reconstruct_task_update_callback(task *task, void *user_context) {
|
|||
}
|
||||
}
|
||||
|
||||
void reconstruct_result_lookup_callback(object_id reconstruct_object_id,
|
||||
void reconstruct_evicted_result_lookup_callback(object_id reconstruct_object_id,
|
||||
task_id task_id,
|
||||
void *user_context) {
|
||||
/* TODO(swang): The following check will fail if an object was created by a
|
||||
|
@ -488,13 +518,33 @@ void reconstruct_result_lookup_callback(object_id reconstruct_object_id,
|
|||
CHECKM(!IS_NIL_ID(task_id),
|
||||
"No task information found for object during reconstruction");
|
||||
local_scheduler_state *state = user_context;
|
||||
/* Try to claim the responsibility for reconstruction by doing a test-and-set
|
||||
* of the task's scheduling state in the global state. If the task's
|
||||
* scheduling state is pending completion, assume that reconstruction is
|
||||
* already being taken care of. NOTE: This codepath is not responsible for
|
||||
* detecting failure of the other reconstruction, or updating the
|
||||
* scheduling_state accordingly. */
|
||||
task_table_test_and_update(state->db, task_id, TASK_STATUS_DONE,
|
||||
/* If there are no other instances of the task running, it's safe for us to
|
||||
* claim responsibility for reconstruction. */
|
||||
task_table_test_and_update(state->db, task_id,
|
||||
(TASK_STATUS_DONE | TASK_STATUS_LOST),
|
||||
TASK_STATUS_RECONSTRUCTING, NULL,
|
||||
reconstruct_task_update_callback, state);
|
||||
}
|
||||
|
||||
void reconstruct_failed_result_lookup_callback(object_id reconstruct_object_id,
|
||||
task_id task_id,
|
||||
void *user_context) {
|
||||
/* TODO(swang): The following check will fail if an object was created by a
|
||||
* put. */
|
||||
if (IS_NIL_ID(task_id)) {
|
||||
/* NOTE(swang): For some reason, the result table update sometimes happens
|
||||
* after this lookup returns, possibly due to concurrent clients. In most
|
||||
* cases, this is okay because the initial execution is probably still
|
||||
* pending, so for now, we log a warning and suppress reconstruction. */
|
||||
LOG_WARN(
|
||||
"No task information found for object during reconstruction (no object "
|
||||
"entry yet)");
|
||||
return;
|
||||
}
|
||||
local_scheduler_state *state = user_context;
|
||||
/* If the task failed to finish, it's safe for us to claim responsibility for
|
||||
* reconstruction. */
|
||||
task_table_test_and_update(state->db, task_id, TASK_STATUS_LOST,
|
||||
TASK_STATUS_RECONSTRUCTING, NULL,
|
||||
reconstruct_task_update_callback, state);
|
||||
}
|
||||
|
@ -508,10 +558,19 @@ void reconstruct_object_lookup_callback(object_id reconstruct_object_id,
|
|||
* any nodes. NOTE: This codepath is not responsible for checking if the
|
||||
* object table entry is up-to-date. */
|
||||
local_scheduler_state *state = user_context;
|
||||
if (manager_count == 0) {
|
||||
/* Look up the task that created the object in the result table. */
|
||||
if (manager_count == 0) {
|
||||
/* If the object was created and later evicted, we reconstruct the object
|
||||
* if and only if there are no other instances of the task running. */
|
||||
result_table_lookup(state->db, reconstruct_object_id, NULL,
|
||||
reconstruct_result_lookup_callback, (void *) state);
|
||||
reconstruct_evicted_result_lookup_callback,
|
||||
(void *) state);
|
||||
} else if (manager_count == -1) {
|
||||
/* If the object has not been created yet, we reconstruct the object if and
|
||||
* only if the task that created the object failed to complete. */
|
||||
result_table_lookup(state->db, reconstruct_object_id, NULL,
|
||||
reconstruct_failed_result_lookup_callback,
|
||||
(void *) state);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -541,6 +600,17 @@ void process_message(event_loop *loop,
|
|||
switch (type) {
|
||||
case SUBMIT_TASK: {
|
||||
task_spec *spec = (task_spec *) utarray_front(state->input_buffer);
|
||||
/* Update the result table, which holds mappings of object ID -> ID of the
|
||||
* task that created it. */
|
||||
if (state->db != NULL) {
|
||||
task_id task_id = task_spec_id(spec);
|
||||
for (int64_t i = 0; i < task_num_returns(spec); ++i) {
|
||||
object_id return_id = task_return(spec, i);
|
||||
result_table_add(state->db, return_id, task_id, NULL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle the task submission. */
|
||||
if (actor_ids_equal(task_spec_actor_id(spec), NIL_ACTOR_ID)) {
|
||||
handle_task_submitted(state, state->algorithm_state, spec);
|
||||
} else {
|
||||
|
|
|
@ -7,6 +7,10 @@
|
|||
/* The duration between local scheduler heartbeats. */
|
||||
#define LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS 100
|
||||
|
||||
/* The duration that we wait after sending a worker SIGTERM before sending the
|
||||
* worker SIGKILL. */
|
||||
#define KILL_WORKER_TIMEOUT_MILLISECONDS 100
|
||||
|
||||
#define DEFAULT_NUM_CPUS INT16_MAX
|
||||
#define DEFAULT_NUM_GPUS 0
|
||||
|
||||
|
|
|
@ -50,19 +50,12 @@ typedef struct {
|
|||
photon_conn **conns;
|
||||
} photon_mock;
|
||||
|
||||
photon_mock *init_photon_mock(bool connect_to_redis,
|
||||
int num_workers,
|
||||
int num_mock_workers) {
|
||||
photon_mock *init_photon_mock(int num_workers, int num_mock_workers) {
|
||||
const char *node_ip_address = "127.0.0.1";
|
||||
const char *redis_addr = NULL;
|
||||
int redis_port = -1;
|
||||
const char *redis_addr = node_ip_address;
|
||||
int redis_port = 6379;
|
||||
const double static_resource_conf[MAX_RESOURCE_INDEX] = {DEFAULT_NUM_CPUS,
|
||||
DEFAULT_NUM_GPUS};
|
||||
if (connect_to_redis) {
|
||||
redis_addr = node_ip_address;
|
||||
redis_port = 6379;
|
||||
}
|
||||
|
||||
photon_mock *mock = malloc(sizeof(photon_mock));
|
||||
memset(mock, 0, sizeof(photon_mock));
|
||||
mock->loop = event_loop_create();
|
||||
|
@ -114,10 +107,25 @@ photon_mock *init_photon_mock(bool connect_to_redis,
|
|||
}
|
||||
|
||||
void destroy_photon_mock(photon_mock *mock) {
|
||||
/* Disconnect clients. */
|
||||
for (int i = 0; i < mock->num_photon_conns; ++i) {
|
||||
photon_disconnect(mock->conns[i]);
|
||||
}
|
||||
free(mock->conns);
|
||||
|
||||
/* Kill all the workers and run the event loop again so that the task table
|
||||
* updates propagate and the tasks in progress are freed. */
|
||||
local_scheduler_client **worker = (local_scheduler_client **) utarray_eltptr(
|
||||
mock->photon_state->workers, 0);
|
||||
while (worker != NULL) {
|
||||
kill_worker(*worker, true);
|
||||
worker = (local_scheduler_client **) utarray_eltptr(
|
||||
mock->photon_state->workers, 0);
|
||||
}
|
||||
event_loop_add_timer(mock->loop, 500,
|
||||
(event_loop_timer_handler) timeout_handler, NULL);
|
||||
event_loop_run(mock->loop);
|
||||
|
||||
/* This also frees mock->loop. */
|
||||
free_local_scheduler(mock->photon_state);
|
||||
close(mock->plasma_store_fd);
|
||||
|
@ -138,7 +146,7 @@ void reset_worker(photon_mock *mock, local_scheduler_client *worker) {
|
|||
* value, the task should get assigned to a worker again.
|
||||
*/
|
||||
TEST object_reconstruction_test(void) {
|
||||
photon_mock *photon = init_photon_mock(true, 0, 1);
|
||||
photon_mock *photon = init_photon_mock(0, 1);
|
||||
photon_conn *worker = photon->conns[0];
|
||||
|
||||
/* Create a task with zero dependencies and one return value. */
|
||||
|
@ -207,7 +215,7 @@ TEST object_reconstruction_test(void) {
|
|||
* should trigger reconstruction of all previous tasks in the lineage.
|
||||
*/
|
||||
TEST object_reconstruction_recursive_test(void) {
|
||||
photon_mock *photon = init_photon_mock(true, 0, 1);
|
||||
photon_mock *photon = init_photon_mock(0, 1);
|
||||
photon_conn *worker = photon->conns[0];
|
||||
/* Create a chain of tasks, each one dependent on the one before it. Mark
|
||||
* each object as available so that tasks will run immediately. */
|
||||
|
@ -316,7 +324,7 @@ void object_reconstruction_suppression_callback(object_id object_id,
|
|||
}
|
||||
|
||||
TEST object_reconstruction_suppression_test(void) {
|
||||
photon_mock *photon = init_photon_mock(true, 0, 1);
|
||||
photon_mock *photon = init_photon_mock(0, 1);
|
||||
photon_conn *worker = photon->conns[0];
|
||||
|
||||
object_reconstruction_suppression_spec = example_task_spec(0, 1);
|
||||
|
@ -365,7 +373,7 @@ TEST object_reconstruction_suppression_test(void) {
|
|||
}
|
||||
|
||||
TEST task_dependency_test(void) {
|
||||
photon_mock *photon = init_photon_mock(false, 0, 1);
|
||||
photon_mock *photon = init_photon_mock(0, 1);
|
||||
local_scheduler_state *state = photon->photon_state;
|
||||
scheduling_algorithm_state *algorithm_state = state->algorithm_state;
|
||||
/* Get the first worker. */
|
||||
|
@ -440,7 +448,7 @@ TEST task_dependency_test(void) {
|
|||
}
|
||||
|
||||
TEST task_multi_dependency_test(void) {
|
||||
photon_mock *photon = init_photon_mock(false, 0, 1);
|
||||
photon_mock *photon = init_photon_mock(0, 1);
|
||||
local_scheduler_state *state = photon->photon_state;
|
||||
scheduling_algorithm_state *algorithm_state = state->algorithm_state;
|
||||
/* Get the first worker. */
|
||||
|
@ -516,7 +524,7 @@ TEST task_multi_dependency_test(void) {
|
|||
TEST start_kill_workers_test(void) {
|
||||
/* Start some workers. */
|
||||
int num_workers = 4;
|
||||
photon_mock *photon = init_photon_mock(true, num_workers, 0);
|
||||
photon_mock *photon = init_photon_mock(num_workers, 0);
|
||||
/* We start off with num_workers children processes, but no workers
|
||||
* registered yet. */
|
||||
ASSERT_EQ(utarray_len(photon->photon_state->child_pids), num_workers);
|
||||
|
@ -548,7 +556,7 @@ TEST start_kill_workers_test(void) {
|
|||
/* After killing a worker, its state is cleaned up. */
|
||||
local_scheduler_client *worker = *(local_scheduler_client **) utarray_eltptr(
|
||||
photon->photon_state->workers, 0);
|
||||
kill_worker(worker, true);
|
||||
kill_worker(worker, false);
|
||||
ASSERT_EQ(utarray_len(photon->photon_state->child_pids), 0);
|
||||
ASSERT_EQ(utarray_len(photon->photon_state->workers), num_workers - 1);
|
||||
|
||||
|
@ -581,9 +589,9 @@ SUITE(photon_tests) {
|
|||
RUN_REDIS_TEST(object_reconstruction_test);
|
||||
RUN_REDIS_TEST(object_reconstruction_recursive_test);
|
||||
RUN_REDIS_TEST(object_reconstruction_suppression_test);
|
||||
RUN_TEST(task_dependency_test);
|
||||
RUN_TEST(task_multi_dependency_test);
|
||||
RUN_TEST(start_kill_workers_test);
|
||||
RUN_REDIS_TEST(task_dependency_test);
|
||||
RUN_REDIS_TEST(task_multi_dependency_test);
|
||||
RUN_REDIS_TEST(start_kill_workers_test);
|
||||
}
|
||||
|
||||
GREATEST_MAIN_DEFS();
|
||||
|
|
|
@ -70,5 +70,38 @@ class ComponentFailureTest(unittest.TestCase):
|
|||
self.assertTrue(ray.services.all_processes_alive(exclude=[ray.services.PROCESS_TYPE_WORKER]))
|
||||
ray.worker.cleanup()
|
||||
|
||||
def _testWorkerFailed(self, num_local_schedulers):
|
||||
@ray.remote
|
||||
def f(x):
|
||||
time.sleep(0.5)
|
||||
return x
|
||||
|
||||
num_initial_workers = 4
|
||||
ray.worker._init(num_workers=num_initial_workers * num_local_schedulers,
|
||||
num_local_schedulers=num_local_schedulers,
|
||||
start_workers_from_local_scheduler=False,
|
||||
start_ray_local=True,
|
||||
num_cpus=[num_initial_workers] * num_local_schedulers)
|
||||
# Submit more tasks than there are workers so that all workers and cores
|
||||
# are utilized.
|
||||
object_ids = [f.remote(i) for i in range(num_initial_workers * num_local_schedulers)]
|
||||
object_ids += [f.remote(object_id) for object_id in object_ids]
|
||||
# Allow the tasks some time to begin executing.
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for worker in ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]:
|
||||
worker.terminate()
|
||||
time.sleep(0.1)
|
||||
# Make sure that we can still get the objects after the executing tasks died.
|
||||
ray.get(object_ids)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testWorkerFailed(self):
|
||||
self._testWorkerFailed(1)
|
||||
|
||||
def testWorkerFailedMultinode(self):
|
||||
self._testWorkerFailed(4)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
|
Loading…
Add table
Reference in a new issue