Availability after worker failure (#316)

* Availability after a killed worker * Workers exit cleanly * Memory cleanup in photon C tests * Worker failure in multinode * Consolidate worker cleanup handlers * Update the result table before handling a task submission * KILL_WORKER_TIMEOUT -> KILL_WORKER_TIMEOUT_MILLISECONDS * Log a warning instead of crashing if no result table entry found
2025-03-06 02:21:39 -05:00 · 2017-02-25 20:19:36 -08:00 · 2017-02-25 20:19:36 -08:00 · be1618f041
commit be1618f041
parent 232601f90d
14 changed files with 307 additions and 118 deletions
--- a/python/common/redis_module/runtest.py
+++ b/python/common/redis_module/runtest.py
@ -216,14 +216,18 @@ class TestGlobalStateStore(unittest.TestCase):
                                 "node_id")

  def testTaskTableAddAndLookup(self):
+    TASK_STATUS_WAITING = 1
+    TASK_STATUS_SCHEDULED = 2
+    TASK_STATUS_QUEUED = 4
+
    # Check that task table adds, updates, and lookups work correctly.
-    task_args = [1, b"node_id", b"task_spec"]
+    task_args = [TASK_STATUS_WAITING, b"node_id", b"task_spec"]
    response = self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
                                          *task_args)
    response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
    self.assertEqual(response, task_args)

-    task_args[0] = 2
+    task_args[0] = TASK_STATUS_SCHEDULED
    self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id", *task_args[:2])
    response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
    self.assertEqual(response, task_args)
@ -241,7 +245,7 @@ class TestGlobalStateStore(unittest.TestCase):

    # If the current value is the same as the test value, and the set value is
    # different, the update happens, and the response is the entire task.
-    task_args[1] += 1
+    task_args[1] = TASK_STATUS_QUEUED
    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
                                          "task_id",
                                          *task_args[:3])
@ -252,7 +256,7 @@ class TestGlobalStateStore(unittest.TestCase):

    # If the current value is no longer the same as the test value, the
    # response is nil.
-    task_args[1] += 1
+    task_args[1] = TASK_STATUS_WAITING
    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
                                          "task_id",
                                          *task_args[:3])
@ -262,6 +266,27 @@ class TestGlobalStateStore(unittest.TestCase):
    self.assertEqual(get_response2, get_response)
    self.assertNotEqual(get_response2, task_args[1:])

+    # If the test value is a bitmask that matches the current value, the update
+    # happens.
+    task_args[0] = TASK_STATUS_SCHEDULED | TASK_STATUS_QUEUED
+    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
+                                          "task_id",
+                                          *task_args[:3])
+    self.assertEqual(response, task_args[1:])
+
+    # If the test value is a bitmask that does not match the current value, the
+    # update does not happen.
+    task_args[1] = TASK_STATUS_SCHEDULED
+    old_response = response
+    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
+                                          "task_id",
+                                          *task_args[:3])
+    self.assertEqual(response, None)
+    # Check that the update did not happen.
+    get_response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
+    self.assertEqual(get_response, old_response)
+    self.assertNotEqual(get_response, task_args[1:])
+
  def testTaskTableSubscribe(self):
    scheduling_state = 1
    node_id = "node_id"
--- a/python/ray/worker.py
+++ b/python/ray/worker.py
@ -14,6 +14,7 @@ import numpy as np
 import os
 import random
 import redis
+import signal
 import string
 import sys
 import threading
@ -936,23 +937,31 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
               num_gpus=num_gpus)

 def cleanup(worker=global_worker):
-  """Disconnect the driver, and terminate any processes started in init.
+  """Disconnect the worker, and terminate any processes started in init.

  This will automatically run at the end when a Python process that uses Ray
  exits. It is ok to run this twice in a row. Note that we manually call
  services.cleanup() in the tests because we need to start and stop many
  clusters in the tests, but the import and exit only happen once.
  """
-  # If this is a driver, push the finish time to Redis.
-  if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
-    worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
-                              {"end_time": time.time()})
-
  disconnect(worker)
-  worker.set_mode(None)
+  if hasattr(worker, "photon_client"):
+    del worker.photon_client
  if hasattr(worker, "plasma_client"):
    worker.plasma_client.shutdown()
-  services.cleanup()
+
+  if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
+    # If this is a driver, push the finish time to Redis and clean up any
+    # other services that were started with the driver.
+    worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
+                              {"end_time": time.time()})
+    services.cleanup()
+  else:
+    # If this is not a driver, make sure there are no orphan processes.
+    for process_type, processes in services.all_processes.items():
+      assert(len(processes) == 0)
+
+  worker.set_mode(None)

 atexit.register(cleanup)

@ -1559,6 +1568,12 @@ def main_loop(worker=global_worker):
  that occurred while executing the command, and waits for the next command.
  """

+  def exit(signum, frame):
+    cleanup(worker=worker)
+    sys.exit(0)
+
+  signal.signal(signal.SIGTERM, exit)
+
  def process_task(task): # wrapping these lines in a function should cause the local variables to go out of scope more quickly, which is useful for inspecting reference counts
    """Execute a task assigned to this worker.

--- a/src/common/redis_module/ray_redis_module.c
+++ b/src/common/redis_module/ray_redis_module.c
@ -844,17 +844,18 @@ int TaskTableUpdate_RedisCommand(RedisModuleCtx *ctx,

 /**
 * Test and update an entry in the task table if the current value matches the
- * test value. This does not update the task specification in the table.
+ * test value bitmask. This does not update the task specification in the
+ * table.
 *
 * This is called from a client with the command:
 *
- *     RAY.TASK_TABLE_TEST_AND_UPDATE <task ID> <test state> <state>
+ *     RAY.TASK_TABLE_TEST_AND_UPDATE <task ID> <test state bitmask> <state>
 *         <local scheduler ID>
 *
 * @param task_id A string that is the ID of the task.
- * @param test_state A string that is the test value for the scheduling state.
- *        The update happens if and only if the current scheduling state
- *        matches this value.
+ * @param test_state_bitmask A string that is the test bitmask for the
+ *        scheduling state. The update happens if and only if the current
+ *        scheduling state AND-ed with the bitmask is greater than 0.
 * @param state A string that is the scheduling state (a scheduling_state enum
 *        instance) to update the task entry with. The string's value must be a
 *        nonnegative integer less than 100, so that it has width at most 2. If
@ -862,7 +863,7 @@ int TaskTableUpdate_RedisCommand(RedisModuleCtx *ctx,
 *        2.
 * @param ray_client_id A string that is the ray client ID of the associated
 *        local scheduler, if any, to update the task entry with.
- * @return If the current scheduling state does not match the test value,
+ * @return If the current scheduling state does not match the test bitmask,
 *         returns nil. Else, returns the same as RAY.TASK_TABLE_GET: an array
 *         of strings representing the updated task fields in the following
 *         order: 1) (integer) scheduling state 2) (string) associated node ID,
@ -903,16 +904,16 @@ int TaskTableTestAndUpdate_RedisCommand(RedisModuleCtx *ctx,
                                      "Found invalid scheduling state (must "
                                      "be an integer of width 2");
  }
-  long long test_state_integer;
-  int status = RedisModule_StringToLongLong(argv[2], &test_state_integer);
+  long long test_state_bitmask;
+  int status = RedisModule_StringToLongLong(argv[2], &test_state_bitmask);
  if (status != REDISMODULE_OK) {
    RedisModule_CloseKey(key);
    RedisModule_FreeString(ctx, state);
    return RedisModule_ReplyWithError(
        ctx, "Invalid test value for scheduling state");
  }
-  if (current_state_integer != test_state_integer) {
-    /* The current value does not match the test value, so do not perform the
+  if ((current_state_integer & test_state_bitmask) == 0) {
+    /* The current value does not match the test bitmask, so do not perform the
     * update. */
    RedisModule_CloseKey(key);
    RedisModule_FreeString(ctx, state);
--- a/src/common/state/redis.c
+++ b/src/common/state/redis.c
@ -864,8 +864,8 @@ void redis_task_table_test_and_update(table_callback_data *callback_data) {
      db->context, redis_task_table_test_and_update_callback,
      (void *) callback_data->timer_id,
      "RAY.TASK_TABLE_TEST_AND_UPDATE %b %d %d %b", task_id.id,
-      sizeof(task_id.id), update_data->test_state, update_data->update_state,
-      update_data->local_scheduler_id.id,
+      sizeof(task_id.id), update_data->test_state_bitmask,
+      update_data->update_state, update_data->local_scheduler_id.id,
      sizeof(update_data->local_scheduler_id.id));
  if ((status == REDIS_ERR) || db->context->err) {
    LOG_REDIS_DEBUG(db->context, "error in redis_task_table_test_and_update");
--- a/src/common/state/task_table.c
+++ b/src/common/state/task_table.c
@ -32,14 +32,14 @@ void task_table_update(db_handle *db_handle,

 void task_table_test_and_update(db_handle *db_handle,
                                task_id task_id,
-                                int test_state,
+                                int test_state_bitmask,
                                int update_state,
                                retry_info *retry,
                                task_table_get_callback done_callback,
                                void *user_context) {
  task_table_test_and_update_data *update_data =
      malloc(sizeof(task_table_test_and_update_data));
-  update_data->test_state = test_state;
+  update_data->test_state_bitmask = test_state_bitmask;
  update_data->update_state = update_state;
  /* Update the task entry's local scheduler with this client's ID. */
  update_data->local_scheduler_id = db_handle->client;
--- a/src/common/state/task_table.h
+++ b/src/common/state/task_table.h
@ -96,10 +96,11 @@ void task_table_update(db_handle *db_handle,
 *
 * @param db_handle Database handle.
 * @param task_id The task ID of the task entry to update.
- * @param test_state The value to test the current task entry's scheduling
- *        state against.
+ * @param test_state_bitmask The bitmask to apply to the task entry's current
+ *        scheduling state.  The update happens if and only if the current
+ *        scheduling state AND-ed with the bitmask is greater than 0.
 * @param update_state The value to update the task entry's scheduling state
- *        with, if the current state matches test_state.
+ *        with, if the current state matches test_state_bitmask.
 * @param retry Information about retrying the request to the database.
 * @param done_callback Function to be called when database returns result.
 * @param user_context Data that will be passed to done_callback and
@ -108,7 +109,7 @@ void task_table_update(db_handle *db_handle,
 */
 void task_table_test_and_update(db_handle *db_handle,
                                task_id task_id,
-                                int test_state,
+                                int test_state_bitmask,
                                int update_state,
                                retry_info *retry,
                                task_table_get_callback done_callback,
@ -116,7 +117,7 @@ void task_table_test_and_update(db_handle *db_handle,

 /* Data that is needed to test and set the task's scheduling state. */
 typedef struct {
-  int test_state;
+  int test_state_bitmask;
  int update_state;
  db_client_id local_scheduler_id;
 } task_table_test_and_update_data;
--- a/src/common/task.h
+++ b/src/common/task.h
@ -348,8 +348,10 @@ typedef enum {
  TASK_STATUS_RUNNING = 8,
  /** The task is done executing. */
  TASK_STATUS_DONE = 16,
+  /** The task was not able to finish. */
+  TASK_STATUS_LOST = 32,
  /** The task will be submitted for reexecution. */
-  TASK_STATUS_RECONSTRUCTING = 32
+  TASK_STATUS_RECONSTRUCTING = 64
 } scheduling_state;

 /** A task is an execution of a task specification.  It has a state of execution
--- a/src/photon/photon_algorithm.c
+++ b/src/photon/photon_algorithm.c
@ -803,24 +803,6 @@ bool resource_constraints_satisfied(local_scheduler_state *state,
  return true;
 }

-/**
- * Update the result table, which holds mappings of object ID -> ID of the
- * task that created it.
- *
- * @param state The scheduler state.
- * @param spec The task spec in question.
- * @return Void.
- */
-void update_result_table(local_scheduler_state *state, task_spec *spec) {
-  if (state->db != NULL) {
-    task_id task_id = task_spec_id(spec);
-    for (int64_t i = 0; i < task_num_returns(spec); ++i) {
-      object_id return_id = task_return(spec, i);
-      result_table_add(state->db, return_id, task_id, NULL, NULL, NULL);
-    }
-  }
-}
-
 void handle_task_submitted(local_scheduler_state *state,
                           scheduling_algorithm_state *algorithm_state,
                           task_spec *spec) {
@ -843,10 +825,6 @@ void handle_task_submitted(local_scheduler_state *state,

  /* Try to dispatch tasks, since we may have added one to the queue. */
  dispatch_tasks(state, algorithm_state);
-
-  /* Update the result table, which holds mappings of object ID -> ID of the
-   * task that created it. */
-  update_result_table(state, spec);
 }

 void handle_actor_task_submitted(local_scheduler_state *state,
@ -881,10 +859,6 @@ void handle_actor_task_submitted(local_scheduler_state *state,
    give_task_to_local_scheduler(state, algorithm_state, spec,
                                 entry->local_scheduler_id);
  }
-
-  /* Update the result table, which holds mappings of object ID -> ID of the
-   * task that created it. */
-  update_result_table(state, spec);
 }

 void handle_actor_creation_notification(
@ -994,6 +968,50 @@ void handle_worker_available(local_scheduler_state *state,
  dispatch_tasks(state, algorithm_state);
 }

+void handle_worker_removed(local_scheduler_state *state,
+                           scheduling_algorithm_state *algorithm_state,
+                           local_scheduler_client *worker) {
+  /* Make sure that we remove the worker at most once. */
+  bool removed = false;
+  int64_t num_workers;
+
+  /* Remove the worker from available workers, if it's there. */
+  num_workers = utarray_len(algorithm_state->available_workers);
+  for (int64_t i = num_workers - 1; i >= 0; --i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->available_workers, i);
+    DCHECK(!((*p == worker) && removed));
+    if (*p == worker) {
+      utarray_erase(algorithm_state->available_workers, i, 1);
+      removed = true;
+    }
+  }
+
+  /* Remove the worker from executing workers, if it's there. */
+  num_workers = utarray_len(algorithm_state->executing_workers);
+  for (int64_t i = num_workers - 1; i >= 0; --i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->executing_workers, i);
+    DCHECK(!((*p == worker) && removed));
+    if (*p == worker) {
+      utarray_erase(algorithm_state->executing_workers, i, 1);
+      removed = true;
+    }
+  }
+
+  /* Remove the worker from blocked workers, if it's there. */
+  num_workers = utarray_len(algorithm_state->blocked_workers);
+  for (int64_t i = num_workers - 1; i >= 0; --i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->blocked_workers, i);
+    DCHECK(!((*p == worker) && removed));
+    if (*p == worker) {
+      utarray_erase(algorithm_state->blocked_workers, i, 1);
+      removed = true;
+    }
+  }
+}
+
 void handle_actor_worker_available(local_scheduler_state *state,
                                   scheduling_algorithm_state *algorithm_state,
                                   local_scheduler_client *worker) {
--- a/src/photon/photon_algorithm.h
+++ b/src/photon/photon_algorithm.h
@ -150,6 +150,18 @@ void handle_worker_available(local_scheduler_state *state,
                             scheduling_algorithm_state *algorithm_state,
                             local_scheduler_client *worker);

+/**
+ * This function is called when a worker is removed.
+ *
+ * @param state The state of the local scheduler.
+ * @param algorithm_state State maintained by the scheduling algorithm.
+ * @param worker The worker that is removed.
+ * @return Void.
+ */
+void handle_worker_removed(local_scheduler_state *state,
+                           scheduling_algorithm_state *algorithm_state,
+                           local_scheduler_client *worker);
+
 /**
 * This version of handle_worker_available is called whenever the worker that is
 * available is running an actor.
--- a/src/photon/photon_extension.c
+++ b/src/photon/photon_extension.c
@ -28,7 +28,7 @@ static int PyPhotonClient_init(PyPhotonClient *self,
 }

 static void PyPhotonClient_dealloc(PyPhotonClient *self) {
-  free(((PyPhotonClient *) self)->photon_connection);
+  photon_disconnect(((PyPhotonClient *) self)->photon_connection);
  Py_TYPE(self)->tp_free((PyObject *) self);
 }

--- a/src/photon/photon_scheduler.c
+++ b/src/photon/photon_scheduler.c
@ -57,21 +57,27 @@ void print_resource_info(const local_scheduler_state *state,
 #endif
 }

+int force_kill_worker(event_loop *loop, timer_id id, void *context) {
+  local_scheduler_client *worker = (local_scheduler_client *) context;
+  kill(worker->pid, SIGKILL);
+  close(worker->sock);
+  free(worker);
+  return EVENT_LOOP_TIMER_DONE;
+}
+
 /**
 * Kill a worker, if it is a child process, and clean up all of its associated
 * state.
 *
 * @param worker A pointer to the worker we want to kill.
- * @param wait A bool representing whether we should wait for the worker's
- *        process to exit. If the worker is not a child process, this flag is
- *        ignored.
+ * @param cleanup A bool representing whether we're cleaning up the entire local
+ *        scheduler's state, or just this worker. If true, then the worker will
+ *        be force-killed immediately. Else, the worker will be given a chance
+ *        to clean up its own state.
 * @return Void.
 */
-void kill_worker(local_scheduler_client *worker, bool wait) {
-  /* TODO(swang): This method should also propagate changes to other parts of
-   * the system to reflect the killed task in progress, if there was one.  This
-   * includes updating dynamic resources and updating the task table. */
-  /* Erase the worker from the array of workers. */
+void kill_worker(local_scheduler_client *worker, bool cleanup) {
+  /* Erase the local scheduler's reference to the worker. */
  local_scheduler_state *state = worker->local_scheduler_state;
  int num_workers = utarray_len(state->workers);
  for (int i = 0; i < utarray_len(state->workers); ++i) {
@ -86,6 +92,8 @@ void kill_worker(local_scheduler_client *worker, bool wait) {
         "Found duplicate workers");
  CHECKM(utarray_len(state->workers) != num_workers,
         "Tried to kill worker that doesn't exist");
+  /* Erase the algorithm state's reference to the worker. */
+  handle_worker_removed(state, state->algorithm_state, worker);

  /* Remove the client socket from the event loop so that we don't process the
   * SIGPIPE when the worker is killed. */
@ -93,27 +101,47 @@ void kill_worker(local_scheduler_client *worker, bool wait) {

  /* If the worker has registered a process ID with us and it's a child
   * process, use it to send a kill signal. */
+  bool free_worker = true;
  if (worker->is_child && worker->pid != 0) {
-    kill(worker->pid, SIGKILL);
-    if (wait) {
-      /* Wait for the process to exit. */
+    if (cleanup) {
+      /* If we're exiting the local scheduler anyway, it's okay to force kill
+       * the worker immediately. Wait for the process to exit. */
+      kill(worker->pid, SIGKILL);
      waitpid(worker->pid, NULL, 0);
+      close(worker->sock);
+    } else {
+      /* If we're just cleaning up a single worker, allow it some time to clean
+       * up its state before force killing. The client socket will be closed
+       * and the worker struct will be freed after the timeout. */
+      kill(worker->pid, SIGTERM);
+      event_loop_add_timer(state->loop, KILL_WORKER_TIMEOUT_MILLISECONDS,
+                           force_kill_worker, (void *) worker);
+      free_worker = false;
    }
    LOG_INFO("Killed worker with pid %d", worker->pid);
  }

-  /* Clean up the client socket after killing the worker so that the worker
-   * can't receive the SIGPIPE before exiting. */
-  close(worker->sock);
-
  /* Clean up the task in progress. */
  if (worker->task_in_progress) {
-    /* TODO(swang): Update the task table to mark the task as lost. */
-    free_task(worker->task_in_progress);
+    /* Return the resources that the worker was using. */
+    task_spec *spec = task_task_spec(worker->task_in_progress);
+    update_dynamic_resources(state, spec, true);
+    /* Update the task table to reflect that the task failed to complete. */
+    if (state->db != NULL) {
+      task_set_state(worker->task_in_progress, TASK_STATUS_LOST);
+      task_table_update(state->db, worker->task_in_progress, NULL, NULL, NULL);
+    } else {
+      free_task(worker->task_in_progress);
+    }
  }

  LOG_DEBUG("Killed worker with pid %d", worker->pid);
-  free(worker);
+  if (free_worker) {
+    /* Clean up the client socket after killing the worker so that the worker
+     * can't receive the SIGPIPE before exiting. */
+    close(worker->sock);
+    free(worker);
+  }
 }

 void free_local_scheduler(local_scheduler_state *state) {
@ -130,15 +158,6 @@ void free_local_scheduler(local_scheduler_state *state) {
    state->config.start_worker_command = NULL;
  }

-  /* Disconnect from the database. */
-  if (state->db != NULL) {
-    db_disconnect(state->db);
-    state->db = NULL;
-  }
-  /* Disconnect from plasma. */
-  plasma_disconnect(state->plasma_conn);
-  state->plasma_conn = NULL;
-
  /* Kill any child processes that didn't register as a worker yet. */
  pid_t *worker_pid;
  for (worker_pid = (pid_t *) utarray_front(state->child_pids);
@ -152,6 +171,8 @@ void free_local_scheduler(local_scheduler_state *state) {

  /* Free the list of workers and any tasks that are still in progress on those
   * workers. */
+  /* TODO(swang): It's possible that the local scheduler will exit before all
+   * of its task table updates make it to redis. */
  for (local_scheduler_client **worker =
           (local_scheduler_client **) utarray_front(state->workers);
       worker != NULL;
@ -161,6 +182,15 @@ void free_local_scheduler(local_scheduler_state *state) {
  utarray_free(state->workers);
  state->workers = NULL;

+  /* Disconnect from the database. */
+  if (state->db != NULL) {
+    db_disconnect(state->db);
+    state->db = NULL;
+  }
+  /* Disconnect from plasma. */
+  plasma_disconnect(state->plasma_conn);
+  state->plasma_conn = NULL;
+
  /* Free the mapping from the actor ID to the ID of the local scheduler
   * responsible for that actor. */
  actor_map_entry *current_actor_map_entry, *temp_actor_map_entry;
@ -480,21 +510,41 @@ void reconstruct_task_update_callback(task *task, void *user_context) {
  }
 }

-void reconstruct_result_lookup_callback(object_id reconstruct_object_id,
-                                        task_id task_id,
-                                        void *user_context) {
+void reconstruct_evicted_result_lookup_callback(object_id reconstruct_object_id,
+                                                task_id task_id,
+                                                void *user_context) {
  /* TODO(swang): The following check will fail if an object was created by a
   * put. */
  CHECKM(!IS_NIL_ID(task_id),
         "No task information found for object during reconstruction");
  local_scheduler_state *state = user_context;
-  /* Try to claim the responsibility for reconstruction by doing a test-and-set
-   * of the task's scheduling state in the global state. If the task's
-   * scheduling state is pending completion, assume that reconstruction is
-   * already being taken care of. NOTE: This codepath is not responsible for
-   * detecting failure of the other reconstruction, or updating the
-   * scheduling_state accordingly. */
-  task_table_test_and_update(state->db, task_id, TASK_STATUS_DONE,
+  /* If there are no other instances of the task running, it's safe for us to
+   * claim responsibility for reconstruction. */
+  task_table_test_and_update(state->db, task_id,
+                             (TASK_STATUS_DONE | TASK_STATUS_LOST),
+                             TASK_STATUS_RECONSTRUCTING, NULL,
+                             reconstruct_task_update_callback, state);
+}
+
+void reconstruct_failed_result_lookup_callback(object_id reconstruct_object_id,
+                                               task_id task_id,
+                                               void *user_context) {
+  /* TODO(swang): The following check will fail if an object was created by a
+   * put. */
+  if (IS_NIL_ID(task_id)) {
+    /* NOTE(swang): For some reason, the result table update sometimes happens
+     * after this lookup returns, possibly due to concurrent clients. In most
+     * cases, this is okay because the initial execution is probably still
+     * pending, so for now, we log a warning and suppress reconstruction. */
+    LOG_WARN(
+        "No task information found for object during reconstruction (no object "
+        "entry yet)");
+    return;
+  }
+  local_scheduler_state *state = user_context;
+  /* If the task failed to finish, it's safe for us to claim responsibility for
+   * reconstruction. */
+  task_table_test_and_update(state->db, task_id, TASK_STATUS_LOST,
                             TASK_STATUS_RECONSTRUCTING, NULL,
                             reconstruct_task_update_callback, state);
 }
@ -508,10 +558,19 @@ void reconstruct_object_lookup_callback(object_id reconstruct_object_id,
   * any nodes. NOTE: This codepath is not responsible for checking if the
   * object table entry is up-to-date. */
  local_scheduler_state *state = user_context;
+  /* Look up the task that created the object in the result table. */
  if (manager_count == 0) {
-    /* Look up the task that created the object in the result table. */
+    /* If the object was created and later evicted, we reconstruct the object
+     * if and only if there are no other instances of the task running. */
    result_table_lookup(state->db, reconstruct_object_id, NULL,
-                        reconstruct_result_lookup_callback, (void *) state);
+                        reconstruct_evicted_result_lookup_callback,
+                        (void *) state);
+  } else if (manager_count == -1) {
+    /* If the object has not been created yet, we reconstruct the object if and
+     * only if the task that created the object failed to complete. */
+    result_table_lookup(state->db, reconstruct_object_id, NULL,
+                        reconstruct_failed_result_lookup_callback,
+                        (void *) state);
  }
 }

@ -541,6 +600,17 @@ void process_message(event_loop *loop,
  switch (type) {
  case SUBMIT_TASK: {
    task_spec *spec = (task_spec *) utarray_front(state->input_buffer);
+    /* Update the result table, which holds mappings of object ID -> ID of the
+     * task that created it. */
+    if (state->db != NULL) {
+      task_id task_id = task_spec_id(spec);
+      for (int64_t i = 0; i < task_num_returns(spec); ++i) {
+        object_id return_id = task_return(spec, i);
+        result_table_add(state->db, return_id, task_id, NULL, NULL, NULL);
+      }
+    }
+
+    /* Handle the task submission. */
    if (actor_ids_equal(task_spec_actor_id(spec), NIL_ACTOR_ID)) {
      handle_task_submitted(state, state->algorithm_state, spec);
    } else {
--- a/src/photon/photon_scheduler.h
+++ b/src/photon/photon_scheduler.h
@ -7,6 +7,10 @@
 /* The duration between local scheduler heartbeats. */
 #define LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS 100

+/* The duration that we wait after sending a worker SIGTERM before sending the
+ * worker SIGKILL. */
+#define KILL_WORKER_TIMEOUT_MILLISECONDS 100
+
 #define DEFAULT_NUM_CPUS INT16_MAX
 #define DEFAULT_NUM_GPUS 0

--- a/src/photon/test/photon_tests.c
+++ b/src/photon/test/photon_tests.c
@ -50,19 +50,12 @@ typedef struct {
  photon_conn **conns;
 } photon_mock;

-photon_mock *init_photon_mock(bool connect_to_redis,
-                              int num_workers,
-                              int num_mock_workers) {
+photon_mock *init_photon_mock(int num_workers, int num_mock_workers) {
  const char *node_ip_address = "127.0.0.1";
-  const char *redis_addr = NULL;
-  int redis_port = -1;
+  const char *redis_addr = node_ip_address;
+  int redis_port = 6379;
  const double static_resource_conf[MAX_RESOURCE_INDEX] = {DEFAULT_NUM_CPUS,
                                                           DEFAULT_NUM_GPUS};
-  if (connect_to_redis) {
-    redis_addr = node_ip_address;
-    redis_port = 6379;
-  }
-
  photon_mock *mock = malloc(sizeof(photon_mock));
  memset(mock, 0, sizeof(photon_mock));
  mock->loop = event_loop_create();
@ -114,10 +107,25 @@ photon_mock *init_photon_mock(bool connect_to_redis,
 }

 void destroy_photon_mock(photon_mock *mock) {
+  /* Disconnect clients. */
  for (int i = 0; i < mock->num_photon_conns; ++i) {
    photon_disconnect(mock->conns[i]);
  }
  free(mock->conns);
+
+  /* Kill all the workers and run the event loop again so that the task table
+   * updates propagate and the tasks in progress are freed. */
+  local_scheduler_client **worker = (local_scheduler_client **) utarray_eltptr(
+      mock->photon_state->workers, 0);
+  while (worker != NULL) {
+    kill_worker(*worker, true);
+    worker = (local_scheduler_client **) utarray_eltptr(
+        mock->photon_state->workers, 0);
+  }
+  event_loop_add_timer(mock->loop, 500,
+                       (event_loop_timer_handler) timeout_handler, NULL);
+  event_loop_run(mock->loop);
+
  /* This also frees mock->loop. */
  free_local_scheduler(mock->photon_state);
  close(mock->plasma_store_fd);
@ -138,7 +146,7 @@ void reset_worker(photon_mock *mock, local_scheduler_client *worker) {
 * value, the task should get assigned to a worker again.
 */
 TEST object_reconstruction_test(void) {
-  photon_mock *photon = init_photon_mock(true, 0, 1);
+  photon_mock *photon = init_photon_mock(0, 1);
  photon_conn *worker = photon->conns[0];

  /* Create a task with zero dependencies and one return value. */
@ -207,7 +215,7 @@ TEST object_reconstruction_test(void) {
 * should trigger reconstruction of all previous tasks in the lineage.
 */
 TEST object_reconstruction_recursive_test(void) {
-  photon_mock *photon = init_photon_mock(true, 0, 1);
+  photon_mock *photon = init_photon_mock(0, 1);
  photon_conn *worker = photon->conns[0];
  /* Create a chain of tasks, each one dependent on the one before it. Mark
   * each object as available so that tasks will run immediately. */
@ -316,7 +324,7 @@ void object_reconstruction_suppression_callback(object_id object_id,
 }

 TEST object_reconstruction_suppression_test(void) {
-  photon_mock *photon = init_photon_mock(true, 0, 1);
+  photon_mock *photon = init_photon_mock(0, 1);
  photon_conn *worker = photon->conns[0];

  object_reconstruction_suppression_spec = example_task_spec(0, 1);
@ -365,7 +373,7 @@ TEST object_reconstruction_suppression_test(void) {
 }

 TEST task_dependency_test(void) {
-  photon_mock *photon = init_photon_mock(false, 0, 1);
+  photon_mock *photon = init_photon_mock(0, 1);
  local_scheduler_state *state = photon->photon_state;
  scheduling_algorithm_state *algorithm_state = state->algorithm_state;
  /* Get the first worker. */
@ -440,7 +448,7 @@ TEST task_dependency_test(void) {
 }

 TEST task_multi_dependency_test(void) {
-  photon_mock *photon = init_photon_mock(false, 0, 1);
+  photon_mock *photon = init_photon_mock(0, 1);
  local_scheduler_state *state = photon->photon_state;
  scheduling_algorithm_state *algorithm_state = state->algorithm_state;
  /* Get the first worker. */
@ -516,7 +524,7 @@ TEST task_multi_dependency_test(void) {
 TEST start_kill_workers_test(void) {
  /* Start some workers. */
  int num_workers = 4;
-  photon_mock *photon = init_photon_mock(true, num_workers, 0);
+  photon_mock *photon = init_photon_mock(num_workers, 0);
  /* We start off with num_workers children processes, but no workers
   * registered yet. */
  ASSERT_EQ(utarray_len(photon->photon_state->child_pids), num_workers);
@ -548,7 +556,7 @@ TEST start_kill_workers_test(void) {
  /* After killing a worker, its state is cleaned up. */
  local_scheduler_client *worker = *(local_scheduler_client **) utarray_eltptr(
      photon->photon_state->workers, 0);
-  kill_worker(worker, true);
+  kill_worker(worker, false);
  ASSERT_EQ(utarray_len(photon->photon_state->child_pids), 0);
  ASSERT_EQ(utarray_len(photon->photon_state->workers), num_workers - 1);

@ -581,9 +589,9 @@ SUITE(photon_tests) {
  RUN_REDIS_TEST(object_reconstruction_test);
  RUN_REDIS_TEST(object_reconstruction_recursive_test);
  RUN_REDIS_TEST(object_reconstruction_suppression_test);
-  RUN_TEST(task_dependency_test);
-  RUN_TEST(task_multi_dependency_test);
-  RUN_TEST(start_kill_workers_test);
+  RUN_REDIS_TEST(task_dependency_test);
+  RUN_REDIS_TEST(task_multi_dependency_test);
+  RUN_REDIS_TEST(start_kill_workers_test);
 }

 GREATEST_MAIN_DEFS();
--- a/test/component_failures_test.py
+++ b/test/component_failures_test.py
@ -70,5 +70,38 @@ class ComponentFailureTest(unittest.TestCase):
    self.assertTrue(ray.services.all_processes_alive(exclude=[ray.services.PROCESS_TYPE_WORKER]))
    ray.worker.cleanup()

+  def _testWorkerFailed(self, num_local_schedulers):
+    @ray.remote
+    def f(x):
+      time.sleep(0.5)
+      return x
+
+    num_initial_workers = 4
+    ray.worker._init(num_workers=num_initial_workers * num_local_schedulers,
+                     num_local_schedulers=num_local_schedulers,
+                     start_workers_from_local_scheduler=False,
+                     start_ray_local=True,
+                     num_cpus=[num_initial_workers] * num_local_schedulers)
+    # Submit more tasks than there are workers so that all workers and cores
+    # are utilized.
+    object_ids = [f.remote(i) for i in range(num_initial_workers * num_local_schedulers)]
+    object_ids += [f.remote(object_id) for object_id in object_ids]
+    # Allow the tasks some time to begin executing.
+    time.sleep(0.1)
+    # Kill the workers as the tasks execute.
+    for worker in ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]:
+      worker.terminate()
+      time.sleep(0.1)
+    # Make sure that we can still get the objects after the executing tasks died.
+    ray.get(object_ids)
+
+    ray.worker.cleanup()
+
+  def testWorkerFailed(self):
+    self._testWorkerFailed(1)
+
+  def testWorkerFailedMultinode(self):
+    self._testWorkerFailed(4)
+
 if __name__ == "__main__":
  unittest.main(verbosity=2)