Dynamically grow worker pool to partially solve hanging workloads (#286)

* First pass at a policy to solve deadlock * Address Robert's comments * stress test * unit test * Fix test cases * Fix test for python3 * add more logging * White space.
2025-03-06 02:21:39 -05:00 · 2017-02-17 17:08:52 -08:00 · 2017-02-17 17:08:52 -08:00 · a0dd3a44c0
commit a0dd3a44c0
parent 0bbf08a4ac
11 changed files with 393 additions and 38 deletions
--- a/python/ray/worker.py
+++ b/python/ray/worker.py
@ -471,6 +471,7 @@ class Worker(object):
    # their original index in the object_ids argument.
    unready_ids = dict((object_id, i) for (i, (object_id, val)) in
                       enumerate(final_results) if val is None)
+    was_blocked = (len(unready_ids) > 0)
    # Try reconstructing any objects we haven't gotten yet. Try to get them
    # until GET_TIMEOUT_MILLISECONDS milliseconds passes, then repeat.
    while len(unready_ids) > 0:
@ -487,6 +488,11 @@ class Worker(object):
          final_results[index] = (object_id, val)
          unready_ids.pop(object_id)

+    # If there were objects that we weren't able to get locally, let the local
+    # scheduler know that we're now unblocked.
+    if was_blocked:
+      self.photon_client.notify_unblocked()
+
    # Unwrap the object from the list (it was wrapped put_object).
    assert len(final_results) == len(object_ids)
    for i in range(len(final_results)):
--- a/src/photon/photon.h
+++ b/src/photon/photon.h
@ -29,7 +29,10 @@ enum photon_message_type {
  EVENT_LOG_MESSAGE,
  /** Send an initial connection message to the local scheduler.
   *  This contains the worker's process ID and actor ID. */
-  REGISTER_WORKER_INFO
+  REGISTER_WORKER_INFO,
+  /** For a worker that was blocked on some object(s), tell the local scheduler
+   *  that the worker is now unblocked. */
+  NOTIFY_UNBLOCKED,
 };

 /* These are needed to define the UT_arrays. */
@ -112,6 +115,9 @@ typedef struct {
   *  no task is running on the worker, this will be NULL. This is used to
   *  update the task table. */
  task *task_in_progress;
+  /** A flag to indicate whether this worker is currently blocking on an
+   *  object(s) that isn't available locally yet. */
+  bool is_blocked;
  /** The process ID of the client. If this is set to zero, the client has not
   *  yet registered a process ID. */
  pid_t pid;
--- a/src/photon/photon_algorithm.c
+++ b/src/photon/photon_algorithm.c
@ -83,9 +83,19 @@ struct scheduling_algorithm_state {
   *  about a new local scheduler arrives, we will resubmit all of these tasks
   *  locally. */
  UT_array *cached_submitted_actor_tasks;
-  /** An array of worker indices corresponding to clients that are
-   *  waiting for tasks. */
+  /** An array of pointers to workers in the worker pool. These are workers
+   *  that have registered a PID with us and that are now waiting to be
+   *  assigned a task to execute. */
  UT_array *available_workers;
+  /** An array of pointers to workers that are currently executing a task,
+   *  unblocked. These are the workers that are leasing some number of
+   *  resources. */
+  UT_array *executing_workers;
+  /** An array of pointers to workers that are currently executing a task,
+   *  blocked on some object(s) that isn't available locally yet. These are the
+   *  workers that are executing a task, but that have temporarily returned the
+   *  task's required resources. */
+  UT_array *blocked_workers;
  /** A hash map of the objects that are available in the local Plasma store.
   *  The key is the object ID. This information could be a little stale. */
  object_entry *local_objects;
@ -107,9 +117,13 @@ scheduling_algorithm_state *make_scheduling_algorithm_state(void) {
  /* Initialize the local data structures used for queuing tasks and workers. */
  algorithm_state->waiting_task_queue = NULL;
  algorithm_state->dispatch_task_queue = NULL;
-  utarray_new(algorithm_state->available_workers, &worker_icd);
+
  utarray_new(algorithm_state->cached_submitted_actor_tasks, &task_spec_icd);
  algorithm_state->local_actor_infos = NULL;
+
+  utarray_new(algorithm_state->available_workers, &worker_icd);
+  utarray_new(algorithm_state->executing_workers, &worker_icd);
+  utarray_new(algorithm_state->blocked_workers, &worker_icd);
  return algorithm_state;
 }

@ -146,6 +160,8 @@ void free_scheduling_algorithm_state(
  utarray_free(algorithm_state->cached_submitted_actor_tasks);
  /* Free the list of available workers. */
  utarray_free(algorithm_state->available_workers);
+  utarray_free(algorithm_state->executing_workers);
+  utarray_free(algorithm_state->blocked_workers);
  /* Free the cached information about which objects are present locally. */
  object_entry *obj_entry, *tmp_obj_entry;
  HASH_ITER(hh, algorithm_state->local_objects, obj_entry, tmp_obj_entry) {
@ -556,10 +572,6 @@ void dispatch_tasks(local_scheduler_state *state,

  /* Assign as many tasks as we can, while there are workers available. */
  DL_FOREACH_SAFE(algorithm_state->dispatch_task_queue, elt, tmp) {
-    if (utarray_len(algorithm_state->available_workers) <= 0) {
-      /* There are no more available workers, so we're done. */
-      break;
-    }
    /* TODO(atumanov): as an optimization, we can also check if all dynamic
     * capacity is zero and bail early. */
    bool task_satisfied = true;
@ -574,6 +586,19 @@ void dispatch_tasks(local_scheduler_state *state,
    if (!task_satisfied) {
      continue; /* Proceed to the next task. */
    }
+
+    /* If there is a task to assign, but there are no more available workers in
+     * the worker pool, then exit. Ensure that there will be an available
+     * worker during a future invocation of dispatch_tasks. */
+    if (utarray_len(algorithm_state->available_workers) == 0) {
+      if (utarray_len(state->child_pids) == 0) {
+        /* If there are no workers, including those pending PID registration,
+         * then we must start a new one to replenish the worker pool. */
+        start_worker(state, NIL_ACTOR_ID);
+      }
+      break;
+    }
+
    /* Dispatch this task to an available worker and dequeue the task. */
    LOG_DEBUG("Dispatching task");
    /* Get the last available worker in the available worker queue. */
@ -581,10 +606,12 @@ void dispatch_tasks(local_scheduler_state *state,
        algorithm_state->available_workers);
    /* Tell the available worker to execute the task. */
    assign_task_to_worker(state, elt->spec, *worker);
-    /* Remove the available worker from the queue and free the struct. */
+    /* Remove the worker from the available queue, and add it to the executing
+     * workers. */
    utarray_pop_back(algorithm_state->available_workers);
+    utarray_push_back(algorithm_state->executing_workers, worker);
+    /* Dequeue the task and free the struct. */
    print_resource_info(state, elt->spec);
-    /* Deque the task. */
    DL_DELETE(algorithm_state->dispatch_task_queue, elt);
    free_task_spec(elt->spec);
    free(elt);
@ -923,12 +950,37 @@ void handle_worker_available(local_scheduler_state *state,
                             scheduling_algorithm_state *algorithm_state,
                             local_scheduler_client *worker) {
  CHECK(worker->task_in_progress == NULL);
+  /* Check that the worker isn't in the pool of available workers. */
  for (local_scheduler_client **p = (local_scheduler_client **) utarray_front(
           algorithm_state->available_workers);
       p != NULL; p = (local_scheduler_client **) utarray_next(
                      algorithm_state->available_workers, p)) {
    DCHECK(*p != worker);
  }
+  /* Check that the worker isn't in the list of blocked workers. */
+  for (local_scheduler_client **p = (local_scheduler_client **) utarray_front(
+           algorithm_state->blocked_workers);
+       p != NULL; p = (local_scheduler_client **) utarray_next(
+                      algorithm_state->blocked_workers, p)) {
+    DCHECK(*p != worker);
+  }
+  /* If the worker was executing a task, it must have finished, so remove it
+   * from the list of executing workers. */
+  for (int i = 0; i < utarray_len(algorithm_state->executing_workers); ++i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->executing_workers, i);
+    if (*p == worker) {
+      utarray_erase(algorithm_state->executing_workers, i, 1);
+      break;
+    }
+  }
+  /* Check that we actually erased the worker. */
+  for (int i = 0; i < utarray_len(algorithm_state->executing_workers); ++i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->executing_workers, i);
+    DCHECK(*p != worker);
+  }
+
  /* Add worker to the list of available workers. */
  utarray_push_back(algorithm_state->available_workers, &worker);

@ -954,6 +1006,88 @@ void handle_actor_worker_available(local_scheduler_state *state,
  dispatch_actor_task(state, algorithm_state, actor_id);
 }

+void handle_worker_blocked(local_scheduler_state *state,
+                           scheduling_algorithm_state *algorithm_state,
+                           local_scheduler_client *worker) {
+  /* Find the worker in the list of executing workers. */
+  for (int i = 0; i < utarray_len(algorithm_state->executing_workers); ++i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->executing_workers, i);
+    if (*p == worker) {
+      /* Remove the worker from the list of executing workers. */
+      utarray_erase(algorithm_state->executing_workers, i, 1);
+
+      /* Check that the worker isn't in the list of blocked workers. */
+      for (local_scheduler_client **q =
+               (local_scheduler_client **) utarray_front(
+                   algorithm_state->blocked_workers);
+           q != NULL; q = (local_scheduler_client **) utarray_next(
+                          algorithm_state->blocked_workers, q)) {
+        DCHECK(*q != worker);
+      }
+
+      /* Return the resources that the blocked worker was using. */
+      CHECK(worker->task_in_progress != NULL);
+      task_spec *spec = task_task_spec(worker->task_in_progress);
+      update_dynamic_resources(state, spec, true);
+      /* Add the worker to the list of blocked workers. */
+      worker->is_blocked = true;
+      utarray_push_back(algorithm_state->blocked_workers, &worker);
+
+      return;
+    }
+  }
+
+  /* The worker should have been in the list of executing workers, so this line
+   * should be unreachable. */
+  LOG_FATAL(
+      "Worker registered as blocked, but it was not in the list of executing "
+      "workers.");
+}
+
+void handle_worker_unblocked(local_scheduler_state *state,
+                             scheduling_algorithm_state *algorithm_state,
+                             local_scheduler_client *worker) {
+  /* Find the worker in the list of blocked workers. */
+  for (int i = 0; i < utarray_len(algorithm_state->blocked_workers); ++i) {
+    local_scheduler_client **p = (local_scheduler_client **) utarray_eltptr(
+        algorithm_state->blocked_workers, i);
+    if (*p == worker) {
+      /* Remove the worker from the list of blocked workers. */
+      utarray_erase(algorithm_state->blocked_workers, i, 1);
+
+      /* Check that the worker isn't in the list of executing workers. */
+      for (local_scheduler_client **q =
+               (local_scheduler_client **) utarray_front(
+                   algorithm_state->executing_workers);
+           q != NULL; q = (local_scheduler_client **) utarray_next(
+                          algorithm_state->executing_workers, q)) {
+        DCHECK(*q != worker);
+      }
+
+      /* Lease back the resources that the blocked worker will need. */
+      /* TODO(swang): Leasing back the resources to blocked workers can cause
+       * us to transiently exceed the maximum number of resources. This can be
+       * fixed by having blocked workers explicitly yield and wait to be given
+       * back resources before continuing execution. */
+      CHECK(worker->task_in_progress != NULL);
+      task_spec *spec = task_task_spec(worker->task_in_progress);
+      update_dynamic_resources(state, spec, false);
+      /* Add the worker to the list of executing workers. */
+      worker->is_blocked = false;
+      utarray_push_back(algorithm_state->executing_workers, &worker);
+
+      return;
+    }
+  }
+
+  /* The worker should have been in the list of blocked workers, so this line
+   * should be unreachable. */
+  LOG_FATAL(
+      "Worker registered as unblocked, but it was not in the list of blocked "
+      "workers.");
+}
+
 void handle_object_available(local_scheduler_state *state,
                             scheduling_algorithm_state *algorithm_state,
                             object_id object_id) {
@ -1064,3 +1198,11 @@ int num_dispatch_tasks(scheduling_algorithm_state *algorithm_state) {
  DL_COUNT(algorithm_state->dispatch_task_queue, elt, count);
  return count;
 }
+
+void print_worker_info(const char *message,
+                       scheduling_algorithm_state *algorithm_state) {
+  LOG_DEBUG("%s: %d available, %d executing, %d blocked", message,
+            utarray_len(algorithm_state->available_workers),
+            utarray_len(algorithm_state->executing_workers),
+            utarray_len(algorithm_state->blocked_workers));
+}
--- a/src/photon/photon_algorithm.h
+++ b/src/photon/photon_algorithm.h
@ -139,7 +139,7 @@ void handle_object_available(local_scheduler_state *state,
 void handle_object_removed(local_scheduler_state *state, object_id object_id);

 /**
- * This function is called when a new worker becomes available
+ * This function is called when a new worker becomes available.
 *
 * @param state The state of the local scheduler.
 * @param algorithm_state State maintained by the scheduling algorithm.
@ -189,6 +189,32 @@ void handle_actor_worker_disconnect(local_scheduler_state *state,
                                    scheduling_algorithm_state *algorithm_state,
                                    actor_id actor_id);

+/**
+ * This function is called when a worker that was executing a task becomes
+ * blocked on an object that isn't available locally yet.
+ *
+ * @param state The state of the local scheduler.
+ * @param algorithm_state State maintained by the scheduling algorithm.
+ * @param worker The worker that is blocked.
+ * @return Void.
+ */
+void handle_worker_blocked(local_scheduler_state *state,
+                           scheduling_algorithm_state *algorithm_state,
+                           local_scheduler_client *worker);
+
+/**
+ * This function is called when a worker that was blocked on an object that
+ * wasn't available locally yet becomes unblocked.
+ *
+ * @param state The state of the local scheduler.
+ * @param algorithm_state State maintained by the scheduling algorithm.
+ * @param worker The worker that is now unblocked.
+ * @return Void.
+ */
+void handle_worker_unblocked(local_scheduler_state *state,
+                             scheduling_algorithm_state *algorithm_state,
+                             local_scheduler_client *worker);
+
 /**
 * This function fetches queued task's missing object dependencies. It is
 * called every LOCAL_SCHEDULER_FETCH_TIMEOUT_MILLISECONDS.
@ -201,6 +227,17 @@ void handle_actor_worker_disconnect(local_scheduler_state *state,
 */
 int fetch_object_timeout_handler(event_loop *loop, timer_id id, void *context);

+/**
+ * A helper function to print debug information about the current state and
+ * number of workers.
+ *
+ * @param message A message to identify the log message.
+ * @param algorithm_state State maintained by the scheduling algorithm.
+ * @return Void.
+ */
+void print_worker_info(const char *message,
+                       scheduling_algorithm_state *algorithm_state);
+
 /** The following methods are for testing purposes only. */
 #ifdef PHOTON_TEST
 /**
--- a/src/photon/photon_client.c
+++ b/src/photon/photon_client.c
@ -76,3 +76,7 @@ void photon_reconstruct_object(photon_conn *conn, object_id object_id) {
 void photon_log_message(photon_conn *conn) {
  write_message(conn->conn, LOG_MESSAGE, 0, NULL);
 }
+
+void photon_notify_unblocked(photon_conn *conn) {
+  write_message(conn->conn, NOTIFY_UNBLOCKED, 0, NULL);
+}
--- a/src/photon/photon_client.h
+++ b/src/photon/photon_client.h
@ -93,4 +93,12 @@ void photon_reconstruct_object(photon_conn *conn, object_id object_id);
 */
 void photon_log_message(photon_conn *conn);

+/**
+ * Notify the local scheduler that this client (worker) is no longer blocked.
+ *
+ * @param conn The connection information.
+ * @return Void.
+ */
+void photon_notify_unblocked(photon_conn *conn);
+
 #endif
--- a/src/photon/photon_extension.c
+++ b/src/photon/photon_extension.c
@ -80,6 +80,11 @@ static PyObject *PyPhotonClient_log_event(PyObject *self, PyObject *args) {
  Py_RETURN_NONE;
 }

+static PyObject *PyPhotonClient_notify_unblocked(PyObject *self) {
+  photon_notify_unblocked(((PyPhotonClient *) self)->photon_connection);
+  Py_RETURN_NONE;
+}
+
 static PyMethodDef PyPhotonClient_methods[] = {
    {"submit", (PyCFunction) PyPhotonClient_submit, METH_VARARGS,
     "Submit a task to the local scheduler."},
@ -89,6 +94,8 @@ static PyMethodDef PyPhotonClient_methods[] = {
     METH_VARARGS, "Ask the local scheduler to reconstruct an object."},
    {"log_event", (PyCFunction) PyPhotonClient_log_event, METH_VARARGS,
     "Log an event to the event log through the local scheduler."},
+    {"notify_unblocked", (PyCFunction) PyPhotonClient_notify_unblocked,
+     METH_NOARGS, "Notify the local scheduler that we are unblocked."},
    {NULL} /* Sentinel */
 };

--- a/src/photon/photon_scheduler.c
+++ b/src/photon/photon_scheduler.c
@ -68,6 +68,9 @@ void print_resource_info(const local_scheduler_state *state,
 * @return Void.
 */
 void kill_worker(local_scheduler_client *worker, bool wait) {
+  /* TODO(swang): This method should also propagate changes to other parts of
+   * the system to reflect the killed task in progress, if there was one.  This
+   * includes updating dynamic resources and updating the task table. */
  /* Erase the worker from the array of workers. */
  local_scheduler_state *state = worker->local_scheduler_state;
  int num_workers = utarray_len(state->workers);
@ -96,6 +99,7 @@ void kill_worker(local_scheduler_client *worker, bool wait) {
      /* Wait for the process to exit. */
      waitpid(worker->pid, NULL, 0);
    }
+    LOG_INFO("Killed worker with pid %d", worker->pid);
  }

  /* Clean up the client socket after killing the worker so that the worker
@ -187,12 +191,15 @@ void free_local_scheduler(local_scheduler_state *state) {
 */
 void start_worker(local_scheduler_state *state, actor_id actor_id) {
  /* We can't start a worker if we don't have the path to the worker script. */
-  CHECK(state->config.start_worker_command != NULL);
+  if (state->config.start_worker_command == NULL) {
+    LOG_WARN("No valid command to start worker provided. Cannot start worker.");
+    return;
+  }
  /* Launch the process to create the worker. */
  pid_t pid = fork();
  if (pid != 0) {
    utarray_push_back(state->child_pids, &pid);
-    LOG_DEBUG("Started worker with pid %d", pid);
+    LOG_INFO("Started worker with pid %d", pid);
    return;
  }

@ -279,6 +286,11 @@ local_scheduler_state *init_local_scheduler(
  } else {
    state->config.start_worker_command = NULL;
  }
+  if (start_worker_command == NULL) {
+    LOG_WARN(
+        "No valid command to start a worker provided, local scheduler will not "
+        "start any workers.");
+  }
  state->config.global_scheduler_exists = global_scheduler_exists;

  state->loop = loop;
@ -343,6 +355,29 @@ local_scheduler_state *init_local_scheduler(
  return state;
 }

+void update_dynamic_resources(local_scheduler_state *state,
+                              task_spec *spec,
+                              bool return_resources) {
+  for (int i = 0; i < MAX_RESOURCE_INDEX; ++i) {
+    double resource = task_spec_get_required_resource(spec, i);
+    if (!return_resources) {
+      /* If we are not returning resources, we are leasing them, so we want to
+       * subtract the resource quantities from our accounting. */
+      resource *= -1;
+    }
+    /* Add or subtract the task's resources from our count. */
+    state->dynamic_resources[i] += resource;
+
+    if (!return_resources && state->dynamic_resources[i] < 0) {
+      /* We are using more resources than we have been allocated. */
+      LOG_WARN("photon dynamic resources dropped to %8.4f\t%8.4f\n",
+               state->dynamic_resources[0], state->dynamic_resources[1]);
+    }
+    CHECK(state->dynamic_resources[i] <= state->static_resources[i]);
+  }
+  print_resource_info(state, spec);
+}
+
 void assign_task_to_worker(local_scheduler_state *state,
                           task_spec *spec,
                           local_scheduler_client *worker) {
@ -362,13 +397,7 @@ void assign_task_to_worker(local_scheduler_state *state,

  /* Resource accounting:
   * Update dynamic resource vector in the local scheduler state. */
-  for (int i = 0; i < MAX_RESOURCE_INDEX; i++) {
-    state->dynamic_resources[i] -= task_spec_get_required_resource(spec, i);
-    CHECKM(state->dynamic_resources[i] >= 0,
-           "photon dynamic resources dropped to %8.4f\t%8.4f\n",
-           state->dynamic_resources[0], state->dynamic_resources[1]);
-  }
-  print_resource_info(state, spec);
+  update_dynamic_resources(state, spec, false);
  task *task = alloc_task(spec, TASK_STATUS_RUNNING,
                          state->db ? get_db_client_id(state->db) : NIL_ID);
  /* Record which task this worker is executing. This will be freed in
@ -580,12 +609,7 @@ void process_message(event_loop *loop,
    if (worker->task_in_progress != NULL) {
      task_spec *spec = task_task_spec(worker->task_in_progress);
      /* Return dynamic resources back for the task in progress. */
-      for (int i = 0; i < MAX_RESOURCE_INDEX; i++) {
-        state->dynamic_resources[i] += task_spec_get_required_resource(spec, i);
-        /* Sanity-check resource vector boundary conditions. */
-        CHECK(state->dynamic_resources[i] <= state->static_resources[i]);
-      }
-      print_resource_info(state, spec);
+      update_dynamic_resources(state, spec, true);
      /* If we're connected to Redis, update tables. */
      if (state->db != NULL) {
        /* Update control state tables. */
@ -608,6 +632,16 @@ void process_message(event_loop *loop,
    }
  } break;
  case RECONSTRUCT_OBJECT: {
+    if (worker->task_in_progress != NULL && !worker->is_blocked) {
+      /* TODO(swang): For now, we don't handle blocked actors. */
+      if (actor_ids_equal(worker->actor_id, NIL_ACTOR_ID)) {
+        /* If the worker was executing a task (i.e. non-driver) and it wasn't
+         * already blocked on an object that's not locally available, update its
+         * state to blocked. */
+        handle_worker_blocked(state, state->algorithm_state, worker);
+        print_worker_info("Reconstructing", state->algorithm_state);
+      }
+    }
    object_id *obj_id = (object_id *) utarray_front(state->input_buffer);
    reconstruct_object(state, *obj_id);
  } break;
@ -622,6 +656,18 @@ void process_message(event_loop *loop,
  } break;
  case LOG_MESSAGE: {
  } break;
+  case NOTIFY_UNBLOCKED: {
+    if (worker->task_in_progress != NULL) {
+      /* TODO(swang): For now, we don't handle blocked actors. */
+      if (actor_ids_equal(worker->actor_id, NIL_ACTOR_ID)) {
+        /* If the worker was executing a task (i.e. non-driver), update its
+         * state to not blocked. */
+        CHECK(worker->is_blocked);
+        handle_worker_unblocked(state, state->algorithm_state, worker);
+      }
+    }
+    print_worker_info("Worker unblocked", state->algorithm_state);
+  } break;
  default:
    /* This code should be unreachable. */
    CHECK(0);
@ -639,6 +685,7 @@ void new_client_connection(event_loop *loop,
  local_scheduler_client *worker = malloc(sizeof(local_scheduler_client));
  worker->sock = new_socket;
  worker->task_in_progress = NULL;
+  worker->is_blocked = false;
  worker->pid = 0;
  worker->is_child = false;
  worker->actor_id = NIL_ACTOR_ID;
--- a/src/photon/photon_scheduler.h
+++ b/src/photon/photon_scheduler.h
@ -67,6 +67,44 @@ void reconstruct_object(local_scheduler_state *state, object_id object_id);

 void print_resource_info(const local_scheduler_state *s, const task_spec *spec);

+/**
+ * Kill a worker.
+ *
+ * @param worker The local scheduler client to kill.
+ * @param wait A boolean representing whether to wait for the killed worker to
+ *        exit.
+ * @param Void.
+ */
+void kill_worker(local_scheduler_client *worker, bool wait);
+
+/**
+ * Start a worker. This forks a new worker process that can be added to the
+ * pool of available workers, pending registration of its PID with the local
+ * scheduler.
+ *
+ * @param state The local scheduler state.
+ * @param actor_id The ID of the actor for this worker. If this worker is not an
+ *        actor, then NIL_ACTOR_ID should be used.
+ * @param Void.
+ */
+void start_worker(local_scheduler_state *state, actor_id actor_id);
+
+/**
+ * Update our accounting for the current resources being used, according to
+ * some task that is starting or finishing execution.
+ *
+ * @param state The local scheduler state.
+ * @param spec The specification for the task that is or was using resources.
+ * @param return_resources A boolean representing whether the task is starting
+ *        or finishing execution. If true, then the task is finishing execution
+ *        (possibly temporarily), so it will add to the dynamic resources
+ *        available. Else, it will take from the dynamic resources available.
+ * @return Void.
+ */
+void update_dynamic_resources(local_scheduler_state *state,
+                              task_spec *spec,
+                              bool return_resources);
+
 /** The following methods are for testing purposes only. */
 #ifdef PHOTON_TEST
 local_scheduler_state *init_local_scheduler(
@ -92,17 +130,6 @@ void process_message(event_loop *loop,
                     void *context,
                     int events);

-void kill_worker(local_scheduler_client *worker, bool wait);
-
-/**
- * Start a new worker by forking.
- *
- * @param state The local scheduler state.
- * @param actor_id The ID of the actor for this worker. If this worker is not an
- *        actor, then NIL_ACTOR_ID should be used.
- * @return Void.
- */
-void start_worker(local_scheduler_state *state, actor_id actor_id);
 #endif

 #endif /* PHOTON_SCHEDULER_H */
--- a/test/runtest.py
+++ b/test/runtest.py
@ -367,7 +367,7 @@ class APITest(unittest.TestCase):
    ray.worker.cleanup()

  def testWait(self):
-    ray.init(num_workers=1)
+    ray.init(num_workers=1, num_cpus=1)

    @ray.remote
    def f(delay):
@ -1115,6 +1115,55 @@ class ResourcesTest(unittest.TestCase):

    ray.worker.cleanup()

+class WorkerPoolTests(unittest.TestCase):
+
+  def tearDown(self):
+    ray.worker.cleanup()
+
+  def testNoWorkers(self):
+    ray.init(num_workers=0)
+
+    @ray.remote
+    def f():
+      return 1
+
+    # Make sure we can call a remote function. This will require starting a new
+    # worker.
+    ray.get(f.remote())
+
+    ray.get([f.remote() for _ in range(100)])
+
+  def testBlockingTasks(self):
+    ray.init(num_workers=1)
+
+    @ray.remote
+    def f(i, j):
+      return (i, j)
+
+    @ray.remote
+    def g(i):
+      # Each instance of g submits and blocks on the result of another remote
+      # task.
+      object_ids = [f.remote(i, j) for j in range(10)]
+      return ray.get(object_ids)
+
+    ray.get([g.remote(i) for i in range(100)])
+
+    @ray.remote
+    def _sleep(i):
+      time.sleep(1)
+      return (i)
+
+    @ray.remote
+    def sleep():
+      # Each instance of sleep submits and blocks on the result of another
+      # remote task, which takes one second to execute.
+      ray.get([_sleep.remote(i) for i in range(10)])
+
+    ray.get(sleep.remote())
+
+    ray.worker.cleanup()
+
 class SchedulingAlgorithm(unittest.TestCase):

  def attempt_to_load_balance(self, remote_function, args, total_tasks,
--- a/test/stress_tests.py
+++ b/test/stress_tests.py
@ -300,5 +300,27 @@ class ReconstructionTestsMultinode(ReconstructionTests):
  # one worker each.
  num_local_schedulers = 4

+# NOTE(swang): This test tries to launch 1000 workers and breaks.
+#class WorkerPoolTests(unittest.TestCase):
+#
+#  def tearDown(self):
+#    ray.worker.cleanup()
+#
+#  def testBlockingTasks(self):
+#    @ray.remote
+#    def f(i, j):
+#      return (i, j)
+#
+#    @ray.remote
+#    def g(i):
+#      # Each instance of g submits and blocks on the result of another remote
+#      # task.
+#      object_ids = [f.remote(i, j) for j in range(10)]
+#      return ray.get(object_ids)
+#
+#    ray.init(num_workers=1)
+#    ray.get([g.remote(i) for i in range(1000)])
+#    ray.worker.cleanup()
+
 if __name__ == "__main__":
  unittest.main(verbosity=2)