[Object Spilling] Implement level triggered logic to make streaming shuffle work + additional cleanup (#12773)

2025-03-06 10:31:39 -05:00 · 2020-12-18 19:31:14 -08:00 · 2020-12-18 19:31:14 -08:00 · 9d939e6674
commit 9d939e6674
parent 404161a3ff
41 changed files with 654 additions and 543 deletions
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@ -638,9 +638,11 @@ cdef c_vector[c_string] spill_objects_handler(
        return return_urls


-cdef void restore_spilled_objects_handler(
+cdef int64_t restore_spilled_objects_handler(
        const c_vector[CObjectID]& object_ids_to_restore,
        const c_vector[c_string]& object_urls) nogil:
+    cdef:
+        int64_t bytes_restored = 0
    with gil:
        urls = []
        size = object_urls.size()
@ -651,7 +653,8 @@ cdef void restore_spilled_objects_handler(
            with ray.worker._changeproctitle(
                    ray_constants.WORKER_PROCESS_TYPE_RESTORE_WORKER,
                    ray_constants.WORKER_PROCESS_TYPE_RESTORE_WORKER_IDLE):
-                external_storage.restore_spilled_objects(object_refs, urls)
+                bytes_restored = external_storage.restore_spilled_objects(
+                    object_refs, urls)
        except Exception:
            exception_str = (
                "An unexpected internal error occurred while the IO worker "
@ -662,6 +665,7 @@ cdef void restore_spilled_objects_handler(
                "restore_spilled_objects_error",
                traceback.format_exc() + exception_str,
                job_id=None)
+    return bytes_restored


 cdef void delete_spilled_objects_handler(
@ -873,7 +877,8 @@ cdef class CoreWorker:
        return self.plasma_event_handler

    def get_objects(self, object_refs, TaskID current_task_id,
-                    int64_t timeout_ms=-1, plasma_objects_only=False):
+                    int64_t timeout_ms=-1,
+                    plasma_objects_only=False):
        cdef:
            c_vector[shared_ptr[CRayObject]] results
            CTaskID c_task_id = current_task_id.native()
@ -1573,17 +1578,6 @@ cdef class CoreWorker:
            resource_name.encode("ascii"), capacity,
            CNodeID.FromBinary(client_id.binary()))

-    def force_spill_objects(self, object_refs):
-        cdef c_vector[CObjectID] object_ids
-        object_ids = ObjectRefsToVector(object_refs)
-        assert not RayConfig.instance().automatic_object_deletion_enabled(), (
-            "Automatic object deletion is not supported for"
-            "force_spill_objects yet. Please set"
-            "automatic_object_deletion_enabled: False in Ray's system config.")
-        with nogil:
-            check_status(CCoreWorkerProcess.GetCoreWorker()
-                         .SpillObjects(object_ids))
-
 cdef void async_set_result(shared_ptr[CRayObject] obj,
                           CObjectID object_ref,
                           void *future) with gil:
--- a/python/ray/experimental/init.py
+++ b/python/ray/experimental/init.py
@ -1,6 +1,4 @@
 from .dynamic_resources import set_resource
-from .object_spilling import force_spill_objects
 __all__ = [
    "set_resource",
-    "force_spill_objects",
 ]
--- a/python/ray/experimental/object_spilling.py
+++ b/python/ray/experimental/object_spilling.py
@ -1,18 +0,0 @@
-import ray
-
-
-def force_spill_objects(object_refs):
-    """Force spilling objects to external storage.
-
-    Args:
-        object_refs: Object refs of the objects to be
-            spilled.
-    """
-    core_worker = ray.worker.global_worker.core_worker
-    # Make sure that the values are object refs.
-    for object_ref in object_refs:
-        if not isinstance(object_ref, ray.ObjectRef):
-            raise TypeError(
-                f"Attempting to call `force_spill_objects` on the "
-                f"value {object_ref}, which is not an ray.ObjectRef.")
-    return core_worker.force_spill_objects(object_refs)
--- a/python/ray/external_storage.py
+++ b/python/ray/external_storage.py
@ -157,12 +157,15 @@ class ExternalStorage(metaclass=abc.ABCMeta):

    @abc.abstractmethod
    def restore_spilled_objects(self, object_refs: List[ObjectRef],
-                                url_with_offset_list: List[str]):
+                                url_with_offset_list: List[str]) -> int:
        """Restore objects from the external storage.

        Args:
            object_refs: List of object IDs (note that it is not ref).
            url_with_offset_list: List of url_with_offset.
+
+        Returns:
+            The total number of bytes restored.
        """

    @abc.abstractmethod
@ -215,6 +218,7 @@ class FileSystemStorage(ExternalStorage):

    def restore_spilled_objects(self, object_refs: List[ObjectRef],
                                url_with_offset_list: List[str]):
+        total = 0
        for i in range(len(object_refs)):
            object_ref = object_refs[i]
            url_with_offset = url_with_offset_list[i].decode()
@ -228,9 +232,11 @@ class FileSystemStorage(ExternalStorage):
                metadata_len = int.from_bytes(f.read(8), byteorder="little")
                buf_len = int.from_bytes(f.read(8), byteorder="little")
                self._size_check(metadata_len, buf_len, parsed_result.size)
+                total += buf_len
                metadata = f.read(metadata_len)
                # read remaining data to our buffer
                self._put_object_to_store(metadata, buf_len, f, object_ref)
+        return total

    def delete_spilled_objects(self, urls: List[str]):
        for url in urls:
@ -297,6 +303,7 @@ class ExternalStorageSmartOpenImpl(ExternalStorage):
    def restore_spilled_objects(self, object_refs: List[ObjectRef],
                                url_with_offset_list: List[str]):
        from smart_open import open
+        total = 0
        for i in range(len(object_refs)):
            object_ref = object_refs[i]
            url_with_offset = url_with_offset_list[i].decode()
@ -315,9 +322,11 @@ class ExternalStorageSmartOpenImpl(ExternalStorage):
                metadata_len = int.from_bytes(f.read(8), byteorder="little")
                buf_len = int.from_bytes(f.read(8), byteorder="little")
                self._size_check(metadata_len, buf_len, parsed_result.size)
+                total += buf_len
                metadata = f.read(metadata_len)
                # read remaining data to our buffer
                self._put_object_to_store(metadata, buf_len, f, object_ref)
+        return total

    def delete_spilled_objects(self, urls: List[str]):
        pass
@ -367,8 +376,8 @@ def restore_spilled_objects(object_refs: List[ObjectRef],
        object_refs: List of object IDs (note that it is not ref).
        url_with_offset_list: List of url_with_offset.
    """
-    _external_storage.restore_spilled_objects(object_refs,
-                                              url_with_offset_list)
+    return _external_storage.restore_spilled_objects(object_refs,
+                                                     url_with_offset_list)


 def delete_spilled_objects(urls: List[str]):
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@ -233,7 +233,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
        (CRayStatus() nogil) check_signals
        (void() nogil) gc_collect
        (c_vector[c_string](const c_vector[CObjectID] &) nogil) spill_objects
-        (void(
+        (int64_t(
            const c_vector[CObjectID] &,
            const c_vector[c_string] &) nogil) restore_spilled_objects
        (void(
--- a/python/ray/tests/conftest.py
+++ b/python/ray/tests/conftest.py
@ -23,7 +23,7 @@ def get_default_fixure_system_config():
        "object_timeout_milliseconds": 200,
        "num_heartbeats_timeout": 10,
        "object_store_full_max_retries": 3,
-        "object_store_full_initial_delay_ms": 100,
+        "object_store_full_delay_ms": 100,
    }
    return system_config

--- a/python/ray/tests/test_object_spilling.py
+++ b/python/ray/tests/test_object_spilling.py
@ -4,11 +4,9 @@ import os
 import random
 import platform
 import sys
-import time

 import numpy as np
 import pytest
-import psutil
 import ray
 from ray.external_storage import (create_url_with_offset,
                                  parse_url_with_offset)
@ -43,57 +41,6 @@ def object_spilling_config(request, tmpdir):
    yield json.dumps(request.param)


-@pytest.mark.skip("This test is for local benchmark.")
-def test_sample_benchmark(object_spilling_config, shutdown_only):
-    # --Config values--
-    max_io_workers = 10
-    object_store_limit = 500 * 1024 * 1024
-    eight_mb = 1024 * 1024
-    object_size = 12 * eight_mb
-    spill_cnt = 50
-
-    # Limit our object store to 200 MiB of memory.
-    ray.init(
-        object_store_memory=object_store_limit,
-        _system_config={
-            "object_store_full_max_retries": 0,
-            "max_io_workers": max_io_workers,
-            "object_spilling_config": object_spilling_config,
-            "automatic_object_deletion_enabled": False,
-        })
-    arr = np.random.rand(object_size)
-    replay_buffer = []
-    pinned_objects = set()
-
-    # Create objects of more than 200 MiB.
-    spill_start = time.perf_counter()
-    for _ in range(spill_cnt):
-        ref = None
-        while ref is None:
-            try:
-                ref = ray.put(arr)
-                replay_buffer.append(ref)
-                pinned_objects.add(ref)
-            except ray.exceptions.ObjectStoreFullError:
-                ref_to_spill = pinned_objects.pop()
-                ray.experimental.force_spill_objects([ref_to_spill])
-    spill_end = time.perf_counter()
-
-    # Make sure to remove unpinned objects.
-    del pinned_objects
-    restore_start = time.perf_counter()
-    while replay_buffer:
-        ref = replay_buffer.pop()
-        sample = ray.get(ref)  # noqa
-    restore_end = time.perf_counter()
-
-    print(f"Object spilling benchmark for the config {object_spilling_config}")
-    print(f"Spilling {spill_cnt} number of objects of size {object_size}B "
-          f"takes {spill_end - spill_start} seconds with {max_io_workers} "
-          "number of io workers.")
-    print(f"Getting all objects takes {restore_end - restore_start} seconds.")
-
-
 def test_invalid_config_raises_exception(shutdown_only):
    # Make sure ray.init raises an exception before
    # it starts processes when invalid object spilling
@ -127,123 +74,38 @@ def test_url_generation_and_parse():

@pytest.mark.skipif(
    platform.system() == "Windows", reason="Failing on Windows.")
-def test_spill_objects_manually(object_spilling_config, shutdown_only):
+def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
    # Limit our object store to 75 MiB of memory.
+    temp_folder = tmp_path / "spill"
+    temp_folder.mkdir()
    ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
-            "object_store_full_max_retries": 0,
-            "automatic_object_spilling_enabled": False,
            "max_io_workers": 4,
-            "object_spilling_config": object_spilling_config,
+            "automatic_object_spilling_enabled": True,
+            "object_store_full_max_retries": 4,
+            "object_store_full_delay_ms": 100,
+            "object_spilling_config": json.dumps({
+                "type": "filesystem",
+                "params": {
+                    "directory_path": str(temp_folder)
+                }
+            }),
            "min_spilling_size": 0,
-            "automatic_object_deletion_enabled": False,
        })
-    arr = np.random.rand(1024 * 1024)  # 8 MB data
-    replay_buffer = []
-    pinned_objects = set()
+    arr = np.random.rand(5 * 1024 * 1024)  # 40 MB
+    ref = ray.get(ray.put(arr))  # noqa
+    # Since the ref exists, it should raise OOM.
+    with pytest.raises(ray.exceptions.ObjectStoreFullError):
+        ref2 = ray.put(arr)  # noqa

-    # Create objects of more than 200 MiB.
-    for _ in range(25):
-        ref = None
-        while ref is None:
-            try:
-                ref = ray.put(arr)
-                replay_buffer.append(ref)
-                pinned_objects.add(ref)
-            except ray.exceptions.ObjectStoreFullError:
-                ref_to_spill = pinned_objects.pop()
-                ray.experimental.force_spill_objects([ref_to_spill])
+    def is_dir_empty():
+        num_files = 0
+        for path in temp_folder.iterdir():
+            num_files += 1
+        return num_files == 0

-    def is_worker(cmdline):
-        return cmdline and cmdline[0].startswith("ray::")
-
-    # Make sure io workers are spawned with proper name.
-    processes = [
-        x.cmdline()[0] for x in psutil.process_iter(attrs=["cmdline"])
-        if is_worker(x.info["cmdline"])
-    ]
-    assert (
-        ray.ray_constants.WORKER_PROCESS_TYPE_SPILL_WORKER_IDLE in processes)
-
-    # Spill 2 more objects so we will always have enough space for
-    # restoring objects back.
-    refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
-    ray.experimental.force_spill_objects(refs_to_spill)
-
-    # randomly sample objects
-    for _ in range(100):
-        ref = random.choice(replay_buffer)
-        sample = ray.get(ref)
-        assert np.array_equal(sample, arr)
-
-    # Make sure io workers are spawned with proper name.
-    processes = [
-        x.cmdline()[0] for x in psutil.process_iter(attrs=["cmdline"])
-        if is_worker(x.info["cmdline"])
-    ]
-    assert (
-        ray.ray_constants.WORKER_PROCESS_TYPE_RESTORE_WORKER_IDLE in processes)
-
-
-@pytest.mark.skipif(
-    platform.system() == "Windows", reason="Failing on Windows.")
-def test_spill_objects_manually_from_workers(object_spilling_config,
-                                             shutdown_only):
-    # Limit our object store to 100 MiB of memory.
-    ray.init(
-        object_store_memory=100 * 1024 * 1024,
-        _system_config={
-            "object_store_full_max_retries": 0,
-            "automatic_object_spilling_enabled": False,
-            "max_io_workers": 4,
-            "object_spilling_config": object_spilling_config,
-            "min_spilling_size": 0,
-            "automatic_object_deletion_enabled": False,
-        })
-
-    @ray.remote
-    def _worker():
-        arr = np.random.rand(1024 * 1024)  # 8 MB data
-        ref = ray.put(arr)
-        ray.experimental.force_spill_objects([ref])
-        return ref
-
-    # Create objects of more than 200 MiB.
-    replay_buffer = [ray.get(_worker.remote()) for _ in range(25)]
-    values = {ref: np.copy(ray.get(ref)) for ref in replay_buffer}
-    # Randomly sample objects.
-    for _ in range(100):
-        ref = random.choice(replay_buffer)
-        sample = ray.get(ref)
-        assert np.array_equal(sample, values[ref])
-
-
-@pytest.mark.skip(reason="Not implemented yet.")
-def test_spill_objects_manually_with_workers(object_spilling_config,
-                                             shutdown_only):
-    # Limit our object store to 75 MiB of memory.
-    ray.init(
-        object_store_memory=100 * 1024 * 1024,
-        _system_config={
-            "object_store_full_max_retries": 0,
-            "automatic_object_spilling_enabled": False,
-            "max_io_workers": 4,
-            "object_spilling_config": object_spilling_config,
-            "min_spilling_size": 0,
-            "automatic_object_deletion_enabled": False,
-        })
-    arrays = [np.random.rand(100 * 1024) for _ in range(50)]
-    objects = [ray.put(arr) for arr in arrays]
-
-    @ray.remote
-    def _worker(object_refs):
-        ray.experimental.force_spill_objects(object_refs)
-
-    ray.get([_worker.remote([o]) for o in objects])
-
-    for restored, arr in zip(ray.get(objects), arrays):
-        assert np.array_equal(restored, arr)
+    wait_for_condition(is_dir_empty)


@pytest.mark.skipif(
@ -255,7 +117,7 @@ def test_spill_objects_manually_with_workers(object_spilling_config,
        "_system_config": {
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "max_io_workers": 4,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@ -308,7 +170,7 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0
        })
@ -344,7 +206,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
        object_store_memory=100 * 1024 * 1024,
        _system_config={
            "automatic_object_spilling_enabled": True,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            # NOTE(swang): Use infinite retries because the OOM timer can still
            # get accidentally triggered when objects are released too slowly
            # (see github.com/ray-project/ray/issues/12040).
@ -381,7 +243,7 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
            "max_io_workers": 1,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0,
        })
@ -411,10 +273,11 @@ def test_delete_objects(tmp_path, shutdown_only):
    ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
-            "max_io_workers": 4,
+            "max_io_workers": 1,
+            "min_spilling_size": 0,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
@ -454,9 +317,10 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 4,
+            "min_spilling_size": 0,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
@ -506,7 +370,7 @@ def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
@ -579,9 +443,10 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 2,
+            "min_spilling_size": 20 * 1024 * 1024,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
@ -648,14 +513,14 @@ def test_fusion_objects(tmp_path, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    temp_folder = tmp_path / "spill"
    temp_folder.mkdir()
-    min_spilling_size = 30 * 1024 * 1024
+    min_spilling_size = 10 * 1024 * 1024
    ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
-            "max_io_workers": 4,
+            "max_io_workers": 3,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
-            "object_store_full_initial_delay_ms": 100,
+            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
--- a/python/ray/tests/test_reference_counting.py
+++ b/python/ray/tests/test_reference_counting.py
@ -19,7 +19,6 @@ logger = logging.getLogger(__name__)
@pytest.fixture
 def one_worker_100MiB(request):
    config = {
-        "object_store_full_max_retries": 2,
        "task_retry_delay_ms": 0,
    }
    yield ray.init(
--- a/src/ray/common/ray_config_def.h
+++ b/src/ray/common/ray_config_def.h
@ -243,10 +243,9 @@ RAY_CONFIG(int64_t, gcs_dump_debug_log_interval_minutes, 1)

 /// Maximum number of times to retry putting an object when the plasma store is full.
 /// Can be set to -1 to enable unlimited retries.
-RAY_CONFIG(int32_t, object_store_full_max_retries, 5)
+RAY_CONFIG(int32_t, object_store_full_max_retries, 1000)
 /// Duration to sleep after failing to put an object in plasma because it is full.
-/// This will be exponentially increased for each retry.
-RAY_CONFIG(uint32_t, object_store_full_initial_delay_ms, 1000)
+RAY_CONFIG(uint32_t, object_store_full_delay_ms, 10)

 /// The amount of time to wait between logging plasma space usage debug messages.
 RAY_CONFIG(uint64_t, object_store_usage_log_interval_s, 10 * 60)
@ -254,6 +253,9 @@ RAY_CONFIG(uint64_t, object_store_usage_log_interval_s, 10 * 60)
 /// The amount of time between automatic local Python GC triggers.
 RAY_CONFIG(uint64_t, local_gc_interval_s, 10 * 60)

+/// The min amount of time between local GCs (whether auto or mem pressure triggered).
+RAY_CONFIG(uint64_t, local_gc_min_interval_s, 10)
+
 /// Duration to wait between retries for failed tasks.
 RAY_CONFIG(uint32_t, task_retry_delay_ms, 5000)

--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@ -575,7 +575,8 @@ void CoreWorker::Exit(bool intentional) {
      << " received, this process will exit after all outstanding tasks have finished";
  exiting_ = true;
  // Release the resources early in case draining takes a long time.
-  RAY_CHECK_OK(local_raylet_client_->NotifyDirectCallTaskBlocked());
+  RAY_CHECK_OK(
+      local_raylet_client_->NotifyDirectCallTaskBlocked(/*release_resources*/ true));

  // Callback to shutdown.
  auto shutdown = [this, intentional]() {
@ -2369,7 +2370,9 @@ void CoreWorker::HandleRestoreSpilledObjects(
    for (const auto &url : request.spilled_objects_url()) {
      spilled_objects_url.push_back(url);
    }
-    options_.restore_spilled_objects(object_ids_to_restore, spilled_objects_url);
+    auto total =
+        options_.restore_spilled_objects(object_ids_to_restore, spilled_objects_url);
+    reply->set_bytes_restored_total(total);
    send_reply_callback(Status::OK(), nullptr, nullptr);
  } else {
    send_reply_callback(
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@ -139,7 +139,7 @@ struct CoreWorkerOptions {
  /// Application-language callback to spill objects to external storage.
  std::function<std::vector<std::string>(const std::vector<ObjectID> &)> spill_objects;
  /// Application-language callback to restore objects from external storage.
-  std::function<void(const std::vector<ObjectID> &, const std::vector<std::string> &)>
+  std::function<int64_t(const std::vector<ObjectID> &, const std::vector<std::string> &)>
      restore_spilled_objects;
  /// Application-language callback to delete objects from external storage.
  std::function<void(const std::vector<std::string> &, rpc::WorkerType)>
--- a/src/ray/core_worker/store_provider/memory_store/memory_store.cc
+++ b/src/ray/core_worker/store_provider/memory_store/memory_store.cc
@ -232,16 +232,18 @@ bool CoreWorkerMemoryStore::Put(const RayObject &object, const ObjectID &object_
 Status CoreWorkerMemoryStore::Get(const std::vector<ObjectID> &object_ids,
                                  int num_objects, int64_t timeout_ms,
                                  const WorkerContext &ctx, bool remove_after_get,
-                                  std::vector<std::shared_ptr<RayObject>> *results) {
+                                  std::vector<std::shared_ptr<RayObject>> *results,
+                                  bool release_resources) {
  return GetImpl(object_ids, num_objects, timeout_ms, ctx, remove_after_get, results,
-                 /*abort_if_any_object_is_exception=*/true);
+                 /*abort_if_any_object_is_exception=*/true, release_resources);
 }

 Status CoreWorkerMemoryStore::GetImpl(const std::vector<ObjectID> &object_ids,
                                      int num_objects, int64_t timeout_ms,
                                      const WorkerContext &ctx, bool remove_after_get,
                                      std::vector<std::shared_ptr<RayObject>> *results,
-                                      bool abort_if_any_object_is_exception) {
+                                      bool abort_if_any_object_is_exception,
+                                      bool release_resources) {
  (*results).resize(object_ids.size(), nullptr);

  std::shared_ptr<GetRequest> get_request;
@ -299,7 +301,8 @@ Status CoreWorkerMemoryStore::GetImpl(const std::vector<ObjectID> &object_ids,

  // Wait for remaining objects (or timeout).
  if (should_notify_raylet) {
-    RAY_CHECK_OK(raylet_client_->NotifyDirectCallTaskBlocked());
+    // SANG-TODO Implement memory store get
+    RAY_CHECK_OK(raylet_client_->NotifyDirectCallTaskBlocked(release_resources));
  }

  bool done = false;
@ -374,11 +377,11 @@ Status CoreWorkerMemoryStore::Get(
    const absl::flat_hash_set<ObjectID> &object_ids, int64_t timeout_ms,
    const WorkerContext &ctx,
    absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
-    bool *got_exception) {
+    bool *got_exception, bool release_resources) {
  const std::vector<ObjectID> id_vector(object_ids.begin(), object_ids.end());
  std::vector<std::shared_ptr<RayObject>> result_objects;
  RAY_RETURN_NOT_OK(Get(id_vector, id_vector.size(), timeout_ms, ctx,
-                        /*remove_after_get=*/false, &result_objects));
+                        /*remove_after_get=*/false, &result_objects, release_resources));

  for (size_t i = 0; i < id_vector.size(); i++) {
    if (result_objects[i] != nullptr) {
@ -401,8 +404,9 @@ Status CoreWorkerMemoryStore::Wait(const absl::flat_hash_set<ObjectID> &object_i
  std::vector<ObjectID> id_vector(object_ids.begin(), object_ids.end());
  std::vector<std::shared_ptr<RayObject>> result_objects;
  RAY_CHECK(object_ids.size() == id_vector.size());
-  auto status = GetImpl(id_vector, num_objects, timeout_ms, ctx, false, &result_objects,
-                        /*abort_if_any_object_is_exception=*/false);
+  auto status =
+      GetImpl(id_vector, num_objects, timeout_ms, ctx, false, &result_objects,
+              /*abort_if_any_object_is_exception=*/false, /*release_resources=*/true);
  // Ignore TimedOut statuses since we return ready objects explicitly.
  if (!status.IsTimedOut()) {
    RAY_RETURN_NOT_OK(status);
--- a/src/ray/core_worker/store_provider/memory_store/memory_store.h
+++ b/src/ray/core_worker/store_provider/memory_store/memory_store.h
@ -58,13 +58,14 @@ class CoreWorkerMemoryStore {
  /// \return Status.
  Status Get(const std::vector<ObjectID> &object_ids, int num_objects, int64_t timeout_ms,
             const WorkerContext &ctx, bool remove_after_get,
-             std::vector<std::shared_ptr<RayObject>> *results);
+             std::vector<std::shared_ptr<RayObject>> *results,
+             bool release_resources = true);

  /// Convenience wrapper around Get() that stores results in a given result map.
  Status Get(const absl::flat_hash_set<ObjectID> &object_ids, int64_t timeout_ms,
             const WorkerContext &ctx,
             absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
-             bool *got_exception);
+             bool *got_exception, bool release_resources = true);

  /// Convenience wrapper around Get() that stores ready objects in a given result set.
  Status Wait(const absl::flat_hash_set<ObjectID> &object_ids, int num_objects,
@ -137,11 +138,12 @@ class CoreWorkerMemoryStore {
 private:
  /// See the public version of `Get` for meaning of the other arguments.
  /// \param[in] abort_if_any_object_is_exception Whether we should abort if any object
-  /// is an exception.
+  /// \param[in] release_resources true if memory store blocking get needs to release
+  /// resources. is an exception.
  Status GetImpl(const std::vector<ObjectID> &object_ids, int num_objects,
                 int64_t timeout_ms, const WorkerContext &ctx, bool remove_after_get,
                 std::vector<std::shared_ptr<RayObject>> *results,
-                 bool abort_if_any_object_is_exception);
+                 bool abort_if_any_object_is_exception, bool release_resources);

  /// Optional callback for putting objects into the plasma store.
  std::function<void(const RayObject &, const ObjectID &)> store_in_plasma_;
--- a/src/ray/core_worker/store_provider/plasma_store_provider.cc
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc
@ -35,6 +35,7 @@ CoreWorkerPlasmaStoreProvider::CoreWorkerPlasmaStoreProvider(
  } else {
    get_current_call_site_ = []() { return "<no callsite callback>"; };
  }
+  object_store_full_delay_ms_ = RayConfig::instance().object_store_full_delay_ms();
  buffer_tracker_ = std::make_shared<BufferTracker>();
  RAY_CHECK_OK(store_client_.Connect(store_socket));
  if (warmup) {
@ -95,7 +96,8 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &meta
  }

  while (retry_with_request_id > 0) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    // TODO(sang): Use exponential backoff instead.
+    std::this_thread::sleep_for(std::chrono::milliseconds(object_store_full_delay_ms_));
    {
      std::lock_guard<std::mutex> guard(store_client_mutex_);
      RAY_LOG(DEBUG) << "Retrying request for object " << object_id << " with request ID "
@ -224,7 +226,7 @@ Status CoreWorkerPlasmaStoreProvider::Get(
    const absl::flat_hash_set<ObjectID> &object_ids, int64_t timeout_ms,
    const WorkerContext &ctx,
    absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
-    bool *got_exception) {
+    bool *got_exception, bool release_resources) {
  int64_t batch_size = RayConfig::instance().worker_fetch_request_size();
  std::vector<ObjectID> batch_ids;
  absl::flat_hash_set<ObjectID> remaining(object_ids.begin(), object_ids.end());
@ -275,7 +277,7 @@ Status CoreWorkerPlasmaStoreProvider::Get(
    size_t previous_size = remaining.size();
    // This is a separate IPC from the FetchAndGet in direct call mode.
    if (ctx.CurrentTaskIsDirectCall() && ctx.ShouldReleaseResourcesOnBlockingCalls()) {
-      RAY_RETURN_NOT_OK(raylet_client_->NotifyDirectCallTaskBlocked());
+      RAY_RETURN_NOT_OK(raylet_client_->NotifyDirectCallTaskBlocked(release_resources));
    }
    RAY_RETURN_NOT_OK(
        FetchAndGetFromPlasmaStore(remaining, batch_ids, batch_timeout,
@ -332,7 +334,9 @@ Status CoreWorkerPlasmaStoreProvider::Wait(

    // This is a separate IPC from the Wait in direct call mode.
    if (ctx.CurrentTaskIsDirectCall() && ctx.ShouldReleaseResourcesOnBlockingCalls()) {
-      RAY_RETURN_NOT_OK(raylet_client_->NotifyDirectCallTaskBlocked());
+      // SANG-TODO Implement wait
+      RAY_RETURN_NOT_OK(
+          raylet_client_->NotifyDirectCallTaskBlocked(/*release_resources*/ true));
    }
    const auto owner_addresses = reference_counter_->GetOwnerAddresses(id_vector);
    RAY_RETURN_NOT_OK(
--- a/src/ray/core_worker/store_provider/plasma_store_provider.h
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.h
@ -90,7 +90,7 @@ class CoreWorkerPlasmaStoreProvider {
  Status Get(const absl::flat_hash_set<ObjectID> &object_ids, int64_t timeout_ms,
             const WorkerContext &ctx,
             absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
-             bool *got_exception);
+             bool *got_exception, bool release_resources = true);

  Status Contains(const ObjectID &object_id, bool *has_object);

@ -154,6 +154,7 @@ class CoreWorkerPlasmaStoreProvider {
  std::mutex store_client_mutex_;
  std::function<Status()> check_signals_;
  std::function<std::string()> get_current_call_site_;
+  uint32_t object_store_full_delay_ms_;

  // Active buffers tracker. This must be allocated as a separate structure since its
  // lifetime can exceed that of the store provider due to callback references.
--- a/src/ray/gcs/gcs_server/gcs_object_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_object_manager.cc
@ -72,7 +72,9 @@ void GcsObjectManager::HandleAddObjectLocation(
    AddObjectLocationInCache(object_id, node_id);
  } else {
    absl::MutexLock lock(&mutex_);
-    object_to_locations_[object_id].spilled_url = request.spilled_url();
+    RAY_CHECK(!request.spilled_url().empty());
+    spilled_url = request.spilled_url();
+    object_to_locations_[object_id].spilled_url = spilled_url;
    RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id;
  }

@ -91,7 +93,8 @@ void GcsObjectManager::HandleAddObjectLocation(
                                         notification.SerializeAsString(), nullptr));
      RAY_LOG(DEBUG) << "Finished adding object location, job id = "
                     << object_id.TaskId().JobId() << ", object id = " << object_id
-                     << ", node id = " << node_id << ", task id = " << object_id.TaskId();
+                     << ", node id = " << node_id << ", task id = " << object_id.TaskId()
+                     << ", spilled_url = " << spilled_url;
    } else {
      RAY_LOG(ERROR) << "Failed to add object location: " << status.ToString()
                     << ", job id = " << object_id.TaskId().JobId()
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@ -10,14 +10,8 @@
 namespace ray {

 /// A callback to asynchronously spill objects when space is needed.
-/// The callback tries to spill objects as much as num_bytes_to_spill and returns
-/// the amount of space needed after the spilling is complete.
-/// The returned value is calculated based off of min_bytes_to_spill. That says,
-/// although it fails to spill num_bytes_to_spill, as long as it spills more than
-/// min_bytes_to_spill, it will return the value that is less than 0 (meaning we
-/// don't need any more additional space).
-using SpillObjectsCallback =
-    std::function<int64_t(int64_t num_bytes_to_spill, int64_t min_bytes_to_spill)>;
+/// It spills enough objects to saturate all spill IO workers.
+using SpillObjectsCallback = std::function<bool()>;

 /// A callback to call when space has been released.
 using SpaceReleasedCallback = std::function<void()>;
--- a/src/ray/object_manager/object_manager.cc
+++ b/src/ray/object_manager/object_manager.cc
@ -118,6 +118,13 @@ void ObjectManager::Stop() {
  }
 }

+bool ObjectManager::IsPlasmaObjectSpillable(const ObjectID &object_id) {
+  if (plasma::plasma_store_runner != nullptr) {
+    return plasma::plasma_store_runner->IsPlasmaObjectSpillable(object_id);
+  }
+  return false;
+}
+
 void ObjectManager::RunRpcService() { rpc_service_.run(); }

 void ObjectManager::StartRpcService() {
--- a/src/ray/object_manager/object_manager.h
+++ b/src/ray/object_manager/object_manager.h
@ -206,6 +206,13 @@ class ObjectManager : public ObjectManagerInterface,
  /// signals from Raylet.
  void Stop();

+  /// This methods call the plasma store which runs in a separate thread.
+  /// Check if the given object id is evictable by directly calling plasma store.
+  /// Plasma store will return true if the object is spillable, meaning it is only
+  /// pinned by the raylet, so we can comfotable evict after spilling the object from
+  /// local object manager. False otherwise.
+  bool IsPlasmaObjectSpillable(const ObjectID &object_id);
+
  /// Subscribe to notifications of objects added to local store.
  /// Upon subscribing, the callback will be invoked for all objects that
  ///
--- a/src/ray/object_manager/plasma/create_request_queue.cc
+++ b/src/ray/object_manager/plasma/create_request_queue.cc
@ -69,17 +69,21 @@ std::pair<PlasmaObject, PlasmaError> CreateRequestQueue::TryRequestImmediately(
  auto req_id = AddRequest(object_id, client, create_callback);
  if (!ProcessRequests().ok()) {
    // If the request was not immediately fulfillable, finish it.
-    RAY_CHECK(!queue_.empty());
-    FinishRequest(queue_.begin());
+    if (!queue_.empty()) {
+      // Some errors such as a transient OOM error doesn't finish the request, so we
+      // should finish it here.
+      FinishRequest(queue_.begin());
+    }
  }
  PlasmaError error;
  RAY_CHECK(GetRequestResult(req_id, &result, &error));
  return {result, error};
 }

-Status CreateRequestQueue::ProcessRequest(std::unique_ptr<CreateRequest> &request) {
+bool CreateRequestQueue::ProcessRequest(std::unique_ptr<CreateRequest> &request) {
  // Return an OOM error to the client if we have hit the maximum number of
  // retries.
+  // TODO(sang): Delete this logic?
  bool evict_if_full = evict_if_full_;
  if (max_retries_ == 0) {
    // If we cannot retry, then always evict on the first attempt.
@ -88,50 +92,36 @@ Status CreateRequestQueue::ProcessRequest(std::unique_ptr<CreateRequest> &reques
    // Always try to evict after the first attempt.
    evict_if_full = true;
  }
-
  request->error = request->create_callback(evict_if_full, &request->result);
-  Status status;
-  auto should_retry_on_oom = max_retries_ == -1 || num_retries_ < max_retries_;
-  if (request->error == PlasmaError::TransientOutOfMemory) {
-    // The object store is full, but we should wait for space to be made
-    // through spilling, so do nothing. The caller must guarantee that
-    // ProcessRequests is called again so that we can try this request again.
-    // NOTE(swang): There could be other requests behind this one that are
-    // actually serviceable. This may be inefficient, but eventually this
-    // request will get served and unblock the following requests, once
-    // enough objects have been spilled.
-    // TODO(swang): Ask the raylet to spill enough space for multiple requests
-    // at once, instead of just the head of the queue.
-    num_retries_ = 0;
-    status =
-        Status::TransientObjectStoreFull("Object store full, queueing creation request");
-  } else if (request->error == PlasmaError::OutOfMemory && should_retry_on_oom) {
-    num_retries_++;
-    RAY_LOG(DEBUG) << "Not enough memory to create the object, after " << num_retries_
-                   << " tries";
-
-    if (trigger_global_gc_) {
-      trigger_global_gc_();
-    }
-
-    status = Status::ObjectStoreFull("Object store full, should retry on timeout");
-  } else if (request->error == PlasmaError::OutOfMemory) {
-    RAY_LOG(ERROR) << "Not enough memory to create object " << request->object_id
-                   << " after " << num_retries_
-                   << " tries, will return OutOfMemory to the client";
-  }
-
-  return status;
+  return request->error != PlasmaError::OutOfMemory;
 }

 Status CreateRequestQueue::ProcessRequests() {
  while (!queue_.empty()) {
    auto request_it = queue_.begin();
-    auto status = ProcessRequest(*request_it);
-    if (status.IsTransientObjectStoreFull() || status.IsObjectStoreFull()) {
-      return status;
+    auto create_ok = ProcessRequest(*request_it);
+    if (create_ok) {
+      FinishRequest(request_it);
+    } else {
+      if (trigger_global_gc_) {
+        trigger_global_gc_();
+      }
+
+      if (spill_objects_callback_()) {
+        return Status::TransientObjectStoreFull("Waiting for spilling.");
+      } else if (num_retries_ < max_retries_ || max_retries_ == -1) {
+        // We need a grace period since (1) global GC takes a bit of time to
+        // kick in, and (2) there is a race between spilling finishing and space
+        // actually freeing up in the object store.
+        // If max_retries == -1, we retry infinitely.
+        num_retries_ += 1;
+        return Status::ObjectStoreFull("Waiting for grace period.");
+      } else {
+        // Raise OOM. In this case, the request will be marked as OOM.
+        // We don't return so that we can process the next entry right away.
+        FinishRequest(request_it);
+      }
    }
-    FinishRequest(request_it);
  }
  return Status::OK();
 }
--- a/src/ray/object_manager/plasma/create_request_queue.h
+++ b/src/ray/object_manager/plasma/create_request_queue.h
@ -21,6 +21,7 @@
 #include "absl/container/flat_hash_map.h"

 #include "ray/common/status.h"
+#include "ray/object_manager/common.h"
 #include "ray/object_manager/plasma/common.h"
 #include "ray/object_manager/plasma/connection.h"
 #include "ray/object_manager/plasma/plasma.h"
@ -34,9 +35,11 @@ class CreateRequestQueue {
      std::function<PlasmaError(bool evict_if_full, PlasmaObject *result)>;

  CreateRequestQueue(int32_t max_retries, bool evict_if_full,
+                     ray::SpillObjectsCallback spill_objects_callback,
                     std::function<void()> trigger_global_gc)
      : max_retries_(max_retries),
        evict_if_full_(evict_if_full),
+        spill_objects_callback_(spill_objects_callback),
        trigger_global_gc_(trigger_global_gc) {
    RAY_LOG(DEBUG) << "Starting plasma::CreateRequestQueue with " << max_retries_
                   << " retries on OOM, evict if full? " << (evict_if_full_ ? 1 : 0);
@ -136,7 +139,7 @@ class CreateRequestQueue {
  /// Process a single request. Sets the request's error result to the error
  /// returned by the request handler inside. Returns OK if the request can be
  /// finished.
-  Status ProcessRequest(std::unique_ptr<CreateRequest> &request);
+  bool ProcessRequest(std::unique_ptr<CreateRequest> &request);

  /// Finish a queued request and remove it from the queue.
  void FinishRequest(std::list<std::unique_ptr<CreateRequest>>::iterator request_it);
@ -156,6 +159,11 @@ class CreateRequestQueue {
  /// always try to evict.
  const bool evict_if_full_;

+  /// A callback to trigger object spilling. It tries to spill objects upto max
+  /// throughput. It returns true if space is made by object spilling, and false if
+  /// there's no more space to be made.
+  ray::SpillObjectsCallback spill_objects_callback_;
+
  /// A callback to trigger global GC in the cluster if the object store is
  /// full.
  const std::function<void()> trigger_global_gc_;
@ -178,6 +186,9 @@ class CreateRequestQueue {
  /// finished.
  absl::flat_hash_map<uint64_t, std::unique_ptr<CreateRequest>> fulfilled_requests_;

+  /// Last time global gc was invoked in ms.
+  uint64_t last_global_gc_ms_;
+
  friend class CreateRequestQueueTest;
 };

--- a/src/ray/object_manager/plasma/eviction_policy.cc
+++ b/src/ray/object_manager/plasma/eviction_policy.cc
@ -132,10 +132,10 @@ int64_t EvictionPolicy::RequireSpace(int64_t size,
  RAY_LOG(DEBUG) << "not enough space to create this object, so evicting objects";
  // Choose some objects to evict, and update the return pointers.
  int64_t num_bytes_evicted = ChooseObjectsToEvict(space_to_free, objects_to_evict);
-  RAY_LOG(INFO) << "There is not enough space to create this object, so evicting "
-                << objects_to_evict->size() << " objects to free up " << num_bytes_evicted
-                << " bytes. The number of bytes in use (before "
-                << "this eviction) is " << PlasmaAllocator::Allocated() << ".";
+  RAY_LOG(DEBUG) << "There is not enough space to create this object, so evicting "
+                 << objects_to_evict->size() << " objects to free up "
+                 << num_bytes_evicted << " bytes. The number of bytes in use (before "
+                 << "this eviction) is " << PlasmaAllocator::Allocated() << ".";
  return required_space - num_bytes_evicted;
 }

--- a/src/ray/object_manager/plasma/plasma.fbs
+++ b/src/ray/object_manager/plasma/plasma.fbs
@ -82,11 +82,6 @@ enum PlasmaError:int {
  ObjectNonexistent,
  // Trying to create an object but there isn't enough space in the store.
  OutOfMemory,
-  // Trying to create an object but there isn't enough space in the store.
-  // However, objects are currently being spilled to make enough space. The
-  // client should try again soon, and there will be enough space (assuming the
-  // space is not taken by another client).
-  TransientOutOfMemory,
  // Trying to delete an object but it's not sealed.
  ObjectNotSealed,
  // Trying to delete an object but it's in use.
@ -162,7 +157,7 @@ table PlasmaCreateRetryRequest {
  object_id: string;
  // The ID of the request to retry.
  request_id: uint64;
-  }
+}

 table CudaHandle {
  handle: [ubyte];
--- a/src/ray/object_manager/plasma/protocol.cc
+++ b/src/ray/object_manager/plasma/protocol.cc
@ -131,9 +131,6 @@ Status PlasmaErrorStatus(fb::PlasmaError plasma_error) {
    return Status::ObjectNotFound("object does not exist in the plasma store");
  case fb::PlasmaError::OutOfMemory:
    return Status::ObjectStoreFull("object does not fit in the plasma store");
-  case fb::PlasmaError::TransientOutOfMemory:
-    return Status::ObjectStoreFull(
-        "object does not fit in the plasma store, spilling objects to make room");
  case fb::PlasmaError::UnexpectedError:
    return Status::UnknownError(
        "an unexpected error occurred, likely due to a bug in the system or caller");
--- a/src/ray/object_manager/plasma/store.cc
+++ b/src/ray/object_manager/plasma/store.cc
@ -138,7 +138,7 @@ PlasmaStore::PlasmaStore(boost::asio::io_service &main_service, std::string dire
      create_request_queue_(
          RayConfig::instance().object_store_full_max_retries(),
          /*evict_if_full=*/RayConfig::instance().object_pinning_enabled(),
-          object_store_full_callback) {
+          spill_objects_callback, object_store_full_callback) {
  store_info_.directory = directory;
  store_info_.hugepages_enabled = hugepages_enabled;
 #ifdef PLASMA_CUDA
@ -223,34 +223,7 @@ uint8_t *PlasmaStore::AllocateMemory(size_t size, bool evict_if_full, MEMFD_TYPE
    // More space is still needed. Try to spill objects to external storage to
    // make room.
    if (space_needed > 0) {
-      if (spill_objects_callback_) {
-        // If the space needed is too small, we'd like to bump up to the minimum
-        // size. Cap the max size to be lower than the plasma store limit.
-        int64_t byte_to_spill =
-            std::min(PlasmaAllocator::GetFootprintLimit(),
-                     std::max(space_needed, RayConfig::instance().min_spilling_size()));
-        // Object spilling is asynchronous so that we do not block the plasma
-        // store thread. Therefore the client must try again, even if enough
-        // space will be made after the spill is complete.
-        // TODO(swang): Only respond to the client with OutOfMemory if we could not
-        // make enough space through spilling. If we could make enough space,
-        // respond to the plasma client once spilling is complete.
-        space_needed = spill_objects_callback_(byte_to_spill, space_needed);
-      }
-      if (space_needed > 0) {
-        // There is still not enough space, even once all evictable objects
-        // were evicted and all pending object spills have finished.  The
-        // client may choose to try again, or throw an OutOfMemory error to
-        // the application immediately.
-        *error = PlasmaError::OutOfMemory;
-      } else {
-        // Once all pending object spills have finished, there should be
-        // enough space for this allocation. Return a transient error to the
-        // client so that they try again soon.
-        *error = PlasmaError::TransientOutOfMemory;
-      }
-      // Return an error to the client if not enough space could be freed to
-      // create the object.
+      *error = PlasmaError::OutOfMemory;
      break;
    }
  }
@ -311,9 +284,8 @@ PlasmaError PlasmaStore::HandleCreateObjectRequest(const std::shared_ptr<Client>
                            owner_worker_id, evict_if_full, data_size, metadata_size,
                            device_num, client, object);
  if (error == PlasmaError::OutOfMemory) {
-    RAY_LOG(WARNING) << "Not enough memory to create the object " << object_id
-                     << ", data_size=" << data_size
-                     << ", metadata_size=" << metadata_size;
+    RAY_LOG(DEBUG) << "Not enough memory to create the object " << object_id
+                   << ", data_size=" << data_size << ", metadata_size=" << metadata_size;
  }
  return error;
 }
@ -551,8 +523,8 @@ void PlasmaStore::ProcessGetRequest(const std::shared_ptr<Client> &client,
  std::vector<ObjectID> evicted_ids;
  std::vector<ObjectTableEntry *> evicted_entries;
  for (auto object_id : object_ids) {
-    // Check if this object is already present locally. If so, record that the
-    // object is being used and mark it as accounted for.
+    // Check if this object is already present
+    // locally. If so, record that the object is being used and mark it as accounted for.
    auto entry = GetObjectTableEntry(&store_info_, object_id);
    if (entry && entry->state == ObjectState::PLASMA_SEALED) {
      // Update the get request to take into account the present object.
@ -972,6 +944,9 @@ void PlasmaStore::SubscribeToUpdates(const std::shared_ptr<Client> &client) {
 Status PlasmaStore::ProcessMessage(const std::shared_ptr<Client> &client,
                                   fb::MessageType type,
                                   const std::vector<uint8_t> &message) {
+  // Global lock is used here so that we allow raylet to access some of methods
+  // that are required for object spilling directly without releasing a lock.
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
  // TODO(suquark): We should convert these interfaces to const later.
  uint8_t *input = (uint8_t *)message.data();
  size_t input_size = message.size();
@ -1116,9 +1091,7 @@ void PlasmaStore::ProcessCreateRequests() {

  auto status = create_request_queue_.ProcessRequests();
  uint32_t retry_after_ms = 0;
-  if (status.IsTransientObjectStoreFull()) {
-    retry_after_ms = delay_on_transient_oom_ms_;
-  } else if (status.IsObjectStoreFull()) {
+  if (!status.ok()) {
    retry_after_ms = delay_on_oom_ms_;
  }

@ -1151,4 +1124,12 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr<Client> &client,
  }
 }

+bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) {
+  // The lock is acquired when a request is received to the plasma store.
+  // recursive mutex is used here to allow
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
+  auto entry = GetObjectTableEntry(&store_info_, object_id);
+  return entry->ref_count == 1;
+}
+
 }  // namespace plasma
--- a/src/ray/object_manager/plasma/store.h
+++ b/src/ray/object_manager/plasma/store.h
@ -99,9 +99,6 @@ class PlasmaStore {
  ///  - PlasmaError::OutOfMemory, if the store is out of memory and
  ///    cannot create the object. In this case, the client should not call
  ///    plasma_release.
-  ///  - PlasmaError::TransientOutOfMemory, if the store is temporarily out of
-  ///    memory but there may be space soon to create the object.  In this
-  ///    case, the client should not call plasma_release.
  PlasmaError CreateObject(const ObjectID &object_id, const NodeID &owner_raylet_id,
                           const std::string &owner_ip_address, int owner_port,
                           const WorkerID &owner_worker_id, bool evict_if_full,
@ -186,6 +183,14 @@ class PlasmaStore {
                        plasma::flatbuf::MessageType type,
                        const std::vector<uint8_t> &message);

+  /// Return true if the given object id has only one reference.
+  /// Only one reference means there's only a raylet that pins the object
+  /// so it is safe to spill the object.
+  /// NOTE: Avoid using this method outside object spilling context (e.g., unless you
+  /// absolutely know what's going on). This method won't work correctly if it is used
+  /// before the object is pinned by raylet for the first time.
+  bool IsObjectSpillable(const ObjectID &object_id);
+
  void SetNotificationListener(
      const std::shared_ptr<ray::ObjectStoreNotificationManager> &notification_listener) {
    notification_listener_ = notification_listener;
@ -286,16 +291,14 @@ class PlasmaStore {
  /// A callback to asynchronously spill objects when space is needed. The
  /// callback returns the amount of space still needed after the spilling is
  /// complete.
+  /// NOTE: This function should guarantee the thread-safety because the callback is
+  /// shared with the main raylet thread.
  ray::SpillObjectsCallback spill_objects_callback_;

  /// The amount of time to wait before retrying a creation request after an
  /// OOM error.
  const uint32_t delay_on_oom_ms_;

-  /// The amount of time to wait before retrying a creation request after a
-  /// transient OOM error.
-  const uint32_t delay_on_transient_oom_ms_ = 10;
-
  /// The amount of time to wait between logging space usage debug messages.
  const uint64_t usage_log_interval_ns_;

@ -309,6 +312,14 @@ class PlasmaStore {

  /// Queue of object creation requests.
  CreateRequestQueue create_request_queue_;
+
+  /// This mutex is used in order to make plasma store threas-safe with raylet.
+  /// Raylet's local_object_manager needs to ping access plasma store's method in order to
+  /// figure out the correct view of the object store. recursive_mutex is used to avoid
+  /// deadlock while we keep the simplest possible change. NOTE(sang): Avoid adding more
+  /// interface that node manager or object manager can access the plasma store with this
+  /// mutex if it is not absolutely necessary.
+  std::recursive_mutex mutex_;
 };

 }  // namespace plasma
--- a/src/ray/object_manager/plasma/store_runner.cc
+++ b/src/ray/object_manager/plasma/store_runner.cc
@ -94,10 +94,10 @@ void PlasmaStoreRunner::Start(ray::SpillObjectsCallback spill_objects_callback,

  {
    absl::MutexLock lock(&store_runner_mutex_);
-    store_.reset(new PlasmaStore(
-        main_service_, plasma_directory_, hugepages_enabled_, socket_name_,
-        external_store, RayConfig::instance().object_store_full_initial_delay_ms(),
-        spill_objects_callback, object_store_full_callback));
+    store_.reset(new PlasmaStore(main_service_, plasma_directory_, hugepages_enabled_,
+                                 socket_name_, external_store,
+                                 RayConfig::instance().object_store_full_delay_ms(),
+                                 spill_objects_callback, object_store_full_callback));
    plasma_config = store_->GetPlasmaStoreInfo();

    // We are using a single memory-mapped file by mallocing and freeing a single
@ -134,6 +134,10 @@ void PlasmaStoreRunner::Shutdown() {
  }
 }

+bool PlasmaStoreRunner::IsPlasmaObjectSpillable(const ObjectID &object_id) {
+  return store_->IsObjectSpillable(object_id);
+}
+
 std::unique_ptr<PlasmaStoreRunner> plasma_store_runner;

 }  // namespace plasma
--- a/src/ray/object_manager/plasma/store_runner.h
+++ b/src/ray/object_manager/plasma/store_runner.h
@ -22,6 +22,7 @@ class PlasmaStoreRunner {
      const std::shared_ptr<ray::ObjectStoreNotificationManager> &notification_listener) {
    store_->SetNotificationListener(notification_listener);
  }
+  bool IsPlasmaObjectSpillable(const ObjectID &object_id);

 private:
  void Shutdown();
--- a/src/ray/object_manager/pull_manager.cc
+++ b/src/ray/object_manager/pull_manager.cc
@ -46,6 +46,8 @@ void PullManager::OnLocationChange(const ObjectID &object_id,
  // before.
  it->second.client_locations = std::vector<NodeID>(client_ids.begin(), client_ids.end());
  if (!spilled_url.empty()) {
+    RAY_LOG(DEBUG) << "OnLocationChange " << spilled_url << " num clients "
+                   << client_ids.size();
    // Try to restore the spilled object.
    restore_spilled_object_(object_id, spilled_url,
                            [this, object_id](const ray::Status &status) {
--- a/src/ray/object_manager/test/create_request_queue_test.cc
+++ b/src/ray/object_manager/test/create_request_queue_test.cc
@ -49,6 +49,7 @@ class CreateRequestQueueTest : public ::testing::Test {
      : queue_(
            /*max_retries=*/2,
            /*evict_if_full=*/true,
+            /*spill_object_callback=*/[&]() { return false; },
            /*on_global_gc=*/[&]() { num_global_gc_++; }) {}

  void AssertNoLeaks() {
@ -117,7 +118,7 @@ TEST_F(CreateRequestQueueTest, TestOom) {
  // Retries used up. The first request should reply with OOM and the second
  // request should also be served.
  ASSERT_TRUE(queue_.ProcessRequests().ok());
-  ASSERT_EQ(num_global_gc_, 2);
+  ASSERT_EQ(num_global_gc_, 3);

  // Both requests fulfilled.
  ASSERT_REQUEST_FINISHED(queue_, req_id1, PlasmaError::OutOfMemory);
@ -131,6 +132,8 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) {
  CreateRequestQueue queue(
      /*max_retries=*/-1,
      /*evict_if_full=*/true,
+      // Spilling is failing.
+      /*spill_object_callback=*/[&]() { return false; },
      /*on_global_gc=*/[&]() { num_global_gc_++; });

  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
@ -156,7 +159,13 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) {
 }

 TEST_F(CreateRequestQueueTest, TestTransientOom) {
-  auto return_status = PlasmaError::TransientOutOfMemory;
+  CreateRequestQueue queue(
+      /*max_retries=*/2,
+      /*evict_if_full=*/false,
+      /*spill_object_callback=*/[&]() { return true; },
+      /*on_global_gc=*/[&]() { num_global_gc_++; });
+
+  auto return_status = PlasmaError::OutOfMemory;
  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
    if (return_status == PlasmaError::OK) {
      result->data_size = 1234;
@ -169,28 +178,35 @@ TEST_F(CreateRequestQueueTest, TestTransientOom) {
  };

  auto client = std::make_shared<MockClient>();
-  auto req_id1 = queue_.AddRequest(ObjectID::Nil(), client, oom_request);
-  auto req_id2 = queue_.AddRequest(ObjectID::Nil(), client, blocked_request);
+  auto req_id1 = queue.AddRequest(ObjectID::Nil(), client, oom_request);
+  auto req_id2 = queue.AddRequest(ObjectID::Nil(), client, blocked_request);

  // Transient OOM should not use up any retries.
  for (int i = 0; i < 3; i++) {
-    ASSERT_TRUE(queue_.ProcessRequests().IsTransientObjectStoreFull());
-    ASSERT_REQUEST_UNFINISHED(queue_, req_id1);
-    ASSERT_REQUEST_UNFINISHED(queue_, req_id2);
-    ASSERT_EQ(num_global_gc_, 0);
+    ASSERT_TRUE(queue.ProcessRequests().IsTransientObjectStoreFull());
+    ASSERT_REQUEST_UNFINISHED(queue, req_id1);
+    ASSERT_REQUEST_UNFINISHED(queue, req_id2);
+    ASSERT_EQ(num_global_gc_, i + 1);
  }

  // Return OK for the first request. The second request should also be served.
  return_status = PlasmaError::OK;
-  ASSERT_TRUE(queue_.ProcessRequests().ok());
-  ASSERT_REQUEST_FINISHED(queue_, req_id1, PlasmaError::OK);
-  ASSERT_REQUEST_FINISHED(queue_, req_id2, PlasmaError::OK);
+  ASSERT_TRUE(queue.ProcessRequests().ok());
+  ASSERT_REQUEST_FINISHED(queue, req_id1, PlasmaError::OK);
+  ASSERT_REQUEST_FINISHED(queue, req_id2, PlasmaError::OK);

  AssertNoLeaks();
 }

 TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) {
-  auto return_status = PlasmaError::TransientOutOfMemory;
+  bool is_spilling_possible = true;
+  CreateRequestQueue queue(
+      /*max_retries=*/2,
+      /*evict_if_full=*/false,
+      /*spill_object_callback=*/[&]() { return is_spilling_possible; },
+      /*on_global_gc=*/[&]() { num_global_gc_++; });
+
+  auto return_status = PlasmaError::OutOfMemory;
  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
    if (return_status == PlasmaError::OK) {
      result->data_size = 1234;
@ -203,31 +219,31 @@ TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) {
  };

  auto client = std::make_shared<MockClient>();
-  auto req_id1 = queue_.AddRequest(ObjectID::Nil(), client, oom_request);
-  auto req_id2 = queue_.AddRequest(ObjectID::Nil(), client, blocked_request);
+  auto req_id1 = queue.AddRequest(ObjectID::Nil(), client, oom_request);
+  auto req_id2 = queue.AddRequest(ObjectID::Nil(), client, blocked_request);

  // Transient OOM should not use up any retries.
  for (int i = 0; i < 3; i++) {
-    ASSERT_TRUE(queue_.ProcessRequests().IsTransientObjectStoreFull());
-    ASSERT_REQUEST_UNFINISHED(queue_, req_id1);
-    ASSERT_REQUEST_UNFINISHED(queue_, req_id2);
-    ASSERT_EQ(num_global_gc_, 0);
+    ASSERT_TRUE(queue.ProcessRequests().IsTransientObjectStoreFull());
+    ASSERT_REQUEST_UNFINISHED(queue, req_id1);
+    ASSERT_REQUEST_UNFINISHED(queue, req_id2);
+    ASSERT_EQ(num_global_gc_, i + 1);
  }

-  // Now we are actually OOM.
-  return_status = PlasmaError::OutOfMemory;
-  ASSERT_TRUE(queue_.ProcessRequests().IsObjectStoreFull());
-  ASSERT_TRUE(queue_.ProcessRequests().IsObjectStoreFull());
-  ASSERT_REQUEST_UNFINISHED(queue_, req_id1);
-  ASSERT_REQUEST_UNFINISHED(queue_, req_id2);
-  ASSERT_EQ(num_global_gc_, 2);
+  // Now spilling is not possible. We should start raising OOM with retry.
+  is_spilling_possible = false;
+  ASSERT_TRUE(queue.ProcessRequests().IsObjectStoreFull());
+  ASSERT_TRUE(queue.ProcessRequests().IsObjectStoreFull());
+  ASSERT_REQUEST_UNFINISHED(queue, req_id1);
+  ASSERT_REQUEST_UNFINISHED(queue, req_id2);
+  ASSERT_EQ(num_global_gc_, 5);

  // Retries used up. The first request should reply with OOM and the second
  // request should also be served.
-  ASSERT_TRUE(queue_.ProcessRequests().ok());
-  ASSERT_REQUEST_FINISHED(queue_, req_id1, PlasmaError::OutOfMemory);
-  ASSERT_REQUEST_FINISHED(queue_, req_id2, PlasmaError::OK);
-  ASSERT_EQ(num_global_gc_, 2);
+  ASSERT_TRUE(queue.ProcessRequests().ok());
+  ASSERT_REQUEST_FINISHED(queue, req_id1, PlasmaError::OutOfMemory);
+  ASSERT_REQUEST_FINISHED(queue, req_id2, PlasmaError::OK);
+  ASSERT_EQ(num_global_gc_, 6);

  AssertNoLeaks();
 }
@ -248,6 +264,7 @@ TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) {
  CreateRequestQueue queue(
      /*max_retries=*/2,
      /*evict_if_full=*/false,
+      /*spill_object_callback=*/[&]() { return false; },
      /*on_global_gc=*/[&]() {});

  bool first_try = true;
--- a/src/ray/protobuf/core_worker.proto
+++ b/src/ray/protobuf/core_worker.proto
@ -315,6 +315,7 @@ message RestoreSpilledObjectsRequest {
 }

 message RestoreSpilledObjectsReply {
+  int64 bytes_restored_total = 1;
 }

 message DeleteSpilledObjectsRequest {
--- a/src/ray/raylet/format/node_manager.fbs
+++ b/src/ray/raylet/format/node_manager.fbs
@ -194,6 +194,7 @@ table NotifyUnblocked {
 }

 table NotifyDirectCallTaskBlocked {
+  release_resources: bool;
 }

 table NotifyDirectCallTaskUnblocked {
--- a/src/ray/raylet/local_object_manager.cc
+++ b/src/ray/raylet/local_object_manager.cc
@ -22,7 +22,6 @@ namespace raylet {

 void LocalObjectManager::PinObjects(const std::vector<ObjectID> &object_ids,
                                    std::vector<std::unique_ptr<RayObject>> &&objects) {
-  absl::MutexLock lock(&mutex_);
  RAY_CHECK(object_pinning_enabled_);
  for (size_t i = 0; i < object_ids.size(); i++) {
    const auto &object_id = object_ids[i];
@ -62,7 +61,6 @@ void LocalObjectManager::WaitForObjectFree(const rpc::Address &owner_address,
 void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) {
  // object_pinning_enabled_ flag is off when the --lru-evict flag is on.
  if (object_pinning_enabled_) {
-    absl::MutexLock lock(&mutex_);
    RAY_LOG(DEBUG) << "Unpinning object " << object_id;
    // The object should be in one of these stats. pinned, spilling, or spilled.
    RAY_CHECK((pinned_objects_.count(object_id) > 0) ||
@ -104,50 +102,85 @@ void LocalObjectManager::FlushFreeObjectsIfNeeded(int64_t now_ms) {
  }
 }

-int64_t LocalObjectManager::SpillObjectsOfSize(int64_t num_bytes_to_spill,
-                                               int64_t min_bytes_to_spill) {
-  RAY_CHECK(num_bytes_to_spill >= min_bytes_to_spill);
-
+void LocalObjectManager::SpillObjectUptoMaxThroughput() {
  if (RayConfig::instance().object_spilling_config().empty() ||
      !RayConfig::instance().automatic_object_spilling_enabled()) {
-    return min_bytes_to_spill;
+    return;
  }

-  absl::MutexLock lock(&mutex_);
+  // Spill as fast as we can using all our spill workers.
+  bool can_spill_more = true;
+  while (can_spill_more) {
+    if (!SpillObjectsOfSize(min_spilling_size_)) {
+      break;
+    }
+    {
+      absl::MutexLock lock(&mutex_);
+      num_active_workers_ += 1;
+      can_spill_more = num_active_workers_ < max_active_workers_;
+    }
+  }
+}

-  RAY_LOG(INFO) << "Choosing objects to spill of total size " << num_bytes_to_spill;
+bool LocalObjectManager::IsSpillingInProgress() {
+  absl::MutexLock lock(&mutex_);
+  return num_active_workers_ > 0;
+}
+
+bool LocalObjectManager::SpillObjectsOfSize(int64_t num_bytes_to_spill) {
+  if (RayConfig::instance().object_spilling_config().empty() ||
+      !RayConfig::instance().automatic_object_spilling_enabled()) {
+    return false;
+  }
+
+  RAY_LOG(DEBUG) << "Choosing objects to spill of total size " << num_bytes_to_spill;
  int64_t bytes_to_spill = 0;
  auto it = pinned_objects_.begin();
  std::vector<ObjectID> objects_to_spill;
-  while (bytes_to_spill < num_bytes_to_spill && it != pinned_objects_.end()) {
-    bytes_to_spill += it->second->GetSize();
-    objects_to_spill.push_back(it->first);
+  while (bytes_to_spill <= num_bytes_to_spill && it != pinned_objects_.end()) {
+    if (is_plasma_object_spillable_(it->first)) {
+      bytes_to_spill += it->second->GetSize();
+      objects_to_spill.push_back(it->first);
+    }
    it++;
  }
  if (!objects_to_spill.empty()) {
-    RAY_LOG(INFO) << "Spilling objects of total size " << bytes_to_spill;
-    auto start_time = current_time_ms();
-    SpillObjectsInternal(
-        objects_to_spill, [bytes_to_spill, start_time](const Status &status) {
-          if (!status.ok()) {
-            RAY_LOG(ERROR) << "Error spilling objects " << status.ToString();
-          } else {
-            RAY_LOG(INFO) << "Spilled " << bytes_to_spill << " in "
-                          << (current_time_ms() - start_time) << "ms";
-          }
-        });
+    RAY_LOG(DEBUG) << "Spilling objects of total size " << bytes_to_spill
+                   << " num objects " << objects_to_spill.size();
+    auto start_time = absl::GetCurrentTimeNanos();
+    SpillObjectsInternal(objects_to_spill, [this, bytes_to_spill, objects_to_spill,
+                                            start_time](const Status &status) {
+      if (!status.ok()) {
+        RAY_LOG(ERROR) << "Error spilling objects " << status.ToString();
+      } else {
+        auto now = absl::GetCurrentTimeNanos();
+        RAY_LOG(DEBUG) << "Spilled " << bytes_to_spill << " bytes in "
+                       << (now - start_time) / 1e6 << "ms";
+        spilled_bytes_total_ += bytes_to_spill;
+        spilled_objects_total_ += objects_to_spill.size();
+        // Adjust throughput timing to account for concurrent spill operations.
+        spill_time_total_s_ += (now - std::max(start_time, last_spill_finish_ns_)) / 1e9;
+        if (now - last_spill_log_ns_ > 1e9) {
+          last_spill_log_ns_ = now;
+          // TODO(ekl) logging at error level until we add a better UX indicator.
+          RAY_LOG(ERROR) << "Spilled "
+                         << static_cast<int>(spilled_bytes_total_ / (1024 * 1024))
+                         << " MiB, " << spilled_objects_total_
+                         << " objects, write throughput "
+                         << static_cast<int>(spilled_bytes_total_ / (1024 * 1024) /
+                                             spill_time_total_s_)
+                         << " MiB/s";
+        }
+        last_spill_finish_ns_ = now;
+      }
+    });
+    return true;
  }
-  //  We do not track a mapping between objects that need to be created to
-  //  objects that are being spilled, so we just subtract the total number of
-  //  bytes that are currently being spilled from the amount of space
-  //  requested. If the space is claimed by another client, this client may
-  //  need to request space again.
-  return min_bytes_to_spill - num_bytes_pending_spill_;
+  return false;
 }

 void LocalObjectManager::SpillObjects(const std::vector<ObjectID> &object_ids,
                                      std::function<void(const ray::Status &)> callback) {
-  absl::MutexLock lock(&mutex_);
  SpillObjectsInternal(object_ids, callback);
 }

@ -196,7 +229,10 @@ void LocalObjectManager::SpillObjectsInternal(
        io_worker->rpc_client()->SpillObjects(
            request, [this, objects_to_spill, callback, io_worker](
                         const ray::Status &status, const rpc::SpillObjectsReply &r) {
-              absl::MutexLock lock(&mutex_);
+              {
+                absl::MutexLock lock(&mutex_);
+                num_active_workers_ -= 1;
+              }
              io_worker_pool_.PushSpillWorker(io_worker);
              if (!status.ok()) {
                for (const auto &object_id : objects_to_spill) {
@ -222,7 +258,6 @@ void LocalObjectManager::AddSpilledUrls(
    const std::vector<ObjectID> &object_ids, const rpc::SpillObjectsReply &worker_reply,
    std::function<void(const ray::Status &)> callback) {
  auto num_remaining = std::make_shared<size_t>(object_ids.size());
-  auto num_bytes_spilled = std::make_shared<size_t>(0);
  for (size_t i = 0; i < object_ids.size(); ++i) {
    const ObjectID &object_id = object_ids[i];
    const std::string &object_url = worker_reply.spilled_objects_url(i);
@ -232,15 +267,12 @@ void LocalObjectManager::AddSpilledUrls(
    // be retrieved by other raylets.
    RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl(
        object_id, object_url,
-        [this, object_id, object_url, callback, num_remaining,
-         num_bytes_spilled](Status status) {
+        [this, object_id, object_url, callback, num_remaining](Status status) {
          RAY_CHECK_OK(status);
-          absl::MutexLock lock(&mutex_);
          // Unpin the object.
          auto it = objects_pending_spill_.find(object_id);
          RAY_CHECK(it != objects_pending_spill_.end());
          num_bytes_pending_spill_ -= it->second->GetSize();
-          *num_bytes_spilled += it->second->GetSize();
          objects_pending_spill_.erase(it);

          // Update the object_id -> url_ref_count to use it for deletion later.
@ -273,20 +305,41 @@ void LocalObjectManager::AsyncRestoreSpilledObject(
                 << object_url;
  io_worker_pool_.PopRestoreWorker([this, object_id, object_url, callback](
                                       std::shared_ptr<WorkerInterface> io_worker) {
+    auto start_time = absl::GetCurrentTimeNanos();
    RAY_LOG(DEBUG) << "Sending restore spilled object request";
    rpc::RestoreSpilledObjectsRequest request;
    request.add_spilled_objects_url(std::move(object_url));
    request.add_object_ids_to_restore(object_id.Binary());
    io_worker->rpc_client()->RestoreSpilledObjects(
        request,
-        [this, object_id, callback, io_worker](const ray::Status &status,
-                                               const rpc::RestoreSpilledObjectsReply &r) {
+        [this, start_time, object_id, callback, io_worker](
+            const ray::Status &status, const rpc::RestoreSpilledObjectsReply &r) {
          io_worker_pool_.PushRestoreWorker(io_worker);
          if (!status.ok()) {
            RAY_LOG(ERROR) << "Failed to send restore spilled object request: "
                           << status.ToString();
          } else {
-            RAY_LOG(DEBUG) << "Restored object " << object_id;
+            auto now = absl::GetCurrentTimeNanos();
+            auto restored_bytes = r.bytes_restored_total();
+            RAY_LOG(DEBUG) << "Restored " << restored_bytes << " in "
+                           << (now - start_time) / 1e6 << "ms. Object id:" << object_id;
+            restored_bytes_total_ += restored_bytes;
+            restored_objects_total_ += 1;
+            // Adjust throughput timing to account for concurrent restore operations.
+            restore_time_total_s_ +=
+                (now - std::max(start_time, last_restore_finish_ns_)) / 1e9;
+            if (now - last_restore_log_ns_ > 1e9) {
+              last_restore_log_ns_ = now;
+              // TODO(ekl) logging at error level until we add a better UX indicator.
+              RAY_LOG(ERROR) << "Restored "
+                             << static_cast<int>(restored_bytes_total_ / (1024 * 1024))
+                             << " MiB, " << restored_objects_total_
+                             << " objects, read throughput "
+                             << static_cast<int>(restored_bytes_total_ / (1024 * 1024) /
+                                                 restore_time_total_s_)
+                             << " MiB/s";
+            }
+            last_restore_finish_ns_ = now;
          }
          if (callback) {
            callback(status);
@ -296,7 +349,6 @@ void LocalObjectManager::AsyncRestoreSpilledObject(
 }

 void LocalObjectManager::ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_size) {
-  absl::MutexLock lock(&mutex_);
  std::vector<std::string> object_urls_to_delete;

  // Process upto batch size of objects to delete.
--- a/src/ray/raylet/local_object_manager.h
+++ b/src/ray/raylet/local_object_manager.h
@ -33,13 +33,15 @@ namespace raylet {
 /// have been freed, and objects that have been spilled.
 class LocalObjectManager {
 public:
-  LocalObjectManager(boost::asio::io_service &io_context, size_t free_objects_batch_size,
-                     int64_t free_objects_period_ms,
-                     IOWorkerPoolInterface &io_worker_pool,
-                     gcs::ObjectInfoAccessor &object_info_accessor,
-                     rpc::CoreWorkerClientPool &owner_client_pool,
-                     bool object_pinning_enabled, bool automatic_object_deletion_enabled,
-                     std::function<void(const std::vector<ObjectID> &)> on_objects_freed)
+  LocalObjectManager(
+      boost::asio::io_service &io_context, size_t free_objects_batch_size,
+      int64_t free_objects_period_ms, IOWorkerPoolInterface &io_worker_pool,
+      gcs::ObjectInfoAccessor &object_info_accessor,
+      rpc::CoreWorkerClientPool &owner_client_pool, bool object_pinning_enabled,
+      bool automatic_object_deletion_enabled, int max_io_workers,
+      int64_t min_spilling_size,
+      std::function<void(const std::vector<ObjectID> &)> on_objects_freed,
+      std::function<bool(const ray::ObjectID &)> is_plasma_object_spillable)
      : free_objects_period_ms_(free_objects_period_ms),
        free_objects_batch_size_(free_objects_batch_size),
        io_worker_pool_(io_worker_pool),
@ -48,7 +50,11 @@ class LocalObjectManager {
        object_pinning_enabled_(object_pinning_enabled),
        automatic_object_deletion_enabled_(automatic_object_deletion_enabled),
        on_objects_freed_(on_objects_freed),
-        last_free_objects_at_ms_(current_time_ms()) {}
+        last_free_objects_at_ms_(current_time_ms()),
+        min_spilling_size_(min_spilling_size),
+        num_active_workers_(0),
+        max_active_workers_(max_io_workers),
+        is_plasma_object_spillable_(is_plasma_object_spillable) {}

  /// Pin objects.
  ///
@ -67,22 +73,10 @@ class LocalObjectManager {
  void WaitForObjectFree(const rpc::Address &owner_address,
                         const std::vector<ObjectID> &object_ids);

-  /// Asynchronously spill objects when space is needed.
-  /// The callback tries to spill objects as much as num_bytes_to_spill and returns
-  /// the amount of space needed after the spilling is complete.
-  /// The returned value is calculated based off of min_bytes_to_spill. That says,
-  /// although it fails to spill num_bytes_to_spill, as long as it spills more than
-  /// min_bytes_to_spill, it will return the value that is less than 0 (meaning we
-  /// don't need any more additional space).
+  /// Spill objects as much as possible as fast as possible up to the max throughput.
  ///
-  /// \param num_bytes_to_spill The total number of bytes to spill. The method tries to
-  /// spill bytes as much as this value.
-  /// \param min_bytes_to_spill The minimum bytes that
-  /// need to be spilled.
-  /// \return The number of bytes of space still required after the
-  /// spill is complete. This return the value is less than 0 if it satifies the
-  /// min_bytes_to_spill.
-  int64_t SpillObjectsOfSize(int64_t num_bytes_to_spill, int64_t min_bytes_to_spill);
+  /// \return True if spilling is in progress.
+  void SpillObjectUptoMaxThroughput();

  /// Spill objects to external storage.
  ///
@ -114,11 +108,33 @@ class LocalObjectManager {
  /// invocation.
  void ProcessSpilledObjectsDeleteQueue(uint32_t max_batch_size);

+  /// Return True if spilling is in progress.
+  /// This is a narrow interface that is accessed by plasma store.
+  /// We are using the narrow interface here because plasma store is running in a
+  /// different thread, and we'd like to avoid making this component thread-safe,
+  /// which is against the general raylet design.
+  ///
+  /// \return True if spilling is still in progress. False otherwise.
+  bool IsSpillingInProgress();
+
 private:
+  FRIEND_TEST(LocalObjectManagerTest, TestSpillObjectsOfSize);
+  FRIEND_TEST(LocalObjectManagerTest,
+              TestSpillObjectsOfSizeNumBytesToSpillHigherThanMinBytesToSpill);
+  FRIEND_TEST(LocalObjectManagerTest, TestSpillObjectNotEvictable);
+
+  /// Asynchronously spill objects when space is needed.
+  /// The callback tries to spill objects as much as num_bytes_to_spill and returns
+  /// true if we could spill the corresponding bytes.
+  /// NOTE(sang): If 0 is given, this method spills a single object.
+  ///
+  /// \param num_bytes_to_spill The total number of bytes to spill.
+  /// \return True if it can spill num_bytes_to_spill. False otherwise.
+  bool SpillObjectsOfSize(int64_t num_bytes_to_spill);
+
  /// Internal helper method for spilling objects.
  void SpillObjectsInternal(const std::vector<ObjectID> &objects_ids,
-                            std::function<void(const ray::Status &)> callback)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+                            std::function<void(const ray::Status &)> callback);

  /// Release an object that has been freed by its owner.
  void ReleaseFreedObject(const ObjectID &object_id);
@ -164,14 +180,12 @@ class LocalObjectManager {
  std::function<void(const std::vector<ObjectID> &)> on_objects_freed_;

  // Objects that are pinned on this node.
-  absl::flat_hash_map<ObjectID, std::unique_ptr<RayObject>> pinned_objects_
-      GUARDED_BY(mutex_);
+  absl::flat_hash_map<ObjectID, std::unique_ptr<RayObject>> pinned_objects_;

  // Objects that were pinned on this node but that are being spilled.
  // These objects will be released once spilling is complete and the URL is
  // written to the object directory.
-  absl::flat_hash_map<ObjectID, std::unique_ptr<RayObject>> objects_pending_spill_
-      GUARDED_BY(mutex_);
+  absl::flat_hash_map<ObjectID, std::unique_ptr<RayObject>> objects_pending_spill_;

  /// The time that we last sent a FreeObjects request to other nodes for
  /// objects that have gone out of scope in the application.
@ -185,7 +199,7 @@ class LocalObjectManager {

  /// The total size of the objects that are currently being
  /// spilled from this node, in bytes.
-  size_t num_bytes_pending_spill_ GUARDED_BY(mutex_) = 0;
+  size_t num_bytes_pending_spill_;

  /// This class is accessed by both the raylet and plasma store threads. The
  /// mutex protects private members that relate to object spilling.
@ -198,16 +212,63 @@ class LocalObjectManager {
  /// A list of object id and url pairs that need to be deleted.
  /// We don't instantly delete objects when it goes out of scope from external storages
  /// because those objects could be still in progress of spilling.
-  std::queue<ObjectID> spilled_object_pending_delete_ GUARDED_BY(mutex_);
+  std::queue<ObjectID> spilled_object_pending_delete_;

  /// Mapping from object id to url_with_offsets. We cannot reuse pinned_objects_ because
  /// pinned_objects_ entries are deleted when spilling happens.
-  absl::flat_hash_map<ObjectID, std::string> spilled_objects_url_ GUARDED_BY(mutex_);
+  absl::flat_hash_map<ObjectID, std::string> spilled_objects_url_;

  /// Base URL -> ref_count. It is used because there could be multiple objects
  /// within a single spilled file. We need to ref count to avoid deleting the file
  /// before all objects within that file are out of scope.
-  absl::flat_hash_map<std::string, uint64_t> url_ref_count_ GUARDED_BY(mutex_);
+  absl::flat_hash_map<std::string, uint64_t> url_ref_count_;
+
+  /// Minimum bytes to spill to a single IO spill worker.
+  int64_t min_spilling_size_;
+
+  /// The current number of active spill workers.
+  int64_t num_active_workers_ GUARDED_BY(mutex_);
+
+  /// The max number of active spill workers.
+  const int64_t max_active_workers_;
+
+  /// Callback to check if a plasma object is pinned in workers.
+  /// Return true if unpinned, meaning we can safely spill the object. False otherwise.
+  std::function<bool(const ray::ObjectID &)> is_plasma_object_spillable_;
+
+  ///
+  /// Stats
+  ///
+
+  /// The last time a spill operation finished.
+  int64_t last_spill_finish_ns_ = 0;
+
+  /// The total wall time in seconds spent in spilling.
+  double spill_time_total_s_ = 0;
+
+  /// The total number of bytes spilled.
+  int64_t spilled_bytes_total_ = 0;
+
+  /// The total number of objects spilled.
+  int64_t spilled_objects_total_ = 0;
+
+  /// The last time a restore operation finished.
+  int64_t last_restore_finish_ns_ = 0;
+
+  /// The total wall time in seconds spent in restoring.
+  double restore_time_total_s_ = 0;
+
+  /// The total number of bytes restored.
+  int64_t restored_bytes_total_ = 0;
+
+  /// The total number of objects restored.
+  int64_t restored_objects_total_ = 0;
+
+  /// The last time a spill log finished.
+  int64_t last_spill_log_ns_ = 0;
+
+  /// The last time a restore log finished.
+  int64_t last_restore_log_ns_ = 0;
 };

 };  // namespace raylet
--- a/src/ray/raylet/main.cc
+++ b/src/ray/raylet/main.cc
@ -229,6 +229,8 @@ int main(int argc, char *argv[]) {
        node_manager_config.store_socket_name = store_socket_name;
        node_manager_config.temp_dir = temp_dir;
        node_manager_config.session_dir = session_dir;
+        node_manager_config.max_io_workers = RayConfig::instance().max_io_workers();
+        node_manager_config.min_spilling_size = RayConfig::instance().min_spilling_size();

        // Configuration for the object manager.
        ray::ObjectManagerConfig object_manager_config;
--- a/src/ray/raylet/node_manager.cc
+++ b/src/ray/raylet/node_manager.cc
@ -116,7 +116,8 @@ std::string WorkerOwnerString(std::shared_ptr<WorkerInterface> &worker) {
 NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self_node_id,
                         const NodeManagerConfig &config, ObjectManager &object_manager,
                         std::shared_ptr<gcs::GcsClient> gcs_client,
-                         std::shared_ptr<ObjectDirectoryInterface> object_directory)
+                         std::shared_ptr<ObjectDirectoryInterface> object_directory,
+                         std::function<bool(const ObjectID &)> is_plasma_object_spillable)
    : self_node_id_(self_node_id),
      io_service_(io_service),
      object_manager_(object_manager),
@ -171,14 +172,18 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self
                            /* object_pinning_enabled */ config.object_pinning_enabled,
                            /* automatic_object_deletion_enabled */
                            config.automatic_object_deletion_enabled,
+                            /*max_io_workers*/ config.max_io_workers,
+                            /*min_spilling_size*/ config.min_spilling_size,
                            [this](const std::vector<ObjectID> &object_ids) {
                              object_manager_.FreeObjects(object_ids,
                                                          /*local_only=*/false);
-                            }),
+                            },
+                            is_plasma_object_spillable),
      new_scheduler_enabled_(RayConfig::instance().new_scheduler_enabled()),
      report_worker_backlog_(RayConfig::instance().report_worker_backlog()),
      last_local_gc_ns_(absl::GetCurrentTimeNanos()),
      local_gc_interval_ns_(RayConfig::instance().local_gc_interval_s() * 1e9),
+      local_gc_min_interval_ns_(RayConfig::instance().local_gc_min_interval_s() * 1e9),
      record_metrics_period_(config.record_metrics_period_ms) {
  RAY_LOG(INFO) << "Initializing NodeManager with ID " << self_node_id_;
  RAY_CHECK(heartbeat_period_.count() > 0);
@ -553,7 +558,8 @@ void NodeManager::ReportResourceUsage() {
  // Trigger local GC if needed. This throttles the frequency of local GC calls
  // to at most once per heartbeat interval.
  auto now = absl::GetCurrentTimeNanos();
-  if (should_local_gc_ || now - last_local_gc_ns_ > local_gc_interval_ns_) {
+  if ((should_local_gc_ || now - last_local_gc_ns_ > local_gc_interval_ns_) &&
+      now - last_local_gc_ns_ > local_gc_min_interval_ns_) {
    DoLocalGC();
    should_local_gc_ = false;
    last_local_gc_ns_ = now;
@ -1186,8 +1192,7 @@ void NodeManager::ProcessClientMessage(const std::shared_ptr<ClientConnection> &
    ProcessFetchOrReconstructMessage(client, message_data);
  } break;
  case protocol::MessageType::NotifyDirectCallTaskBlocked: {
-    std::shared_ptr<WorkerInterface> worker = worker_pool_.GetRegisteredWorker(client);
-    HandleDirectCallTaskBlocked(worker);
+    ProcessDirectCallTaskBlocked(client, message_data);
  } break;
  case protocol::MessageType::NotifyDirectCallTaskUnblocked: {
    std::shared_ptr<WorkerInterface> worker = worker_pool_.GetRegisteredWorker(client);
@ -1534,6 +1539,15 @@ void NodeManager::ProcessFetchOrReconstructMessage(
  }
 }

+void NodeManager::ProcessDirectCallTaskBlocked(
+    const std::shared_ptr<ClientConnection> &client, const uint8_t *message_data) {
+  auto message =
+      flatbuffers::GetRoot<protocol::NotifyDirectCallTaskBlocked>(message_data);
+  bool release_resources = message->release_resources();
+  std::shared_ptr<WorkerInterface> worker = worker_pool_.GetRegisteredWorker(client);
+  HandleDirectCallTaskBlocked(worker, release_resources);
+}
+
 void NodeManager::ProcessWaitRequestMessage(
    const std::shared_ptr<ClientConnection> &client, const uint8_t *message_data) {
  // Read the data.
@ -2148,9 +2162,9 @@ void NodeManager::SubmitTask(const Task &task) {
 }

 void NodeManager::HandleDirectCallTaskBlocked(
-    const std::shared_ptr<WorkerInterface> &worker) {
+    const std::shared_ptr<WorkerInterface> &worker, bool release_resources) {
  if (new_scheduler_enabled_) {
-    if (!worker || worker->IsBlocked()) {
+    if (!worker || worker->IsBlocked() || !release_resources) {
      return;
    }
    std::vector<double> cpu_instances;
@ -2169,7 +2183,8 @@ void NodeManager::HandleDirectCallTaskBlocked(
    return;
  }

-  if (!worker || worker->GetAssignedTaskId().IsNil() || worker->IsBlocked()) {
+  if (!worker || worker->GetAssignedTaskId().IsNil() || worker->IsBlocked() ||
+      !release_resources) {
    return;  // The worker may have died or is no longer processing the task.
  }
  auto const cpu_resource_ids = worker->ReleaseTaskCpuResources();
@ -2297,7 +2312,6 @@ void NodeManager::AsyncResolveObjectsFinish(
    const std::shared_ptr<ClientConnection> &client, const TaskID &current_task_id,
    bool was_blocked) {
  std::shared_ptr<WorkerInterface> worker = worker_pool_.GetRegisteredWorker(client);
-
  // TODO(swang): Because the object dependencies are tracked in the task
  // dependency manager, we could actually remove this message entirely and
  // instead unblock the worker once all the objects become available.
@ -3154,9 +3168,6 @@ void NodeManager::HandleGlobalGC(const rpc::GlobalGCRequest &request,
 }

 void NodeManager::TriggerGlobalGC() {
-  RAY_LOG(INFO) << "Broadcasting Python GC request to all raylets since the cluster "
-                << "is low on resources. This removes Ray actor and object refs "
-                << "that are stuck in Python reference cycles.";
  should_global_gc_ = true;
  // We won't see our own request, so trigger local GC in the next heartbeat.
  should_local_gc_ = true;
--- a/src/ray/raylet/node_manager.h
+++ b/src/ray/raylet/node_manager.h
@ -104,6 +104,10 @@ struct NodeManagerConfig {
  std::unordered_map<std::string, std::string> raylet_config;
  // The time between record metrics in milliseconds, or -1 to disable.
  uint64_t record_metrics_period_ms;
+  // The number if max io workers.
+  int max_io_workers;
+  // The minimum object size that can be spilled by each spill operation.
+  int64_t min_spilling_size;
 };

 class NodeManager : public rpc::NodeManagerServiceHandler {
@ -115,7 +119,8 @@ class NodeManager : public rpc::NodeManagerServiceHandler {
  NodeManager(boost::asio::io_service &io_service, const NodeID &self_node_id,
              const NodeManagerConfig &config, ObjectManager &object_manager,
              std::shared_ptr<gcs::GcsClient> gcs_client,
-              std::shared_ptr<ObjectDirectoryInterface> object_directory);
+              std::shared_ptr<ObjectDirectoryInterface> object_directory_,
+              std::function<bool(const ObjectID &)> is_plasma_object_spillable);

  /// Process a new client connection.
  ///
@ -375,7 +380,8 @@ class NodeManager : public rpc::NodeManagerServiceHandler {
  /// arrive after the worker lease has been returned to the node manager.
  ///
  /// \param worker Shared ptr to the worker, or nullptr if lost.
-  void HandleDirectCallTaskBlocked(const std::shared_ptr<WorkerInterface> &worker);
+  void HandleDirectCallTaskBlocked(const std::shared_ptr<WorkerInterface> &worker,
+                                   bool release_resources);

  /// Handle a direct call task that is unblocked. Note that this callback may
  /// arrive after the worker lease has been returned to the node manager.
@ -437,6 +443,13 @@ class NodeManager : public rpc::NodeManagerServiceHandler {
  /// \return Void.
  void ProcessSubmitTaskMessage(const uint8_t *message_data);

+  /// Process client message of NotifyDirectCallTaskBlocked
+  ///
+  /// \param message_data A pointer to the message data.
+  /// \return Void.
+  void ProcessDirectCallTaskBlocked(const std::shared_ptr<ClientConnection> &client,
+                                    const uint8_t *message_data);
+
  /// Process client message of RegisterClientRequest
  ///
  /// \param client The client that sent the message.
@ -745,11 +758,15 @@ class NodeManager : public rpc::NodeManagerServiceHandler {
  /// on all local workers of this raylet.
  bool should_local_gc_ = false;

-  /// The last time local GC was triggered.
+  /// The last time local gc was run.
  int64_t last_local_gc_ns_ = 0;

  /// The interval in nanoseconds between local GC automatic triggers.
-  const int64_t local_gc_interval_ns_ = 10 * 60 * 1e9;
+  const int64_t local_gc_interval_ns_;
+
+  /// The min interval in nanoseconds between local GC runs (auto + memory pressure
+  /// triggered).
+  const int64_t local_gc_min_interval_ns_;

  /// These two classes make up the new scheduler. ClusterResourceScheduler is
  /// responsible for maintaining a view of the cluster state w.r.t resource
--- a/src/ray/raylet/raylet.cc
+++ b/src/ray/raylet/raylet.cc
@ -70,24 +70,33 @@ Raylet::Raylet(boost::asio::io_service &main_service, const std::string &socket_
                                                                    gcs_client_))
              : std::dynamic_pointer_cast<ObjectDirectoryInterface>(
                    std::make_shared<ObjectDirectory>(main_service, gcs_client_))),
-      object_manager_(main_service, self_node_id_, object_manager_config,
-                      object_directory_,
-                      [this](const ObjectID &object_id, const std::string &spilled_url,
-                             std::function<void(const ray::Status &)> callback) {
-                        node_manager_.GetLocalObjectManager().AsyncRestoreSpilledObject(
-                            object_id, spilled_url, callback);
-                      },
-                      [this](int64_t num_bytes_to_spill, int64_t min_bytes_to_spill) {
-                        return node_manager_.GetLocalObjectManager().SpillObjectsOfSize(
-                            num_bytes_to_spill, min_bytes_to_spill);
-                      },
-                      [this]() {
-                        // Post on the node manager's event loop since this
-                        // will be called from the plasma store thread.
-                        main_service_.post([this]() { node_manager_.TriggerGlobalGC(); });
-                      }),
+      object_manager_(
+          main_service, self_node_id_, object_manager_config, object_directory_,
+          [this](const ObjectID &object_id, const std::string &spilled_url,
+                 std::function<void(const ray::Status &)> callback) {
+            node_manager_.GetLocalObjectManager().AsyncRestoreSpilledObject(
+                object_id, spilled_url, callback);
+          },
+          [this]() {
+            // This callback is called from the plasma store thread.
+            // NOTE: It means the local object manager should be thread-safe.
+            main_service_.post([this]() {
+              node_manager_.GetLocalObjectManager().SpillObjectUptoMaxThroughput();
+            });
+            return node_manager_.GetLocalObjectManager().IsSpillingInProgress();
+          },
+          [this]() {
+            // Post on the node manager's event loop since this
+            // callback is called from the plasma store thread.
+            // This will help keep node manager lock-less.
+            main_service_.post([this]() { node_manager_.TriggerGlobalGC(); });
+          }),
      node_manager_(main_service, self_node_id_, node_manager_config, object_manager_,
-                    gcs_client_, object_directory_),
+                    gcs_client_, object_directory_,
+                    [this](const ObjectID &object_id) {
+                      // It is used by local_object_store.
+                      return object_manager_.IsPlasmaObjectSpillable(object_id);
+                    }),
      socket_name_(socket_name),
      acceptor_(main_service, ParseUrlEndpoint(socket_name)),
      socket_(main_service) {
--- a/src/ray/raylet/test/local_object_manager_test.cc
+++ b/src/ray/raylet/test/local_object_manager_test.cc
@ -236,15 +236,23 @@ class LocalObjectManagerTest : public ::testing::Test {
                /*free_objects_period_ms=*/1000, worker_pool, object_table, client_pool,
                /*object_pinning_enabled=*/true,
                /*automatic_object_delete_enabled=*/true,
+                /*max_io_workers=*/2,
+                /*min_spilling_size=*/0,
                [&](const std::vector<ObjectID> &object_ids) {
                  for (const auto &object_id : object_ids) {
                    freed.insert(object_id);
                  }
+                },
+                /*is_plasma_object_spillable=*/
+                [&](const ray::ObjectID &object_id) {
+                  return unevictable_objects_.count(object_id) == 0;
                }),
        unpins(std::make_shared<std::unordered_map<ObjectID, int>>()) {
    RayConfig::instance().initialize({{"object_spilling_config", "mock_config"}});
  }

+  void TearDown() { unevictable_objects_.clear(); }
+
  std::string BuildURL(const std::string url, int offset = 0, int num_objects = 1) {
    return url + "?" + "num_objects=" + std::to_string(num_objects) +
           "&offset=" + std::to_string(offset);
@ -262,6 +270,8 @@ class LocalObjectManagerTest : public ::testing::Test {
  // This hashmap is incremented when objects are unpinned by destroying their
  // unique_ptr.
  std::shared_ptr<std::unordered_map<ObjectID, int>> unpins;
+  // Object ids in this field won't be evictable.
+  std::unordered_set<ObjectID> unevictable_objects_;
 };

 TEST_F(LocalObjectManagerTest, TestPin) {
@ -416,17 +426,11 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) {
    objects.push_back(std::move(object));
  }
  manager.PinObjects(object_ids, std::move(objects));
-
-  int64_t num_bytes_required = manager.SpillObjectsOfSize(total_size / 2, total_size / 2);
-  ASSERT_EQ(num_bytes_required, -object_size / 2);
+  ASSERT_TRUE(manager.SpillObjectsOfSize(total_size / 2));
  for (const auto &id : object_ids) {
    ASSERT_EQ((*unpins)[id], 0);
  }

-  // Check that this returns the total number of bytes currently being spilled.
-  num_bytes_required = manager.SpillObjectsOfSize(0, 0);
-  ASSERT_EQ(num_bytes_required, -2 * object_size);
-
  // Check that half the objects get spilled and the URLs get added to the
  // global object directory.
  std::vector<std::string> urls;
@ -447,9 +451,124 @@ TEST_F(LocalObjectManagerTest, TestSpillObjectsOfSize) {
    ASSERT_EQ((*unpins)[object_url.first], 1);
  }

-  // Check that this returns the total number of bytes currently being spilled.
-  num_bytes_required = manager.SpillObjectsOfSize(0, 0);
-  ASSERT_EQ(num_bytes_required, 0);
+  // Make sure providing 0 bytes to SpillObjectsOfSize will spill one object.
+  // This is important to cover min_spilling_size_== 0.
+  ASSERT_TRUE(manager.SpillObjectsOfSize(0));
+  EXPECT_CALL(worker_pool, PushSpillWorker(_));
+  const std::string url = BuildURL("url" + std::to_string(object_ids.size()));
+  ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({url}));
+  ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl());
+  ASSERT_EQ(object_table.object_urls.size(), 3);
+  urls.push_back(url);
+  for (auto &object_url : object_table.object_urls) {
+    auto it = std::find(urls.begin(), urls.end(), object_url.second);
+    ASSERT_TRUE(it != urls.end());
+    ASSERT_EQ((*unpins)[object_url.first], 1);
+  }
+
+  // Since there's no more object to spill, this should fail.
+  ASSERT_FALSE(manager.SpillObjectsOfSize(0));
+}
+
+TEST_F(LocalObjectManagerTest, TestSpillObjectNotEvictable) {
+  rpc::Address owner_address;
+  owner_address.set_worker_id(WorkerID::FromRandom().Binary());
+
+  std::vector<ObjectID> object_ids;
+  std::vector<std::unique_ptr<RayObject>> objects;
+  int64_t total_size = 0;
+  int64_t object_size = 1000;
+
+  const ObjectID object_id = ObjectID::FromRandom();
+  object_ids.push_back(object_id);
+  unevictable_objects_.emplace(object_id);
+  auto data_buffer = std::make_shared<MockObjectBuffer>(object_size, object_id, unpins);
+  total_size += object_size;
+  std::unique_ptr<RayObject> object(
+      new RayObject(data_buffer, nullptr, std::vector<ObjectID>()));
+  objects.push_back(std::move(object));
+
+  manager.PinObjects(object_ids, std::move(objects));
+  ASSERT_FALSE(manager.SpillObjectsOfSize(1000));
+  for (const auto &id : object_ids) {
+    ASSERT_EQ((*unpins)[id], 0);
+  }
+
+  // Now object is evictable. Spill should succeed.
+  unevictable_objects_.erase(object_id);
+  ASSERT_TRUE(manager.SpillObjectsOfSize(1000));
+}
+
+TEST_F(LocalObjectManagerTest, TestSpillUptoMaxThroughput) {
+  rpc::Address owner_address;
+  owner_address.set_worker_id(WorkerID::FromRandom().Binary());
+
+  std::vector<ObjectID> object_ids;
+  std::vector<std::unique_ptr<RayObject>> objects;
+  int64_t object_size = 1000;
+  size_t total_objects = 3;
+
+  // Pin 3 objects.
+  for (size_t i = 0; i < total_objects; i++) {
+    ObjectID object_id = ObjectID::FromRandom();
+    object_ids.push_back(object_id);
+    auto data_buffer = std::make_shared<MockObjectBuffer>(object_size, object_id, unpins);
+    std::unique_ptr<RayObject> object(
+        new RayObject(data_buffer, nullptr, std::vector<ObjectID>()));
+    objects.push_back(std::move(object));
+  }
+  manager.PinObjects(object_ids, std::move(objects));
+
+  // This will spill until 2 workers are occupied.
+  manager.SpillObjectUptoMaxThroughput();
+  ASSERT_TRUE(manager.IsSpillingInProgress());
+  // Spilling is still going on, meaning we can make the pace. So it should return true.
+  manager.SpillObjectUptoMaxThroughput();
+  ASSERT_TRUE(manager.IsSpillingInProgress());
+  // No object ids are spilled yet.
+  for (const auto &id : object_ids) {
+    ASSERT_EQ((*unpins)[id], 0);
+  }
+
+  // Spill one object.
+  std::vector<std::string> urls;
+  urls.push_back(BuildURL("url" + std::to_string(0)));
+  ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({urls[0]}));
+  ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl());
+  // Make sure object is spilled.
+  ASSERT_EQ(object_table.object_urls.size(), 1);
+  for (auto &object_url : object_table.object_urls) {
+    if (urls[0] == object_url.second) {
+      ASSERT_EQ((*unpins)[object_url.first], 1);
+    }
+  }
+
+  // Now, there's only one object that is current spilling.
+  // SpillObjectUptoMaxThroughput will spill one more object (since one worker is
+  // availlable).
+  manager.SpillObjectUptoMaxThroughput();
+  ASSERT_TRUE(manager.IsSpillingInProgress());
+  manager.SpillObjectUptoMaxThroughput();
+  ASSERT_TRUE(manager.IsSpillingInProgress());
+
+  // Spilling is done for all objects.
+  for (size_t i = 1; i < object_ids.size(); i++) {
+    urls.push_back(BuildURL("url" + std::to_string(i)));
+  }
+  for (size_t i = 1; i < urls.size(); i++) {
+    ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects({urls[i]}));
+    ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl());
+  }
+  ASSERT_EQ(object_table.object_urls.size(), 3);
+  for (auto &object_url : object_table.object_urls) {
+    auto it = std::find(urls.begin(), urls.end(), object_url.second);
+    ASSERT_TRUE(it != urls.end());
+    ASSERT_EQ((*unpins)[object_url.first], 1);
+  }
+
+  // We cannot spill anymore as there is no more pinned object.
+  manager.SpillObjectUptoMaxThroughput();
+  ASSERT_FALSE(manager.IsSpillingInProgress());
 }

 TEST_F(LocalObjectManagerTest, TestSpillError) {
@ -739,52 +858,6 @@ TEST_F(LocalObjectManagerTest, TestDeleteMaxObjects) {
  ASSERT_EQ(deleted_urls_size, free_objects_batch_size);
 }

-TEST_F(LocalObjectManagerTest,
-       TestSpillObjectsOfSizeNumBytesToSpillHigherThanMinBytesToSpill) {
-  /// Test the case SpillObjectsOfSize(num_bytes_to_spill, min_bytes_to_spill
-  /// where num_bytes_to_spill > min_bytes_to_spill.
-  rpc::Address owner_address;
-  owner_address.set_worker_id(WorkerID::FromRandom().Binary());
-
-  std::vector<ObjectID> object_ids;
-  std::vector<std::unique_ptr<RayObject>> objects;
-  int64_t total_size = 0;
-  int64_t object_size = 1000;
-  size_t object_len = 3;
-
-  for (size_t i = 0; i < object_len; i++) {
-    ObjectID object_id = ObjectID::FromRandom();
-    object_ids.push_back(object_id);
-    auto data_buffer = std::make_shared<MockObjectBuffer>(object_size, object_id, unpins);
-    total_size += object_size;
-    std::unique_ptr<RayObject> object(
-        new RayObject(data_buffer, nullptr, std::vector<ObjectID>()));
-    objects.push_back(std::move(object));
-  }
-  manager.PinObjects(object_ids, std::move(objects));
-
-  // First test when num_bytes_to_spill > min_bytes to spill.
-  // It means that we cannot spill the num_bytes_required, but we at least spilled the
-  // required amount, which is the min_bytes_to_spill.
-  int64_t num_bytes_required = manager.SpillObjectsOfSize(8000, object_size);
-  // only min bytes to spill is considered.
-  ASSERT_TRUE(num_bytes_required <= 0);
-
-  // Make sure the spilling is done properly.
-  std::vector<std::string> urls;
-  for (size_t i = 0; i < object_ids.size(); i++) {
-    urls.push_back(BuildURL("url" + std::to_string(i)));
-  }
-  EXPECT_CALL(worker_pool, PushSpillWorker(_));
-  ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls));
-  for (size_t i = 0; i < object_ids.size(); i++) {
-    ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl());
-  }
-  for (size_t i = 0; i < object_ids.size(); i++) {
-    ASSERT_EQ((*unpins).size(), object_len);
-  }
-}
-
 }  // namespace raylet

 }  // namespace ray
--- a/src/ray/raylet_client/raylet_client.cc
+++ b/src/ray/raylet_client/raylet_client.cc
@ -189,9 +189,9 @@ Status raylet::RayletClient::NotifyUnblocked(const TaskID &current_task_id) {
  return conn_->WriteMessage(MessageType::NotifyUnblocked, &fbb);
 }

-Status raylet::RayletClient::NotifyDirectCallTaskBlocked() {
+Status raylet::RayletClient::NotifyDirectCallTaskBlocked(bool release_resources) {
  flatbuffers::FlatBufferBuilder fbb;
-  auto message = protocol::CreateNotifyDirectCallTaskBlocked(fbb);
+  auto message = protocol::CreateNotifyDirectCallTaskBlocked(fbb, release_resources);
  fbb.Finish(message);
  return conn_->WriteMessage(MessageType::NotifyDirectCallTaskBlocked, &fbb);
 }
--- a/src/ray/raylet_client/raylet_client.h
+++ b/src/ray/raylet_client/raylet_client.h
@ -256,8 +256,9 @@ class RayletClient : public RayletClientInterface {
  /// Notify the raylet that this client is blocked. This is only used for direct task
  /// calls. Note that ordering of this with respect to Unblock calls is important.
  ///
-  /// \return ray::Status.
-  ray::Status NotifyDirectCallTaskBlocked();
+  /// \param release_resources: true if the dirct call blocking needs to release
+  /// resources. \return ray::Status.
+  ray::Status NotifyDirectCallTaskBlocked(bool release_resources);

  /// Notify the raylet that this client is unblocked. This is only used for direct task
  /// calls. Note that ordering of this with respect to Block calls is important.