Remove memory quota enforcement from actors (#11480)

* wip * fix * deprecate
2025-03-06 10:31:39 -05:00 · 2020-10-21 14:29:03 -07:00 · 2020-10-21 14:29:03 -07:00 · e8c77e2847
commit e8c77e2847
parent 8c82369cad
5 changed files with 1 additions and 81 deletions
--- a/doc/source/memory-management.rst
+++ b/doc/source/memory-management.rst
@ -249,8 +249,6 @@ In the above example, the memory quota is specified statically by the decorator,
  # override the memory quota to 1GiB when creating the actor
  SomeActor.options(memory=1000 * 1024 * 1024).remote(a=1, b=2)
 **Enforcement**: If an actor exceeds its memory quota, calls to it will throw ``RayOutOfMemoryError`` and it may be killed. Memory quota is currently enforced on a best-effort basis for actors only (but quota is taken into account during scheduling in all cases).
 Questions or Issues?
 --------------------
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@ -397,11 +397,6 @@ cdef execute_task(
        next_title = f"ray::{class_name}"
        pid = os.getpid()
        worker_name = f"ray_{class_name}_{pid}"
        if c_resources.find(b"memory") != c_resources.end():
            worker.memory_monitor.set_heap_limit(
                worker_name,
                ray_constants.from_memory_units(
                    dereference(c_resources.find(b"memory")).second))
        if c_resources.find(b"object_store_memory") != c_resources.end():
            worker.core_worker.set_object_store_client_options(
                worker_name,
--- a/python/ray/memory_monitor.py
+++ b/python/ray/memory_monitor.py
@ -78,8 +78,6 @@ class MemoryMonitor:
        # throttle this check at most once a second or so.
        self.check_interval = check_interval
        self.last_checked = 0
        self.heap_limit = None
        self.worker_name = None
        try:
            self.error_threshold = float(
                os.getenv("RAY_MEMORY_MONITOR_ERROR_THRESHOLD"))
@ -98,10 +96,6 @@ class MemoryMonitor:
                        "`pip install psutil` (or ray[debug]) to enable "
                        "debugging of memory-related crashes.")
    def set_heap_limit(self, worker_name, limit_bytes):
        self.heap_limit = limit_bytes
        self.worker_name = worker_name
    def get_memory_usage(self):
        psutil_mem = psutil.virtual_memory()
        total_gb = psutil_mem.total / (1024**3)
@ -140,17 +134,3 @@ class MemoryMonitor:
                                                    self.error_threshold))
            else:
                logger.debug(f"Memory usage is {used_gb} / {total_gb}")
            if self.heap_limit:
                mem_info = psutil.Process(os.getpid()).memory_info()
                heap_size = get_rss(mem_info)
                if heap_size > self.heap_limit:
                    raise RayOutOfMemoryError(
                        "Heap memory usage for {} is {} / {} GiB limit".format(
                            self.worker_name, round(heap_size / (1024**3), 4),
                            round(self.heap_limit / (1024**3), 4)))
                elif heap_size > 0.8 * self.heap_limit:
                    logger.warning(
                        "Heap memory usage for {} is {} / {} GiB limit".format(
                            self.worker_name, round(heap_size / (1024**3), 4),
                            round(self.heap_limit / (1024**3), 4)))
--- a/python/ray/tests/test_memory_scheduling.py
+++ b/python/ray/tests/test_memory_scheduling.py
@ -68,26 +68,6 @@ class TestMemoryScheduling(unittest.TestCase):
        finally:
            ray.shutdown()
    def testTuneDriverHeapLimit(self):
        try:
            ray.init(num_cpus=4, _memory=100 * MB)
            _register_all()
            result = tune.run(
                "PG",
                stop={"timesteps_total": 10000},
                config={
                    "env": "CartPole-v0",
                    "memory": 100 * 1024 * 1024,  # too little
                    "framework": "tf",
                },
                raise_on_failed_trial=False)
            self.assertEqual(result.trials[0].status, "ERROR")
            self.assertTrue(
                "RayOutOfMemoryError: Heap memory usage for ray_PG_" in
                result.trials[0].error_msg)
        finally:
            ray.shutdown()
    def testTuneDriverStoreLimit(self):
        try:
            ray.init(
@ -111,27 +91,6 @@ class TestMemoryScheduling(unittest.TestCase):
        finally:
            ray.shutdown()
    def testTuneWorkerHeapLimit(self):
        try:
            ray.init(num_cpus=4, _memory=100 * MB)
            _register_all()
            result = tune.run(
                "PG",
                stop={"timesteps_total": 10000},
                config={
                    "env": "CartPole-v0",
                    "num_workers": 1,
                    "memory_per_worker": 100 * 1024 * 1024,  # too little
                    "framework": "tf",
                },
                raise_on_failed_trial=False)
            self.assertEqual(result.trials[0].status, "ERROR")
            self.assertTrue(
                "RayOutOfMemoryError: Heap memory usage for ray_Rollout" in
                result.trials[0].error_msg)
        finally:
            ray.shutdown()
    def testTuneWorkerStoreLimit(self):
        try:
            ray.init(
--- a/rllib/agents/trainer.py
+++ b/rllib/agents/trainer.py
@ -293,22 +293,10 @@ COMMON_CONFIG: TrainerConfigDict = {
    # Number of CPUs to allocate for the trainer. Note: this only takes effect
    # when running in Tune. Otherwise, the trainer runs in the main program.
    "num_cpus_for_driver": 1,
-    # You can set these memory quotas to tell Ray to reserve memory for your
+    # Deprecated.
    # training run. This guarantees predictable execution, but the tradeoff is
    # if your workload exceeeds the memory quota it will fail.
    # Heap memory to reserve for the trainer process (0 for unlimited). This
    # can be large if your are using large train batches, replay buffers, etc.
    "memory": 0,
    # Object store memory to reserve for the trainer process. Being large
    # enough to fit a few copies of the model weights should be sufficient.
    # This is enabled by default since models are typically quite small.
    "object_store_memory": 0,
    # Heap memory to reserve for each worker. Should generally be small unless
    # your environment is very heavyweight.
    "memory_per_worker": 0,
    # Object store memory to reserve for each worker. This only needs to be
    # large enough to fit a few sample batches at a time. This is enabled
    # by default since it almost never needs to be larger than ~200MB.
    "object_store_memory_per_worker": 0,
    # === Offline Datasets ===