diff --git a/doc/source/memory-management.rst b/doc/source/memory-management.rst index 96e4f7e15..a8ab32098 100644 --- a/doc/source/memory-management.rst +++ b/doc/source/memory-management.rst @@ -249,8 +249,6 @@ In the above example, the memory quota is specified statically by the decorator, # override the memory quota to 1GiB when creating the actor SomeActor.options(memory=1000 * 1024 * 1024).remote(a=1, b=2) -**Enforcement**: If an actor exceeds its memory quota, calls to it will throw ``RayOutOfMemoryError`` and it may be killed. Memory quota is currently enforced on a best-effort basis for actors only (but quota is taken into account during scheduling in all cases). - Questions or Issues? -------------------- diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 7723a69c9..efbbd5f2e 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -397,11 +397,6 @@ cdef execute_task( next_title = f"ray::{class_name}" pid = os.getpid() worker_name = f"ray_{class_name}_{pid}" - if c_resources.find(b"memory") != c_resources.end(): - worker.memory_monitor.set_heap_limit( - worker_name, - ray_constants.from_memory_units( - dereference(c_resources.find(b"memory")).second)) if c_resources.find(b"object_store_memory") != c_resources.end(): worker.core_worker.set_object_store_client_options( worker_name, diff --git a/python/ray/memory_monitor.py b/python/ray/memory_monitor.py index f6d6f352c..c65f46d79 100644 --- a/python/ray/memory_monitor.py +++ b/python/ray/memory_monitor.py @@ -78,8 +78,6 @@ class MemoryMonitor: # throttle this check at most once a second or so. self.check_interval = check_interval self.last_checked = 0 - self.heap_limit = None - self.worker_name = None try: self.error_threshold = float( os.getenv("RAY_MEMORY_MONITOR_ERROR_THRESHOLD")) @@ -98,10 +96,6 @@ class MemoryMonitor: "`pip install psutil` (or ray[debug]) to enable " "debugging of memory-related crashes.") - def set_heap_limit(self, worker_name, limit_bytes): - self.heap_limit = limit_bytes - self.worker_name = worker_name - def get_memory_usage(self): psutil_mem = psutil.virtual_memory() total_gb = psutil_mem.total / (1024**3) @@ -140,17 +134,3 @@ class MemoryMonitor: self.error_threshold)) else: logger.debug(f"Memory usage is {used_gb} / {total_gb}") - - if self.heap_limit: - mem_info = psutil.Process(os.getpid()).memory_info() - heap_size = get_rss(mem_info) - if heap_size > self.heap_limit: - raise RayOutOfMemoryError( - "Heap memory usage for {} is {} / {} GiB limit".format( - self.worker_name, round(heap_size / (1024**3), 4), - round(self.heap_limit / (1024**3), 4))) - elif heap_size > 0.8 * self.heap_limit: - logger.warning( - "Heap memory usage for {} is {} / {} GiB limit".format( - self.worker_name, round(heap_size / (1024**3), 4), - round(self.heap_limit / (1024**3), 4))) diff --git a/python/ray/tests/test_memory_scheduling.py b/python/ray/tests/test_memory_scheduling.py index 8da6026d8..45ddac50d 100644 --- a/python/ray/tests/test_memory_scheduling.py +++ b/python/ray/tests/test_memory_scheduling.py @@ -68,26 +68,6 @@ class TestMemoryScheduling(unittest.TestCase): finally: ray.shutdown() - def testTuneDriverHeapLimit(self): - try: - ray.init(num_cpus=4, _memory=100 * MB) - _register_all() - result = tune.run( - "PG", - stop={"timesteps_total": 10000}, - config={ - "env": "CartPole-v0", - "memory": 100 * 1024 * 1024, # too little - "framework": "tf", - }, - raise_on_failed_trial=False) - self.assertEqual(result.trials[0].status, "ERROR") - self.assertTrue( - "RayOutOfMemoryError: Heap memory usage for ray_PG_" in - result.trials[0].error_msg) - finally: - ray.shutdown() - def testTuneDriverStoreLimit(self): try: ray.init( @@ -111,27 +91,6 @@ class TestMemoryScheduling(unittest.TestCase): finally: ray.shutdown() - def testTuneWorkerHeapLimit(self): - try: - ray.init(num_cpus=4, _memory=100 * MB) - _register_all() - result = tune.run( - "PG", - stop={"timesteps_total": 10000}, - config={ - "env": "CartPole-v0", - "num_workers": 1, - "memory_per_worker": 100 * 1024 * 1024, # too little - "framework": "tf", - }, - raise_on_failed_trial=False) - self.assertEqual(result.trials[0].status, "ERROR") - self.assertTrue( - "RayOutOfMemoryError: Heap memory usage for ray_Rollout" in - result.trials[0].error_msg) - finally: - ray.shutdown() - def testTuneWorkerStoreLimit(self): try: ray.init( diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index d50e2e96f..e6925674b 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -293,22 +293,10 @@ COMMON_CONFIG: TrainerConfigDict = { # Number of CPUs to allocate for the trainer. Note: this only takes effect # when running in Tune. Otherwise, the trainer runs in the main program. "num_cpus_for_driver": 1, - # You can set these memory quotas to tell Ray to reserve memory for your - # training run. This guarantees predictable execution, but the tradeoff is - # if your workload exceeeds the memory quota it will fail. - # Heap memory to reserve for the trainer process (0 for unlimited). This - # can be large if your are using large train batches, replay buffers, etc. + # Deprecated. "memory": 0, - # Object store memory to reserve for the trainer process. Being large - # enough to fit a few copies of the model weights should be sufficient. - # This is enabled by default since models are typically quite small. "object_store_memory": 0, - # Heap memory to reserve for each worker. Should generally be small unless - # your environment is very heavyweight. "memory_per_worker": 0, - # Object store memory to reserve for each worker. This only needs to be - # large enough to fit a few sample batches at a time. This is enabled - # by default since it almost never needs to be larger than ~200MB. "object_store_memory_per_worker": 0, # === Offline Datasets ===