mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Remove memory quota enforcement from actors (#11480)
* wip * fix * deprecate
This commit is contained in:
parent
8c82369cad
commit
e8c77e2847
5 changed files with 1 additions and 81 deletions
|
@ -249,8 +249,6 @@ In the above example, the memory quota is specified statically by the decorator,
|
||||||
# override the memory quota to 1GiB when creating the actor
|
# override the memory quota to 1GiB when creating the actor
|
||||||
SomeActor.options(memory=1000 * 1024 * 1024).remote(a=1, b=2)
|
SomeActor.options(memory=1000 * 1024 * 1024).remote(a=1, b=2)
|
||||||
|
|
||||||
**Enforcement**: If an actor exceeds its memory quota, calls to it will throw ``RayOutOfMemoryError`` and it may be killed. Memory quota is currently enforced on a best-effort basis for actors only (but quota is taken into account during scheduling in all cases).
|
|
||||||
|
|
||||||
Questions or Issues?
|
Questions or Issues?
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
|
|
|
@ -397,11 +397,6 @@ cdef execute_task(
|
||||||
next_title = f"ray::{class_name}"
|
next_title = f"ray::{class_name}"
|
||||||
pid = os.getpid()
|
pid = os.getpid()
|
||||||
worker_name = f"ray_{class_name}_{pid}"
|
worker_name = f"ray_{class_name}_{pid}"
|
||||||
if c_resources.find(b"memory") != c_resources.end():
|
|
||||||
worker.memory_monitor.set_heap_limit(
|
|
||||||
worker_name,
|
|
||||||
ray_constants.from_memory_units(
|
|
||||||
dereference(c_resources.find(b"memory")).second))
|
|
||||||
if c_resources.find(b"object_store_memory") != c_resources.end():
|
if c_resources.find(b"object_store_memory") != c_resources.end():
|
||||||
worker.core_worker.set_object_store_client_options(
|
worker.core_worker.set_object_store_client_options(
|
||||||
worker_name,
|
worker_name,
|
||||||
|
|
|
@ -78,8 +78,6 @@ class MemoryMonitor:
|
||||||
# throttle this check at most once a second or so.
|
# throttle this check at most once a second or so.
|
||||||
self.check_interval = check_interval
|
self.check_interval = check_interval
|
||||||
self.last_checked = 0
|
self.last_checked = 0
|
||||||
self.heap_limit = None
|
|
||||||
self.worker_name = None
|
|
||||||
try:
|
try:
|
||||||
self.error_threshold = float(
|
self.error_threshold = float(
|
||||||
os.getenv("RAY_MEMORY_MONITOR_ERROR_THRESHOLD"))
|
os.getenv("RAY_MEMORY_MONITOR_ERROR_THRESHOLD"))
|
||||||
|
@ -98,10 +96,6 @@ class MemoryMonitor:
|
||||||
"`pip install psutil` (or ray[debug]) to enable "
|
"`pip install psutil` (or ray[debug]) to enable "
|
||||||
"debugging of memory-related crashes.")
|
"debugging of memory-related crashes.")
|
||||||
|
|
||||||
def set_heap_limit(self, worker_name, limit_bytes):
|
|
||||||
self.heap_limit = limit_bytes
|
|
||||||
self.worker_name = worker_name
|
|
||||||
|
|
||||||
def get_memory_usage(self):
|
def get_memory_usage(self):
|
||||||
psutil_mem = psutil.virtual_memory()
|
psutil_mem = psutil.virtual_memory()
|
||||||
total_gb = psutil_mem.total / (1024**3)
|
total_gb = psutil_mem.total / (1024**3)
|
||||||
|
@ -140,17 +134,3 @@ class MemoryMonitor:
|
||||||
self.error_threshold))
|
self.error_threshold))
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Memory usage is {used_gb} / {total_gb}")
|
logger.debug(f"Memory usage is {used_gb} / {total_gb}")
|
||||||
|
|
||||||
if self.heap_limit:
|
|
||||||
mem_info = psutil.Process(os.getpid()).memory_info()
|
|
||||||
heap_size = get_rss(mem_info)
|
|
||||||
if heap_size > self.heap_limit:
|
|
||||||
raise RayOutOfMemoryError(
|
|
||||||
"Heap memory usage for {} is {} / {} GiB limit".format(
|
|
||||||
self.worker_name, round(heap_size / (1024**3), 4),
|
|
||||||
round(self.heap_limit / (1024**3), 4)))
|
|
||||||
elif heap_size > 0.8 * self.heap_limit:
|
|
||||||
logger.warning(
|
|
||||||
"Heap memory usage for {} is {} / {} GiB limit".format(
|
|
||||||
self.worker_name, round(heap_size / (1024**3), 4),
|
|
||||||
round(self.heap_limit / (1024**3), 4)))
|
|
||||||
|
|
|
@ -68,26 +68,6 @@ class TestMemoryScheduling(unittest.TestCase):
|
||||||
finally:
|
finally:
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
|
|
||||||
def testTuneDriverHeapLimit(self):
|
|
||||||
try:
|
|
||||||
ray.init(num_cpus=4, _memory=100 * MB)
|
|
||||||
_register_all()
|
|
||||||
result = tune.run(
|
|
||||||
"PG",
|
|
||||||
stop={"timesteps_total": 10000},
|
|
||||||
config={
|
|
||||||
"env": "CartPole-v0",
|
|
||||||
"memory": 100 * 1024 * 1024, # too little
|
|
||||||
"framework": "tf",
|
|
||||||
},
|
|
||||||
raise_on_failed_trial=False)
|
|
||||||
self.assertEqual(result.trials[0].status, "ERROR")
|
|
||||||
self.assertTrue(
|
|
||||||
"RayOutOfMemoryError: Heap memory usage for ray_PG_" in
|
|
||||||
result.trials[0].error_msg)
|
|
||||||
finally:
|
|
||||||
ray.shutdown()
|
|
||||||
|
|
||||||
def testTuneDriverStoreLimit(self):
|
def testTuneDriverStoreLimit(self):
|
||||||
try:
|
try:
|
||||||
ray.init(
|
ray.init(
|
||||||
|
@ -111,27 +91,6 @@ class TestMemoryScheduling(unittest.TestCase):
|
||||||
finally:
|
finally:
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
|
|
||||||
def testTuneWorkerHeapLimit(self):
|
|
||||||
try:
|
|
||||||
ray.init(num_cpus=4, _memory=100 * MB)
|
|
||||||
_register_all()
|
|
||||||
result = tune.run(
|
|
||||||
"PG",
|
|
||||||
stop={"timesteps_total": 10000},
|
|
||||||
config={
|
|
||||||
"env": "CartPole-v0",
|
|
||||||
"num_workers": 1,
|
|
||||||
"memory_per_worker": 100 * 1024 * 1024, # too little
|
|
||||||
"framework": "tf",
|
|
||||||
},
|
|
||||||
raise_on_failed_trial=False)
|
|
||||||
self.assertEqual(result.trials[0].status, "ERROR")
|
|
||||||
self.assertTrue(
|
|
||||||
"RayOutOfMemoryError: Heap memory usage for ray_Rollout" in
|
|
||||||
result.trials[0].error_msg)
|
|
||||||
finally:
|
|
||||||
ray.shutdown()
|
|
||||||
|
|
||||||
def testTuneWorkerStoreLimit(self):
|
def testTuneWorkerStoreLimit(self):
|
||||||
try:
|
try:
|
||||||
ray.init(
|
ray.init(
|
||||||
|
|
|
@ -293,22 +293,10 @@ COMMON_CONFIG: TrainerConfigDict = {
|
||||||
# Number of CPUs to allocate for the trainer. Note: this only takes effect
|
# Number of CPUs to allocate for the trainer. Note: this only takes effect
|
||||||
# when running in Tune. Otherwise, the trainer runs in the main program.
|
# when running in Tune. Otherwise, the trainer runs in the main program.
|
||||||
"num_cpus_for_driver": 1,
|
"num_cpus_for_driver": 1,
|
||||||
# You can set these memory quotas to tell Ray to reserve memory for your
|
# Deprecated.
|
||||||
# training run. This guarantees predictable execution, but the tradeoff is
|
|
||||||
# if your workload exceeeds the memory quota it will fail.
|
|
||||||
# Heap memory to reserve for the trainer process (0 for unlimited). This
|
|
||||||
# can be large if your are using large train batches, replay buffers, etc.
|
|
||||||
"memory": 0,
|
"memory": 0,
|
||||||
# Object store memory to reserve for the trainer process. Being large
|
|
||||||
# enough to fit a few copies of the model weights should be sufficient.
|
|
||||||
# This is enabled by default since models are typically quite small.
|
|
||||||
"object_store_memory": 0,
|
"object_store_memory": 0,
|
||||||
# Heap memory to reserve for each worker. Should generally be small unless
|
|
||||||
# your environment is very heavyweight.
|
|
||||||
"memory_per_worker": 0,
|
"memory_per_worker": 0,
|
||||||
# Object store memory to reserve for each worker. This only needs to be
|
|
||||||
# large enough to fit a few sample batches at a time. This is enabled
|
|
||||||
# by default since it almost never needs to be larger than ~200MB.
|
|
||||||
"object_store_memory_per_worker": 0,
|
"object_store_memory_per_worker": 0,
|
||||||
|
|
||||||
# === Offline Datasets ===
|
# === Offline Datasets ===
|
||||||
|
|
Loading…
Add table
Reference in a new issue