mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Raise better error message when workers are killed with SIGTERM in k8s (#20557)
In k8s, sigterm almost always means the pod was killed due to memory limits. Raise a better error message there.
This commit is contained in:
parent
f0e8d66a85
commit
79911510d3
1 changed files with 7 additions and 1 deletions
|
@ -799,7 +799,13 @@ cdef CRayStatus task_execution_handler(
|
|||
# https://docs.python.org/3/library/sys.html#sys.exit
|
||||
return CRayStatus.IntentionalSystemExit()
|
||||
else:
|
||||
logger.exception("SystemExit was raised from the worker")
|
||||
msg = "SystemExit was raised from the worker."
|
||||
# In K8s, SIGTERM likely means we hit memory limits, so print
|
||||
# a more informative message there.
|
||||
if "KUBERNETES_SERVICE_HOST" in os.environ:
|
||||
msg += (
|
||||
" The worker may have exceeded K8s pod memory limits.")
|
||||
logger.exception(msg)
|
||||
return CRayStatus.UnexpectedSystemExit()
|
||||
|
||||
return CRayStatus.OK()
|
||||
|
|
Loading…
Add table
Reference in a new issue