[serve] Don't halt main control loop due to exceptions in snapshot logic (#20151)

This commit is contained in:
Edward Oakes 2021-11-09 14:46:15 -08:00 committed by GitHub
parent 215f47bc53
commit 39b3eb9763
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -181,21 +181,30 @@ class ServeController:
deployment_name, new_deployment_info)
async def run_control_loop(self) -> None:
# NOTE(edoakes): we catch all exceptions here and simply log them,
# because an unhandled exception would cause the main control loop to
# halt, which should *never* happen.
while True:
try:
self.autoscale()
except Exception:
logger.exception("Exception while autoscaling deployments.")
logger.exception("Exception in autoscaling.")
async with self.write_lock:
try:
self.http_state.update()
except Exception:
logger.exception("Exception updating HTTP state.")
try:
self.deployment_state_manager.update()
except Exception:
logger.exception("Exception updating deployment state.")
self._put_serve_snapshot()
try:
self._put_serve_snapshot()
except Exception:
logger.exception("Exception putting serve snapshot.")
await asyncio.sleep(CONTROL_LOOP_PERIOD_S)
def _put_serve_snapshot(self) -> None: