From 4ab059eaa1158137ea3b6fd0e2319a3836283388 Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Fri, 7 Jan 2022 22:54:50 -0800 Subject: [PATCH] [gcs] Fix the server standalone tests in HA mode (#21480) CoreWorker hangs there before exiting if gcs exits first due to in correct ordering of destruction. This PR fixed this. It'll stop gcs client first and then job the thread. --- .buildkite/pipeline.yml | 2 +- src/ray/core_worker/core_worker.cc | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2a4b0712a..f2390ce6b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -331,7 +331,7 @@ --test_env=RAY_gcs_grpc_based_pubsub=1 --test_env=RAY_bootstrap_with_gcs=1 --test_env=RAY_gcs_storage=memory - -- python/ray/serve/... -//python/ray/serve:test_cli -//python/ray/serve:test_standalone + -- python/ray/serve/... # Re-enable after fixing. #- bazel test --config=ci $(./scripts/bazel_export_options) # --test_tag_filters=team:serve diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 1f21046f5..aa2512c1b 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -594,12 +594,13 @@ void CoreWorker::OnNodeRemoved(const NodeID &node_id) { } void CoreWorker::WaitForShutdown() { - if (io_thread_.joinable()) { - io_thread_.join(); - } + // Stop gcs client first since it runs in io_thread_ if (gcs_client_) { gcs_client_->Disconnect(); } + if (io_thread_.joinable()) { + io_thread_.join(); + } if (options_.worker_type == WorkerType::WORKER) { RAY_CHECK(task_execution_service_.stopped()); // Asyncio coroutines could still run after CoreWorker is removed because it is