[gcs] Fix the server standalone tests in HA mode (#21480)

CoreWorker hangs there before exiting if gcs exits first due to in correct ordering of destruction. This PR fixed this. It'll stop gcs client first and then job the thread.
This commit is contained in:
Yi Cheng 2022-01-07 22:54:50 -08:00 committed by GitHub
parent bdfba88082
commit 4ab059eaa1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 4 deletions

View file

@ -331,7 +331,7 @@
--test_env=RAY_gcs_grpc_based_pubsub=1
--test_env=RAY_bootstrap_with_gcs=1
--test_env=RAY_gcs_storage=memory
-- python/ray/serve/... -//python/ray/serve:test_cli -//python/ray/serve:test_standalone
-- python/ray/serve/...
# Re-enable after fixing.
#- bazel test --config=ci $(./scripts/bazel_export_options)
# --test_tag_filters=team:serve

View file

@ -594,12 +594,13 @@ void CoreWorker::OnNodeRemoved(const NodeID &node_id) {
}
void CoreWorker::WaitForShutdown() {
if (io_thread_.joinable()) {
io_thread_.join();
}
// Stop gcs client first since it runs in io_thread_
if (gcs_client_) {
gcs_client_->Disconnect();
}
if (io_thread_.joinable()) {
io_thread_.join();
}
if (options_.worker_type == WorkerType::WORKER) {
RAY_CHECK(task_execution_service_.stopped());
// Asyncio coroutines could still run after CoreWorker is removed because it is