From f86c4f992cb49418c93cc642cd8a23488b2628fd Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 5 Nov 2020 16:02:04 -0800 Subject: [PATCH] Fix RAY_ENABLE_NEW_SCHEDULER=1 pytest test_advanced_2.py::test_zero_cpus_actor (#11817) --- python/ray/tests/BUILD | 23 ++++++------------- python/ray/tests/test_advanced_2.py | 2 -- src/ray/raylet/node_manager.cc | 8 ++----- .../raylet/scheduling/cluster_task_manager.cc | 12 ++++++---- 4 files changed, 16 insertions(+), 29 deletions(-) diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index a5250eef0..3f64254cc 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -16,13 +16,16 @@ py_test_module_list( "test_actor_advanced.py", "test_advanced.py", "test_advanced_2.py", + "test_array.py", "test_basic.py", "test_basic_2.py", + "test_cancel.py", "test_cli.py", "test_component_failures_3.py", "test_error_ray_not_initialized.py", "test_gcs_fault_tolerance.py", "test_iter.py", + "test_joblib.py", "test_resource_demand_scheduler.py", ], size = "medium", @@ -35,13 +38,10 @@ py_test_module_list( files = [ "test_actor_resources.py", "test_advanced_3.py", - "test_array.py", - "test_cancel.py", "test_component_failures_2.py", "test_dynres.py", "test_global_gc.py", "test_global_state.py", - "test_joblib.py", ], size = "medium", extra_srcs = SRCS, @@ -60,6 +60,7 @@ py_test_module_list( "test_output.py", "test_reference_counting_2.py", "test_unreconstructable_errors.py", + "test_serialization.py", "test_tensorflow.py", "test_object_spilling.py", ], @@ -76,7 +77,6 @@ py_test_module_list( "test_object_manager.py", "test_reconstruction.py", "test_reference_counting.py", - "test_serialization.py", "test_stress.py", "test_stress_sharded.py", "test_multi_tenancy.py", @@ -89,6 +89,7 @@ py_test_module_list( py_test_module_list( files = [ + "test_actor_pool.py", "test_args.py", "test_asyncio.py", "test_asyncio_cluster.py", @@ -102,7 +103,9 @@ py_test_module_list( "test_dask_callback.py", "test_debug_tools.py", "test_job.py", + "test_memstat.py", "test_metrics_agent.py", + "test_microbenchmarks.py", "test_mini.py", "test_monitor.py", "test_node_manager.py", @@ -118,18 +121,6 @@ py_test_module_list( deps = ["//:ray_lib"], ) -py_test_module_list( - files = [ - "test_actor_pool.py", - "test_memstat.py", - "test_microbenchmarks.py", - ], - size = "small", - extra_srcs = SRCS, - tags = ["exclusive", "new_scheduler_broken"], - deps = ["//:ray_lib"], -) - py_test_module_list( files = [ "test_stress_failure.py", diff --git a/python/ray/tests/test_advanced_2.py b/python/ray/tests/test_advanced_2.py index 60fccec7e..f1f96031d 100644 --- a/python/ray/tests/test_advanced_2.py +++ b/python/ray/tests/test_advanced_2.py @@ -14,7 +14,6 @@ import ray.test_utils from ray.test_utils import ( RayTestTimeoutException, wait_for_condition, - new_scheduler_enabled, ) logger = logging.getLogger(__name__) @@ -252,7 +251,6 @@ def test_zero_cpus(shutdown_only): ray.get(x) -@pytest.mark.skipif(new_scheduler_enabled(), reason="zero cpu handling") def test_zero_cpus_actor(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 2bbbe35d2..ebc66aa21 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -610,9 +610,7 @@ void NodeManager::WarnResourceDeadlock() { SchedulingResources &local_resources = cluster_resource_map_[self_node_id_]; error_message << "The actor or task with ID " << exemplar.GetTaskSpecification().TaskId() - << " is pending and cannot currently be scheduled. It requires " - << exemplar.GetTaskSpecification().GetRequiredResources().ToString() - << " for execution and " + << " cannot be scheduled right now. It requires " << exemplar.GetTaskSpecification().GetRequiredPlacementResources().ToString() << " for placement, but this node only has remaining " << local_resources.GetAvailableResources().ToString() << ". In total there are " @@ -2087,9 +2085,7 @@ void NodeManager::ScheduleTasks( std::ostringstream error_message; error_message << "The actor or task with ID " << task.GetTaskSpecification().TaskId() - << " is infeasible and cannot currently be scheduled. It requires " - << task.GetTaskSpecification().GetRequiredResources().ToString() - << " for execution and " + << " cannot be scheduled right now. It requires " << task.GetTaskSpecification().GetRequiredPlacementResources().ToString() << " for placement, however the cluster currently cannot provide the requested " "resources. The required resources may be added as autoscaling takes place " diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index e138618dd..482cf6eb4 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -30,13 +30,14 @@ bool ClusterTaskManager::SchedulePendingTasks() { // tasks from being scheduled. Work work = *work_it; Task task = std::get<0>(work); - auto request_resources = - task.GetTaskSpecification().GetRequiredResources().GetResourceMap(); + auto placement_resources = + task.GetTaskSpecification().GetRequiredPlacementResources().GetResourceMap(); int64_t _unused; // TODO (Alex): We should distinguish between infeasible tasks and a fully // utilized cluster. std::string node_id_string = cluster_resource_scheduler_->GetBestSchedulableNode( - request_resources, task.GetTaskSpecification().IsActorCreationTask(), &_unused); + placement_resources, task.GetTaskSpecification().IsActorCreationTask(), + &_unused); if (node_id_string.empty()) { // There is no node that has available resources to run the request. // Move on to the next shape. @@ -49,8 +50,9 @@ bool ClusterTaskManager::SchedulePendingTasks() { did_schedule = task_scheduled || did_schedule; } else { // Should spill over to a different node. - cluster_resource_scheduler_->AllocateRemoteTaskResources(node_id_string, - request_resources); + cluster_resource_scheduler_->AllocateRemoteTaskResources( + node_id_string, + task.GetTaskSpecification().GetRequiredResources().GetResourceMap()); NodeID node_id = NodeID::FromBinary(node_id_string); auto node_info_opt = get_node_info_(node_id);