Fix RAY_ENABLE_NEW_SCHEDULER=1 pytest test_advanced_2.py::test_zero_cpus_actor (#11817)

This commit is contained in:
Eric Liang 2020-11-05 16:02:04 -08:00 committed by GitHub
parent 347e871409
commit f86c4f992c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 16 additions and 29 deletions

View file

@ -16,13 +16,16 @@ py_test_module_list(
"test_actor_advanced.py",
"test_advanced.py",
"test_advanced_2.py",
"test_array.py",
"test_basic.py",
"test_basic_2.py",
"test_cancel.py",
"test_cli.py",
"test_component_failures_3.py",
"test_error_ray_not_initialized.py",
"test_gcs_fault_tolerance.py",
"test_iter.py",
"test_joblib.py",
"test_resource_demand_scheduler.py",
],
size = "medium",
@ -35,13 +38,10 @@ py_test_module_list(
files = [
"test_actor_resources.py",
"test_advanced_3.py",
"test_array.py",
"test_cancel.py",
"test_component_failures_2.py",
"test_dynres.py",
"test_global_gc.py",
"test_global_state.py",
"test_joblib.py",
],
size = "medium",
extra_srcs = SRCS,
@ -60,6 +60,7 @@ py_test_module_list(
"test_output.py",
"test_reference_counting_2.py",
"test_unreconstructable_errors.py",
"test_serialization.py",
"test_tensorflow.py",
"test_object_spilling.py",
],
@ -76,7 +77,6 @@ py_test_module_list(
"test_object_manager.py",
"test_reconstruction.py",
"test_reference_counting.py",
"test_serialization.py",
"test_stress.py",
"test_stress_sharded.py",
"test_multi_tenancy.py",
@ -89,6 +89,7 @@ py_test_module_list(
py_test_module_list(
files = [
"test_actor_pool.py",
"test_args.py",
"test_asyncio.py",
"test_asyncio_cluster.py",
@ -102,7 +103,9 @@ py_test_module_list(
"test_dask_callback.py",
"test_debug_tools.py",
"test_job.py",
"test_memstat.py",
"test_metrics_agent.py",
"test_microbenchmarks.py",
"test_mini.py",
"test_monitor.py",
"test_node_manager.py",
@ -118,18 +121,6 @@ py_test_module_list(
deps = ["//:ray_lib"],
)
py_test_module_list(
files = [
"test_actor_pool.py",
"test_memstat.py",
"test_microbenchmarks.py",
],
size = "small",
extra_srcs = SRCS,
tags = ["exclusive", "new_scheduler_broken"],
deps = ["//:ray_lib"],
)
py_test_module_list(
files = [
"test_stress_failure.py",

View file

@ -14,7 +14,6 @@ import ray.test_utils
from ray.test_utils import (
RayTestTimeoutException,
wait_for_condition,
new_scheduler_enabled,
)
logger = logging.getLogger(__name__)
@ -252,7 +251,6 @@ def test_zero_cpus(shutdown_only):
ray.get(x)
@pytest.mark.skipif(new_scheduler_enabled(), reason="zero cpu handling")
def test_zero_cpus_actor(ray_start_cluster):
cluster = ray_start_cluster
cluster.add_node(num_cpus=0)

View file

@ -610,9 +610,7 @@ void NodeManager::WarnResourceDeadlock() {
SchedulingResources &local_resources = cluster_resource_map_[self_node_id_];
error_message
<< "The actor or task with ID " << exemplar.GetTaskSpecification().TaskId()
<< " is pending and cannot currently be scheduled. It requires "
<< exemplar.GetTaskSpecification().GetRequiredResources().ToString()
<< " for execution and "
<< " cannot be scheduled right now. It requires "
<< exemplar.GetTaskSpecification().GetRequiredPlacementResources().ToString()
<< " for placement, but this node only has remaining "
<< local_resources.GetAvailableResources().ToString() << ". In total there are "
@ -2087,9 +2085,7 @@ void NodeManager::ScheduleTasks(
std::ostringstream error_message;
error_message
<< "The actor or task with ID " << task.GetTaskSpecification().TaskId()
<< " is infeasible and cannot currently be scheduled. It requires "
<< task.GetTaskSpecification().GetRequiredResources().ToString()
<< " for execution and "
<< " cannot be scheduled right now. It requires "
<< task.GetTaskSpecification().GetRequiredPlacementResources().ToString()
<< " for placement, however the cluster currently cannot provide the requested "
"resources. The required resources may be added as autoscaling takes place "

View file

@ -30,13 +30,14 @@ bool ClusterTaskManager::SchedulePendingTasks() {
// tasks from being scheduled.
Work work = *work_it;
Task task = std::get<0>(work);
auto request_resources =
task.GetTaskSpecification().GetRequiredResources().GetResourceMap();
auto placement_resources =
task.GetTaskSpecification().GetRequiredPlacementResources().GetResourceMap();
int64_t _unused;
// TODO (Alex): We should distinguish between infeasible tasks and a fully
// utilized cluster.
std::string node_id_string = cluster_resource_scheduler_->GetBestSchedulableNode(
request_resources, task.GetTaskSpecification().IsActorCreationTask(), &_unused);
placement_resources, task.GetTaskSpecification().IsActorCreationTask(),
&_unused);
if (node_id_string.empty()) {
// There is no node that has available resources to run the request.
// Move on to the next shape.
@ -49,8 +50,9 @@ bool ClusterTaskManager::SchedulePendingTasks() {
did_schedule = task_scheduled || did_schedule;
} else {
// Should spill over to a different node.
cluster_resource_scheduler_->AllocateRemoteTaskResources(node_id_string,
request_resources);
cluster_resource_scheduler_->AllocateRemoteTaskResources(
node_id_string,
task.GetTaskSpecification().GetRequiredResources().GetResourceMap());
NodeID node_id = NodeID::FromBinary(node_id_string);
auto node_info_opt = get_node_info_(node_id);