2020-10-02 17:58:44 -07:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import logging
|
|
|
|
import requests
|
|
|
|
import time
|
|
|
|
import traceback
|
|
|
|
import ray
|
2020-10-23 16:52:14 -04:00
|
|
|
import pytest
|
2020-10-02 17:58:44 -07:00
|
|
|
from ray.new_dashboard.tests.conftest import * # noqa
|
|
|
|
from ray.test_utils import (
|
|
|
|
format_web_url,
|
|
|
|
wait_until_server_available,
|
|
|
|
)
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def test_actor_groups(ray_start_with_dashboard):
|
|
|
|
@ray.remote
|
|
|
|
class Foo:
|
|
|
|
def __init__(self, num):
|
|
|
|
self.num = num
|
|
|
|
|
|
|
|
def do_task(self):
|
|
|
|
return self.num
|
|
|
|
|
|
|
|
@ray.remote(num_gpus=1)
|
|
|
|
class InfeasibleActor:
|
|
|
|
pass
|
|
|
|
|
|
|
|
foo_actors = [Foo.remote(4), Foo.remote(5)]
|
|
|
|
infeasible_actor = InfeasibleActor.remote() # noqa
|
|
|
|
results = [actor.do_task.remote() for actor in foo_actors] # noqa
|
|
|
|
webui_url = ray_start_with_dashboard["webui_url"]
|
2020-10-23 16:52:14 -04:00
|
|
|
assert wait_until_server_available(webui_url)
|
2020-10-02 17:58:44 -07:00
|
|
|
webui_url = format_web_url(webui_url)
|
|
|
|
|
2020-12-19 13:22:24 -08:00
|
|
|
timeout_seconds = 10
|
2020-10-02 17:58:44 -07:00
|
|
|
start_time = time.time()
|
|
|
|
last_ex = None
|
|
|
|
while True:
|
|
|
|
time.sleep(1)
|
|
|
|
try:
|
|
|
|
response = requests.get(webui_url + "/logical/actor_groups")
|
|
|
|
response.raise_for_status()
|
|
|
|
actor_groups_resp = response.json()
|
|
|
|
assert actor_groups_resp["result"] is True, actor_groups_resp[
|
|
|
|
"msg"]
|
|
|
|
actor_groups = actor_groups_resp["data"]["actorGroups"]
|
|
|
|
assert "Foo" in actor_groups
|
|
|
|
summary = actor_groups["Foo"]["summary"]
|
|
|
|
# 2 __init__ tasks and 2 do_task tasks
|
|
|
|
assert summary["numExecutedTasks"] == 4
|
|
|
|
assert summary["stateToCount"]["ALIVE"] == 2
|
|
|
|
|
|
|
|
entries = actor_groups["Foo"]["entries"]
|
2020-12-12 23:34:24 -08:00
|
|
|
foo_entry = entries[0]
|
|
|
|
assert type(foo_entry["gpus"]) is list
|
|
|
|
assert "timestamp" in foo_entry
|
|
|
|
assert "actorConstructor" in foo_entry
|
|
|
|
assert "actorClass" in foo_entry
|
|
|
|
assert "actorId" in foo_entry
|
|
|
|
assert "ipAddress" in foo_entry
|
2020-10-02 17:58:44 -07:00
|
|
|
assert len(entries) == 2
|
|
|
|
assert "InfeasibleActor" in actor_groups
|
|
|
|
|
|
|
|
entries = actor_groups["InfeasibleActor"]["entries"]
|
|
|
|
assert "requiredResources" in entries[0]
|
|
|
|
assert "GPU" in entries[0]["requiredResources"]
|
|
|
|
break
|
|
|
|
except Exception as ex:
|
|
|
|
last_ex = ex
|
2020-12-23 11:14:23 +08:00
|
|
|
finally:
|
|
|
|
if time.time() > start_time + timeout_seconds:
|
|
|
|
ex_stack = traceback.format_exception(
|
|
|
|
type(last_ex), last_ex,
|
|
|
|
last_ex.__traceback__) if last_ex else []
|
|
|
|
ex_stack = "".join(ex_stack)
|
|
|
|
raise Exception(f"Timed out while testing, {ex_stack}")
|
|
|
|
|
|
|
|
|
|
|
|
def test_actors(disable_aiohttp_cache, ray_start_with_dashboard):
|
|
|
|
@ray.remote
|
|
|
|
class Foo:
|
|
|
|
def __init__(self, num):
|
|
|
|
self.num = num
|
|
|
|
|
|
|
|
def do_task(self):
|
|
|
|
return self.num
|
|
|
|
|
|
|
|
@ray.remote(num_gpus=1)
|
|
|
|
class InfeasibleActor:
|
|
|
|
pass
|
|
|
|
|
|
|
|
foo_actors = [Foo.remote(4), Foo.remote(5)]
|
|
|
|
infeasible_actor = InfeasibleActor.remote() # noqa
|
|
|
|
results = [actor.do_task.remote() for actor in foo_actors] # noqa
|
|
|
|
webui_url = ray_start_with_dashboard["webui_url"]
|
|
|
|
assert wait_until_server_available(webui_url)
|
|
|
|
webui_url = format_web_url(webui_url)
|
|
|
|
|
|
|
|
timeout_seconds = 5
|
|
|
|
start_time = time.time()
|
|
|
|
last_ex = None
|
|
|
|
while True:
|
|
|
|
time.sleep(1)
|
|
|
|
try:
|
|
|
|
resp = requests.get(f"{webui_url}/logical/actors")
|
|
|
|
resp_json = resp.json()
|
|
|
|
resp_data = resp_json["data"]
|
|
|
|
actors = resp_data["actors"]
|
|
|
|
assert len(actors) == 3
|
|
|
|
one_entry = list(actors.values())[0]
|
|
|
|
assert "jobId" in one_entry
|
|
|
|
assert "taskSpec" in one_entry
|
|
|
|
assert "functionDescriptor" in one_entry["taskSpec"]
|
|
|
|
assert type(one_entry["taskSpec"]["functionDescriptor"]) is dict
|
|
|
|
assert "address" in one_entry
|
|
|
|
assert type(one_entry["address"]) is dict
|
|
|
|
assert "state" in one_entry
|
|
|
|
assert "name" in one_entry
|
|
|
|
assert "numRestarts" in one_entry
|
|
|
|
assert "pid" in one_entry
|
|
|
|
all_pids = [entry["pid"] for entry in actors.values()]
|
|
|
|
assert 0 in all_pids # The infeasible actor
|
|
|
|
assert len(all_pids) > 1
|
|
|
|
break
|
|
|
|
except Exception as ex:
|
|
|
|
last_ex = ex
|
2020-10-02 17:58:44 -07:00
|
|
|
finally:
|
|
|
|
if time.time() > start_time + timeout_seconds:
|
|
|
|
ex_stack = traceback.format_exception(
|
|
|
|
type(last_ex), last_ex,
|
|
|
|
last_ex.__traceback__) if last_ex else []
|
|
|
|
ex_stack = "".join(ex_stack)
|
|
|
|
raise Exception(f"Timed out while testing, {ex_stack}")
|
|
|
|
|
|
|
|
|
2020-10-23 16:52:14 -04:00
|
|
|
def test_kill_actor(ray_start_with_dashboard):
|
|
|
|
@ray.remote
|
|
|
|
class Actor:
|
|
|
|
def __init__(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def f(self):
|
|
|
|
ray.show_in_dashboard("test")
|
|
|
|
return os.getpid()
|
|
|
|
|
|
|
|
a = Actor.remote()
|
|
|
|
worker_pid = ray.get(a.f.remote()) # noqa
|
|
|
|
|
|
|
|
webui_url = ray_start_with_dashboard["webui_url"]
|
|
|
|
assert wait_until_server_available(webui_url)
|
|
|
|
webui_url = format_web_url(webui_url)
|
|
|
|
|
|
|
|
def actor_killed(pid):
|
|
|
|
"""Check For the existence of a unix pid."""
|
|
|
|
try:
|
|
|
|
os.kill(pid, 0)
|
|
|
|
except OSError:
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
def get_actor():
|
|
|
|
resp = requests.get(f"{webui_url}/logical/actor_groups")
|
|
|
|
resp.raise_for_status()
|
|
|
|
actor_groups_resp = resp.json()
|
|
|
|
assert actor_groups_resp["result"] is True, actor_groups_resp["msg"]
|
|
|
|
actor_groups = actor_groups_resp["data"]["actorGroups"]
|
|
|
|
actor = actor_groups["Actor"]["entries"][0]
|
|
|
|
return actor
|
|
|
|
|
|
|
|
def kill_actor_using_dashboard(actor):
|
|
|
|
resp = requests.get(
|
|
|
|
webui_url + "/logical/kill_actor",
|
|
|
|
params={
|
|
|
|
"actorId": actor["actorId"],
|
|
|
|
"ipAddress": actor["ipAddress"],
|
|
|
|
"port": actor["port"]
|
|
|
|
})
|
|
|
|
resp.raise_for_status()
|
|
|
|
resp_json = resp.json()
|
|
|
|
assert resp_json["result"] is True, "msg" in resp_json
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
last_exc = None
|
|
|
|
while time.time() - start <= 10:
|
|
|
|
try:
|
|
|
|
actor = get_actor()
|
|
|
|
kill_actor_using_dashboard(actor)
|
|
|
|
last_exc = None
|
|
|
|
break
|
|
|
|
except (KeyError, AssertionError) as e:
|
|
|
|
last_exc = e
|
|
|
|
time.sleep(.1)
|
|
|
|
assert last_exc is None
|
|
|
|
|
|
|
|
|
2020-10-02 17:58:44 -07:00
|
|
|
if __name__ == "__main__":
|
|
|
|
sys.exit(pytest.main(["-v", __file__]))
|