2022-07-13 10:53:24 +01:00
|
|
|
import os
|
|
|
|
import subprocess
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import ray
|
2022-07-14 22:08:43 +01:00
|
|
|
from typing import List, Dict, Union, Callable
|
2022-07-13 10:53:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
def _schedule_remote_fn_on_node(node_ip: str, remote_fn, *args, **kwargs):
|
|
|
|
return remote_fn.options(resources={f"node:{node_ip}": 0.01}).remote(
|
|
|
|
*args,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _schedule_remote_fn_on_all_nodes(
|
|
|
|
remote_fn, exclude_head: bool = False, *args, **kwargs
|
|
|
|
):
|
|
|
|
head_ip = ray.util.get_node_ip_address()
|
|
|
|
|
|
|
|
futures = []
|
|
|
|
for node in ray.nodes():
|
|
|
|
if not node["Alive"]:
|
|
|
|
continue
|
|
|
|
|
|
|
|
node_ip = node["NodeManagerAddress"]
|
|
|
|
|
|
|
|
if exclude_head and node_ip == head_ip:
|
|
|
|
continue
|
|
|
|
|
|
|
|
future = _schedule_remote_fn_on_node(node_ip, remote_fn, *args, **kwargs)
|
|
|
|
futures.append(future)
|
|
|
|
return futures
|
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def _write(stream: bytes, path: str):
|
|
|
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
with open(path, "wb") as f:
|
|
|
|
f.write(stream)
|
|
|
|
|
|
|
|
|
|
|
|
def upload_file_to_all_nodes(path: str):
|
|
|
|
path = os.path.abspath(path)
|
|
|
|
|
|
|
|
with open(path, "rb") as f:
|
|
|
|
stream = f.read()
|
|
|
|
|
|
|
|
futures = _schedule_remote_fn_on_all_nodes(
|
|
|
|
_write, exclude_head=True, stream=stream, path=path
|
|
|
|
)
|
|
|
|
return ray.get(futures)
|
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def _run_command(cmd: str):
|
|
|
|
return subprocess.check_call(cmd)
|
|
|
|
|
|
|
|
|
|
|
|
def run_command_on_all_nodes(cmd: List[str]):
|
|
|
|
futures = _schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
|
|
|
|
return ray.get(futures)
|
|
|
|
|
|
|
|
|
|
|
|
def run_commands_with_resources(
|
2022-07-14 22:08:43 +01:00
|
|
|
cmds: List[List[str]], resources: Dict[str, Union[float, int]]
|
2022-07-13 10:53:24 +01:00
|
|
|
):
|
|
|
|
num_cpus = resources.pop("CPU", 1)
|
|
|
|
num_gpus = resources.pop("GPU", 0)
|
|
|
|
futures = []
|
|
|
|
for cmd in cmds:
|
|
|
|
future = _run_command.options(
|
|
|
|
num_cpus=num_cpus, num_gpus=num_gpus, resources=resources
|
|
|
|
).remote(cmd=cmd)
|
|
|
|
futures.append(future)
|
|
|
|
return ray.get(futures)
|
2022-07-14 22:08:43 +01:00
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
class CommandRunner:
|
|
|
|
def run_command(self, cmd: str):
|
|
|
|
return subprocess.check_call(cmd)
|
|
|
|
|
|
|
|
def run_fn(self, fn: Callable, *args, **kwargs):
|
|
|
|
return fn(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def create_actors_with_resources(
|
|
|
|
num_actors: int, resources: Dict[str, Union[float, int]]
|
|
|
|
) -> List[ray.actor.ActorHandle]:
|
|
|
|
num_cpus = resources.pop("CPU", 1)
|
|
|
|
num_gpus = resources.pop("GPU", 0)
|
|
|
|
|
|
|
|
return [
|
|
|
|
CommandRunner.options(
|
|
|
|
num_cpus=num_cpus, num_gpus=num_gpus, resources=resources
|
|
|
|
).remote()
|
|
|
|
for _ in range(num_actors)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def run_commands_on_actors(actors: List[ray.actor.ActorHandle], cmds: List[List[str]]):
|
|
|
|
assert len(actors) == len(cmds)
|
|
|
|
futures = []
|
|
|
|
for actor, cmd in zip(actors, cmds):
|
|
|
|
futures.append(actor.run_command.remote(cmd))
|
|
|
|
return ray.get(futures)
|
|
|
|
|
|
|
|
|
|
|
|
def run_fn_on_actors(
|
|
|
|
actors: List[ray.actor.ActorHandle], fn: Callable, *args, **kwargs
|
|
|
|
):
|
|
|
|
futures = []
|
|
|
|
for actor in actors:
|
|
|
|
futures.append(actor.run_fn.remote(fn, *args, **kwargs))
|
|
|
|
return ray.get(futures)
|