ray/doc/kubernetes/example_scripts/job_example.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

56 lines
1.3 KiB
Python
Raw Normal View History

import sys
import time
from collections import Counter
import ray
""" This script is meant to be run from a pod in the same Kubernetes namespace
as your Ray cluster.
"""
@ray.remote
def gethostname(x):
import platform
import time
time.sleep(0.01)
return x + (platform.node(),)
def wait_for_nodes(expected):
# Wait for all nodes to join the cluster.
while True:
resources = ray.cluster_resources()
node_keys = [key for key in resources if "node" in key]
num_nodes = sum(resources[node_key] for node_key in node_keys)
if num_nodes < expected:
print(
"{} nodes have joined so far, waiting for {} more.".format(
num_nodes, expected - num_nodes
)
)
sys.stdout.flush()
time.sleep(1)
else:
break
def main():
wait_for_nodes(3)
# Check that objects can be transferred from each node to each other node.
for i in range(10):
print("Iteration {}".format(i))
results = [gethostname.remote(gethostname.remote(())) for _ in range(100)]
print(Counter(ray.get(results)))
sys.stdout.flush()
print("Success!")
sys.stdout.flush()
if __name__ == "__main__":
ray.init("ray://example-cluster-ray-head:10001")
main()