ray/release/long_running_tests/workloads/actor_deaths.py

# This workload tests repeatedly killing actors and submitting tasks to them.

import json
import numpy as np
import os
import sys
import time

import ray
from ray.cluster_utils import Cluster


def update_progress(result):
    result["last_update"] = time.time()
    test_output_json = os.environ.get(
        "TEST_OUTPUT_JSON", "/tmp/release_test_output.json"
    )
    with open(test_output_json, "wt") as f:
        json.dump(result, f)


num_redis_shards = 1
redis_max_memory = 10 ** 8
object_store_memory = 10 ** 8
num_nodes = 2

message = (
    "Make sure there is enough memory on this machine to run this "
    "workload. We divide the system memory by 2 to provide a buffer."
)
assert (
    num_nodes * object_store_memory + num_redis_shards * redis_max_memory
    < ray._private.utils.get_system_memory() / 2
), message

# Simulate a cluster on one machine.

cluster = Cluster()
for i in range(num_nodes):
    cluster.add_node(
        redis_port=6379 if i == 0 else None,
        num_redis_shards=num_redis_shards if i == 0 else None,
        num_cpus=8,
        num_gpus=0,
        resources={str(i): 2},
        object_store_memory=object_store_memory,
        redis_max_memory=redis_max_memory,
        dashboard_host="0.0.0.0",
    )
ray.init(address=cluster.address)

# Run the workload.

num_parents = 5
num_children = 5
death_probability = 0.95


@ray.remote
class Child(object):
    def __init__(self, death_probability):
        self.death_probability = death_probability

    def ping(self):
        # Exit process with some probability.
        exit_chance = np.random.rand()
        if exit_chance > self.death_probability:
            sys.exit(-1)


@ray.remote
class Parent(object):
    def __init__(self, num_children, death_probability):
        self.death_probability = death_probability
        self.children = [Child.remote(death_probability) for _ in range(num_children)]

    def ping(self, num_pings):
        children_outputs = []
        for _ in range(num_pings):
            children_outputs += [child.ping.remote() for child in self.children]
        try:
            ray.get(children_outputs)
        except Exception:
            # Replace the children if one of them died.
            self.__init__(len(self.children), self.death_probability)

    def kill(self):
        # Clean up children.
        ray.get([child.__ray_terminate__.remote() for child in self.children])


parents = [Parent.remote(num_children, death_probability) for _ in range(num_parents)]

iteration = 0
start_time = time.time()
previous_time = start_time
while True:
    ray.get([parent.ping.remote(10) for parent in parents])

    # Kill a parent actor with some probability.
    exit_chance = np.random.rand()
    if exit_chance > death_probability:
        parent_index = np.random.randint(len(parents))
        parents[parent_index].kill.remote()
        parents[parent_index] = Parent.remote(num_children, death_probability)

    new_time = time.time()
    print(
        "Iteration {}:\n"
        "  - Iteration time: {}.\n"
        "  - Absolute time: {}.\n"
        "  - Total elapsed time: {}.".format(
            iteration, new_time - previous_time, new_time, new_time - start_time
        )
    )
    update_progress(
        {
            "iteration": iteration,
            "iteration_time": new_time - previous_time,
            "absolute_time": new_time,
            "elapsed_time": new_time - start_time,
        }
    )
    previous_time = new_time
    iteration += 1
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00			`# This workload tests repeatedly killing actors and submitting tasks to them.`

[release tests] Fix microbenchmark base image, network overhead cluster wait time, add long running tests (#16355) 2021-06-16 21:37:17 +01:00			`import json`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00			`import numpy as np`
[release tests] Fix microbenchmark base image, network overhead cluster wait time, add long running tests (#16355) 2021-06-16 21:37:17 +01:00			`import os`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00			`import sys`
			`import time`

			`import ray`
Fix long running stress tests (#6374) 2019-12-05 18:29:41 -08:00			`from ray.cluster_utils import Cluster`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00
[release tests] Fix microbenchmark base image, network overhead cluster wait time, add long running tests (#16355) 2021-06-16 21:37:17 +01:00
			`def update_progress(result):`
			`result["last_update"] = time.time()`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`test_output_json = os.environ.get(`
			`"TEST_OUTPUT_JSON", "/tmp/release_test_output.json"`
			`)`
[release tests] Fix microbenchmark base image, network overhead cluster wait time, add long running tests (#16355) 2021-06-16 21:37:17 +01:00			`with open(test_output_json, "wt") as f:`
			`json.dump(result, f)`


Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00			`num_redis_shards = 1`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`redis_max_memory = 10 ** 8`
			`object_store_memory = 10 ** 8`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00			`num_nodes = 2`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`message = (`
			`"Make sure there is enough memory on this machine to run this "`
			`"workload. We divide the system memory by 2 to provide a buffer."`
			`)`
			`assert (`
			`num_nodes * object_store_memory + num_redis_shards * redis_max_memory`
			`< ray._private.utils.get_system_memory() / 2`
			`), message`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00
			`# Simulate a cluster on one machine.`

			`cluster = Cluster()`
			`for i in range(num_nodes):`
			`cluster.add_node(`
			`redis_port=6379 if i == 0 else None,`
			`num_redis_shards=num_redis_shards if i == 0 else None,`
			`num_cpus=8,`
			`num_gpus=0,`
			`resources={str(i): 2},`
			`object_store_memory=object_store_memory,`
Use 2xlarge instances in long running tests (#6802) 2020-01-15 19:47:59 -06:00			`redis_max_memory=redis_max_memory,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`dashboard_host="0.0.0.0",`
			`)`
Replace --redis-address with --address in test, docs, tune, rllib (#5602) * wip * add tests and tune * add ci * test fix * lint * fix tests * wip * sugar dep 2019-09-01 16:53:02 -07:00			`ray.init(address=cluster.address)`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00
			`# Run the workload.`

			`num_parents = 5`
			`num_children = 5`
			`death_probability = 0.95`


			`@ray.remote`
			`class Child(object):`
			`def __init__(self, death_probability):`
			`self.death_probability = death_probability`

			`def ping(self):`
			`# Exit process with some probability.`
			`exit_chance = np.random.rand()`
			`if exit_chance > self.death_probability:`
			`sys.exit(-1)`


			`@ray.remote`
			`class Parent(object):`
			`def __init__(self, num_children, death_probability):`
			`self.death_probability = death_probability`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.children = [Child.remote(death_probability) for _ in range(num_children)]`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00
			`def ping(self, num_pings):`
			`children_outputs = []`
			`for _ in range(num_pings):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`children_outputs += [child.ping.remote() for child in self.children]`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00			`try:`
			`ray.get(children_outputs)`
			`except Exception:`
			`# Replace the children if one of them died.`
			`self.__init__(len(self.children), self.death_probability)`

			`def kill(self):`
			`# Clean up children.`
			`ray.get([child.__ray_terminate__.remote() for child in self.children])`


[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`parents = [Parent.remote(num_children, death_probability) for _ in range(num_parents)]`
Update long running stress tests and add actor death test. (#4275) 2019-03-06 14:26:45 -08:00
			`iteration = 0`
			`start_time = time.time()`
			`previous_time = start_time`
			`while True:`
			`ray.get([parent.ping.remote(10) for parent in parents])`

			`# Kill a parent actor with some probability.`
			`exit_chance = np.random.rand()`
			`if exit_chance > death_probability:`
			`parent_index = np.random.randint(len(parents))`
			`parents[parent_index].kill.remote()`
			`parents[parent_index] = Parent.remote(num_children, death_probability)`

			`new_time = time.time()`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`print(`
			`"Iteration {}:\n"`
			`" - Iteration time: {}.\n"`
			`" - Absolute time: {}.\n"`
			`" - Total elapsed time: {}.".format(`
			`iteration, new_time - previous_time, new_time, new_time - start_time`
			`)`
			`)`
			`update_progress(`
			`{`
			`"iteration": iteration,`
			`"iteration_time": new_time - previous_time,`
			`"absolute_time": new_time,`
			`"elapsed_time": new_time - start_time,`
			`}`
			`)`
move variable updates from middle of loop to end (#17591) 2021-08-05 01:53:01 -07:00			`previous_time = new_time`
			`iteration += 1`