ray/rllib/utils/tests/run_memory_leak_tests.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

128 lines
4.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# Runs one or more memory leak tests.
#
# Example usage:
# $ python run_memory_leak_tests.py memory-leak-test-ppo.yaml
#
# When using in BAZEL (with py_test), e.g. see in ray/rllib/BUILD:
# py_test(
# name = "memory_leak_ppo",
# main = "tests/test_memory_leak.py",
# tags = ["memory_leak_tests"],
# size = "medium", # 5min timeout
# srcs = ["tests/test_memory_leak.py"],
# data = glob(["tuned_examples/ppo/*.yaml"]),
# # Pass `BAZEL` option and the path to look for yaml files.
# args = ["BAZEL", "tuned_examples/ppo/memory-leak-test-ppo.yaml"]
# )
import argparse
import os
from pathlib import Path
import sys
import yaml
import ray
from ray.rllib.algorithms.registry import get_algorithm_class
from ray.rllib.utils.debug.memory import check_memory_leaks
parser = argparse.ArgumentParser()
parser.add_argument(
"--framework",
required=False,
choices=["jax", "tf2", "tf", "tfe", "torch", None],
default=None,
help="The deep learning framework to use.",
)
parser.add_argument(
"--yaml-dir",
required=True,
type=str,
help="The directory in which to find all yamls to test.",
)
parser.add_argument(
"--local-mode",
action="store_true",
help="Run ray in local mode for easier debugging.",
)
parser.add_argument(
"--to-check",
nargs="+",
default=["env", "policy", "rollout_worker"],
help="List of 'env', 'policy', 'rollout_worker', 'model'.",
)
if __name__ == "__main__":
args = parser.parse_args()
# Bazel regression test mode: Get path to look for yaml files.
# Get the path or single file to use.
rllib_dir = Path(__file__).parent.parent.parent
print("rllib dir={}".format(rllib_dir))
abs_yaml_path = os.path.join(rllib_dir, args.yaml_dir)
# Single file given.
if os.path.isfile(abs_yaml_path):
yaml_files = [abs_yaml_path]
# Given path/file does not exist.
elif not os.path.isdir(abs_yaml_path):
raise ValueError("yaml-dir ({}) not found!".format(args.yaml_dir))
# Path given -> Get all yaml files in there via rglob.
else:
yaml_files = rllib_dir.rglob(args.yaml_dir + "/*.yaml")
yaml_files = sorted(
map(lambda path: str(path.absolute()), yaml_files), reverse=True
)
print("Will run the following memory-leak tests:")
for yaml_file in yaml_files:
print("->", yaml_file)
# Loop through all collected files.
for yaml_file in yaml_files:
experiments = yaml.safe_load(open(yaml_file).read())
assert (
len(experiments) == 1
), "Error, can only run a single experiment per yaml file!"
experiment = list(experiments.values())[0]
# Add framework option to exp configs.
if args.framework:
experiment["config"]["framework"] = args.framework
# Create env on local_worker for memory leak testing just the env.
experiment["config"]["create_env_on_driver"] = True
# Always run with eager-tracing when framework=tf2 if not in local-mode.
if args.framework in ["tf2", "tfe"] and not args.local_mode:
experiment["config"]["eager_tracing"] = True
# experiment["config"]["callbacks"] = MemoryTrackingCallbacks
# Move "env" specifier into config.
experiment["config"]["env"] = experiment["env"]
experiment.pop("env", None)
# Print out the actual config.
print("== Test config ==")
print(yaml.dump(experiment))
# Construct the trainer instance based on the given config.
leaking = True
try:
ray.init(num_cpus=5, local_mode=args.local_mode)
trainer = get_algorithm_class(experiment["run"])(experiment["config"])
results = check_memory_leaks(
trainer,
to_check=set(args.to_check),
)
if not results:
leaking = False
finally:
ray.shutdown()
if not leaking:
print("Memory leak test PASSED")
else:
print("Memory leak test FAILED. Exiting with Error.")
sys.exit(1)