ray/ci/pipeline/py_dep_analysis.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

385 lines
12 KiB
Python
Raw Normal View History

#!/usr/bin/env python
#
# This file contains utilities for understanding dependencies between python
# source files and tests.
#
# Utils are assumed to be used from top level ray/ folder, since that is how
# our tests are defined today.
#
# Example usage:
# To find all circular dependencies under ray/python/:
# python ci/pipeline/py_dep_analysis.py --mode=circular-dep
# To find all the RLlib tests that depend on a file:
# python ci/pipeline/py_dep_analysis.py --mode=test-dep \
# --file=python/ray/tune/tune.py
# For testing, add --smoke-test to any commands, so it doesn't spend
# tons of time querying for available RLlib tests.
import argparse
import ast
import os
import re
import subprocess
import sys
from typing import Dict, List, Tuple
class DepGraph(object):
def __init__(self):
self.edges: Dict[str, Dict[str, bool]] = {}
self.ids: Dict[str, int] = {}
self.inv_ids: Dict[int, str] = {}
def _run_shell(args: List[str]) -> str:
return subprocess.check_output(args).decode(sys.stdout.encoding)
def list_rllib_tests(n: int = -1, test: str = None) -> Tuple[str, List[str]]:
"""List RLlib tests.
Args:
n: return at most n tests. all tests if n = -1.
test: only return information about a specific test.
"""
tests_res = _run_shell(
["bazel", "query", "tests(//python/ray/rllib:*)", "--output", "label"]
)
all_tests = []
# Strip, also skip any empty lines
tests = [t.strip() for t in tests_res.splitlines() if t.strip()]
for t in tests:
if test and t != test:
continue
src_out = _run_shell(
[
"bazel",
"query",
'kind("source file", deps({}))'.format(t),
"--output",
"label",
]
)
srcs = [f.strip() for f in src_out.splitlines()]
srcs = [f for f in srcs if f.startswith("//python") and f.endswith(".py")]
if srcs:
all_tests.append((t, srcs))
# Break early if smoke test.
if n > 0 and len(all_tests) >= n:
break
return all_tests
def _new_dep(graph: DepGraph, src_module: str, dep: str):
"""Create a new dependency between src_module and dep."""
if dep not in graph.ids:
graph.ids[dep] = len(graph.ids)
src_id = graph.ids[src_module]
dep_id = graph.ids[dep]
if src_id not in graph.edges:
graph.edges[src_id] = {}
graph.edges[src_id][dep_id] = True
def _new_import(graph: DepGraph, src_module: str, dep_module: str):
"""Process a new import statement in src_module."""
# We don't care about system imports.
if not dep_module.startswith("ray"):
return
_new_dep(graph, src_module, dep_module)
def _is_path_module(module: str, name: str, _base_dir: str) -> bool:
"""Figure out if base.sub is a python module or not."""
# Special handling for _raylet, which is a C++ lib.
if module == "ray._raylet":
return False
bps = ["python"] + module.split(".")
path = os.path.join(_base_dir, os.path.join(*bps), name + ".py")
if os.path.isfile(path):
return True # file module
return False
def _new_from_import(
graph: DepGraph, src_module: str, dep_module: str, dep_name: str, _base_dir: str
):
"""Process a new "from ... import ..." statement in src_module."""
# We don't care about imports outside of ray package.
if not dep_module or not dep_module.startswith("ray"):
return
if _is_path_module(dep_module, dep_name, _base_dir):
# dep_module.dep_name points to a file.
_new_dep(graph, src_module, _full_module_path(dep_module, dep_name))
else:
# sub is an obj on base dir/file.
_new_dep(graph, src_module, dep_module)
def _process_file(graph: DepGraph, src_path: str, src_module: str, _base_dir=""):
"""Create dependencies from src_module to all the valid imports in src_path.
Args:
graph: the DepGraph to be added to.
src_path: .py file to be processed.
src_module: full module path of the source file.
_base_dir: use a different base dir than current dir. For unit testing.
"""
with open(os.path.join(_base_dir, src_path), "r") as in_f:
tree = ast.parse(in_f.read())
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
_new_import(graph, src_module, alias.name)
elif isinstance(node, ast.ImportFrom):
for alias in node.names:
_new_from_import(
graph, src_module, node.module, alias.name, _base_dir
)
def build_dep_graph() -> DepGraph:
"""Build index from py files to their immediate dependees."""
graph = DepGraph()
# Assuming we run from root /ray directory.
# Follow links since rllib is linked to /rllib.
for root, sub_dirs, files in os.walk("python", followlinks=True):
if _should_skip(root):
continue
module = _bazel_path_to_module_path(root)
# Process files first.
for f in files:
if not f.endswith(".py"):
continue
full = _full_module_path(module, f)
if full not in graph.ids:
graph.ids[full] = len(graph.ids)
# Process file:
_process_file(graph, os.path.join(root, f), full)
# Build reverse index for convenience.
graph.inv_ids = {v: k for k, v in graph.ids.items()}
return graph
def _full_module_path(module, f) -> str:
if f == "__init__.py":
# __init__ file for this module.
# Full path is the same as the module name.
return module
fn = re.sub(r"\.py$", "", f)
if not module:
return fn
return module + "." + fn
def _should_skip(d: str) -> bool:
"""Skip directories that should not contain py sources."""
if d.startswith("python/.eggs/"):
return True
if d.startswith("python/."):
return True
if d.startswith("python/build"):
return True
if d.startswith("python/ray/cpp"):
return True
return False
def _bazel_path_to_module_path(d: str) -> str:
"""Convert a Bazel file path to python module path.
Example: //python/ray/rllib:xxx/yyy/dd -> ray.rllib.xxx.yyy.dd
"""
# Do this in 3 steps, so all of 'python:', 'python/', or '//python', etc
# will get stripped.
d = re.sub(r"^\/\/", "", d)
d = re.sub(r"^python", "", d)
d = re.sub(r"^[\/:]", "", d)
return d.replace("/", ".").replace(":", ".")
def _file_path_to_module_path(f: str) -> str:
"""Return the corresponding module path for a .py file."""
dir, fn = os.path.split(f)
return _full_module_path(_bazel_path_to_module_path(dir), fn)
def _depends(
graph: DepGraph, visited: Dict[int, bool], tid: int, qid: int
) -> List[int]:
"""Whether there is a dependency path from module tid to module qid.
Given graph, and without going through visited.
"""
if tid not in graph.edges or qid not in graph.edges:
return []
if qid in graph.edges[tid]:
# tid directly depends on qid.
return [tid, qid]
for c in graph.edges[tid]:
if c in visited:
continue
visited[c] = True
# Reduce to a question of whether there is a path from c to qid.
ds = _depends(graph, visited, c, qid)
if ds:
# From tid -> c -> qid.
return [tid] + ds
return []
def test_depends_on_file(
graph: DepGraph, test: Tuple[str, Tuple[str]], path: str
) -> List[int]:
"""Give dependency graph, check if a test depends on a specific .py file.
Args:
graph: the dependency graph.
test: information about a test, in the format of:
[test_name, (src files for the test)]
"""
query = _file_path_to_module_path(path)
if query not in graph.ids:
# Not a file that we care about.
return []
t, srcs = test
# Skip tuned_examples/ and examples/ tests.
if t.startswith("//python/ray/rllib:examples/"):
return []
for src in srcs:
if src == "ray.rllib.tests.run_regression_tests":
return []
tid = _file_path_to_module_path(src)
if tid not in graph.ids:
# Not a test that we care about.
# TODO(jungong): What tests are these?????
continue
branch = _depends(graph, {}, graph.ids[tid], graph.ids[query])
if branch:
return branch
# Does not depend on file.
return []
def _find_circular_dep_impl(graph: DepGraph, id: str, branch: str) -> bool:
if id not in graph.edges:
return False
for c in graph.edges[id]:
if c in branch:
# Found a circle.
branch.append(c)
return True
branch.append(c)
if _find_circular_dep_impl(graph, c, branch):
return True
branch.pop()
return False
def find_circular_dep(graph: DepGraph) -> Dict[str, List[int]]:
"""Find circular dependencies among a dependency graph."""
known = {}
circles = {}
for m, id in graph.ids.items():
branch = []
if _find_circular_dep_impl(graph, id, branch):
if branch[-1] in known:
# Already knew, skip.
continue
# Since this is a cycle dependency, any step along the circle
# will form a different circle.
# So we mark every entry on this circle known.
for n in branch:
known[n] = True
# Mark that module m contains a potential circular dep.
circles[m] = branch
return circles
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default="test-dep",
help=(
"test-dep: find dependencies for a specified test. "
"circular-dep: find circular dependencies in "
"the specific codebase."
),
)
parser.add_argument(
"--file", type=str, help="Path of a .py source file relative to --base_dir."
)
parser.add_argument("--test", type=str, help="Specific test to check.")
parser.add_argument(
"--smoke-test", action="store_true", help="Load only a few tests for testing."
)
args = parser.parse_args()
print("building dep graph ...")
graph = build_dep_graph()
print(
"done. total {} files, {} of which have dependencies.".format(
len(graph.ids), len(graph.edges)
)
)
if args.mode == "circular-dep":
circles = find_circular_dep(graph)
print("Found following circular dependencies: \n")
for m, b in circles.items():
print(m)
for n in b:
print(" ", graph.inv_ids[n])
print()
if args.mode == "test-dep":
assert args.file, "Must specify --file for the query."
# Only support RLlib tests for now.
# The way Tune tests are defined, they all depend on
# the entire tune codebase.
tests = list_rllib_tests(5 if args.smoke_test else -1, args.test)
print("Total # of tests: ", len(tests))
for t in tests:
branch = test_depends_on_file(graph, t, args.file)
if branch:
print("{} depends on {}".format(t[0], args.file))
# Print some debugging info.
for n in branch:
print(" ", graph.inv_ids[n])
else:
print("{} does not depend on {}".format(t[0], args.file))