ray/ci/pipeline/py_dep_analysis.py

#!/usr/bin/env python
#
# This file contains utilities for understanding dependencies between python
# source files and tests.
#
# Utils are assumed to be used from top level ray/ folder, since that is how
# our tests are defined today.
#
# Example usage:
#     To find all circular dependencies under ray/python/:
#         python ci/pipeline/py_dep_analysis.py --mode=circular-dep
#     To find all the RLlib tests that depend on a file:
#         python ci/pipeline/py_dep_analysis.py --mode=test-dep \
#             --file=python/ray/tune/tune.py
#     For testing, add --smoke-test to any commands, so it doesn't spend
#     tons of time querying for available RLlib tests.

import argparse
import ast
import os
import re
import subprocess
import sys
from typing import Dict, List, Tuple


class DepGraph(object):
    def __init__(self):
        self.edges: Dict[str, Dict[str, bool]] = {}
        self.ids: Dict[str, int] = {}
        self.inv_ids: Dict[int, str] = {}


def _run_shell(args: List[str]) -> str:
    return subprocess.check_output(args).decode(sys.stdout.encoding)


def list_rllib_tests(n: int = -1, test: str = None) -> Tuple[str, List[str]]:
    """List RLlib tests.

    Args:
        n: return at most n tests. all tests if n = -1.
        test: only return information about a specific test.
    """
    tests_res = _run_shell(
        ["bazel", "query", "tests(//python/ray/rllib:*)", "--output", "label"]
    )

    all_tests = []

    # Strip, also skip any empty lines
    tests = [t.strip() for t in tests_res.splitlines() if t.strip()]
    for t in tests:
        if test and t != test:
            continue

        src_out = _run_shell(
            [
                "bazel",
                "query",
                'kind("source file", deps({}))'.format(t),
                "--output",
                "label",
            ]
        )

        srcs = [f.strip() for f in src_out.splitlines()]
        srcs = [f for f in srcs if f.startswith("//python") and f.endswith(".py")]
        if srcs:
            all_tests.append((t, srcs))

        # Break early if smoke test.
        if n > 0 and len(all_tests) >= n:
            break

    return all_tests


def _new_dep(graph: DepGraph, src_module: str, dep: str):
    """Create a new dependency between src_module and dep."""
    if dep not in graph.ids:
        graph.ids[dep] = len(graph.ids)

    src_id = graph.ids[src_module]
    dep_id = graph.ids[dep]

    if src_id not in graph.edges:
        graph.edges[src_id] = {}
    graph.edges[src_id][dep_id] = True


def _new_import(graph: DepGraph, src_module: str, dep_module: str):
    """Process a new import statement in src_module."""
    # We don't care about system imports.
    if not dep_module.startswith("ray"):
        return

    _new_dep(graph, src_module, dep_module)


def _is_path_module(module: str, name: str, _base_dir: str) -> bool:
    """Figure out if base.sub is a python module or not."""
    # Special handling for _raylet, which is a C++ lib.
    if module == "ray._raylet":
        return False

    bps = ["python"] + module.split(".")
    path = os.path.join(_base_dir, os.path.join(*bps), name + ".py")
    if os.path.isfile(path):
        return True  # file module
    return False


def _new_from_import(
    graph: DepGraph, src_module: str, dep_module: str, dep_name: str, _base_dir: str
):
    """Process a new "from ... import ..." statement in src_module."""
    # We don't care about imports outside of ray package.
    if not dep_module or not dep_module.startswith("ray"):
        return

    if _is_path_module(dep_module, dep_name, _base_dir):
        # dep_module.dep_name points to a file.
        _new_dep(graph, src_module, _full_module_path(dep_module, dep_name))
    else:
        # sub is an obj on base dir/file.
        _new_dep(graph, src_module, dep_module)


def _process_file(graph: DepGraph, src_path: str, src_module: str, _base_dir=""):
    """Create dependencies from src_module to all the valid imports in src_path.

    Args:
        graph: the DepGraph to be added to.
        src_path: .py file to be processed.
        src_module: full module path of the source file.
        _base_dir: use a different base dir than current dir. For unit testing.
    """
    with open(os.path.join(_base_dir, src_path), "r") as in_f:
        tree = ast.parse(in_f.read())

        for node in ast.walk(tree):
            if isinstance(node, ast.Import):
                for alias in node.names:
                    _new_import(graph, src_module, alias.name)
            elif isinstance(node, ast.ImportFrom):
                for alias in node.names:
                    _new_from_import(
                        graph, src_module, node.module, alias.name, _base_dir
                    )


def build_dep_graph() -> DepGraph:
    """Build index from py files to their immediate dependees."""
    graph = DepGraph()

    # Assuming we run from root /ray directory.
    # Follow links since rllib is linked to /rllib.
    for root, sub_dirs, files in os.walk("python", followlinks=True):
        if _should_skip(root):
            continue

        module = _bazel_path_to_module_path(root)

        # Process files first.
        for f in files:
            if not f.endswith(".py"):
                continue

            full = _full_module_path(module, f)

            if full not in graph.ids:
                graph.ids[full] = len(graph.ids)

            # Process file:
            _process_file(graph, os.path.join(root, f), full)

    # Build reverse index for convenience.
    graph.inv_ids = {v: k for k, v in graph.ids.items()}

    return graph


def _full_module_path(module, f) -> str:
    if f == "__init__.py":
        # __init__ file for this module.
        # Full path is the same as the module name.
        return module

    fn = re.sub(r"\.py$", "", f)

    if not module:
        return fn
    return module + "." + fn


def _should_skip(d: str) -> bool:
    """Skip directories that should not contain py sources."""
    if d.startswith("python/.eggs/"):
        return True
    if d.startswith("python/."):
        return True
    if d.startswith("python/build"):
        return True
    if d.startswith("python/ray/cpp"):
        return True
    return False


def _bazel_path_to_module_path(d: str) -> str:
    """Convert a Bazel file path to python module path.

    Example: //python/ray/rllib:xxx/yyy/dd -> ray.rllib.xxx.yyy.dd
    """
    # Do this in 3 steps, so all of 'python:', 'python/', or '//python', etc
    # will get stripped.
    d = re.sub(r"^\/\/", "", d)
    d = re.sub(r"^python", "", d)
    d = re.sub(r"^[\/:]", "", d)
    return d.replace("/", ".").replace(":", ".")


def _file_path_to_module_path(f: str) -> str:
    """Return the corresponding module path for a .py file."""
    dir, fn = os.path.split(f)
    return _full_module_path(_bazel_path_to_module_path(dir), fn)


def _depends(
    graph: DepGraph, visited: Dict[int, bool], tid: int, qid: int
) -> List[int]:
    """Whether there is a dependency path from module tid to module qid.

    Given graph, and without going through visited.
    """
    if tid not in graph.edges or qid not in graph.edges:
        return []
    if qid in graph.edges[tid]:
        # tid directly depends on qid.
        return [tid, qid]
    for c in graph.edges[tid]:
        if c in visited:
            continue
        visited[c] = True
        # Reduce to a question of whether there is a path from c to qid.
        ds = _depends(graph, visited, c, qid)
        if ds:
            # From tid -> c -> qid.
            return [tid] + ds
    return []


def test_depends_on_file(
    graph: DepGraph, test: Tuple[str, Tuple[str]], path: str
) -> List[int]:
    """Give dependency graph, check if a test depends on a specific .py file.

    Args:
        graph: the dependency graph.
        test: information about a test, in the format of:
            [test_name, (src files for the test)]
    """
    query = _file_path_to_module_path(path)
    if query not in graph.ids:
        # Not a file that we care about.
        return []

    t, srcs = test

    # Skip tuned_examples/ and examples/ tests.
    if t.startswith("//python/ray/rllib:examples/"):
        return []

    for src in srcs:
        if src == "ray.rllib.tests.run_regression_tests":
            return []

        tid = _file_path_to_module_path(src)
        if tid not in graph.ids:
            # Not a test that we care about.
            # TODO(jungong): What tests are these?????
            continue

        branch = _depends(graph, {}, graph.ids[tid], graph.ids[query])
        if branch:
            return branch

    # Does not depend on file.
    return []


def _find_circular_dep_impl(graph: DepGraph, id: str, branch: str) -> bool:
    if id not in graph.edges:
        return False
    for c in graph.edges[id]:
        if c in branch:
            # Found a circle.
            branch.append(c)
            return True
        branch.append(c)
        if _find_circular_dep_impl(graph, c, branch):
            return True
        branch.pop()
    return False


def find_circular_dep(graph: DepGraph) -> Dict[str, List[int]]:
    """Find circular dependencies among a dependency graph."""
    known = {}
    circles = {}
    for m, id in graph.ids.items():
        branch = []
        if _find_circular_dep_impl(graph, id, branch):
            if branch[-1] in known:
                # Already knew, skip.
                continue
            # Since this is a cycle dependency, any step along the circle
            # will form a different circle.
            # So we mark every entry on this circle known.
            for n in branch:
                known[n] = True
            # Mark that module m contains a potential circular dep.
            circles[m] = branch
    return circles


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--mode",
        type=str,
        default="test-dep",
        help=(
            "test-dep: find dependencies for a specified test. "
            "circular-dep: find circular dependencies in "
            "the specific codebase."
        ),
    )
    parser.add_argument(
        "--file", type=str, help="Path of a .py source file relative to --base_dir."
    )
    parser.add_argument("--test", type=str, help="Specific test to check.")
    parser.add_argument(
        "--smoke-test", action="store_true", help="Load only a few tests for testing."
    )

    args = parser.parse_args()

    print("building dep graph ...")
    graph = build_dep_graph()
    print(
        "done. total {} files, {} of which have dependencies.".format(
            len(graph.ids), len(graph.edges)
        )
    )

    if args.mode == "circular-dep":
        circles = find_circular_dep(graph)
        print("Found following circular dependencies: \n")
        for m, b in circles.items():
            print(m)
            for n in b:
                print("    ", graph.inv_ids[n])
            print()

    if args.mode == "test-dep":
        assert args.file, "Must specify --file for the query."

        # Only support RLlib tests for now.
        # The way Tune tests are defined, they all depend on
        # the entire tune codebase.
        tests = list_rllib_tests(5 if args.smoke_test else -1, args.test)
        print("Total # of tests: ", len(tests))

        for t in tests:
            branch = test_depends_on_file(graph, t, args.file)
            if branch:
                print("{} depends on {}".format(t[0], args.file))
                # Print some debugging info.
                for n in branch:
                    print("    ", graph.inv_ids[n])
            else:
                print("{} does not depend on {}".format(t[0], args.file))
Add a script to analyze python module dependencies using static analysis (#18965) 2021-10-19 13:33:02 -07:00			`#!/usr/bin/env python`
			`#`
			`# This file contains utilities for understanding dependencies between python`
			`# source files and tests.`
			`#`
			`# Utils are assumed to be used from top level ray/ folder, since that is how`
			`# our tests are defined today.`
			`#`
			`# Example usage:`
			`# To find all circular dependencies under ray/python/:`
[ci] Clean up ci/ directory (refactor ci/travis) (#23866) Clean up the ci/ directory. This means getting rid of the travis/ path completely and moving the files into sensible subdirectories. Details: - Moves everything under ci/travis into subdirectories, e.g. ci/build, ci/lint, etc. - Minor adjustments to some scripts (variable renames) - Removes the outdated (unused) asan tests 2022-04-13 18:11:30 +01:00			`# python ci/pipeline/py_dep_analysis.py --mode=circular-dep`
Add a script to analyze python module dependencies using static analysis (#18965) 2021-10-19 13:33:02 -07:00			`# To find all the RLlib tests that depend on a file:`
[ci] Clean up ci/ directory (refactor ci/travis) (#23866) Clean up the ci/ directory. This means getting rid of the travis/ path completely and moving the files into sensible subdirectories. Details: - Moves everything under ci/travis into subdirectories, e.g. ci/build, ci/lint, etc. - Minor adjustments to some scripts (variable renames) - Removes the outdated (unused) asan tests 2022-04-13 18:11:30 +01:00			`# python ci/pipeline/py_dep_analysis.py --mode=test-dep \`
Add a script to analyze python module dependencies using static analysis (#18965) 2021-10-19 13:33:02 -07:00			`# --file=python/ray/tune/tune.py`
			`# For testing, add --smoke-test to any commands, so it doesn't spend`
			`# tons of time querying for available RLlib tests.`

			`import argparse`
			`import ast`
			`import os`
			`import re`
			`import subprocess`
			`import sys`
			`from typing import Dict, List, Tuple`


			`class DepGraph(object):`
			`def __init__(self):`
			`self.edges: Dict[str, Dict[str, bool]] = {}`
			`self.ids: Dict[str, int] = {}`
			`self.inv_ids: Dict[int, str] = {}`


			`def _run_shell(args: List[str]) -> str:`
			`return subprocess.check_output(args).decode(sys.stdout.encoding)`


			`def list_rllib_tests(n: int = -1, test: str = None) -> Tuple[str, List[str]]:`
			`"""List RLlib tests.`

			`Args:`
			`n: return at most n tests. all tests if n = -1.`
			`test: only return information about a specific test.`
			`"""`
			`tests_res = _run_shell(`
			`["bazel", "query", "tests(//python/ray/rllib:*)", "--output", "label"]`
			`)`

			`all_tests = []`

			`# Strip, also skip any empty lines`
			`tests = [t.strip() for t in tests_res.splitlines() if t.strip()]`
			`for t in tests:`
			`if test and t != test:`
			`continue`

			`src_out = _run_shell(`
			`[`
			`"bazel",`
			`"query",`
			`'kind("source file", deps({}))'.format(t),`
			`"--output",`
			`"label",`
			`]`
			`)`

			`srcs = [f.strip() for f in src_out.splitlines()]`
			`srcs = [f for f in srcs if f.startswith("//python") and f.endswith(".py")]`
			`if srcs:`
			`all_tests.append((t, srcs))`

			`# Break early if smoke test.`
			`if n > 0 and len(all_tests) >= n:`
			`break`

			`return all_tests`


			`def _new_dep(graph: DepGraph, src_module: str, dep: str):`
			`"""Create a new dependency between src_module and dep."""`
			`if dep not in graph.ids:`
			`graph.ids[dep] = len(graph.ids)`

			`src_id = graph.ids[src_module]`
			`dep_id = graph.ids[dep]`

			`if src_id not in graph.edges:`
			`graph.edges[src_id] = {}`
			`graph.edges[src_id][dep_id] = True`


			`def _new_import(graph: DepGraph, src_module: str, dep_module: str):`
			`"""Process a new import statement in src_module."""`
			`# We don't care about system imports.`
			`if not dep_module.startswith("ray"):`
			`return`

			`_new_dep(graph, src_module, dep_module)`


			`def _is_path_module(module: str, name: str, _base_dir: str) -> bool:`
			`"""Figure out if base.sub is a python module or not."""`
			`# Special handling for _raylet, which is a C++ lib.`
			`if module == "ray._raylet":`
			`return False`

			`bps = ["python"] + module.split(".")`
			`path = os.path.join(_base_dir, os.path.join(*bps), name + ".py")`
			`if os.path.isfile(path):`
			`return True # file module`
			`return False`


			`def _new_from_import(`
			`graph: DepGraph, src_module: str, dep_module: str, dep_name: str, _base_dir: str`
			`):`
			`"""Process a new "from ... import ..." statement in src_module."""`
			`# We don't care about imports outside of ray package.`
			`if not dep_module or not dep_module.startswith("ray"):`
			`return`

			`if _is_path_module(dep_module, dep_name, _base_dir):`
			`# dep_module.dep_name points to a file.`
			`_new_dep(graph, src_module, _full_module_path(dep_module, dep_name))`
			`else:`
			`# sub is an obj on base dir/file.`
			`_new_dep(graph, src_module, dep_module)`


			`def _process_file(graph: DepGraph, src_path: str, src_module: str, _base_dir=""):`
			`"""Create dependencies from src_module to all the valid imports in src_path.`

			`Args:`
			`graph: the DepGraph to be added to.`
			`src_path: .py file to be processed.`
			`src_module: full module path of the source file.`
			`_base_dir: use a different base dir than current dir. For unit testing.`
			`"""`
			`with open(os.path.join(_base_dir, src_path), "r") as in_f:`
			`tree = ast.parse(in_f.read())`

			`for node in ast.walk(tree):`
			`if isinstance(node, ast.Import):`
			`for alias in node.names:`
			`_new_import(graph, src_module, alias.name)`
			`elif isinstance(node, ast.ImportFrom):`
			`for alias in node.names:`
			`_new_from_import(`
			`graph, src_module, node.module, alias.name, _base_dir`
			`)`


			`def build_dep_graph() -> DepGraph:`
			`"""Build index from py files to their immediate dependees."""`
			`graph = DepGraph()`

			`# Assuming we run from root /ray directory.`
			`# Follow links since rllib is linked to /rllib.`
			`for root, sub_dirs, files in os.walk("python", followlinks=True):`
			`if _should_skip(root):`
			`continue`

			`module = _bazel_path_to_module_path(root)`

			`# Process files first.`
			`for f in files:`
			`if not f.endswith(".py"):`
			`continue`

			`full = _full_module_path(module, f)`

			`if full not in graph.ids:`
			`graph.ids[full] = len(graph.ids)`

			`# Process file:`
			`_process_file(graph, os.path.join(root, f), full)`

			`# Build reverse index for convenience.`
			`graph.inv_ids = {v: k for k, v in graph.ids.items()}`

			`return graph`


			`def _full_module_path(module, f) -> str:`
			`if f == "__init__.py":`
			`# __init__ file for this module.`
			`# Full path is the same as the module name.`
			`return module`

			`fn = re.sub(r"\.py$", "", f)`

			`if not module:`
			`return fn`
			`return module + "." + fn`


			`def _should_skip(d: str) -> bool:`
			`"""Skip directories that should not contain py sources."""`
			`if d.startswith("python/.eggs/"):`
			`return True`
			`if d.startswith("python/."):`
			`return True`
			`if d.startswith("python/build"):`
			`return True`
			`if d.startswith("python/ray/cpp"):`
			`return True`
			`return False`


			`def _bazel_path_to_module_path(d: str) -> str:`
			`"""Convert a Bazel file path to python module path.`

			`Example: //python/ray/rllib:xxx/yyy/dd -> ray.rllib.xxx.yyy.dd`
			`"""`
			`# Do this in 3 steps, so all of 'python:', 'python/', or '//python', etc`
			`# will get stripped.`
			`d = re.sub(r"^\/\/", "", d)`
			`d = re.sub(r"^python", "", d)`
			`d = re.sub(r"^[\/:]", "", d)`
			`return d.replace("/", ".").replace(":", ".")`


			`def _file_path_to_module_path(f: str) -> str:`
			`"""Return the corresponding module path for a .py file."""`
			`dir, fn = os.path.split(f)`
			`return _full_module_path(_bazel_path_to_module_path(dir), fn)`


			`def _depends(`
			`graph: DepGraph, visited: Dict[int, bool], tid: int, qid: int`
			`) -> List[int]:`
			`"""Whether there is a dependency path from module tid to module qid.`

			`Given graph, and without going through visited.`
			`"""`
			`if tid not in graph.edges or qid not in graph.edges:`
			`return []`
			`if qid in graph.edges[tid]:`
			`# tid directly depends on qid.`
			`return [tid, qid]`
			`for c in graph.edges[tid]:`
			`if c in visited:`
			`continue`
			`visited[c] = True`
			`# Reduce to a question of whether there is a path from c to qid.`
			`ds = _depends(graph, visited, c, qid)`
			`if ds:`
			`# From tid -> c -> qid.`
			`return [tid] + ds`
			`return []`


			`def test_depends_on_file(`
			`graph: DepGraph, test: Tuple[str, Tuple[str]], path: str`
			`) -> List[int]:`
			`"""Give dependency graph, check if a test depends on a specific .py file.`

			`Args:`
			`graph: the dependency graph.`
			`test: information about a test, in the format of:`
			`[test_name, (src files for the test)]`
			`"""`
			`query = _file_path_to_module_path(path)`
			`if query not in graph.ids:`
			`# Not a file that we care about.`
			`return []`

			`t, srcs = test`

			`# Skip tuned_examples/ and examples/ tests.`
			`if t.startswith("//python/ray/rllib:examples/"):`
			`return []`

			`for src in srcs:`
			`if src == "ray.rllib.tests.run_regression_tests":`
			`return []`

			`tid = _file_path_to_module_path(src)`
			`if tid not in graph.ids:`
			`# Not a test that we care about.`
			`# TODO(jungong): What tests are these?????`
			`continue`

			`branch = _depends(graph, {}, graph.ids[tid], graph.ids[query])`
			`if branch:`
			`return branch`

			`# Does not depend on file.`
			`return []`


			`def _find_circular_dep_impl(graph: DepGraph, id: str, branch: str) -> bool:`
			`if id not in graph.edges:`
			`return False`
			`for c in graph.edges[id]:`
			`if c in branch:`
			`# Found a circle.`
			`branch.append(c)`
			`return True`
			`branch.append(c)`
			`if _find_circular_dep_impl(graph, c, branch):`
			`return True`
			`branch.pop()`
			`return False`


			`def find_circular_dep(graph: DepGraph) -> Dict[str, List[int]]:`
			`"""Find circular dependencies among a dependency graph."""`
			`known = {}`
			`circles = {}`
			`for m, id in graph.ids.items():`
			`branch = []`
			`if _find_circular_dep_impl(graph, id, branch):`
			`if branch[-1] in known:`
			`# Already knew, skip.`
			`continue`
			`# Since this is a cycle dependency, any step along the circle`
			`# will form a different circle.`
			`# So we mark every entry on this circle known.`
			`for n in branch:`
			`known[n] = True`
			`# Mark that module m contains a potential circular dep.`
			`circles[m] = branch`
			`return circles`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`

			`parser.add_argument(`
			`"--mode",`
			`type=str,`
			`default="test-dep",`
			`help=(`
			`"test-dep: find dependencies for a specified test. "`
			`"circular-dep: find circular dependencies in "`
			`"the specific codebase."`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`),`
Add a script to analyze python module dependencies using static analysis (#18965) 2021-10-19 13:33:02 -07:00			`)`
			`parser.add_argument(`
			`"--file", type=str, help="Path of a .py source file relative to --base_dir."`
			`)`
			`parser.add_argument("--test", type=str, help="Specific test to check.")`
			`parser.add_argument(`
			`"--smoke-test", action="store_true", help="Load only a few tests for testing."`
			`)`

			`args = parser.parse_args()`

			`print("building dep graph ...")`
			`graph = build_dep_graph()`
			`print(`
			`"done. total {} files, {} of which have dependencies.".format(`
			`len(graph.ids), len(graph.edges)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
			`)`
Add a script to analyze python module dependencies using static analysis (#18965) 2021-10-19 13:33:02 -07:00
			`if args.mode == "circular-dep":`
			`circles = find_circular_dep(graph)`
			`print("Found following circular dependencies: \n")`
			`for m, b in circles.items():`
			`print(m)`
			`for n in b:`
			`print(" ", graph.inv_ids[n])`
			`print()`

			`if args.mode == "test-dep":`
			`assert args.file, "Must specify --file for the query."`

			`# Only support RLlib tests for now.`
			`# The way Tune tests are defined, they all depend on`
			`# the entire tune codebase.`
			`tests = list_rllib_tests(5 if args.smoke_test else -1, args.test)`
			`print("Total # of tests: ", len(tests))`

			`for t in tests:`
			`branch = test_depends_on_file(graph, t, args.file)`
			`if branch:`
			`print("{} depends on {}".format(t[0], args.file))`
			`# Print some debugging info.`
			`for n in branch:`
			`print(" ", graph.inv_ids[n])`
			`else:`
			`print("{} does not depend on {}".format(t[0], args.file))`