modvis.py

#!/usr/bin/env python3
# -*- coding: utf-8; -*-
"""A simple import analyzer. Visualize dependencies between modules."""

import ast
from glob import glob
import logging
from optparse import OptionParser  # TODO: migrate to argparse
import os

import pyan.node
import pyan.visgraph
import pyan.writers

# from pyan.anutils import get_module_name


def filename_to_module_name(fullpath):  # we need to see __init__, hence we don't use anutils.get_module_name.
    """'some/path/module.py' -> 'some.path.module'"""
    if not fullpath.endswith(".py"):
        raise ValueError("Expected a .py filename, got '{}'".format(fullpath))
    rel = ".{}".format(os.path.sep)  # ./
    if fullpath.startswith(rel):
        fullpath = fullpath[len(rel) :]
    fullpath = fullpath[:-3]  # remove .py
    return fullpath.replace(os.path.sep, ".")


def split_module_name(m):
    """'fully.qualified.name' -> ('fully.qualified', 'name')"""
    k = m.rfind(".")
    if k == -1:
        return ("", m)
    return (m[:k], m[(k + 1) :])


# blacklist = (".git", "build", "dist", "test")
# def find_py_files(basedir):
#     py_files = []
#     for root, dirs, files in os.walk(basedir):
#         for x in blacklist:  # don't visit blacklisted dirs
#             if x in dirs:
#                 dirs.remove(x)
#         for filename in files:
#             if filename.endswith(".py"):
#                 fullpath = os.path.join(root, filename)
#                 py_files.append(fullpath)
#     return py_files


def resolve(current_module, target_module, level):
    """Return fully qualified name of the target_module in an import.

    If level == 0, the import is absolute, hence target_module is already the
    fully qualified name (and will be returned as-is).

    Relative imports (level > 0) are resolved using current_module as the
    starting point. Usually this is good enough (especially if you analyze your
    project by invoking modvis in its top-level directory).

    For the exact implications, see the section "Import sibling packages" in:
        https://alex.dzyoba.com/blog/python-import/
    and this SO discussion:
        https://stackoverflow.com/questions/14132789/relative-imports-for-the-billionth-time
    """
    if level < 0:
        raise ValueError("Relative import level must be >= 0, got {}".format(level))
    if level == 0:  # absolute import
        return target_module
    # level > 0 (let's have some simplistic support for relative imports)
    if level > current_module.count(".") + 1:  # foo.bar.baz -> max level 3, pointing to top level
        raise ValueError("Relative import level {} too large for module name {}".format(level, current_module))
    base = current_module
    for _ in range(level):
        k = base.rfind(".")
        if k == -1:
            base = ""
            break
        base = base[:k]
    return ".".join((base, target_module))


class ImportVisitor(ast.NodeVisitor):
    def __init__(self, filenames, logger):
        self.modules = {}  # modname: {dep0, dep1, ...}
        self.fullpaths = {}  # modname: fullpath
        self.logger = logger
        self.analyze(filenames)

    def analyze(self, filenames):
        for fullpath in filenames:
            with open(fullpath, "rt", encoding="utf-8") as f:
                content = f.read()
            m = filename_to_module_name(fullpath)
            self.current_module = m
            self.fullpaths[m] = fullpath
            self.visit(ast.parse(content, fullpath))

    def add_dependency(self, target_module):  # source module is always self.current_module
        m = self.current_module
        if m not in self.modules:
            self.modules[m] = set()
        self.modules[m].add(target_module)
        # Just in case the target (or one or more of its parents) is a package
        # (we don't know that), add a dependency on the relevant __init__ module.
        #
        # If there's no matching __init__ (either no __init__.py provided, or
        # the target is just a module), this is harmless - we just generate a
        # spurious dependency on a module that doesn't even exist.
        #
        # Since nonexistent modules are not in the analyzed set (i.e. do not
        # appear as keys of self.modules), prepare_graph will ignore them.
        #
        # TODO: This would be a problem for a simple plain-text output that doesn't use the graph.
        modpath = target_module.split(".")
        for k in range(1, len(modpath) + 1):
            base = ".".join(modpath[:k])
            possible_init = base + ".__init__"
            if possible_init != m:  # will happen when current_module is somepackage.__init__ itself
                self.modules[m].add(possible_init)
                self.logger.debug("    added possible implicit use of '{}'".format(possible_init))

    def visit_Import(self, node):
        self.logger.debug(
            "{}:{}: Import {}".format(self.current_module, node.lineno, [alias.name for alias in node.names])
        )
        for alias in node.names:
            self.add_dependency(alias.name)  # alias.asname not relevant for our purposes

    def visit_ImportFrom(self, node):
        # from foo import some_symbol
        if node.module:
            self.logger.debug(
                "{}:{}: ImportFrom '{}', relative import level {}".format(
                    self.current_module, node.lineno, node.module, node.level
                )
            )
            absname = resolve(self.current_module, node.module, node.level)
            if node.level > 0:
                self.logger.debug("    resolved relative import to '{}'".format(absname))
            self.add_dependency(absname)

        # from . import foo  -->  module = None; now the **names** refer to modules
        else:
            for alias in node.names:
                self.logger.debug(
                    "{}:{}: ImportFrom '{}', target module '{}', relative import level {}".format(
                        self.current_module, node.lineno, "." * node.level, alias.name, node.level
                    )
                )
                absname = resolve(self.current_module, alias.name, node.level)
                if node.level > 0:
                    self.logger.debug("    resolved relative import to '{}'".format(absname))
                self.add_dependency(absname)

    # --------------------------------------------------------------------------------

    def detect_cycles(self):
        """Postprocessing. Detect import cycles.

        Return format is `[(prefix, cycle), ...]` where `prefix` is the
        non-cyclic prefix of the import chain, and `cycle` contains only
        the cyclic part (where the first and last elements are the same).
        """
        cycles = []

        def walk(m, seen=None, trace=None):
            trace = (trace or []) + [m]
            seen = seen or set()
            if m in seen:
                cycles.append(trace)
                return
            seen = seen | {m}
            deps = self.modules[m]
            for d in sorted(deps):
                if d in self.modules:
                    walk(d, seen, trace)

        for root in sorted(self.modules):
            walk(root)

        # For each detected cycle, report the non-cyclic prefix and the cycle separately
        out = []
        for cycle in cycles:
            offender = cycle[-1]
            k = cycle.index(offender)
            out.append((cycle[:k], cycle[k:]))
        return out

    def prepare_graph(self):  # same format as in pyan.analyzer
        """Postprocessing. Prepare data for pyan.visgraph for graph file generation."""
        self.nodes = {}  # Node name: list of Node objects (in possibly different namespaces)
        self.uses_edges = {}
        # we have no defines_edges, which doesn't matter as long as we don't enable that option in visgraph.

        # TODO: Right now we care only about modules whose files we read.
        # TODO: If we want to include in the graph also targets that are not in the analyzed set,
        # TODO: then we could create nodes also for the modules listed in the *values* of self.modules.
        for m in self.modules:
            ns, mod = split_module_name(m)
            package = os.path.dirname(self.fullpaths[m])
            # print("{}: ns={}, mod={}, fn={}".format(m, ns, mod, fn))
            # HACK: The `filename` attribute of the node determines the visual color.
            # HACK: We are visualizing at module level, so color by package.
            # TODO: If we are analyzing files from several projects in the same run,
            # TODO: it could be useful to decide the hue by the top-level directory name
            # TODO: (after the './' if any), and lightness by the depth in each tree.
            # TODO: This would be most similar to how Pyan does it for functions/classes.
            n = pyan.node.Node(namespace=ns, name=mod, ast_node=None, filename=package, flavor=pyan.node.Flavor.MODULE)
            n.defined = True
            # Pyan's analyzer.py allows several nodes to share the same short name,
            # which is used as the key to self.nodes; but we use the fully qualified
            # name as the key. Nevertheless, visgraph expects a format where the
            # values in the visitor's `nodes` attribute are lists.
            self.nodes[m] = [n]

        def add_uses_edge(from_node, to_node):
            if from_node not in self.uses_edges:
                self.uses_edges[from_node] = set()
            self.uses_edges[from_node].add(to_node)

        for m, deps in self.modules.items():
            for d in deps:
                n_from = self.nodes.get(m)
                n_to = self.nodes.get(d)
                if n_from and n_to:
                    add_uses_edge(n_from[0], n_to[0])

        # sanity check output
        for m, deps in self.uses_edges.items():
            assert m.get_name() in self.nodes
            for d in deps:
                assert d.get_name() in self.nodes


def main():
    usage = """usage: %prog FILENAME... [--dot|--tgf|--yed]"""
    desc = "Analyse one or more Python source files and generate an approximate module dependency graph."
    parser = OptionParser(usage=usage, description=desc)
    parser.add_option("--dot", action="store_true", default=False, help="output in GraphViz dot format")
    parser.add_option("--tgf", action="store_true", default=False, help="output in Trivial Graph Format")
    parser.add_option("--yed", action="store_true", default=False, help="output in yEd GraphML Format")
    parser.add_option("-f", "--file", dest="filename", help="write graph to FILE", metavar="FILE", default=None)
    parser.add_option("-l", "--log", dest="logname", help="write log to LOG", metavar="LOG")
    parser.add_option("-v", "--verbose", action="store_true", default=False, dest="verbose", help="verbose output")
    parser.add_option(
        "-V",
        "--very-verbose",
        action="store_true",
        default=False,
        dest="very_verbose",
        help="even more verbose output (mainly for debug)",
    )
    parser.add_option(
        "-c",
        "--colored",
        action="store_true",
        default=False,
        dest="colored",
        help="color nodes according to namespace [dot only]",
    )
    parser.add_option(
        "-g",
        "--grouped",
        action="store_true",
        default=False,
        dest="grouped",
        help="group nodes (create subgraphs) according to namespace [dot only]",
    )
    parser.add_option(
        "-e",
        "--nested-groups",
        action="store_true",
        default=False,
        dest="nested_groups",
        help="create nested groups (subgraphs) for nested namespaces (implies -g) [dot only]",
    )
    parser.add_option(
        "-C",
        "--cycles",
        action="store_true",
        default=False,
        dest="cycles",
        help="detect import cycles and print report to stdout",
    )
    parser.add_option(
        "--dot-rankdir",
        default="TB",
        dest="rankdir",
        help=(
            "specifies the dot graph 'rankdir' property for "
            "controlling the direction of the graph. "
            "Allowed values: ['TB', 'LR', 'BT', 'RL']. "
            "[dot only]"
        ),
    )
    parser.add_option(
        "-a", "--annotated", action="store_true", default=False, dest="annotated", help="annotate with module location"
    )

    options, args = parser.parse_args()
    filenames = [fn2 for fn in args for fn2 in glob(fn, recursive=True)]
    if len(args) == 0:
        parser.error("Need one or more filenames to process")

    if options.nested_groups:
        options.grouped = True

    graph_options = {
        "draw_defines": False,  # we have no defines edges
        "draw_uses": True,
        "colored": options.colored,
        "grouped_alt": False,
        "grouped": options.grouped,
        "nested_groups": options.nested_groups,
        "annotated": options.annotated,
    }

    # TODO: use an int argument for verbosity
    logger = logging.getLogger(__name__)
    if options.very_verbose:
        logger.setLevel(logging.DEBUG)
    elif options.verbose:
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.WARN)
    logger.addHandler(logging.StreamHandler())
    if options.logname:
        handler = logging.FileHandler(options.logname)
        logger.addHandler(handler)

    # run the analysis
    v = ImportVisitor(filenames, logger)

    # Postprocessing: detect import cycles
    #
    # NOTE: Because this is a static analysis, it doesn't care about the order
    # the code runs in any particular invocation of the software. Every
    # analyzed module is considered as a possible entry point to the program,
    # and all cycles (considering *all* possible branches *at any step* of
    # *each* import chain) will be mapped recursively.
    #
    # Obviously, this easily leads to a combinatoric explosion. In a mid-size
    # project (~20k SLOC), the analysis may find thousands of unique import
    # cycles, most of which are harmless.
    #
    # Many cycles appear due to package A importing something from package B
    # (possibly from one of its submodules) and vice versa, when both packages
    # have an __init__ module. If they don't actually try to import any names
    # that only become defined after the init has finished running, it's
    # usually fine.
    #
    # (Init modules often import names from their submodules to the package's
    # top-level namespace; those names can be reliably accessed only after the
    # init module has finished running. But importing names directly from the
    # submodule where they are defined is fine also during the init.)
    #
    # But if your program is crashing due to a cyclic import, you already know
    # in any case *which* import cycle is causing it, just by looking at the
    # stack trace. So this analysis is just extra information that says what
    # other cycles exist, if any.
    if options.cycles:
        cycles = v.detect_cycles()
        if not cycles:
            print("No import cycles detected.")
        else:
            unique_cycles = set()
            for prefix, cycle in cycles:
                unique_cycles.add(tuple(cycle))
            print("Detected the following import cycles (n_results={}).".format(len(unique_cycles)))

            def stats():
                lengths = [len(x) - 1 for x in unique_cycles]  # number of modules in the cycle

                def mean(lst):
                    return sum(lst) / len(lst)

                def median(lst):
                    tmp = list(sorted(lst))
                    n = len(lst)
                    if n % 2 == 1:
                        return tmp[n // 2]  # e.g. tmp[5] if n = 11
                    else:
                        return (tmp[n // 2 - 1] + tmp[n // 2]) / 2  # e.g. avg of tmp[4] and tmp[5] if n = 10

                return min(lengths), mean(lengths), median(lengths), max(lengths)

            print(
                "Number of modules in a cycle: min = {}, average = {:0.2g}, median = {:0.2g}, max = {}".format(*stats())
            )
            for c in sorted(unique_cycles):
                print("    {}".format(c))

    # # we could generate a plaintext report like this (with caveats; see TODO above)
    # ms = v.modules
    # for m in sorted(ms):
    #     print(m)
    #     for d in sorted(ms[m]):
    #         print("    {}".format(d))

    # Postprocessing: format graph report
    make_graph = options.dot or options.tgf or options.yed
    if make_graph:
        v.prepare_graph()
        # print(v.nodes, v.uses_edges)
        graph = pyan.visgraph.VisualGraph.from_visitor(v, options=graph_options, logger=logger)

    if options.dot:
        writer = pyan.writers.DotWriter(
            graph, options=["rankdir=" + options.rankdir], output=options.filename, logger=logger
        )
    if options.tgf:
        writer = pyan.writers.TgfWriter(graph, output=options.filename, logger=logger)
    if options.yed:
        writer = pyan.writers.YedWriter(graph, output=options.filename, logger=logger)
    if make_graph:
        writer.run()


if __name__ == "__main__":
    main()