Commit a0465fa2 authored by James Long's avatar James Long Committed by Commit Bot

Count cycles in Android package dependency graphs

A new script has been added to count and optionally output cycles in
package graphs. A limit to cycle size must be supplied, ideally around 5
or 6 to keep the script runtime low since the number of cycles is
exponential.

For more detailed usage, see `./count_cycles.py -h`.

Bug: 1106484
Change-Id: I01923eaab24eb3c6641a23039c80f60a0bc0e212
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2303054
Commit-Queue: James Long <yjlong@google.com>
Reviewed-by: default avatarSamuel Huang <huangs@chromium.org>
Reviewed-by: default avatarMohamed Heikal <mheikal@chromium.org>
Cr-Commit-Position: refs/heads/master@{#791996}
parent d6265a0a
#!/usr/bin/env python3
# Copyright 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Command-line tool to enumerate cycles in Graph structures."""
import argparse
import collections
from typing import Dict, List
import serialization
import graph
def find_cycles_from_node(
start_node: graph.Node,
max_cycle_length: int,
node_to_id: Dict[graph.Node, int],
) -> List[List[List[graph.Node]]]:
"""Finds all cycles starting at |start_node| in a subset of nodes.
Only nodes with ID >= |start_node|'s ID will be considered. This ensures
uniquely counting all cycles since this function is called on all nodes of
the graph, one at a time in increasing order. Some justification: Consider
cycle C with smallest node n. When this function is called on node n, C will
be found since all nodes of C are >= n. After that call, C will never be
found again since further calls are on nodes > n (n is removed from the
search space).
Cycles are found by recursively scanning all outbound nodes starting from
|start_node|, up to a certain depth. Note this is the same idea, but is
different from DFS since nodes can be visited more than once (to avoid
missing cycles). An example of normal DFS (where nodes can only be visited
once) missing cycles is in the following graph, starting at a:
a <-> b <-> c
^ ^
| |
+-----------+
DFS(a)
DFS(b)
DFS(a) (cycle aba, return)
DFS(c)
DFS(b) (already seen, return)
DFS(a) (cycle abca, return)
DFS(c) (already seen, return)
Since DFS(c) cannot proceed, we miss the cycles aca and acba.
Args:
start_node: The node to start the cycle search from. Only nodes with ID
>= |start_node|'s ID will be considered.
max_cycle_length: The maximum length of cycles to be found.
node_to_id: A map from a Node to a generated ID.
Returns:
A list |cycles| of length |max_cycle_length| + 1, where cycles[i]
contains all relevant cycles of length i.
"""
start_node_id = node_to_id[start_node]
cycles = [[] for _ in range(max_cycle_length + 1)]
def edge_is_interesting(start: graph.Node, end: graph.Node) -> bool:
if start == end:
# Ignore self-loops.
return False
if node_to_id[end] < start_node_id:
# Ignore edges ending at nodes with ID lower than the start.
return False
return True
dfs_stack = collections.deque()
on_stack: Dict[graph.Node, bool] = collections.defaultdict(bool)
def find_cycles_dfs(cur_node: graph.Node, cur_length: int):
for other_node in cur_node.outbound:
if edge_is_interesting(cur_node, other_node):
if other_node == start_node:
# We have found a valid cycle, add it to the list.
new_cycle = list(dfs_stack) + [cur_node, start_node]
cycles[cur_length + 1].append(new_cycle)
elif (not on_stack[other_node]
and cur_length + 1 < max_cycle_length):
# We are only allowed to recurse into the next node if:
# 1) It hasn't been visited in the current cycle. This is
# because if the next node n _has_ been visited in the
# current cycle (i.e., it's on the stack), then we have
# found a cycle starting and ending at n. Since this
# function only returns cycles starting at |start_node|, we
# only care if |n = start_node| (which we already detect
# above).
# 2) It would not exceed the maximum depth allowed.
dfs_stack.append(cur_node)
on_stack[cur_node] = True
find_cycles_dfs(other_node, cur_length + 1)
dfs_stack.pop()
on_stack[cur_node] = False
find_cycles_dfs(start_node, 0)
return cycles
def find_cycles(base_graph: graph.Graph,
max_cycle_length: int) -> List[List[List[graph.Node]]]:
"""Finds all cycles in the graph within a certain length.
The algorithm is as such: Number the nodes arbitrarily. For i from 0 to
the number of nodes, find all cycles starting and ending at node i using
only nodes with numbers >= i (see find_cycles_from_node). Taking the union
of the results will give all relevant cycles in the graph.
Returns:
A list |cycles| of length |max_cycle_length| + 1, where cycles[i]
contains all cycles of length i.
"""
sorted_base_graph_nodes = sorted(base_graph.nodes)
# Some preliminary setup: map between the graph nodes' unique keys and a
# unique number, since the algorithm needs some way to decide when a node is
# 'bigger'. Nodes with a lower number will be processed first, which
# influences the output cycles. For example, the cycle abca is also valid as
# the cycle bcab or cabc. By numbering node a lower than b and c, it is
# guaranteed that the cycle will be output as abca.
node_to_id = {}
for generated_node_id, node in enumerate(sorted_base_graph_nodes):
node_to_id[node] = generated_node_id
num_nodes = base_graph.num_nodes
cycles = [[] for _ in range(max_cycle_length + 1)]
for start_node in sorted_base_graph_nodes:
start_node_cycles = find_cycles_from_node(start_node, max_cycle_length,
node_to_id)
for cycle_length, cycle_list in enumerate(start_node_cycles):
cycles[cycle_length].extend(cycle_list)
return cycles
def main():
"""Enumerates the cycles within a certain length in a graph."""
arg_parser = argparse.ArgumentParser(
description='Given a JSON dependency graph, count the number of cycles '
'in the package graph.')
required_arg_group = arg_parser.add_argument_group('required arguments')
required_arg_group.add_argument(
'-f',
'--file',
required=True,
help='Path to the JSON file containing the dependency graph. '
'See the README on how to generate this file.')
required_arg_group.add_argument(
'-l',
'--cycle-length',
type=int,
required=True,
help='The maximum length of cycles to find, at most 5 or 6 to keep the '
'script runtime low.')
arg_parser.add_argument(
'-o',
'--output',
type=argparse.FileType('w'),
help='Path to the file to write the list of cycles to.')
args = arg_parser.parse_args()
_, package_graph = serialization.load_class_and_package_graphs_from_file(
args.file)
all_cycles = find_cycles(package_graph, args.cycle_length)
# There are no cycles of length 0 or 1 (since self-loops are disallowed).
nonzero_cycles = all_cycles[2:]
print(f'Found {sum(len(cycles) for cycles in nonzero_cycles)} cycles.')
for cycle_length, cycles in enumerate(nonzero_cycles, 2):
print(f'Found {len(cycles)} cycles of length {cycle_length}.')
if args.output is not None:
print(f'Dumping cycles to {args.output.name}.')
with args.output as output_file:
for cycle_length, cycles in enumerate(nonzero_cycles, 2):
output_file.write(f'Cycles of length {cycle_length}:\n')
cycle_texts = []
for cycle in cycles:
cycle_texts.append(' > '.join(cycle_node.name
for cycle_node in cycle))
output_file.write('\n'.join(sorted(cycle_texts)))
output_file.write('\n')
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# Copyright 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Unit tests for dependency_analysis.count_cycles."""
import itertools
import unittest
import count_cycles
import graph
class TestFindCycles(unittest.TestCase):
"""Unit tests for find_cycles."""
KEY_0 = '0'
KEY_1 = '1'
KEY_2 = '2'
KEY_3 = '3'
MAX_CYCLE_LENGTH = 10
def test_no_self_cycles(self):
"""Tests that self-cycles are not considered.
0 <---+
^ |
| v
+---> 1 (plus, 0 and 1 have self-cycles)
(one cycle, 010)
"""
test_graph = graph.Graph()
test_graph.add_edge_if_new(self.KEY_0, self.KEY_1)
test_graph.add_edge_if_new(self.KEY_1, self.KEY_0)
test_graph.add_edge_if_new(self.KEY_0, self.KEY_0)
test_graph.add_edge_if_new(self.KEY_1, self.KEY_1)
res = count_cycles.find_cycles(test_graph, self.MAX_CYCLE_LENGTH)
expected_cycles = {
2: 1,
}
for cycle_length, cycles in enumerate(res):
self.assertEqual(len(cycles), expected_cycles.get(cycle_length, 0))
def test_big_cycle(self):
"""Tests using a graph with one big cycle.
0 -> 1
^ |
| v
3 <- 2
(one cycle, 01230)
"""
test_graph = graph.Graph()
test_graph.add_edge_if_new(self.KEY_0, self.KEY_1)
test_graph.add_edge_if_new(self.KEY_1, self.KEY_2)
test_graph.add_edge_if_new(self.KEY_2, self.KEY_3)
test_graph.add_edge_if_new(self.KEY_3, self.KEY_0)
res = count_cycles.find_cycles(test_graph, self.MAX_CYCLE_LENGTH)
expected_cycles = {
4: 1,
}
for cycle_length, cycles in enumerate(res):
self.assertEqual(len(cycles), expected_cycles.get(cycle_length, 0))
def test_multiple_cycles(self):
"""Tests using a graph with multiple cycles.
0 -> 1
^ ^
| v
+--- 2 -> 3
(two cycles, 0120 and 121)
"""
test_graph = graph.Graph()
test_graph.add_edge_if_new(self.KEY_0, self.KEY_1)
test_graph.add_edge_if_new(self.KEY_1, self.KEY_2)
test_graph.add_edge_if_new(self.KEY_2, self.KEY_0)
test_graph.add_edge_if_new(self.KEY_2, self.KEY_1)
test_graph.add_edge_if_new(self.KEY_2, self.KEY_3)
res = count_cycles.find_cycles(test_graph, self.MAX_CYCLE_LENGTH)
expected_cycles = {
2: 1,
3: 1,
}
for cycle_length, cycles in enumerate(res):
self.assertEqual(len(cycles), expected_cycles.get(cycle_length, 0))
def test_complete_graph(self):
"""Tests using a complete graph on 4 nodes.
+------------+
v |
0 <> 1 <--+ |
^ ^ | |
| v v |
+--> 2 <> 3 <+
(20 cycles,
010, 020, 030, 121, 131, 232,
0120, 0130, 0210, 0230, 0310, 0320, 1231, 1321,
01230, 01320, 02130, 02310, 03120, 03210)
"""
test_graph = graph.Graph()
for ka, kb in itertools.permutations(
[self.KEY_0, self.KEY_1, self.KEY_2, self.KEY_3], 2):
test_graph.add_edge_if_new(ka, kb)
res = count_cycles.find_cycles(test_graph, self.MAX_CYCLE_LENGTH)
expected_cycles = {2: 6, 3: 8, 4: 6}
for cycle_length, cycles in enumerate(res):
self.assertEqual(len(cycles), expected_cycles.get(cycle_length, 0))
def test_complete_graph_restricted_length(self):
"""Tests using a complete graph on 4 nodes with maximum cycle length 2.
+------------+
v |
0 <> 1 <--+ |
^ ^ | |
| v v |
+--> 2 <> 3 <+
(6 cycles, 010, 020, 030, 121, 131, 232)
"""
test_graph = graph.Graph()
for ka, kb in itertools.permutations(
[self.KEY_0, self.KEY_1, self.KEY_2, self.KEY_3], 2):
test_graph.add_edge_if_new(ka, kb)
res = count_cycles.find_cycles(test_graph, 2)
expected_cycles = {2: 6}
for cycle_length, cycles in enumerate(res):
self.assertEqual(len(cycles), expected_cycles.get(cycle_length, 0))
......@@ -3,6 +3,7 @@
# found in the LICENSE file.
"""Utility classes (and functions, in the future) for graph operations."""
import functools
from typing import Dict, List, Optional, Tuple
......@@ -18,6 +19,7 @@ def sorted_edges_by_name(edges):
return sorted(edges, key=lambda edge: (edge[0].name, edge[1].name))
@functools.total_ordering
class Node(object): # pylint: disable=useless-object-inheritance
"""A node/vertex in a directed graph."""
def __init__(self, unique_key: str):
......@@ -33,6 +35,9 @@ class Node(object): # pylint: disable=useless-object-inheritance
def __eq__(self, other: 'Node'): # pylint: disable=missing-function-docstring
return self._unique_key == other._unique_key
def __lt__(self, other: 'Node'): # pylint: disable=missing-function-docstring
return self._unique_key < other._unique_key
def __hash__(self): # pylint: disable=missing-function-docstring
return hash(self._unique_key)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment