Commit cb2757a7 authored by Mohamed Heikal's avatar Mohamed Heikal Committed by Commit Bot

Update to suggest_owners.py script

* Fixes a bunch of bugs, I should probably add an automated test.
* script now also keeps track of number of line modifications rather than
  just commits.
* Because of asking git-log for number of lines modified, it takes 10x the
  amount of time. Might make sense to add option to disable.
* script now tries to cache git-log result.
* script now ignores trivial directories that are empty except for one child
  directory.
* Fix handling of merge commits.
* Improve parsing of owners files (now understands file:// import and set
  noparent directives).
* Now also shows owners commit stats for reference.

Change-Id: I23bcf0645b3b2d330cb781ba7daa15ac9cfcb695
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1810256Reviewed-by: default avatarMohamed Heikal <mheikal@chromium.org>
Reviewed-by: default avatarDirk Pranke <dpranke@chromium.org>
Commit-Queue: Mohamed Heikal <mheikal@chromium.org>
Cr-Commit-Position: refs/heads/master@{#698975}
parent ac80349d
per-file move_source_file.py=satorux@chromium.org per-file move_source_file.py=satorux@chromium.org
per-file suggest_owners.py=mheikal@chromium.org
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import argparse import argparse
import subprocess import subprocess
import pickle
import os import os
from os import path from os import path
from datetime import date, timedelta from datetime import date, timedelta
...@@ -12,166 +13,340 @@ from collections import namedtuple, defaultdict, Counter ...@@ -12,166 +13,340 @@ from collections import namedtuple, defaultdict, Counter
Commit = namedtuple('Commit', ['hash', 'author', 'commit_date', 'dirs']) Commit = namedtuple('Commit', ['hash', 'author', 'commit_date', 'dirs'])
# Takes a git command arguments and runs it returning the output (throwing an # dict mapping each subdirectory and author to the number of their commits and
# exception on error). # modifications in that directory
DIRECTORY_AUTHORS = defaultdict(dict)
# cache for directory owners for memoisation purposes
OWNERS_CACHE = {}
# filename for pickle cache
CACHE_FILENAME = 'suggest_owners.cache'
def _RunGitCommand(options, cmd_args): def _RunGitCommand(options, cmd_args):
repo_path = os.path.join(options.repo_path, '.git') repo_path = path.join(options.repo_path, '.git')
cmd = ['git', '--git-dir', repo_path] + cmd_args cmd = ['git', '--git-dir', repo_path] + cmd_args
print '>', ' '.join(cmd)
return subprocess.check_output(cmd) return subprocess.check_output(cmd)
# return true if this author is a chromium dev and is not a bot. Pretty naive, def _ValidAuthor(author):
# looks for roller in the username.
def _IsValidAuthor(author):
return author.find('@chromium.org') > -1 and author.find('roller') == -1 return author.find('@chromium.org') > -1 and author.find('roller') == -1
# Get a list of commits from the repo and return a nested dictionary # Returns additions/deletions by a commit to a directory (and its descendants).
# directory -> author -> num_commits def getEditsForDirectory(commit, directory):
def processAllCommits(options): additions = deletions = 0
for commit_directory, (directory_additions, directory_deletions) \
in commit.dirs.items():
# check if commit_directory is same as or a descendant of directory
if isSubDirectory(directory, commit_directory):
additions += directory_additions
deletions += directory_deletions
return additions, deletions
# This propagates a commit touching a directory to also be touching all
# ancesstor directories.
def _PropagateCommit(options, commit):
touched_dirs = set()
# first get all the touched dirs and their ancestors
for directory in commit.dirs.iterkeys():
while directory != '':
touched_dirs.add(directory)
# get the parent directory
directory = path.dirname(directory)
# loop over them and calculate the edits per directory
for directory in touched_dirs:
author_commits, author_additions, author_deletions = \
DIRECTORY_AUTHORS[directory].get(commit.author, (0,0,0))
directory_additions, directory_deletions = \
getEditsForDirectory(commit, directory)
DIRECTORY_AUTHORS[directory][commit.author] = \
(author_commits + 1, author_additions + directory_additions,
author_deletions + directory_deletions)
# Checks if child_directory is same as or below parent_directory. For some
# reason the os.path module does not have this functionality.
def isSubDirectory(parent_directory, child_directory):
parent_directory = parent_directory + '/'
child_directory = child_directory + '/'
return child_directory.startswith(parent_directory)
def _GetGitLogCmd(options):
# TODO(mheikal): git-log with --numstat vs --name-only takes 10x the time to
# complete. It takes >15 mins for git log --numstat to return the 1 year git
# history of the full repo. Should probably add a script flag to switch off
# keeping track of number of modifications per commit.
date_limit = date.today() - timedelta(days=options.days_ago) date_limit = date.today() - timedelta(days=options.days_ago)
format_string = "%h,%ae,%cI" format_string = "%h,%ae,%cI"
cmd_args = [ cmd_args = [
'log', 'log',
'--since', date_limit.isoformat(), '--since', date_limit.isoformat(),
'--name-only', '--numstat',
'--pretty=format:%s'%format_string, '--pretty=format:%s'%format_string,
] ]
# has to be last arg # has to be last arg
if options.subdirectory: if options.subdirectory:
cmd_args += ['--', options.subdirectory] cmd_args += ['--', options.subdirectory]
return cmd_args
def _ParseCommitLine(line):
commit_hash, author, commit_date = line.split(",")
return Commit(hash=commit_hash, author=author, commit_date=commit_date,
dirs={})
output = _RunGitCommand(options, cmd_args) def _ParseFileStatsLine(current_commit, line):
try:
additions, deletions, filepath = line.split('\t')
except ValueError:
return False
if additions == '-':
additions = 0
else:
additions = int(additions)
if deletions == '-':
deletions = 0
else:
deletions = int(deletions)
dir_path = path.dirname(filepath)
commit_additions, commit_deletions = \
current_commit.dirs.get(dir_path, (0,0))
current_commit.dirs[dir_path] = (
additions + commit_additions, deletions + commit_deletions)
return True
def processAllCommits(options):
if not options.subdirectory and options.days_ago > 100:
print ('git log for your query might take > 5 minutes, limit by a '
'subdirectory or reduce the number of days of history to low double '
'digits to make this faster. There is no progress indicator, it is '
'all waiting for single git log to finish.')
output = _RunGitCommand(options, _GetGitLogCmd(options))
current_commit = None current_commit = None
author = None
directory_authors = defaultdict(Counter)
for line in output.splitlines(): for line in output.splitlines():
if current_commit is None: if current_commit is None:
commit_hash, author, commit_date = line.split(",") current_commit = _ParseCommitLine(line)
current_commit = Commit(hash=commit_hash, author=author,
commit_date=commit_date, dirs=set())
else: else:
if line == '': # all commit details read if line == '': # all commit details read
if _IsValidAuthor(current_commit.author): if _ValidAuthor(current_commit.author):
for directory in current_commit.dirs: _PropagateCommit(options, current_commit)
if directory == '':
continue
directory_authors[directory][author] += 1
current_commit = None current_commit = None
else: else:
current_commit.dirs.add(os.path.dirname(line)) # Merge commits weird out git-log. If we fail to parse the line, then
return directory_authors # the last commit was a merge and this line is actually another commit
# description line.
if not _ParseFileStatsLine(current_commit, line):
# Return a list of owners for a given directory by reading OWNERS files in its current_commit = _ParseCommitLine(line)
# ancestors. The parsing of OWNERS files is pretty naive, it does not handle # process the final commit
# file imports. if _ValidAuthor(current_commit.author):
def _GetOwners(options, repo_subdir): _PropagateCommit(options, current_commit)
directory_path = os.path.join(options.repo_path, repo_subdir)
owners_path = os.path.join(directory_path, 'OWNERS')
owners = [] def _CountCommits(directory):
while directory_path != '': return sum(
if os.path.isfile(owners_path): [count for (count, _a, _d) in DIRECTORY_AUTHORS[directory].itervalues()])
with open(owners_path) as f:
owners.extend([line.strip() for line in f.readlines() if
line.find('@chromium.org') > -1]) def _GetOwnerLevel(options, author, directory):
directory_path = path.dirname(directory_path) sorted_owners = sorted(_GetOwners(options, directory), key=lambda (o,l): l)
owners_path = os.path.join(directory_path, 'OWNERS') for owner, level in sorted_owners:
if author == owner:
return level
else:
return -1
# Returns the owners for a repo subdirectory. This does not understand per-file
# directives.
# TODO(mheikal): use depot_tools owners.py for parsing owners files.
def _GetOwners(options, directory_path):
if directory_path in OWNERS_CACHE:
return OWNERS_CACHE[directory_path]
owners_path = path.join(options.repo_path, directory_path, 'OWNERS')
owners = set()
parent_dir = directory_path
owner_level = 0
while parent_dir != '':
if path.isfile(owners_path):
parsed_owners, noparent = _ParseOwnersFile(options, owners_path)
owners.update([(owner, owner_level) for owner in parsed_owners])
owner_level += 1
if noparent:
break
parent_dir = path.dirname(parent_dir)
owners_path = path.join(parent_dir, 'OWNERS')
OWNERS_CACHE[directory_path] = set(owners)
return owners return owners
# Return the number of commits for a given directory # Parse an OWNERS file, returns set of owners and if the file sets noparent
def _CountDirectoryCommits(directory_authors, directory): def _ParseOwnersFile(options, filepath):
return sum(directory_authors[directory].values()) owners = set()
noparent = False
with open(filepath) as f:
for line in f.readlines():
line = line.strip()
# The script deals with directories so per-files are ignored.
if line == '' or line[0] == '#' or line.startswith('per-file'):
continue
if line.startswith('file://'):
relpath = line[7:]
abspath = path.join(options.repo_path, relpath)
parsed_owners, _ = _ParseOwnersFile(options, abspath)
owners.update(parsed_owners)
if line == 'set noparent':
noparent = True
index = line.find('@chromium.org')
if index > -1:
owners.add(line[:index + len('@chromium.org')])
return owners, noparent
# Given a directory merge all its children's commits into its own, then delete # Trivial directories are ones that just contain a single child subdir and
# each child subdirectory's entry if it has too few commits. # nothing else.
def _GroupToParentDirectory(options, directory_authors, parent): def _IsTrivialDirectory(options, repo_subdir):
global DIRECTORY_AUTHORS try:
parent_path = path.join(options.repo_path, parent) return len(os.listdir(path.join(options.repo_path, repo_subdir))) == 1
except OSError:
for entry in os.listdir(parent_path): # directory no longer exists
if path.isdir(os.path.join(parent_path, entry)): return False
entry_dir = path.join(parent, entry)
directory_authors[parent].update(directory_authors[entry_dir])
commit_count = _CountDirectoryCommits(directory_authors, entry_dir) def computeSuggestions(options):
if commit_count < options.dir_commit_limit: directory_suggestions = []
directory_authors.pop(entry_dir) for directory, authors in sorted(
DIRECTORY_AUTHORS.iteritems(), key=lambda (d, a): d):
if _IsTrivialDirectory(options, directory):
# Merge directories with too few commits into their parent directory. This
# method changes the directory_authors dict in-place.
def mergeDirectories(options, directory_authors):
changed = False
for directory in directory_authors.keys():
if not path.exists(path.join(options.repo_path, directory)):
del directory_authors[directory]
continue continue
num_commits = _CountDirectoryCommits(directory_authors, directory) if _CountCommits(directory) < options.dir_commit_limit:
if num_commits == 0:
continue continue
elif num_commits < options.dir_commit_limit: # skip suggestions for directories outside the passed in directory
parent = os.path.dirname(directory) if (options.subdirectory
_GroupToParentDirectory(options, directory_authors, parent) and not isSubDirectory(options.subdirectory, directory)):
changed = True continue
return changed # sort authors by descending number of commits
sorted_authors = sorted(authors.items(),
key=lambda (author, details): -details[0])
# keep only authors above the limit
suggestions = [(a,c) for a,c in sorted_authors if \
a not in options.ignore_authors \
and c[0] >= options.author_cl_limit]
directory_suggestions.append((directory, suggestions))
return directory_suggestions
# Retrieves a set of authors that should not be suggested for a directory def _PrintSettings(options):
def _GetIgnoredAuthors(options, repo_subdir): print ('Showing directories with at least ({}) commits in the last ({}) '
if options.ignore_authors: 'days.'.format(options.dir_commit_limit, options.days_ago))
ignored_authors = set(map(str.strip, options.ignore_authors.split(','))) print ('Showing top ({}) committers who have commited at least ({}) commits '
else: 'to the directory in the last ({}) days.'.format(
ignored_authors = set() options.max_suggestions, options.author_cl_limit,
ignored_authors.update(_GetOwners(options, repo_subdir)) options.days_ago))
return ignored_authors print '(owners+N) represents distance through OWNERS files for said owner\n'
# Prints out a list of suggested new owners for each directory with a high def printSuggestions(options, directory_suggestions):
# enough commit count. print '\nCommit stats:'
def outputSuggestions(options, directory_authors): _PrintSettings(options)
for directory, authors in sorted(directory_authors.iteritems()): for directory, suggestions in directory_suggestions:
commit_count = _CountDirectoryCommits(directory_authors, directory) print '{}: {} commits in the last {} days'.format(
if commit_count < options.dir_commit_limit: directory, _CountCommits(directory), options.days_ago)
continue non_owner_suggestions = 0
ignored_authors = _GetIgnoredAuthors(options, directory) for author, (commit_count, additions, deletions) in suggestions:
suggestions = [(a,c) for a,c in authors.most_common() owner_level = _GetOwnerLevel(options, author, directory)
if a not in ignored_authors and c >= options.author_cl_limit] if owner_level > -1:
print "%s: %d commits in the last %d days" % \ owner_string = ' (owner+{})'.format(owner_level)
(directory, commit_count, options.days_ago) else:
for author, commit_count in suggestions[:options.max_suggestions]: non_owner_suggestions +=1
print author, commit_count owner_string = ''
print '{}{}, commits: {}, additions:{}, deletions: {}'.format(
author, owner_string, commit_count, additions, deletions)
if non_owner_suggestions >= options.max_suggestions:
break
print print
# main 2.0 def _GetHeadCommitHash(options):
return _RunGitCommand(options, ['rev-parse', 'HEAD']).strip()
def _GetCacheMetadata(options):
return _GetHeadCommitHash(options), options.days_ago, options.subdirectory
def _IsCacheValid(options, metadata):
head_hash, days_ago, cached_subdirectory = metadata
if head_hash != _GetHeadCommitHash(options):
return False
if days_ago != options.days_ago:
return False
if (cached_subdirectory is not None
and not isSubDirectory(cached_subdirectory, options.subdirectory)):
return False
return True
def cacheProcessedCommits(options):
metadata = _GetCacheMetadata(options)
with open(CACHE_FILENAME, 'w') as f:
pickle.dump((metadata, DIRECTORY_AUTHORS), f)
def maybeRestoreProcessedCommits(options):
global DIRECTORY_AUTHORS
if not path.exists(CACHE_FILENAME):
return False
with open(CACHE_FILENAME) as f:
stored_metadata, cached_directory_authors = pickle.load(f)
if _IsCacheValid(options, stored_metadata):
print 'Loading from cache'
DIRECTORY_AUTHORS = cached_directory_authors
return True
else:
print 'Cache is stale or invalid, must rerun `git log`'
return False
def do(options): def do(options):
directory_authors = processAllCommits(options) if options.skip_cache or not maybeRestoreProcessedCommits(options):
while mergeDirectories(options, directory_authors): processAllCommits(options)
pass cacheProcessedCommits(options)
outputSuggestions(options, directory_authors) directory_suggestions = computeSuggestions(options)
printSuggestions(options, directory_suggestions)
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('repo_path') parser.add_argument('repo_path')
parser.add_argument('--days-ago', help='Number of days of history to search' parser.add_argument('--days-ago', type=int,
' through.', default=365) help='Number of days of history to search through.',
parser.add_argument('--subdirectory', help='Limit to this subdirectory') default=365, metavar='DAYS_AGO')
parser.add_argument('--ignore-authors', help='Ignore this comma separated' parser.add_argument('--subdirectory',
' list of authors') help='Limit suggestions to this subdirectory', default='')
parser.add_argument('--max-suggestions', help='Maximum number of suggested' parser.add_argument('--ignore-authors',
' authors per directory.', default=5) help='Ignore this comma separated list of authors')
parser.add_argument('--author-cl-limit', help='Do not suggest authors who' parser.add_argument('--max-suggestions', type=int, help='Maximum number of '
' have commited less than this to the directory.', 'suggested authors per directory.', default=5)
default=10) parser.add_argument('--author-cl-limit', type=int, help='Do not suggest '
parser.add_argument('--dir-commit-limit', help='Merge directories with less' 'authors who have commited less than this to the '
' than this number of commits into their parent' 'directory in the last DAYS_AGO days.', default=10)
' directory.', default=100) parser.add_argument('--dir-commit-limit', type=int, help='Skip directories '
'with less than this number of commits in the last '
'DAYS_AGO days.', default=100)
parser.add_argument('--skip-cache', action='store_true',
help='Do not read from cache.', default=False)
options = parser.parse_args() options = parser.parse_args()
if options.ignore_authors:
options.ignore_authors = set(
map(str.strip, options.ignore_authors.split(',')))
else:
options.ignore_authors = set()
do(options) do(options)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment