Commit f8af6334 authored by mnaganov's avatar mnaganov Committed by Commit bot

[Android WebView] Rewrite copyrights scanner in Python

This is to prepare for running the scanner as a presubmit check.

I have added some manual tests to make sure that the new scanner is
compatible with the old one. I plan to replace them with automatic
unittests once I start using presubmit-style input API objects that
are easily mockable.

BUG=343104

Review URL: https://codereview.chromium.org/622493004

Cr-Commit-Position: refs/heads/master@{#297828}
parent dc445f21
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utilities for scanning source files to determine code authorship.
"""
import itertools
import os
import re
def FindFiles(root_dir, start_paths_list, excluded_dirs_list):
"""Similar to UNIX utility find(1), searches for files in the directories.
Automatically leaves out only source code files.
Args:
root_dir: The root directory, to which all other paths are relative.
start_paths_list: The list of paths to start search from. Each path can
be a file or a directory.
excluded_dirs_list: The list of directories to skip.
Returns:
The list of source code files found, relative to |root_dir|.
"""
dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
def IsBlacklistedDir(d):
for item in dirs_blacklist:
if item in d:
return True
return False
files_whitelist_re = re.compile(
r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
'|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
'|tex|mli?)$')
files = []
base_path_len = len(root_dir)
for path in start_paths_list:
full_path = os.path.join(root_dir, path)
if os.path.isfile(full_path):
if files_whitelist_re.search(path):
files.append(path)
else:
for dirpath, dirnames, filenames in os.walk(full_path):
# Remove excluded subdirs for faster scanning.
for item in dirnames[:]:
if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):
dirnames.remove(item)
for filename in filenames:
filepath = os.path.join(dirpath, filename)[base_path_len + 1:]
if files_whitelist_re.search(filepath) and \
not IsBlacklistedDir(filepath):
files.append(filepath)
return files
python_multiline_string_double_re = re.compile(
r'"""[^"]*(?:"""|$)', flags=re.MULTILINE)
python_multiline_string_single_re = re.compile(
r"'''[^']*(?:'''|$)", flags=re.MULTILINE)
automatically_generated_re = re.compile(
r'(All changes made in this file will be lost'
'|DO NOT (EDIT|delete this file)'
'|Generated (at|automatically|data)'
'|Automatically generated'
'|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)
def _IsGeneratedFile(header):
header = header.upper()
if '"""' in header:
header = python_multiline_string_double_re.sub('', header)
if "'''" in header:
header = python_multiline_string_single_re.sub('', header)
# First do simple strings lookup to save time.
if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
return True
if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
'GENERATED' in header:
return automatically_generated_re.search(header)
return False
GENERATED_FILE = 'GENERATED FILE'
NO_COPYRIGHT = '*No copyright*'
class _CopyrightsScanner(object):
_c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
_copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
_full_copyright_indicator_re = \
re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \
re.IGNORECASE)
_copyright_disindicator_re = \
re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)
def __init__(self):
self.max_line_numbers_proximity = 3
self.last_a_item_line_number = -200
self.last_b_item_line_number = -100
def _CloseLineNumbers(self, a, b):
return 0 <= a - b <= self.max_line_numbers_proximity
def MatchLine(self, line_number, line):
if '"' in line:
line = _CopyrightsScanner._c_comment_re.sub('', line)
upcase_line = line.upper()
# Record '(a)' and '(b)' last occurences in C++ comments.
# This is to filter out '(c)' used as a list item inside C++ comments.
# E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
cpp_comment_idx = upcase_line.find('//')
if cpp_comment_idx != -1:
if upcase_line.find('(A)') > cpp_comment_idx:
self.last_a_item_line_number = line_number
if upcase_line.find('(B)') > cpp_comment_idx:
self.last_b_item_line_number = line_number
# Fast bailout, uses the same patterns as _copyright_indicator regexp.
if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
and not '\xc2\xa9' in upcase_line:
c_item_index = upcase_line.find('(C)')
if c_item_index == -1:
return None
if c_item_index > cpp_comment_idx and \
self._CloseLineNumbers(line_number,
self.last_b_item_line_number) and \
self._CloseLineNumbers(self.last_b_item_line_number,
self.last_a_item_line_number):
return None
copyr = None
m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
if m and \
not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
copyr = m.group(0)
# Prettify the authorship string.
copyr = re.sub(r'([,.])?\s*$/', '', copyr)
copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)
copyr = re.sub(r'^\s+', '', copyr)
copyr = re.sub(r'\s{2,}', ' ', copyr)
copyr = re.sub(r'\\@', '@', copyr)
return copyr
def FindCopyrights(root_dir, files_to_scan):
"""Determines code autorship, and finds generated files.
Args:
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of copyrights associated with each of the files given.
If the certain file is generated, the corresponding list consists a single
entry -- 'GENERATED_FILE' string. If the file has no copyright info,
the corresponding list contains 'NO_COPYRIGHT' string.
"""
copyrights = []
for file_name in files_to_scan:
linenum = 0
header = ''
file_copyrights = []
scanner = _CopyrightsScanner()
with open(os.path.join(root_dir, file_name), 'r') as f:
for l in f.readlines():
linenum += 1
if linenum <= 25:
header += l
c = scanner.MatchLine(linenum, l)
if c:
file_copyrights.append(c)
if _IsGeneratedFile(header):
copyrights.append([GENERATED_FILE])
elif file_copyrights:
copyrights.append(file_copyrights)
else:
copyrights.append([NO_COPYRIGHT])
return copyrights
def FindCopyrightViolations(root_dir, files_to_scan):
"""Looks for files that are not belong exlusively to the Chromium Authors.
Args:
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of file names that contain non-Chromium copyrights.
"""
copyrights = FindCopyrights(root_dir, files_to_scan)
offending_files = []
allowed_copyrights_re = re.compile(
r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
'All rights reserved.*)$')
for f, cs in itertools.izip(files_to_scan, copyrights):
if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:
continue
for c in cs:
if not allowed_copyrights_re.match(c):
offending_files.append(os.path.normpath(f))
break
return offending_files
#!/usr/bin/perl -w
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use: echo filename1.cc ... | find_copyrights.pl
# or: find_copyrights.pl list_file
# or: find_files.pl ... | find_copyrights.pl
use strict;
use warnings;
use File::Basename;
sub check_is_generated_file($);
sub start_copyright_parsing();
my $progname = basename($0);
my $generated_file_scan_boundary = 25;
while (<>) {
chomp;
my $file = $_;
my $file_header = '';
my %copyrights;
open (F, "<$file") or die "$progname: Unable to access $file\n";
my $parse_copyright = start_copyright_parsing();
while (<F>) {
$file_header .= $_ unless $. > $generated_file_scan_boundary;
my $copyright_match = $parse_copyright->($_, $.);
if ($copyright_match) {
$copyrights{lc("$copyright_match")} = "$copyright_match";
}
}
close(F);
my $copyright = join(" / ", sort values %copyrights);
print "$file\t";
if (check_is_generated_file($file_header)) {
print "GENERATED FILE";
} else {
print ($copyright or "*No copyright*");
}
print "\n";
}
sub check_is_generated_file($) {
my $license = uc($_[0]);
# Remove Python multiline comments to avoid false positives
if (index($license, '"""') != -1) {
$license =~ s/"""[^"]*(?:"""|$)//mg;
}
if (index($license, "'''") != -1) {
$license =~ s/'''[^']*(?:'''|$)//mg;
}
# Quick checks using index.
if (index($license, 'ALL CHANGES MADE IN THIS FILE WILL BE LOST') != -1) {
return 1;
}
if (index($license, 'DO NOT EDIT') != -1 ||
index($license, 'DO NOT DELETE') != -1 ||
index($license, 'GENERATED') != -1) {
return ($license =~ /(All changes made in this file will be lost' .
'DO NOT (EDIT|delete this file)|Generated (at|automatically|data)' .
'|Automatically generated|\Wgenerated\s+(?:\w+\s+)*file\W)/i);
}
return 0;
}
sub are_within_increasing_progression($$$) {
my $delta = $_[0] - $_[1];
return $delta >= 0 && $delta <= $_[2];
}
sub start_copyright_parsing() {
my $max_line_numbers_proximity = 3;
# Set up the defaults the way that proximity checks will not succeed.
my $last_a_item_line_number = -200;
my $last_b_item_line_number = -100;
return sub {
my $line = $_[0];
my $line_number = $_[1];
# Remove C / C++ strings to avoid false positives.
if (index($line, '"') != -1) {
$line =~ s/"[^"\\]*(?:\\.[^"\\]*)*"//g;
}
my $uc_line = uc($line);
# Record '(a)' and '(b)' last occurences in C++ comments.
my $cpp_comment_idx = index($uc_line, '//');
if ($cpp_comment_idx != -1) {
if (index($uc_line, '(A)') > $cpp_comment_idx) {
$last_a_item_line_number = $line_number;
}
if (index($uc_line, '(B)') > $cpp_comment_idx) {
$last_b_item_line_number = $line_number;
}
}
# Fast bailout, uses the same patterns as the regexp.
if (index($uc_line, 'COPYRIGHT') == -1 &&
index($uc_line, 'COPR.') == -1 &&
index($uc_line, '\x{00a9}') == -1 &&
index($uc_line, '\xc2\xa9') == -1) {
my $c_item_index = index($uc_line, '(C)');
return '' if ($c_item_index == -1);
# Filter out 'c' used as a list item inside C++ comments.
# E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
if ($c_item_index > $cpp_comment_idx &&
are_within_increasing_progression(
$line_number,
$last_b_item_line_number,
$max_line_numbers_proximity) &&
are_within_increasing_progression(
$last_b_item_line_number,
$last_a_item_line_number,
$max_line_numbers_proximity)) {
return '';
}
}
my $copyright_indicator_regex =
'(?:copyright|copr\.|\x{00a9}|\xc2\xa9|\(c\))';
my $full_copyright_indicator_regex =
sprintf '(?:\W|^)%s(?::\s*|\s+)(\w.*)$', $copyright_indicator_regex;
my $copyright_disindicator_regex =
'\b(?:info(?:rmation)?|notice|and|or)\b';
my $copyright = '';
if ($line =~ m%$full_copyright_indicator_regex%i) {
my $match = $1;
if ($match !~ m%^\s*$copyright_disindicator_regex%i) {
$match =~ s/([,.])?\s*$//;
$match =~ s/$copyright_indicator_regex//ig;
$match =~ s/^\s+//;
$match =~ s/\s{2,}/ /g;
$match =~ s/\\@/@/g;
$copyright = $match;
}
}
return $copyright;
}
}
#!/usr/bin/perl -w
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Use: find_files.pl <start-from> [exclude-dir ...]
use strict;
use warnings;
use File::Basename;
my $progname = basename($0);
my $root_dir = shift @ARGV;
my @find_args = ();
while (@ARGV) {
my $path = shift @ARGV;
push @find_args, qw'-not ( -path', "*/$path/*", qw'-prune )'
}
push @find_args, qw(-follow -type f -print);
open FIND, '-|', 'find', $root_dir, @find_args
or die "$progname: Couldn't exec find: $!\n";
my $check_regex = '\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' .
'|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' .
'|tex|mli?)$';
my @files = ();
while (<FIND>) {
chomp;
print "$_\n" unless (-z $_ || !m%$check_regex%);
}
close FIND;
#!/bin/sh
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
find android_webview/tools/tests -type f | sort \
| android_webview/tools/webview_licenses.py display_copyrights
// (c) 2014 Google Inc.
//
// (a) One
//
// (b) Two
//
ALL CHANGES MADE IN THIS FILE WILL BE LOST
Copyright 2014 Google
GENERATED FILE. DO NOT EDIT
Copyright 2014 Google
GENERATED. DO NOT DELETE THIS FILE.
Copyright 2014 Google
DO NOT EDIT
Copyright 2014 Google
DO NOT DELETE THIS FILE
Copyright 2014 Google
All changes made in this file will be lost
Copyright 2014 Google
Automatically generated file
Copyright 2014 Google
Synthetically generated dummy file
Copyright 2014 Google
Generated data (by gnugnu)
Copyright 2014 Google
std::cout << "Copyright 2014 Google"
// Several points can be made:
//
// (a) One
//
// (b) Two
//
// (c) Three
//
See 'foo' for copyright information.
See 'foo' for the copyright notice.
See 'foo' for the copyright and other things.
This file was prohibited from being generated.
Please do not delete our files! They are valuable to us.
Manually generated from dice rolls.
"""This Python script produces generated data
"""
'''This Python script produces generated data
'''
#!/usr/bin/python #!/usr/bin/python
# Copyright (c) 2012 The Chromium Authors. All rights reserved. # Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
...@@ -22,7 +22,6 @@ import multiprocessing ...@@ -22,7 +22,6 @@ import multiprocessing
import optparse import optparse
import os import os
import re import re
import subprocess
import sys import sys
import textwrap import textwrap
...@@ -40,6 +39,7 @@ third_party = \ ...@@ -40,6 +39,7 @@ third_party = \
sys.path.append(os.path.join(REPOSITORY_ROOT, 'tools')) sys.path.append(os.path.join(REPOSITORY_ROOT, 'tools'))
import licenses import licenses
import copyright_scanner
import known_issues import known_issues
class InputApi(object): class InputApi(object):
...@@ -97,41 +97,12 @@ class ScanResult(object): ...@@ -97,41 +97,12 @@ class ScanResult(object):
Ok, Warnings, Errors = range(3) Ok, Warnings, Errors = range(3)
# Needs to be a top-level function for multiprocessing # Needs to be a top-level function for multiprocessing
def _FindCopyrights(files_to_scan): def _FindCopyrightViolations(files_to_scan_as_string):
args = [os.path.join('android_webview', 'tools', 'find_copyrights.pl')] return copyright_scanner.FindCopyrightViolations(
p = subprocess.Popen( REPOSITORY_ROOT, files_to_scan_as_string)
args=args, cwd=REPOSITORY_ROOT,
stdin=subprocess.PIPE, stdout=subprocess.PIPE) def _ShardList(l, shard_len):
lines = p.communicate(files_to_scan)[0].splitlines() return [l[i:i + shard_len] for i in range(0, len(l), shard_len)]
offending_files = []
allowed_copyrights = '^(?:\*No copyright\*' \
'|20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' \
'All rights reserved.*)$'
allowed_copyrights_re = re.compile(allowed_copyrights)
for l in lines:
entries = l.split('\t')
if entries[1] == "GENERATED FILE":
continue
copyrights = entries[1].split(' / ')
for c in copyrights:
if c and not allowed_copyrights_re.match(c):
offending_files.append(os.path.normpath(entries[0]))
break
return offending_files
def _ShardString(s, delimiter, shard_len):
result = []
index = 0
last_pos = 0
for m in re.finditer(delimiter, s):
index += 1
if index % shard_len == 0:
result.append(s[last_pos:m.end()])
last_pos = m.end()
if not index % shard_len == 0:
result.append(s[last_pos:])
return result
def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files): def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files):
"""Checks that all files which are not in a listed third-party directory, """Checks that all files which are not in a listed third-party directory,
...@@ -185,16 +156,12 @@ def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files): ...@@ -185,16 +156,12 @@ def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files):
# This is not part of open source chromium, but are included on some bots. # This is not part of open source chromium, but are included on some bots.
excluded_dirs_list.append('skia/tools/clusterfuzz-data') excluded_dirs_list.append('skia/tools/clusterfuzz-data')
args = [os.path.join('android_webview', 'tools', 'find_files.pl'), files_to_scan = copyright_scanner.FindFiles(
'.' REPOSITORY_ROOT, ['.'], excluded_dirs_list)
] + excluded_dirs_list sharded_files_to_scan = _ShardList(files_to_scan, 2000)
p = subprocess.Popen(args=args, cwd=REPOSITORY_ROOT, stdout=subprocess.PIPE)
files_to_scan = p.communicate()[0]
sharded_files_to_scan = _ShardString(files_to_scan, '\n', 2000)
pool = multiprocessing.Pool() pool = multiprocessing.Pool()
offending_files_chunks = pool.map_async( offending_files_chunks = pool.map_async(
_FindCopyrights, sharded_files_to_scan).get(999999) _FindCopyrightViolations, sharded_files_to_scan).get(999999)
pool.close() pool.close()
pool.join() pool.join()
# Flatten out the result # Flatten out the result
...@@ -234,7 +201,8 @@ def _ReadFile(path): ...@@ -234,7 +201,8 @@ def _ReadFile(path):
The contents of the file as a string. The contents of the file as a string.
""" """
return open(os.path.join(REPOSITORY_ROOT, path), 'rb').read() with open(os.path.join(REPOSITORY_ROOT, path), 'rb') as f:
return f.read()
def _FindThirdPartyDirs(): def _FindThirdPartyDirs():
...@@ -347,14 +315,16 @@ def main(): ...@@ -347,14 +315,16 @@ def main():
parser = optparse.OptionParser(formatter=FormatterWithNewLines(), parser = optparse.OptionParser(formatter=FormatterWithNewLines(),
usage='%prog [options]') usage='%prog [options]')
parser.description = (__doc__ + parser.description = (__doc__ +
'\nCommands:\n' \ '\nCommands:\n'
' scan Check licenses.\n' \ ' scan Check licenses.\n'
' notice Generate Android NOTICE file on stdout.\n' \ ' notice Generate Android NOTICE file on stdout.\n'
' incompatible_directories Scan for incompatibly' ' incompatible_directories Scan for incompatibly'
' licensed directories.\n' ' licensed directories.\n'
' all_incompatible_directories Scan for incompatibly' ' all_incompatible_directories Scan for incompatibly'
' licensed directories (even those in' ' licensed directories (even those in'
' known_issues.py).\n') ' known_issues.py).\n'
' display_copyrights Display autorship on the files'
' using names provided via stdin.\n')
(_, args) = parser.parse_args() (_, args) = parser.parse_args()
if len(args) != 1: if len(args) != 1:
parser.print_help() parser.print_help()
...@@ -372,6 +342,11 @@ def main(): ...@@ -372,6 +342,11 @@ def main():
return _ProcessIncompatibleResult(GetUnknownIncompatibleDirectories()) return _ProcessIncompatibleResult(GetUnknownIncompatibleDirectories())
elif args[0] == 'all_incompatible_directories': elif args[0] == 'all_incompatible_directories':
return _ProcessIncompatibleResult(GetIncompatibleDirectories()) return _ProcessIncompatibleResult(GetIncompatibleDirectories())
elif args[0] == 'display_copyrights':
files = sys.stdin.read().splitlines()
for f, c in zip(files, copyright_scanner.FindCopyrights('.', files)):
print f, '\t', ' / '.join(sorted(c))
return ScanResult.Ok
parser.print_help() parser.print_help()
return ScanResult.Errors return ScanResult.Errors
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment