Commit 98bfa880 authored by mnaganov's avatar mnaganov Committed by Commit bot

[Android WebView] Prepare the copyrights scanner to run from presubmit scripts

Make the copyrights scanner to use InputApi instead of accessing system
modules directly.

Convert manual unit tests for the scanner into automatic that now run as
a presubmit check for related changes in android_webview/tools/*

BUG=343104
NOTRY=true

Review URL: https://codereview.chromium.org/667723002

Cr-Commit-Position: refs/heads/master@{#300670}
parent dc5d96aa
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
def CheckChangeOnUpload(input_api, output_api):
return _CommonChecks(input_api, output_api)
def CheckChangeOnCommit(input_api, output_api):
return _CommonChecks(input_api, output_api)
def _CommonChecks(input_api, output_api):
"""Checks common to both upload and commit."""
results = []
would_affect_tests = [
'PRESUBMIT.py',
'copyright_scanner.py',
'copyright_scanner_unittest.py'
]
need_to_run_unittests = False
for f in input_api.AffectedFiles():
if any(t for t in would_affect_tests if f.LocalPath().endswith(t)):
need_to_run_unittests = True
break
tests = [input_api.os_path.join(
input_api.PresubmitLocalPath(), 'copyright_scanner_unittest.py')]
results.extend(
input_api.canned_checks.RunUnitTests(input_api, output_api, tests))
return results
......@@ -6,14 +6,13 @@
"""
import itertools
import os
import re
def FindFiles(root_dir, start_paths_list, excluded_dirs_list):
def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
"""Similar to UNIX utility find(1), searches for files in the directories.
Automatically leaves out only source code files.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
start_paths_list: The list of paths to start search from. Each path can
be a file or a directory.
......@@ -28,7 +27,7 @@ def FindFiles(root_dir, start_paths_list, excluded_dirs_list):
return True
return False
files_whitelist_re = re.compile(
files_whitelist_re = input_api.re.compile(
r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
'|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
'|tex|mli?)$')
......@@ -36,66 +35,75 @@ def FindFiles(root_dir, start_paths_list, excluded_dirs_list):
base_path_len = len(root_dir)
for path in start_paths_list:
full_path = os.path.join(root_dir, path)
if os.path.isfile(full_path):
full_path = input_api.os_path.join(root_dir, path)
if input_api.os_path.isfile(full_path):
if files_whitelist_re.search(path):
files.append(path)
else:
for dirpath, dirnames, filenames in os.walk(full_path):
for dirpath, dirnames, filenames in input_api.os_walk(full_path):
# Remove excluded subdirs for faster scanning.
for item in dirnames[:]:
if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):
if IsBlacklistedDir(
input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
dirnames.remove(item)
for filename in filenames:
filepath = os.path.join(dirpath, filename)[base_path_len + 1:]
filepath = \
input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
if files_whitelist_re.search(filepath) and \
not IsBlacklistedDir(filepath):
files.append(filepath)
return files
python_multiline_string_double_re = re.compile(
r'"""[^"]*(?:"""|$)', flags=re.MULTILINE)
python_multiline_string_single_re = re.compile(
r"'''[^']*(?:'''|$)", flags=re.MULTILINE)
automatically_generated_re = re.compile(
r'(All changes made in this file will be lost'
'|DO NOT (EDIT|delete this file)'
'|Generated (at|automatically|data)'
'|Automatically generated'
'|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)
def _IsGeneratedFile(header):
header = header.upper()
if '"""' in header:
header = python_multiline_string_double_re.sub('', header)
if "'''" in header:
header = python_multiline_string_single_re.sub('', header)
# First do simple strings lookup to save time.
if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
return True
if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
'GENERATED' in header:
return automatically_generated_re.search(header)
return False
GENERATED_FILE = 'GENERATED FILE'
NO_COPYRIGHT = '*No copyright*'
class _GeneratedFilesDetector(object):
GENERATED_FILE = 'GENERATED FILE'
NO_COPYRIGHT = '*No copyright*'
def __init__(self, input_api):
self.python_multiline_string_double_re = \
input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
self.python_multiline_string_single_re = \
input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
self.automatically_generated_re = input_api.re.compile(
r'(All changes made in this file will be lost'
'|DO NOT (EDIT|delete this file)'
'|Generated (at|automatically|data)'
'|Automatically generated'
'|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
def IsGeneratedFile(self, header):
header = header.upper()
if '"""' in header:
header = self.python_multiline_string_double_re.sub('', header)
if "'''" in header:
header = self.python_multiline_string_single_re.sub('', header)
# First do simple strings lookup to save time.
if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
return True
if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
'GENERATED' in header:
return self.automatically_generated_re.search(header)
return False
class _CopyrightsScanner(object):
_c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
_copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
_full_copyright_indicator_re = \
re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \
re.IGNORECASE)
_copyright_disindicator_re = \
re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)
def __init__(self):
@staticmethod
def StaticInit(input_api):
_CopyrightsScanner._c_comment_re = \
input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
_CopyrightsScanner._copyright_indicator = \
r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
_CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
_CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
def __init__(self, input_api):
self.max_line_numbers_proximity = 3
self.last_a_item_line_number = -200
self.last_b_item_line_number = -100
self.re = input_api.re
def _CloseLineNumbers(self, a, b):
return 0 <= a - b <= self.max_line_numbers_proximity
......@@ -131,17 +139,20 @@ class _CopyrightsScanner(object):
not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
copyr = m.group(0)
# Prettify the authorship string.
copyr = re.sub(r'([,.])?\s*$/', '', copyr)
copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)
copyr = re.sub(r'^\s+', '', copyr)
copyr = re.sub(r'\s{2,}', ' ', copyr)
copyr = re.sub(r'\\@', '@', copyr)
copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
copyr = self.re.sub(
_CopyrightsScanner._copyright_indicator, '', copyr, \
flags=self.re.IGNORECASE)
copyr = self.re.sub(r'^\s+', '', copyr)
copyr = self.re.sub(r'\s{2,}', ' ', copyr)
copyr = self.re.sub(r'\\@', '@', copyr)
return copyr
def FindCopyrights(root_dir, files_to_scan):
def FindCopyrights(input_api, root_dir, files_to_scan):
"""Determines code autorship, and finds generated files.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
......@@ -150,47 +161,52 @@ def FindCopyrights(root_dir, files_to_scan):
entry -- 'GENERATED_FILE' string. If the file has no copyright info,
the corresponding list contains 'NO_COPYRIGHT' string.
"""
generated_files_detector = _GeneratedFilesDetector(input_api)
_CopyrightsScanner.StaticInit(input_api)
copyrights = []
for file_name in files_to_scan:
linenum = 0
header = ''
header = []
file_copyrights = []
scanner = _CopyrightsScanner()
with open(os.path.join(root_dir, file_name), 'r') as f:
for l in f.readlines():
linenum += 1
if linenum <= 25:
header += l
c = scanner.MatchLine(linenum, l)
if c:
file_copyrights.append(c)
if _IsGeneratedFile(header):
copyrights.append([GENERATED_FILE])
elif file_copyrights:
copyrights.append(file_copyrights)
else:
copyrights.append([NO_COPYRIGHT])
scanner = _CopyrightsScanner(input_api)
contents = input_api.ReadFile(
input_api.os_path.join(root_dir, file_name), 'r')
for l in contents.split('\n'):
linenum += 1
if linenum <= 25:
header.append(l)
c = scanner.MatchLine(linenum, l)
if c:
file_copyrights.append(c)
if generated_files_detector.IsGeneratedFile('\n'.join(header)):
copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
elif file_copyrights:
copyrights.append(file_copyrights)
else:
copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
return copyrights
def FindCopyrightViolations(root_dir, files_to_scan):
def FindCopyrightViolations(input_api, root_dir, files_to_scan):
"""Looks for files that are not belong exlusively to the Chromium Authors.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of file names that contain non-Chromium copyrights.
"""
copyrights = FindCopyrights(root_dir, files_to_scan)
copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
offending_files = []
allowed_copyrights_re = re.compile(
allowed_copyrights_re = input_api.re.compile(
r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
'All rights reserved.*)$')
for f, cs in itertools.izip(files_to_scan, copyrights):
if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:
if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
continue
for c in cs:
if not allowed_copyrights_re.match(c):
offending_files.append(os.path.normpath(f))
offending_files.append(input_api.os_path.normpath(f))
break
return offending_files
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Unit tests for Copyright Scanner utilities."""
import os
import re
import sys
import unittest
test_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.extend([
os.path.normpath(os.path.join(test_dir, '..', '..', 'tools')),
os.path.join(test_dir),
])
import find_depot_tools
from testing_support.super_mox import SuperMoxTestBase
import copyright_scanner
class FindCopyrightsTest(SuperMoxTestBase):
def setUp(self):
SuperMoxTestBase.setUp(self)
self.input_api = self.mox.CreateMockAnything()
self.input_api.re = re
self.input_api.os_path = os.path
self.input_api.os_walk = os.walk
def ShouldMatchReferenceOutput(self, test_data, expected_output):
for data in test_data:
self.input_api.ReadFile = lambda _1, _2: data
actual_output = copyright_scanner.FindCopyrights(self.input_api, '', [''])
self.assertEqual(
expected_output,
actual_output,
'Input """\n%s""", expected output: "%s", actual: "%s"' % \
(data, expected_output, actual_output));
def testCopyrightedFiles(self):
test_data = [
'// (c) 2014 Google Inc.\n//\n// (a) One\n//\n// (b) Two\n//\n',
'Copyright 2014 Google Inc.\n',
'Copr. 2014 Google Inc.',
'\xc2\xa9 2014 Google Inc.',
'Copyright 2014 Google Inc.'
]
self.ShouldMatchReferenceOutput(test_data, [['2014 Google Inc.']])
def testGeneratedFiles(self):
test_data = [
'ALL CHANGES MADE IN THIS FILE WILL BE LOST\nCopyright 2014 Google\n',
'GENERATED FILE. DO NOT EDIT\nCopyright 2014 Google\n',
'GENERATED. DO NOT DELETE THIS FILE.\nCopyright 2014 Google\n',
'DO NOT EDIT\nCopyright 2014 Google\n',
'DO NOT DELETE THIS FILE\nCopyright 2014 Google\n',
'All changes made in this file will be lost\nCopyright 2014 Google\n',
'Automatically generated file\nCopyright 2014 Google\n',
'Synthetically generated dummy file\nCopyright 2014 Google\n',
'Generated data (by gnugnu)\nCopyright 2014 Google\n'
]
self.ShouldMatchReferenceOutput(test_data, [['GENERATED FILE']])
def testNonCopyrightedFiles(self):
test_data = [
'std::cout << "Copyright 2014 Google"\n',
'// Several points can be made:\n//\n// (a) One\n//\n// (b) Two\n'
'//\n// (c) Three\n//\n',
'See \'foo\' for copyright information.\n',
'See \'foo\' for the copyright notice.\n',
'See \'foo\' for the copyright and other things.\n'
]
self.ShouldMatchReferenceOutput(test_data, [['*No copyright*']])
def testNonGeneratedFiles(self):
test_data = [
'This file was prohibited from being generated.\n',
'Please do not delete our files! They are valuable to us.\n',
'Manually generated from dice rolls.\n',
'"""This Python script produces generated data\n"""\n',
'\'\'\'This Python script produces generated data\n\'\'\'\n'
]
self.ShouldMatchReferenceOutput(test_data, [['*No copyright*']])
if __name__ == '__main__':
unittest.main()
#!/bin/sh
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
find android_webview/tools/tests -type f | sort \
| android_webview/tools/webview_licenses.py display_copyrights
// (c) 2014 Google Inc.
//
// (a) One
//
// (b) Two
//
ALL CHANGES MADE IN THIS FILE WILL BE LOST
Copyright 2014 Google
GENERATED FILE. DO NOT EDIT
Copyright 2014 Google
GENERATED. DO NOT DELETE THIS FILE.
Copyright 2014 Google
DO NOT DELETE THIS FILE
Copyright 2014 Google
All changes made in this file will be lost
Copyright 2014 Google
Automatically generated file
Copyright 2014 Google
Synthetically generated dummy file
Copyright 2014 Google
Generated data (by gnugnu)
Copyright 2014 Google
std::cout << "Copyright 2014 Google"
// Several points can be made:
//
// (a) One
//
// (b) Two
//
// (c) Three
//
See 'foo' for copyright information.
See 'foo' for the copyright and other things.
This file was prohibited from being generated.
Please do not delete our files! They are valuable to us.
"""This Python script produces generated data
"""
'''This Python script produces generated data
'''
......@@ -9,6 +9,8 @@
# so additions to this file should be rare. See
# http://www.chromium.org/developers/adding-3rd-party-libraries.
# Contains test strings that look like copyrights.
android_webview/tools/copyright_scanner_unittest.py
# Copyright IBM; MIT license. This third-party code is taken from ICU, the
# license for which we already pick up from third_party/icu/.
base/i18n/icu_string_conversions.cc
......
......@@ -44,7 +44,10 @@ import known_issues
class InputApi(object):
def __init__(self):
self.os_path = os.path
self.os_walk = os.walk
self.re = re
self.ReadFile = _ReadFile
def GetIncompatibleDirectories():
"""Gets a list of third-party directories which use licenses incompatible
......@@ -99,7 +102,7 @@ class ScanResult(object):
# Needs to be a top-level function for multiprocessing
def _FindCopyrightViolations(files_to_scan_as_string):
return copyright_scanner.FindCopyrightViolations(
REPOSITORY_ROOT, files_to_scan_as_string)
InputApi(), REPOSITORY_ROOT, files_to_scan_as_string)
def _ShardList(l, shard_len):
return [l[i:i + shard_len] for i in range(0, len(l), shard_len)]
......@@ -157,7 +160,7 @@ def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files):
excluded_dirs_list.append('skia/tools/clusterfuzz-data')
files_to_scan = copyright_scanner.FindFiles(
REPOSITORY_ROOT, ['.'], excluded_dirs_list)
InputApi(), REPOSITORY_ROOT, ['.'], excluded_dirs_list)
sharded_files_to_scan = _ShardList(files_to_scan, 2000)
pool = multiprocessing.Pool()
offending_files_chunks = pool.map_async(
......@@ -193,7 +196,19 @@ def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files):
return ScanResult.Ok
def _ReadFile(path):
def _ReadFile(full_path, mode='rU'):
"""Reads a file from disk. This emulates presubmit InputApi.ReadFile func.
Args:
full_path: The path of the file to read.
Returns:
The contents of the file as a string.
"""
with open(full_path, mode) as f:
return f.read()
def _ReadLocalFile(path, mode='rb'):
"""Reads a file from disk.
Args:
path: The path of the file to read, relative to the root of the repository.
......@@ -201,8 +216,7 @@ def _ReadFile(path):
The contents of the file as a string.
"""
with open(os.path.join(REPOSITORY_ROOT, path), 'rb') as f:
return f.read()
return _ReadFile(os.path.join(REPOSITORY_ROOT, path), mode)
def _FindThirdPartyDirs():
......@@ -262,8 +276,8 @@ def _Scan():
all_licenses_valid = False
# Second, check for non-standard license text.
files_data = _ReadFile(os.path.join('android_webview', 'tools',
'third_party_files_whitelist.txt'))
files_data = _ReadLocalFile(os.path.join('android_webview', 'tools',
'third_party_files_whitelist.txt'))
whitelisted_files = []
for line in files_data.splitlines():
match = re.match(r'([^#\s]+)', line)
......@@ -284,7 +298,7 @@ def GenerateNoticeFile():
third_party_dirs = _FindThirdPartyDirs()
# Don't forget Chromium's LICENSE file
content = [_ReadFile('LICENSE')]
content = [_ReadLocalFile('LICENSE')]
# We provide attribution for all third-party directories.
# TODO(steveblock): Limit this to only code used by the WebView binary.
......@@ -293,7 +307,7 @@ def GenerateNoticeFile():
require_license_file=False)
license_file = metadata['License File']
if license_file and license_file != licenses.NOT_SHIPPED:
content.append(_ReadFile(license_file))
content.append(_ReadLocalFile(license_file))
return '\n'.join(content)
......@@ -344,7 +358,8 @@ def main():
return _ProcessIncompatibleResult(GetIncompatibleDirectories())
elif args[0] == 'display_copyrights':
files = sys.stdin.read().splitlines()
for f, c in zip(files, copyright_scanner.FindCopyrights('.', files)):
for f, c in \
zip(files, copyright_scanner.FindCopyrights(InputApi(), '.', files)):
print f, '\t', ' / '.join(sorted(c))
return ScanResult.Ok
parser.print_help()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment