Commit f49b26bf authored by Andrew Grieve's avatar Andrew Grieve Committed by Commit Bot

md5_check.py: Don't hash files larger than 1mb.

I noticed that md5_check.py was taking up to 200ms for javac actions.
Turns out it was due to it hashing really big .jar files. The utility of
md5_check is that it tracks the md5s of often-changing source files
(e.g. .java files). It's not worth spending >100ms to check if rt.jar has
changed over and over again.

This now uses mtime for files greater than 1mb, and the md5_check
overhead for javac.py is down below 10ms on my machine.

Bug: 906803
Change-Id: I4956419e12b69ad025f12a79d82cf044fc9db34d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1839238Reviewed-by: default avatarSam Maier <smaier@chromium.org>
Commit-Queue: Andrew Grieve <agrieve@chromium.org>
Cr-Commit-Position: refs/heads/master@{#702859}
parent 8c2c9fed
...@@ -662,8 +662,7 @@ def CallAndWriteDepfileIfStale(function, options, record_path=None, ...@@ -662,8 +662,7 @@ def CallAndWriteDepfileIfStale(function, options, record_path=None,
input_paths += python_deps input_paths += python_deps
output_paths += [options.depfile] output_paths += [options.depfile]
def on_stale_md5(changes): def on_stale_md5(*args):
args = (changes,) if pass_changes else ()
function(*args) function(*args)
if python_deps is not None: if python_deps is not None:
all_depfile_deps = list(python_deps) if add_pydeps else [] all_depfile_deps = list(python_deps) if add_pydeps else []
...@@ -679,4 +678,4 @@ def CallAndWriteDepfileIfStale(function, options, record_path=None, ...@@ -679,4 +678,4 @@ def CallAndWriteDepfileIfStale(function, options, record_path=None,
input_strings=input_strings, input_strings=input_strings,
output_paths=output_paths, output_paths=output_paths,
force=force, force=force,
pass_changes=True) pass_changes=pass_changes)
...@@ -58,11 +58,13 @@ def CallAndRecordIfStale( ...@@ -58,11 +58,13 @@ def CallAndRecordIfStale(
new_metadata.AddStrings(input_strings) new_metadata.AddStrings(input_strings)
for path in input_paths: for path in input_paths:
if _IsZipFile(path): # It's faster to md5 an entire zip file than it is to just locate & hash
# its central directory (which is what this used to do).
if pass_changes and _IsZipFile(path):
entries = _ExtractZipEntries(path) entries = _ExtractZipEntries(path)
new_metadata.AddZipFile(path, entries) new_metadata.AddZipFile(path, entries)
else: else:
new_metadata.AddFile(path, _Md5ForPath(path)) new_metadata.AddFile(path, _ComputeTagForPath(path))
old_metadata = None old_metadata = None
force = force or _FORCE_REBUILD force = force or _FORCE_REBUILD
...@@ -369,27 +371,15 @@ class _Metadata(object): ...@@ -369,27 +371,15 @@ class _Metadata(object):
return (entry['path'] for entry in subentries) return (entry['path'] for entry in subentries)
def _UpdateMd5ForFile(md5, path, block_size=2**16): def _ComputeTagForPath(path):
with open(path, 'rb') as infile: stat = os.stat(path)
while True: if stat.st_size > 1 * 1024 * 1024:
data = infile.read(block_size) # Fallback to mtime for large files so that md5_check does not take too long
if not data: # to run.
break return stat.st_mtime
md5.update(data)
def _UpdateMd5ForDirectory(md5, dir_path):
for root, _, files in os.walk(dir_path):
for f in files:
_UpdateMd5ForFile(md5, os.path.join(root, f))
def _Md5ForPath(path):
md5 = hashlib.md5() md5 = hashlib.md5()
if os.path.isdir(path): with open(path, 'rb') as f:
_UpdateMd5ForDirectory(md5, path) md5.update(f.read())
else:
_UpdateMd5ForFile(md5, path)
return md5.hexdigest() return md5.hexdigest()
......
...@@ -80,7 +80,11 @@ class TestMd5Check(unittest.TestCase): ...@@ -80,7 +80,11 @@ class TestMd5Check(unittest.TestCase):
CheckCallAndRecord(True, 'should call when record doesn\'t exist', CheckCallAndRecord(True, 'should call when record doesn\'t exist',
expected_changes='Previous stamp file not found.', expected_changes='Previous stamp file not found.',
added_or_modified_only=False) added_or_modified_only=False)
CheckCallAndRecord(True, 'pass_changes changed')
CheckCallAndRecord(False, 'should not call when nothing changed') CheckCallAndRecord(False, 'should not call when nothing changed')
input_files = input_files[::-1]
CheckCallAndRecord(False, 'reordering of inputs shouldn\'t trigger call')
CheckCallAndRecord(False, 'should not call when nothing changed #2', CheckCallAndRecord(False, 'should not call when nothing changed #2',
outputs_specified=True, outputs_missing=False) outputs_specified=True, outputs_missing=False)
CheckCallAndRecord(True, 'should call when output missing', CheckCallAndRecord(True, 'should call when output missing',
...@@ -97,9 +101,6 @@ class TestMd5Check(unittest.TestCase): ...@@ -97,9 +101,6 @@ class TestMd5Check(unittest.TestCase):
expected_changes='*Modified: %s' % input_file1.name, expected_changes='*Modified: %s' % input_file1.name,
added_or_modified_only=True) added_or_modified_only=True)
input_files = input_files[::-1]
CheckCallAndRecord(False, 'reordering of inputs shouldn\'t trigger call')
input_files = input_files[:1] input_files = input_files[:1]
CheckCallAndRecord(True, 'removing file should trigger call', CheckCallAndRecord(True, 'removing file should trigger call',
expected_changes='*Removed: %s' % input_file1.name, expected_changes='*Removed: %s' % input_file1.name,
...@@ -138,7 +139,6 @@ class TestMd5Check(unittest.TestCase): ...@@ -138,7 +139,6 @@ class TestMd5Check(unittest.TestCase):
expected_changes='*Modified: %s*Subpath modified: %s' % ( expected_changes='*Modified: %s*Subpath modified: %s' % (
input_file2.name, 'path/1.txt'), input_file2.name, 'path/1.txt'),
added_or_modified_only=True) added_or_modified_only=True)
CheckCallAndRecord(False, 'should not call when nothing changed')
_WriteZipFile(input_file2.name, []) _WriteZipFile(input_file2.name, [])
CheckCallAndRecord(True, 'removed subpath should trigger call', CheckCallAndRecord(True, 'removed subpath should trigger call',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment