Revert "jumbo: stable assignment of inputs to chunks"

This reverts commit 2c7a71c3. Reason for revert: Linux with chunk size 50 broke in v8 which we can't quickly fix, and Windows (chunk size 8) has a DWORD logging error that looks non-trivial. We need to fix those before landing this. Original change's description: > jumbo: stable assignment of inputs to chunks > > Adding or removing a file from a jumbo source set causes on average > half of the chunks to have their inputs reallocated. > Derive chunk boundaries from a combination of list position and path > content. This is so that when a file is added or removed, only the > boundaries with adjacent chunks typically move. > For a balance between maximum chunk size and stability of partitions: > * Partition uniformly into the required number of chunks. > * Pick a "center" from each chunk by minimum hash value. > * Pick the boundaries between centers by maximum hash value. > > Bug: 782863 > Change-Id: Ie71d82b132e8145b4ed3d1141f85886a12149d5a > Reviewed-on: https://chromium-review.googlesource.com/1102218 > Reviewed-by: Bruce Dawson <brucedawson@chromium.org> > Reviewed-by: Dirk Pranke <dpranke@chromium.org> > Reviewed-by: Daniel Bratell <bratell@opera.com> > Commit-Queue: Dirk Pranke <dpranke@chromium.org> > Cr-Commit-Position: refs/heads/master@{#570623} TBR=dpranke@chromium.org,bratell@opera.com,brucedawson@chromium.org,david.barr@samsung.com Change-Id: I8c81aca34ab2876aebea6b54e847cb24be3aa27f No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: 782863 Reviewed-on: https://chromium-review.googlesource.com/1116658Reviewed-by: Daniel Bratell <bratell@opera.com> Commit-Queue: Daniel Bratell <bratell@opera.com> Cr-Commit-Position: refs/heads/master@{#570701}

Revert "jumbo: stable assignment of inputs to chunks"
This reverts commit 2c7a71c3. Reason for revert: Linux with chunk size 50 broke in v8 which we can't quickly fix, and Windows (chunk size 8) has a DWORD logging error that looks non-trivial. We need to fix those before landing this. Original change's description: > jumbo: stable assignment of inputs to chunks > > Adding or removing a file from a jumbo source set causes on average > half of the chunks to have their inputs reallocated. > Derive chunk boundaries from a combination of list position and path > content. This is so that when a file is added or removed, only the > boundaries with adjacent chunks typically move. > For a balance between maximum chunk size and stability of partitions: > * Partition uniformly into the required number of chunks. > * Pick a "center" from each chunk by minimum hash value. > * Pick the boundaries between centers by maximum hash value. > > Bug: 782863 > Change-Id: Ie71d82b132e8145b4ed3d1141f85886a12149d5a > Reviewed-on: https://chromium-review.googlesource.com/1102218 > Reviewed-by: Bruce Dawson <brucedawson@chromium.org> > Reviewed-by: Dirk Pranke <dpranke@chromium.org> > Reviewed-by: Daniel Bratell <bratell@opera.com> > Commit-Queue: Dirk Pranke <dpranke@chromium.org> > Cr-Commit-Position: refs/heads/master@{#570623} TBR=dpranke@chromium.org,bratell@opera.com,brucedawson@chromium.org,david.barr@samsung.com Change-Id: I8c81aca34ab2876aebea6b54e847cb24be3aa27f No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: 782863 Reviewed-on: https://chromium-review.googlesource.com/1116658Reviewed-by: Daniel Bratell <bratell@opera.com> Commit-Queue: Daniel Bratell <bratell@opera.com> Cr-Commit-Position: refs/heads/master@{#570701}
2405b447 · Daniel Bratell · Commit Bot · 28881e15 · 2405b447 · 2405b447
Commit 2405b447 authored Jun 27, 2018 by Daniel Bratell Committed by Commit Bot Jun 27, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 60 deletions

build/config/jumbo.gni build/config/jumbo.gni +1 -1

build/config/merge_for_jumbo.py build/config/merge_for_jumbo.py +3 -58

docs/jumbo.md docs/jumbo.md +1 -1

No files found.
--- a/build/config/jumbo.gni
+++ b/build/config/jumbo.gni
@@ -14,7 +14,7 @@ declare_args() {
  # when frequently changing a set of cpp files.
  jumbo_build_excluded = []

-  # How many files to group on average. Smaller numbers give more
+  # How many files to group at most. Smaller numbers give more
  # parallellism, higher numbers give less total CPU usage. Higher
  # numbers also give longer single-file recompilation times.
  #

--- a/build/config/merge_for_jumbo.py
+++ b/build/config/merge_for_jumbo.py
@@ -12,67 +12,12 @@ for compiling.
 from __future__ import print_function

 import argparse
-import hashlib
 import cStringIO
 import os

-def cut_ranges(boundaries):
-  # Given an increasing sequence of boundary indices, generate a sequence of
-  # non-overlapping ranges. The total range is inclusive of the first index
-  # and exclusive of the last index from the given sequence.
-  for start, stop in zip(boundaries, boundaries[1:]):
-    yield range(start, stop)
-
-
-def generate_chunk_stops(inputs, output_count, smart_merge=True):
-  # Note: In the comments below, unique numeric labels are assigned to files.
-  #       Consider them as the sorted rank of the hash of each file path.
-  # Simple jumbo chunking generates uniformly sized chunks with the ceiling of:
-  # (output_index + 1) * input_count / output_count
-  input_count = len(inputs)
-  stops = [((i + 1) * input_count + output_count - 1) // output_count
-           for i in range(output_count)]
-  # This is disruptive at times because file insertions and removals can
-  # invalidate many chunks as all files are offset by one.
-  # For example, say we have 12 files in 4 uniformly sized chunks:
-  # 9, 4, 0; 7,  1, 11;  5, 10, 2; 6, 3, 8
-  # If we delete the first file we get:
-  # 4, 0, 7; 1, 11,  5; 10,  2, 6; 3, 8
-  # All of the chunks have new sets of inputs.
-
-  # With path-aware chunking, we start with the uniformly sized chunks:
-  # 9, 4, 0; 7,  1, 11;  5, 10, 2; 6, 3, 8
-  # First we find the smallest rank in each of the chunks. Their indices are
-  # stored in the |centers| list and in this example the ranks would be:
-  # 0, 1, 2, 3
-  # Then we find the largest rank between the centers. Their indices are stored
-  # in the |stops| list and in this example the ranks would be:
-  # 7, 11, 6
-  # These files mark the boundaries between chunks and these boundary files are
-  # often maintained even as files are added or deleted.
-  # In this example, 7, 11, and 6 are the first files in each chunk:
-  # 9, 4, 0; 7,  1; 11,  5, 10, 2; 6, 3, 8
-  # If we delete the first file and repeat the process we get:
-  # 4, 0; 7, 1; 11,  5, 10,  2; 6, 3, 8
-  # Only the first chunk has a new set of inputs.
-  if smart_merge:
-    # Starting with the simple chunks, every file is assigned a rank.
-    # This requires a hash function that is stable across runs.
-    hasher = lambda n: hashlib.md5(inputs[n]).hexdigest()
-    # In each chunk there is a key file with lowest rank; mark them.
-    # Note that they will not easily change.
-    centers = [min(indices, key=hasher) for indices in cut_ranges([0] + stops)]
-    # Between each pair of key files there is a file with highest rank.
-    # Mark these to be used as border files. They also will not easily change.
-    # Forget the inital chunks and create new chunks by splitting the list at
-    # every border file.
-    stops = [max(indices, key=hasher) for indices in cut_ranges(centers)]
-    stops.append(input_count)
-  return stops
-
-
 def write_jumbo_files(inputs, outputs, written_input_set, written_output_set):
-  chunk_stops = generate_chunk_stops(inputs, len(outputs))
+  output_count = len(outputs)
+  input_count = len(inputs)

  written_inputs = 0
  for output_index, output_file in enumerate(outputs):
@@ -86,7 +31,7 @@ def write_jumbo_files(inputs, outputs, written_input_set, written_output_set):
    out = cStringIO.StringIO()
    out.write("/* This is a Jumbo file. Don't edit. */\n\n")
    out.write("/* Generated with merge_for_jumbo.py. */\n\n")
-    input_limit = chunk_stops[output_index]
+    input_limit = (output_index + 1) * input_count / output_count
    while written_inputs < input_limit:
      filename = inputs[written_inputs]
      written_inputs += 1

--- a/docs/jumbo.md
+++ b/docs/jumbo.md
@@ -51,7 +51,7 @@ source files.

 ## Tuning

-By default on average `50`, or `8` when using goma, files are merged at a
+By default at most `50`, or `8` when using goma, files are merged at a
 time. The more files that are are merged, the less total CPU time is
 needed, but parallelism is reduced. This number can be changed by
 setting `jumbo_file_merge_limit`.