[string_util] fix bug in ReplaceSubstringsAfterOffset()

The problem with DoReplaceSubstringsAfterOffset was that the following search would not terminate: ReplaceSubstringsAfterOffset(std::string("aaabaa"), 0, "aa", "ccc"); The root cause of this is an algorithmic bug, where it was assumed that string::rfind and string::find would return the same matches in reversed order. The fix is to only scan forward using find(). For the length-expansion case, if capacity is insufficient, we swap with a temporary and build the result into new memory. If existing capacity suffices, we'll shift the tail of the string down to the new size, and then use left-to-right memmove loop of the length- contraction case, with the shifted tail as the src. BUG=749228 Review-Url: https://codereview.chromium.org/2979393002 Cr-Commit-Position: refs/heads/master@{#491170}

[string_util] fix bug in ReplaceSubstringsAfterOffset()
The problem with DoReplaceSubstringsAfterOffset was that the following search would not terminate: ReplaceSubstringsAfterOffset(std::string("aaabaa"), 0, "aa", "ccc"); The root cause of this is an algorithmic bug, where it was assumed that string::rfind and string::find would return the same matches in reversed order. The fix is to only scan forward using find(). For the length-expansion case, if capacity is insufficient, we swap with a temporary and build the result into new memory. If existing capacity suffices, we'll shift the tail of the string down to the new size, and then use left-to-right memmove loop of the length- contraction case, with the shifted tail as the src. BUG=749228 Review-Url: https://codereview.chromium.org/2979393002 Cr-Commit-Position: refs/heads/master@{#491170}
09d9682b · nick · Commit Bot · 310fda3e · 09d9682b · 09d9682b
Commit 09d9682b authored Aug 01, 2017 by nick Committed by Commit Bot Aug 02, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 168 additions and 97 deletions

base/strings/string_util.cc base/strings/string_util.cc +111 -74

base/strings/string_util_unittest.cc base/strings/string_util_unittest.cc +57 -23

No files found.
--- a/base/strings/string_util.cc
+++ b/base/strings/string_util.cc
@@ -712,111 +712,148 @@ string16 FormatBytesUnlocalized(int64_t bytes) {
 }
 // Runs in O(n) time in the length of |str|.
-template<class StringType>
+template <class StringType>
 void DoReplaceSubstringsAfterOffset(StringType* str,
-                                    size_t offset,
+                                    size_t initial_offset,
                                    BasicStringPiece<StringType> find_this,
                                    BasicStringPiece<StringType> replace_with,
                                    bool replace_all) {
+  using CharTraits = typename StringType::traits_type;
  DCHECK(!find_this.empty());
  // If the find string doesn't appear, there's nothing to do.
-  offset = str->find(find_this.data(), offset, find_this.size());
+  const size_t find_length = find_this.length();
-  if (offset == StringType::npos)
+  size_t first_match = str->find(find_this.data(), initial_offset, find_length);
+  if (first_match == StringType::npos)
    return;
  // If we're only replacing one instance, there's no need to do anything
  // complicated.
-  size_t find_length = find_this.length();
+  const size_t replace_length = replace_with.length();
  if (!replace_all) {
-    str->replace(offset, find_length, replace_with.data(), replace_with.size());
+    str->replace(first_match, find_length, replace_with.data(), replace_length);
    return;
  }
  // If the find and replace strings are the same length, we can simply use
  // replace() on each instance, and finish the entire operation in O(n) time.
-  size_t replace_length = replace_with.length();
  if (find_length == replace_length) {
-    do {
+    auto* buffer = &((*str)[0]);
-      str->replace(offset, find_length,
+    for (size_t offset = first_match; offset != StringType::npos;
-                   replace_with.data(), replace_with.size());
+         offset = str->find(find_this.data(), offset + replace_length,
-      offset = str->find(find_this.data(), offset + replace_length,
+                            find_length)) {
-                         find_this.size());
+      CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
-    } while (offset != StringType::npos);
+    }
    return;
  }
  // Since the find and replace strings aren't the same length, a loop like the
  // one above would be O(n^2) in the worst case, as replace() will shift the
-  // entire remaining string each time.  We need to be more clever to keep
+  // entire remaining string each time. We need to be more clever to keep things
-  // things O(n).
+  // O(n).
  //
-  // If we're shortening the string, we can alternate replacements with shifting
+  // When the string is being shortened, it's possible to just shift the matches
-  // forward the intervening characters using memmove().
+  // down in one pass while finding, and truncate the length at the end of the
+  // search.
+  //
+  // If the string is being lengthened, more work is required. The strategy used
+  // here is to make two find() passes through the string. The first pass counts
+  // the number of matches to determine the new size. The second pass will
+  // either construct the new string into a new buffer (if the existing buffer
+  // lacked capacity), or else -- if there is room -- create a region of scratch
+  // space after |first_match| by shifting the tail of the string to a higher
+  // index, and doing in-place moves from the tail to lower indices thereafter.
  size_t str_length = str->length();
-  if (find_length > replace_length) {
+  size_t expansion = 0;
-    size_t write_offset = offset;
+  if (replace_length > find_length) {
-    do {
+    // This operation lengthens the string; determine the new length by counting
-      if (replace_length) {
+    // matches.
-        str->replace(write_offset, replace_length,
+    const size_t expansion_per_match = (replace_length - find_length);
-                     replace_with.data(), replace_with.size());
+    size_t num_matches = 0;
-        write_offset += replace_length;
+    for (size_t match = first_match; match != StringType::npos;
-      }
+         match =
-      size_t read_offset = offset + find_length;
+             str->find(find_this.data(), match + find_length, find_length)) {
-      offset = std::min(
+      expansion += expansion_per_match;
-          str->find(find_this.data(), read_offset, find_this.size()),
+      ++num_matches;
-          str_length);
+    }
-      size_t length = offset - read_offset;
+    const size_t final_length = str_length + expansion;
-      if (length) {
-        memmove(&(*str)[write_offset], &(*str)[read_offset],
+    if (str->capacity() < final_length) {
-                length * sizeof(typename StringType::value_type));
+      // If we'd have to allocate a new buffer to grow the string, build the
-        write_offset += length;
+      // result directly into the new allocation via append().
+      StringType src(str->get_allocator());
+      str->swap(src);
+      str->reserve(final_length);
+      size_t pos = 0;
+      for (size_t match = first_match;;
+           match = src.find(find_this.data(), pos, find_length)) {
+        str->append(src, pos, match - pos);
+        str->append(replace_with.data(), replace_length);
+        pos = match + find_length;
+        // A mid-loop test/break enables skipping the final find() call; the
+        // number of matches is known, so don't search past the last one.
+        if (!--num_matches)
+          break;
      }
-    } while (offset < str_length);
-    str->resize(write_offset);
+      // Handle substring after the final match.
-    return;
+      str->append(src, pos, str_length - pos);
+      return;
+    }
+    // Prepare for the copy/move loop below -- expand the string to its final
+    // size by shifting the data after the first match to the end of the resized
+    // string.
+    size_t shift_src = first_match + find_length;
+    size_t shift_dst = shift_src + expansion;
+    // Big |expansion| factors (relative to |str_length|) require padding up to
+    // |shift_dst|.
+    if (shift_dst > str_length)
+      str->resize(shift_dst);
+    str->replace(shift_dst, str_length - shift_src, *str, shift_src,
+                 str_length - shift_src);
+    str_length = final_length;
  }
-  // We're lengthening the string.  We can use alternating replacements and
+  // We can alternate replacement and move operations. This won't overwrite the
-  // memmove() calls like above, but we need to precalculate the final string
+  // unsearched region of the string so long as |write_offset| <= |read_offset|;
-  // length and then expand from back-to-front to avoid overwriting the string
+  // that condition is always satisfied because:
-  // as we're reading it, needing to shift, or having to copy to a second string
+  //
-  // temporarily.
+  //   (a) If the string is being shortened, |expansion| is zero and
-  size_t first_match = offset;
+  //       |write_offset| grows slower than |read_offset|.
+  //
-  // First, calculate the final length and resize the string.
+  //   (b) If the string is being lengthened, |write_offset| grows faster than
-  size_t final_length = str_length;
+  //       |read_offset|, but |expansion| is big enough so that |write_offset|
-  size_t expansion = replace_length - find_length;
+  //       will only catch up to |read_offset| at the point of the last match.
-  size_t current_match;
+  auto* buffer = &((*str)[0]);
+  size_t write_offset = first_match;
+  size_t read_offset = first_match + expansion;
  do {
-    final_length += expansion;
+    if (replace_length) {
-    // Minor optimization: save this offset into |current_match|, so that on
+      CharTraits::copy(buffer + write_offset, replace_with.data(),
-    // exit from the loop, |current_match| will point at the last instance of
+                       replace_length);
-    // the find string, and we won't need to find() it again immediately.
+      write_offset += replace_length;
-    current_match = offset;
+    }
-    offset = str->find(find_this.data(), offset + find_length,
+    read_offset += find_length;
-                       find_this.size());
-  } while (offset != StringType::npos);
+    // min() clamps StringType::npos (the largest unsigned value) to str_length.
-  str->resize(final_length);
+    size_t match = std::min(
+        str->find(find_this.data(), read_offset, find_length), str_length);
-  // Now do the replacement loop, working backwards through the string.
-  for (size_t prev_match = str_length, write_offset = final_length; ;
+    size_t length = match - read_offset;
-       current_match = str->rfind(find_this.data(), current_match - 1,
-                                  find_this.size())) {
-    size_t read_offset = current_match + find_length;
-    size_t length = prev_match - read_offset;
    if (length) {
-      write_offset -= length;
+      CharTraits::move(buffer + write_offset, buffer + read_offset, length);
-      memmove(&(*str)[write_offset], &(*str)[read_offset],
+      write_offset += length;
-              length * sizeof(typename StringType::value_type));
+      read_offset += length;
    }
-    write_offset -= replace_length;
+  } while (read_offset < str_length);
-    str->replace(write_offset, replace_length,
-                 replace_with.data(), replace_with.size());
+  // If we're shortening the string, truncate it now.
-    if (current_match == first_match)
+  str->resize(write_offset);
-      return;
-    prev_match = current_match;
-  }
 }
 void ReplaceFirstSubstringAfterOffset(string16* str,

--- a/base/strings/string_util_unittest.cc
+++ b/base/strings/string_util_unittest.cc
@@ -585,32 +585,66 @@ TEST(StringUtilTest, FormatBytesUnlocalized) {
 }
 TEST(StringUtilTest, ReplaceSubstringsAfterOffset) {
  static const struct {
-    const char* str;
+    StringPiece str;
-    string16::size_type start_offset;
+    size_t start_offset;
-    const char* find_this;
+    StringPiece find_this;
-    const char* replace_with;
+    StringPiece replace_with;
-    const char* expected;
+    StringPiece expected;
  } cases[] = {
-    {"aaa", 0, "a", "b", "bbb"},
+      {"aaa", 0, "a", "b", "bbb"},
-    {"abb", 0, "ab", "a", "ab"},
+      {"aaa", 0, "aa", "b", "ba"},
-    {"Removing some substrings inging", 0, "ing", "", "Remov some substrs "},
+      {"aaa", 0, "aa", "bbb", "bbba"},
-    {"Not found", 0, "x", "0", "Not found"},
+      {"aaaaa", 0, "aa", "b", "bba"},
-    {"Not found again", 5, "x", "0", "Not found again"},
+      {"ababaaababa", 0, "aba", "", "baaba"},
-    {" Making it much longer ", 0, " ", "Four score and seven years ago",
+      {"ababaaababa", 0, "aba", "_", "_baa_ba"},
-     "Four score and seven years agoMakingFour score and seven years agoit"
+      {"ababaaababa", 0, "aba", "__", "__baa__ba"},
-     "Four score and seven years agomuchFour score and seven years agolonger"
+      {"ababaaababa", 0, "aba", "___", "___baa___ba"},
-     "Four score and seven years ago"},
+      {"ababaaababa", 0, "aba", "____", "____baa____ba"},
-    {"Invalid offset", 9999, "t", "foobar", "Invalid offset"},
+      {"ababaaababa", 0, "aba", "_____", "_____baa_____ba"},
-    {"Replace me only me once", 9, "me ", "", "Replace me only once"},
+      {"abb", 0, "ab", "a", "ab"},
-    {"abababab", 2, "ab", "c", "abccc"},
+      {"Removing some substrings inging", 0, "ing", "", "Remov some substrs "},
+      {"Not found", 0, "x", "0", "Not found"},
+      {"Not found again", 5, "x", "0", "Not found again"},
+      {" Making it much longer ", 0, " ", "Four score and seven years ago",
+       "Four score and seven years agoMakingFour score and seven years agoit"
+       "Four score and seven years agomuchFour score and seven years agolonger"
+       "Four score and seven years ago"},
+      {" Making it much much much much shorter ", 0,
+       "Making it much much much much shorter", "", "  "},
+      {"so much much much much much very much much much shorter", 0, "much ",
+       "", "so very shorter"},
+      {"Invalid offset", 9999, "t", "foobar", "Invalid offset"},
+      {"Replace me only me once", 9, "me ", "", "Replace me only once"},
+      {"abababab", 2, "ab", "c", "abccc"},
  };
-  for (size_t i = 0; i < arraysize(cases); i++) {
+  // base::string16 variant
-    string16 str = ASCIIToUTF16(cases[i].str);
+  for (const auto& scenario : cases) {
-    ReplaceSubstringsAfterOffset(&str, cases[i].start_offset,
+    string16 str = ASCIIToUTF16(scenario.str);
-                                 ASCIIToUTF16(cases[i].find_this),
+    ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
-                                 ASCIIToUTF16(cases[i].replace_with));
+                                 ASCIIToUTF16(scenario.find_this),
-    EXPECT_EQ(ASCIIToUTF16(cases[i].expected), str);
+                                 ASCIIToUTF16(scenario.replace_with));
+    EXPECT_EQ(ASCIIToUTF16(scenario.expected), str);
+  }
+  // std::string with insufficient capacity: expansion must realloc the buffer.
+  for (const auto& scenario : cases) {
+    std::string str = scenario.str.as_string();
+    str.shrink_to_fit();  // This is nonbinding, but it's the best we've got.
+    ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
+                                 scenario.find_this, scenario.replace_with);
+    EXPECT_EQ(scenario.expected, str);
+  }
+  // std::string with ample capacity: should be possible to grow in-place.
+  for (const auto& scenario : cases) {
+    std::string str = scenario.str.as_string();
+    str.reserve(std::max(scenario.str.length(), scenario.expected.length()) *
+                2);
+    ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
+                                 scenario.find_this, scenario.replace_with);
+    EXPECT_EQ(scenario.expected, str);
  }
 }