Commit 09d9682b authored by nick's avatar nick Committed by Commit Bot

[string_util] fix bug in ReplaceSubstringsAfterOffset()

The problem with DoReplaceSubstringsAfterOffset was that the following search
would not terminate:

  ReplaceSubstringsAfterOffset(std::string("aaabaa"), 0, "aa", "ccc");

The root cause of this is an algorithmic bug, where it was assumed that
string::rfind and string::find would return the same matches in reversed order.

The fix is to only scan forward using find(). For the length-expansion case, if
capacity is insufficient, we swap with a temporary and build the result into new
memory. If existing capacity suffices, we'll shift the tail of the string down
to the new size, and then use left-to-right memmove loop of the length-
contraction case, with the shifted tail as the src.

BUG=749228

Review-Url: https://codereview.chromium.org/2979393002
Cr-Commit-Position: refs/heads/master@{#491170}
parent 310fda3e
...@@ -712,111 +712,148 @@ string16 FormatBytesUnlocalized(int64_t bytes) { ...@@ -712,111 +712,148 @@ string16 FormatBytesUnlocalized(int64_t bytes) {
} }
// Runs in O(n) time in the length of |str|. // Runs in O(n) time in the length of |str|.
template<class StringType> template <class StringType>
void DoReplaceSubstringsAfterOffset(StringType* str, void DoReplaceSubstringsAfterOffset(StringType* str,
size_t offset, size_t initial_offset,
BasicStringPiece<StringType> find_this, BasicStringPiece<StringType> find_this,
BasicStringPiece<StringType> replace_with, BasicStringPiece<StringType> replace_with,
bool replace_all) { bool replace_all) {
using CharTraits = typename StringType::traits_type;
DCHECK(!find_this.empty()); DCHECK(!find_this.empty());
// If the find string doesn't appear, there's nothing to do. // If the find string doesn't appear, there's nothing to do.
offset = str->find(find_this.data(), offset, find_this.size()); const size_t find_length = find_this.length();
if (offset == StringType::npos) size_t first_match = str->find(find_this.data(), initial_offset, find_length);
if (first_match == StringType::npos)
return; return;
// If we're only replacing one instance, there's no need to do anything // If we're only replacing one instance, there's no need to do anything
// complicated. // complicated.
size_t find_length = find_this.length(); const size_t replace_length = replace_with.length();
if (!replace_all) { if (!replace_all) {
str->replace(offset, find_length, replace_with.data(), replace_with.size()); str->replace(first_match, find_length, replace_with.data(), replace_length);
return; return;
} }
// If the find and replace strings are the same length, we can simply use // If the find and replace strings are the same length, we can simply use
// replace() on each instance, and finish the entire operation in O(n) time. // replace() on each instance, and finish the entire operation in O(n) time.
size_t replace_length = replace_with.length();
if (find_length == replace_length) { if (find_length == replace_length) {
do { auto* buffer = &((*str)[0]);
str->replace(offset, find_length, for (size_t offset = first_match; offset != StringType::npos;
replace_with.data(), replace_with.size()); offset = str->find(find_this.data(), offset + replace_length,
offset = str->find(find_this.data(), offset + replace_length, find_length)) {
find_this.size()); CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
} while (offset != StringType::npos); }
return; return;
} }
// Since the find and replace strings aren't the same length, a loop like the // Since the find and replace strings aren't the same length, a loop like the
// one above would be O(n^2) in the worst case, as replace() will shift the // one above would be O(n^2) in the worst case, as replace() will shift the
// entire remaining string each time. We need to be more clever to keep // entire remaining string each time. We need to be more clever to keep things
// things O(n). // O(n).
// //
// If we're shortening the string, we can alternate replacements with shifting // When the string is being shortened, it's possible to just shift the matches
// forward the intervening characters using memmove(). // down in one pass while finding, and truncate the length at the end of the
// search.
//
// If the string is being lengthened, more work is required. The strategy used
// here is to make two find() passes through the string. The first pass counts
// the number of matches to determine the new size. The second pass will
// either construct the new string into a new buffer (if the existing buffer
// lacked capacity), or else -- if there is room -- create a region of scratch
// space after |first_match| by shifting the tail of the string to a higher
// index, and doing in-place moves from the tail to lower indices thereafter.
size_t str_length = str->length(); size_t str_length = str->length();
if (find_length > replace_length) { size_t expansion = 0;
size_t write_offset = offset; if (replace_length > find_length) {
do { // This operation lengthens the string; determine the new length by counting
if (replace_length) { // matches.
str->replace(write_offset, replace_length, const size_t expansion_per_match = (replace_length - find_length);
replace_with.data(), replace_with.size()); size_t num_matches = 0;
write_offset += replace_length; for (size_t match = first_match; match != StringType::npos;
} match =
size_t read_offset = offset + find_length; str->find(find_this.data(), match + find_length, find_length)) {
offset = std::min( expansion += expansion_per_match;
str->find(find_this.data(), read_offset, find_this.size()), ++num_matches;
str_length); }
size_t length = offset - read_offset; const size_t final_length = str_length + expansion;
if (length) {
memmove(&(*str)[write_offset], &(*str)[read_offset], if (str->capacity() < final_length) {
length * sizeof(typename StringType::value_type)); // If we'd have to allocate a new buffer to grow the string, build the
write_offset += length; // result directly into the new allocation via append().
StringType src(str->get_allocator());
str->swap(src);
str->reserve(final_length);
size_t pos = 0;
for (size_t match = first_match;;
match = src.find(find_this.data(), pos, find_length)) {
str->append(src, pos, match - pos);
str->append(replace_with.data(), replace_length);
pos = match + find_length;
// A mid-loop test/break enables skipping the final find() call; the
// number of matches is known, so don't search past the last one.
if (!--num_matches)
break;
} }
} while (offset < str_length);
str->resize(write_offset); // Handle substring after the final match.
return; str->append(src, pos, str_length - pos);
return;
}
// Prepare for the copy/move loop below -- expand the string to its final
// size by shifting the data after the first match to the end of the resized
// string.
size_t shift_src = first_match + find_length;
size_t shift_dst = shift_src + expansion;
// Big |expansion| factors (relative to |str_length|) require padding up to
// |shift_dst|.
if (shift_dst > str_length)
str->resize(shift_dst);
str->replace(shift_dst, str_length - shift_src, *str, shift_src,
str_length - shift_src);
str_length = final_length;
} }
// We're lengthening the string. We can use alternating replacements and // We can alternate replacement and move operations. This won't overwrite the
// memmove() calls like above, but we need to precalculate the final string // unsearched region of the string so long as |write_offset| <= |read_offset|;
// length and then expand from back-to-front to avoid overwriting the string // that condition is always satisfied because:
// as we're reading it, needing to shift, or having to copy to a second string //
// temporarily. // (a) If the string is being shortened, |expansion| is zero and
size_t first_match = offset; // |write_offset| grows slower than |read_offset|.
//
// First, calculate the final length and resize the string. // (b) If the string is being lengthened, |write_offset| grows faster than
size_t final_length = str_length; // |read_offset|, but |expansion| is big enough so that |write_offset|
size_t expansion = replace_length - find_length; // will only catch up to |read_offset| at the point of the last match.
size_t current_match; auto* buffer = &((*str)[0]);
size_t write_offset = first_match;
size_t read_offset = first_match + expansion;
do { do {
final_length += expansion; if (replace_length) {
// Minor optimization: save this offset into |current_match|, so that on CharTraits::copy(buffer + write_offset, replace_with.data(),
// exit from the loop, |current_match| will point at the last instance of replace_length);
// the find string, and we won't need to find() it again immediately. write_offset += replace_length;
current_match = offset; }
offset = str->find(find_this.data(), offset + find_length, read_offset += find_length;
find_this.size());
} while (offset != StringType::npos); // min() clamps StringType::npos (the largest unsigned value) to str_length.
str->resize(final_length); size_t match = std::min(
str->find(find_this.data(), read_offset, find_length), str_length);
// Now do the replacement loop, working backwards through the string.
for (size_t prev_match = str_length, write_offset = final_length; ; size_t length = match - read_offset;
current_match = str->rfind(find_this.data(), current_match - 1,
find_this.size())) {
size_t read_offset = current_match + find_length;
size_t length = prev_match - read_offset;
if (length) { if (length) {
write_offset -= length; CharTraits::move(buffer + write_offset, buffer + read_offset, length);
memmove(&(*str)[write_offset], &(*str)[read_offset], write_offset += length;
length * sizeof(typename StringType::value_type)); read_offset += length;
} }
write_offset -= replace_length; } while (read_offset < str_length);
str->replace(write_offset, replace_length,
replace_with.data(), replace_with.size()); // If we're shortening the string, truncate it now.
if (current_match == first_match) str->resize(write_offset);
return;
prev_match = current_match;
}
} }
void ReplaceFirstSubstringAfterOffset(string16* str, void ReplaceFirstSubstringAfterOffset(string16* str,
......
...@@ -585,32 +585,66 @@ TEST(StringUtilTest, FormatBytesUnlocalized) { ...@@ -585,32 +585,66 @@ TEST(StringUtilTest, FormatBytesUnlocalized) {
} }
TEST(StringUtilTest, ReplaceSubstringsAfterOffset) { TEST(StringUtilTest, ReplaceSubstringsAfterOffset) {
static const struct { static const struct {
const char* str; StringPiece str;
string16::size_type start_offset; size_t start_offset;
const char* find_this; StringPiece find_this;
const char* replace_with; StringPiece replace_with;
const char* expected; StringPiece expected;
} cases[] = { } cases[] = {
{"aaa", 0, "a", "b", "bbb"}, {"aaa", 0, "a", "b", "bbb"},
{"abb", 0, "ab", "a", "ab"}, {"aaa", 0, "aa", "b", "ba"},
{"Removing some substrings inging", 0, "ing", "", "Remov some substrs "}, {"aaa", 0, "aa", "bbb", "bbba"},
{"Not found", 0, "x", "0", "Not found"}, {"aaaaa", 0, "aa", "b", "bba"},
{"Not found again", 5, "x", "0", "Not found again"}, {"ababaaababa", 0, "aba", "", "baaba"},
{" Making it much longer ", 0, " ", "Four score and seven years ago", {"ababaaababa", 0, "aba", "_", "_baa_ba"},
"Four score and seven years agoMakingFour score and seven years agoit" {"ababaaababa", 0, "aba", "__", "__baa__ba"},
"Four score and seven years agomuchFour score and seven years agolonger" {"ababaaababa", 0, "aba", "___", "___baa___ba"},
"Four score and seven years ago"}, {"ababaaababa", 0, "aba", "____", "____baa____ba"},
{"Invalid offset", 9999, "t", "foobar", "Invalid offset"}, {"ababaaababa", 0, "aba", "_____", "_____baa_____ba"},
{"Replace me only me once", 9, "me ", "", "Replace me only once"}, {"abb", 0, "ab", "a", "ab"},
{"abababab", 2, "ab", "c", "abccc"}, {"Removing some substrings inging", 0, "ing", "", "Remov some substrs "},
{"Not found", 0, "x", "0", "Not found"},
{"Not found again", 5, "x", "0", "Not found again"},
{" Making it much longer ", 0, " ", "Four score and seven years ago",
"Four score and seven years agoMakingFour score and seven years agoit"
"Four score and seven years agomuchFour score and seven years agolonger"
"Four score and seven years ago"},
{" Making it much much much much shorter ", 0,
"Making it much much much much shorter", "", " "},
{"so much much much much much very much much much shorter", 0, "much ",
"", "so very shorter"},
{"Invalid offset", 9999, "t", "foobar", "Invalid offset"},
{"Replace me only me once", 9, "me ", "", "Replace me only once"},
{"abababab", 2, "ab", "c", "abccc"},
}; };
for (size_t i = 0; i < arraysize(cases); i++) { // base::string16 variant
string16 str = ASCIIToUTF16(cases[i].str); for (const auto& scenario : cases) {
ReplaceSubstringsAfterOffset(&str, cases[i].start_offset, string16 str = ASCIIToUTF16(scenario.str);
ASCIIToUTF16(cases[i].find_this), ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
ASCIIToUTF16(cases[i].replace_with)); ASCIIToUTF16(scenario.find_this),
EXPECT_EQ(ASCIIToUTF16(cases[i].expected), str); ASCIIToUTF16(scenario.replace_with));
EXPECT_EQ(ASCIIToUTF16(scenario.expected), str);
}
// std::string with insufficient capacity: expansion must realloc the buffer.
for (const auto& scenario : cases) {
std::string str = scenario.str.as_string();
str.shrink_to_fit(); // This is nonbinding, but it's the best we've got.
ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
scenario.find_this, scenario.replace_with);
EXPECT_EQ(scenario.expected, str);
}
// std::string with ample capacity: should be possible to grow in-place.
for (const auto& scenario : cases) {
std::string str = scenario.str.as_string();
str.reserve(std::max(scenario.str.length(), scenario.expected.length()) *
2);
ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
scenario.find_this, scenario.replace_with);
EXPECT_EQ(scenario.expected, str);
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment