Remove UnescapeURLComponent() overload that takes a base::string16.

The method has some safe-for-display safety checks that assume the input is UTF-8 / output is UTF-8. This change makes it at least a little harder to avoid those checks, and makes output no longer vary based on whether passing in a std::string or a string16 (By removing the latter option entirely). Bug: 831321 Change-Id: Ib39a2cccd71861213341e92932525e8ecafc60cd Reviewed-on: https://chromium-review.googlesource.com/1004855Reviewed-by: Matt Giuca <mgiuca@chromium.org> Reviewed-by: Justin Donnelly <jdonnelly@chromium.org> Commit-Queue: Matt Menke <mmenke@chromium.org> Cr-Commit-Position: refs/heads/master@{#550720}

Remove UnescapeURLComponent() overload that takes a base::string16.
The method has some safe-for-display safety checks that assume the input is UTF-8 / output is UTF-8. This change makes it at least a little harder to avoid those checks, and makes output no longer vary based on whether passing in a std::string or a string16 (By removing the latter option entirely). Bug: 831321 Change-Id: Ib39a2cccd71861213341e92932525e8ecafc60cd Reviewed-on: https://chromium-review.googlesource.com/1004855Reviewed-by: Matt Giuca <mgiuca@chromium.org> Reviewed-by: Justin Donnelly <jdonnelly@chromium.org> Commit-Queue: Matt Menke <mmenke@chromium.org> Cr-Commit-Position: refs/heads/master@{#550720}
4858757b · Matt Menke · Commit Bot · f3e10319 · 4858757b · 4858757b
Commit 4858757b authored Apr 13, 2018 by Matt Menke Committed by Commit Bot Apr 13, 2018
5 changed files
--- a/components/omnibox/browser/url_index_private_data.cc
+++ b/components/omnibox/browser/url_index_private_data.cc
@@ -190,10 +190,13 @@ ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
    // the final filtering we need whitespace separated substrings possibly
    // containing escaped characters.
    base::string16 lower_raw_string(base::i18n::ToLower(search_string));
-    base::string16 lower_unescaped_string = net::UnescapeURLComponent(
-        lower_raw_string,
-        net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
-            net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
+    // Have to convert to UTF-8 and back, because UnescapeURLComponent doesn't
+    // support unescaping UTF-8 characters and converting them to UTF-16.
+    base::string16 lower_unescaped_string =
+        base::UTF8ToUTF16(net::UnescapeURLComponent(
+            base::UTF16ToUTF8(lower_raw_string),
+            net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
+                net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));

    // Extract individual 'words' (as opposed to 'terms'; see comment in
    // HistoryIdsToScoredMatches()) from the search string. When the user types

--- a/net/base/escape.cc
+++ b/net/base/escape.cc
@@ -101,18 +101,15 @@ const char kUrlUnescape[128] = {
 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
 // successful, sets |value| to the unescaped value.  Returns whether
 // unescaping succeeded.
-template <typename STR>
-bool UnescapeUnsignedCharAtIndex(STR escaped_text,
+bool UnescapeUnsignedCharAtIndex(base::StringPiece escaped_text,
                                 size_t index,
                                 unsigned char* value) {
  if ((index + 2) >= escaped_text.size())
    return false;
  if (escaped_text[index] != '%')
    return false;
-  const typename STR::value_type most_sig_digit(
-      static_cast<typename STR::value_type>(escaped_text[index + 1]));
-  const typename STR::value_type least_sig_digit(
-      static_cast<typename STR::value_type>(escaped_text[index + 2]));
+  char most_sig_digit(escaped_text[index + 1]);
+  char least_sig_digit(escaped_text[index + 2]);
  if (base::IsHexDigit(most_sig_digit) && base::IsHexDigit(least_sig_digit)) {
    *value = base::HexDigitToInt(most_sig_digit) * 16 +
             base::HexDigitToInt(least_sig_digit);
@@ -123,8 +120,7 @@ bool UnescapeUnsignedCharAtIndex(STR escaped_text,

 // Returns true if there is an Arabic Language Mark at |index|. |first_byte|
 // is the byte at |index|.
-template <typename STR>
-bool HasArabicLanguageMarkAtIndex(STR escaped_text,
+bool HasArabicLanguageMarkAtIndex(base::StringPiece escaped_text,
                                  unsigned char first_byte,
                                  size_t index) {
  if (first_byte != 0xD8)
@@ -137,8 +133,7 @@ bool HasArabicLanguageMarkAtIndex(STR escaped_text,

 // Returns true if there is a BiDi control char at |index|. |first_byte| is the
 // byte at |index|.
-template <typename STR>
-bool HasThreeByteBidiControlCharAtIndex(STR escaped_text,
+bool HasThreeByteBidiControlCharAtIndex(base::StringPiece escaped_text,
                                        unsigned char first_byte,
                                        size_t index) {
  if (first_byte != 0xE2)
@@ -161,8 +156,7 @@ bool HasThreeByteBidiControlCharAtIndex(STR escaped_text,

 // Returns true if there is a four-byte banned char at |index|. |first_byte| is
 // the byte at |index|.
-template <typename STR>
-bool HasFourByteBannedCharAtIndex(STR escaped_text,
+bool HasFourByteBannedCharAtIndex(base::StringPiece escaped_text,
                                  unsigned char first_byte,
                                  size_t index) {
  // The following characters are blacklisted for spoofability concerns.
@@ -196,9 +190,8 @@ bool HasFourByteBannedCharAtIndex(STR escaped_text,
 // the alterations done to the string that are not one-character-to-one-
 // character.  The resulting |adjustments| will always be sorted by increasing
 // offset.
-template <typename STR>
-STR UnescapeURLWithAdjustmentsImpl(
-    base::BasicStringPiece<STR> escaped_text,
+std::string UnescapeURLWithAdjustmentsImpl(
+    base::StringPiece escaped_text,
    UnescapeRule::Type rules,
    base::OffsetAdjuster::Adjustments* adjustments) {
  if (adjustments)
@@ -210,7 +203,7 @@ STR UnescapeURLWithAdjustmentsImpl(
  // The output of the unescaping is always smaller than the input, so we can
  // reserve the input size to make sure we have enough buffer and don't have
  // to allocate in the loop below.
-  STR result;
+  std::string result;
  result.reserve(escaped_text.length());

  // Locations of adjusted text.
@@ -436,11 +429,6 @@ std::string UnescapeURLComponent(base::StringPiece escaped_text,
  return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 }

-base::string16 UnescapeURLComponent(base::StringPiece16 escaped_text,
-                                    UnescapeRule::Type rules) {
-  return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
-}
-
 base::string16 UnescapeAndDecodeUTF8URLComponent(base::StringPiece text,
                                                 UnescapeRule::Type rules) {
  return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL);

--- a/net/base/escape.h
+++ b/net/base/escape.h
@@ -118,15 +118,13 @@ class UnescapeRule {
 // a hex digit, and converting to the character with the numerical value of
 // those digits. Thus "i%20=%203%3b" unescapes to "i = 3;".
 //
-// Watch out: this doesn't necessarily result in the correct final result,
-// because the encoding may be unknown. For example, the input might be ASCII,
-// which, after unescaping, is supposed to be interpreted as UTF-8, and then
-// converted into full UTF-16 chars. This function won't tell you if any
-// conversions need to take place, it only unescapes.
+// This method does not ensure that the output is a valid string using any
+// character encoding. However, unless SPOOFING_AND_CONTROL_CHARS is set, it
+// does leave escaped certain byte sequences that would be dangerous to display
+// to the user, because if interpreted as UTF-8, they could be used to mislead
+// the user.
 NET_EXPORT std::string UnescapeURLComponent(base::StringPiece escaped_text,
                                            UnescapeRule::Type rules);
-NET_EXPORT base::string16 UnescapeURLComponent(base::StringPiece16 escaped_text,
-                                               UnescapeRule::Type rules);

 // Unescapes the given substring as a URL, and then tries to interpret the
 // result as being encoded as UTF-8. If the result is convertable into UTF-8, it

--- a/net/base/escape_unittest.cc
+++ b/net/base/escape_unittest.cc
--- a/net/base/unescape_url_component_fuzzer.cc
+++ b/net/base/unescape_url_component_fuzzer.cc
@@ -18,18 +18,5 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
    net::UnescapeURLComponent(path, static_cast<net::UnescapeRule::Type>(i));
  }

-  // When non-empty, align |data| to sizeof(char16).
-  if ((size > 0) && ((size % 2) == 1)) {
-    data++;
-    size--;
-  }
-
-  // Test for StringPiece16.
-  base::StringPiece16 path16(reinterpret_cast<const base::char16*>(data),
-                             size / 2);
-  for (int i = 0; i <= kMaxUnescapeRule; i++) {
-    net::UnescapeURLComponent(path16, static_cast<net::UnescapeRule::Type>(i));
-  }
-
  return 0;
 }