Restructure UnescapeURLWithAdjustmentsImpl().

In particular, unescape entire unicode characters at once, and then compare against unescape blacklists, rather than the other way around, to simplify code and avoid the tree structure of the old code. This will also allow the method to use icu's code point classification logic, at some point in the future. Also separate out comparing against the character blacklist and UTF-8 character decoding into separate methods, and add a few more test cases to unittest. The method itself should behave exactly the same as before. Bug: 824715 Change-Id: I5311f25bfda4132b122ec4a079740adf093099a3 Reviewed-on: https://chromium-review.googlesource.com/998014 Commit-Queue: Matt Menke <mmenke@chromium.org> Reviewed-by: Matt Giuca <mgiuca@chromium.org> Reviewed-by: Helen Li <xunjieli@chromium.org> Cr-Commit-Position: refs/heads/master@{#551029}

Restructure UnescapeURLWithAdjustmentsImpl().
In particular, unescape entire unicode characters at once, and then compare against unescape blacklists, rather than the other way around, to simplify code and avoid the tree structure of the old code. This will also allow the method to use icu's code point classification logic, at some point in the future. Also separate out comparing against the character blacklist and UTF-8 character decoding into separate methods, and add a few more test cases to unittest. The method itself should behave exactly the same as before. Bug: 824715 Change-Id: I5311f25bfda4132b122ec4a079740adf093099a3 Reviewed-on: https://chromium-review.googlesource.com/998014 Commit-Queue: Matt Menke <mmenke@chromium.org> Reviewed-by: Matt Giuca <mgiuca@chromium.org> Reviewed-by: Helen Li <xunjieli@chromium.org> Cr-Commit-Position: refs/heads/master@{#551029}
6e8dbd1d · Matt Menke · Commit Bot · dd942765 · 6e8dbd1d · 6e8dbd1d
Commit 6e8dbd1d authored Apr 16, 2018 by Matt Menke Committed by Commit Bot Apr 16, 2018
Show whitespace changes
Inline Side-by-side

Showing with 208 additions and 186 deletions

net/base/escape.cc net/base/escape.cc +151 -146

net/base/escape.h net/base/escape.h +4 -2

net/base/escape_unittest.cc net/base/escape_unittest.cc +53 -38

No files found.
--- a/net/base/escape.cc
+++ b/net/base/escape.cc
@@ -6,7 +6,9 @@
 #include "base/logging.h"
 #include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversion_utils.h"
 #include "base/strings/utf_string_conversions.h"
+#include "base/third_party/icu/icu_utf.h"
 namespace net {
@@ -101,7 +103,7 @@ const char kUrlUnescape[128] = {
 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
 // successful, sets |value| to the unescaped value.  Returns whether
 // unescaping succeeded.
-bool UnescapeUnsignedCharAtIndex(base::StringPiece escaped_text,
+bool UnescapeUnsignedByteAtIndex(base::StringPiece escaped_text,
                                 size_t index,
                                 unsigned char* value) {
  if ((index + 2) >= escaped_text.size())
@@ -118,71 +120,115 @@ bool UnescapeUnsignedCharAtIndex(base::StringPiece escaped_text,
  return false;
 }
-// Returns true if there is an Arabic Language Mark at |index|. |first_byte|
+// Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
-// is the byte at |index|.
+// the specified index. On success, returns true, sets |code_point_out| to be
-bool HasArabicLanguageMarkAtIndex(base::StringPiece escaped_text,
+// the character's code point and |unescaped_out| to be the unescaped UTF-8
-                                  unsigned char first_byte,
+// string. |unescaped_out| will always be 1/3rd the length of the substring of
-                                  size_t index) {
+// |escaped_text| that corresponds to the unescaped character.
-  if (first_byte != 0xD8)
+bool UnescapeUTF8CharacterAtIndex(base::StringPiece escaped_text,
-    return false;
+                                  size_t index,
-  unsigned char second_byte;
+                                  uint32_t* code_point_out,
-  if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
+                                  std::string* unescaped_out) {
-    return false;
+  DCHECK(unescaped_out->empty());
-  return second_byte == 0x9c;
-}
-// Returns true if there is a BiDi control char at |index|. |first_byte| is the
+  unsigned char bytes[CBU8_MAX_LENGTH];
-// byte at |index|.
+  if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
-bool HasThreeByteBidiControlCharAtIndex(base::StringPiece escaped_text,
-                                        unsigned char first_byte,
-                                        size_t index) {
-  if (first_byte != 0xE2)
-    return false;
-  unsigned char second_byte;
-  if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
-    return false;
-  if (second_byte != 0x80 && second_byte != 0x81)
-    return false;
-  unsigned char third_byte;
-  if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
    return false;
-  if (second_byte == 0x80) {
-    return third_byte == 0x8E ||
-           third_byte == 0x8F ||
-           (third_byte >= 0xAA && third_byte <= 0xAE);
-  }
-  return third_byte >= 0xA6 && third_byte <= 0xA9;
-}
-// Returns true if there is a four-byte banned char at |index|. |first_byte| is
+  size_t num_bytes = 1;
-// the byte at |index|.
-bool HasFourByteBannedCharAtIndex(base::StringPiece escaped_text,
+  // If this is a lead byte, need to collect trail bytes as well.
-                                  unsigned char first_byte,
+  if (CBU8_IS_LEAD(bytes[0])) {
-                                  size_t index) {
+    // Look for the last trail byte of the UTF-8 character.  Give up once
-  // The following characters are blacklisted for spoofability concerns.
+    // reach max character length number of bytes, or hit an unescaped
-  // U+1F50F LOCK WITH INK PEN         (%F0%9F%94%8F)
+    // character. No need to check length of escaped_text, as
-  // U+1F510 CLOSED LOCK WITH KEY      (%F0%9F%94%90)
+    // UnescapeUnsignedByteAtIndex checks lengths.
-  // U+1F512 LOCK                      (%F0%9F%94%92)
+    while (num_bytes < arraysize(bytes) &&
-  // U+1F513 OPEN LOCK                 (%F0%9F%94%93)
+           UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
-  if (first_byte != 0xF0)
+                                       &bytes[num_bytes]) &&
-    return false;
+           CBU8_IS_TRAIL(bytes[num_bytes])) {
+      ++num_bytes;
+    }
+  }
-  unsigned char second_byte;
+  int32_t char_index = 0;
-  if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte) ||
+  // Check if the unicode "character" that was just unescaped is valid.
-      second_byte != 0x9F) {
+  if (!base::ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
+                                  &char_index, code_point_out)) {
    return false;
  }
-  unsigned char third_byte;
+  // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
-  if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte) ||
+  // and the rest are not valid UTF-8, so need to update |num_bytes| based
-      third_byte != 0x94) {
+  // on the result of ReadUnicodeCharacter().
-    return false;
+  num_bytes = char_index + 1;
+  *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
+  return true;
+}
+// This method takes a Unicode code point and returns true if it should be
+// unescaped, based on |rules|.
+bool ShouldUnescapeCodePoint(UnescapeRule::Type rules, uint32_t code_point) {
+  // If this is an ASCII character, use the lookup table.
+  if (code_point < 0x80) {
+    return kUrlUnescape[code_point] ||
+           // Allow some additional unescaping when flags are set.
+           (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
+           // Allow any of the prohibited but non-control characters when doing
+           // "special" chars.
+           ((code_point == '/' || code_point == '\\') &&
+            (rules & UnescapeRule::PATH_SEPARATORS)) ||
+           (code_point > ' ' && code_point != '/' && code_point != '\\' &&
+            (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS)) ||
+           // Additionally allow non-display characters if requested.
+           (code_point < ' ' &&
+            (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS));
  }
-  unsigned char fourth_byte;
+  // Some schemes such as data: and file: need to parse the exact binary data
-  return UnescapeUnsignedCharAtIndex(escaped_text, index + 9, &fourth_byte) &&
+  // when loading the URL. For that reason, SPOOFING_AND_CONTROL_CHARS allows
-         (fourth_byte == 0x8F || fourth_byte == 0x90 || fourth_byte == 0x92 ||
+  // unescaping UTF-8 byte sequences that are not safe to display. DO NOT use
-          fourth_byte == 0x93);
+  // SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be displayed in
+  // the UI.
+  if (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)
+    return true;
+  // Compare the codepoint against a list of characters that can be used
+  // to spoof other URLs.
+  //
+  // Can't use icu to make this cleaner, because Cronet cannot depend on
+  // icu, and currently uses this file.
+  // TODO(https://crbug.com/829873): Try to make this use icu, both to
+  // protect against regressions as the Unicode stndard is updated and to
+  // reduce the number of long lists of characters.
+  // TODO(https://crbug.com/824715): Add default ignoreable and formatting
+  // codepoints.
+  return !(
+      // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
+      // control characters are not allowed to appear unescaped in URLs.
+      code_point == 0x200E ||  // LEFT-TO-RIGHT MARK         (%E2%80%8E)
+      code_point == 0x200F ||  // RIGHT-TO-LEFT MARK         (%E2%80%8F)
+      code_point == 0x202A ||  // LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
+      code_point == 0x202B ||  // RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
+      code_point == 0x202C ||  // POP DIRECTIONAL FORMATTING (%E2%80%AC)
+      code_point == 0x202D ||  // LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
+      code_point == 0x202E ||  // RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
+      // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
+      // since added some new BiDi control characters that are not safe to
+      // unescape. http://www.unicode.org/reports/tr9
+      code_point == 0x061C ||  // ARABIC LETTER MARK         (%D8%9C)
+      code_point == 0x2066 ||  // LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
+      code_point == 0x2067 ||  // RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
+      code_point == 0x2068 ||  // FIRST STRONG ISOLATE       (%E2%81%A8)
+      code_point == 0x2069 ||  // POP DIRECTIONAL ISOLATE    (%E2%81%A9)
+      // The following spoofable characters are also banned in unescaped URLs,
+      // because they could be used to imitate parts of a web browser's UI.
+      code_point == 0x1F50F ||  // LOCK WITH INK PEN    (%F0%9F%94%8F)
+      code_point == 0x1F510 ||  // CLOSED LOCK WITH KEY (%F0%9F%94%90)
+      code_point == 0x1F512 ||  // LOCK                 (%F0%9F%94%92)
+      code_point == 0x1F513);   // OPEN LOCK            (%F0%9F%94%93)
 }
 // Unescapes |escaped_text| according to |rules|, returning the resulting
@@ -207,101 +253,60 @@ std::string UnescapeURLWithAdjustmentsImpl(
  result.reserve(escaped_text.length());
  // Locations of adjusted text.
-  for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
+  for (size_t i = 0, max = escaped_text.size(); i < max;) {
-    if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
+    // Try to unescape the character.
-      // Non ASCII character, append as is.
+    uint32_t code_point;
-      result.push_back(escaped_text[i]);
+    std::string unescaped;
+    if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
+                                      &unescaped)) {
+      // Check if the next character can be unescaped, but not as a valid UTF-8
+      // character. In that case, just unescaped and write the non-sense
+      // character.
+      //
+      // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8 sequences
+      // unless SPOOFING_AND_CONTROL_CHARS is given. Should also split that
+      // behaviour off into a separate function.
+      unsigned char non_utf8_byte;
+      if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
+        result.push_back(non_utf8_byte);
+        if (adjustments)
+          adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
+        i += 3;
        continue;
      }
-    unsigned char first_byte;
+      // Character is not escaped, so append as is, unless it's a '+' and
-    if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
+      // REPLACE_PLUS_WITH_SPACE is being applied.
-      // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
+      if (escaped_text[i] == '+' &&
-      // control characters are not allowed to appear unescaped in URLs:
+          (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
-      //
+        result.push_back(' ');
-      // U+200E LEFT-TO-RIGHT MARK         (%E2%80%8E)
+      } else {
-      // U+200F RIGHT-TO-LEFT MARK         (%E2%80%8F)
+        result.push_back(escaped_text[i]);
-      // U+202A LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
-      // U+202B RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
-      // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
-      // U+202D LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
-      // U+202E RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
-      //
-      // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
-      // 3987 above has since added some new BiDi control characters.
-      // http://www.unicode.org/reports/tr9
-      //
-      // U+061C ARABIC LETTER MARK         (%D8%9C)
-      // U+2066 LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
-      // U+2067 RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
-      // U+2068 FIRST STRONG ISOLATE       (%E2%81%A8)
-      // U+2069 POP DIRECTIONAL ISOLATE    (%E2%81%A9)
-      //
-      // The following spoofable characters are also banned, because they could
-      // be used to imitate parts of a web browser's UI.
-      //
-      // U+1F50F LOCK WITH INK PEN         (%F0%9F%94%8F)
-      // U+1F510 CLOSED LOCK WITH KEY      (%F0%9F%94%90)
-      // U+1F512 LOCK                      (%F0%9F%94%92)
-      // U+1F513 OPEN LOCK                 (%F0%9F%94%93)
-      //
-      // However, some schemes such as data: and file: need to parse the exact
-      // binary data when loading the URL. For that reason,
-      // SPOOFING_AND_CONTROL_CHARS allows unescaping BiDi control characters.
-      // DO NOT use SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be
-      // displayed in the UI.
-      if (!(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)) {
-        if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
-          // Keep Arabic Language Mark escaped.
-          escaped_text.substr(i, 6).AppendToString(&result);
-          i += 5;
-          continue;
      }
-        if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {
+      ++i;
-          // Keep BiDi control char escaped.
-          escaped_text.substr(i, 9).AppendToString(&result);
-          i += 8;
      continue;
    }
-        if (HasFourByteBannedCharAtIndex(escaped_text, first_byte, i)) {
-          // Keep banned char escaped.
+    DCHECK(!unescaped.empty());
-          escaped_text.substr(i, 12).AppendToString(&result);
-          i += 11;
+    if (!ShouldUnescapeCodePoint(rules, code_point)) {
+      // If it's a valid UTF-8 character, but not safe to unescape, copy all
+      // bytes directly.
+      result.append(escaped_text.begin() + i,
+                    escaped_text.begin() + i + 3 * unescaped.length());
+      i += unescaped.length() * 3;
      continue;
    }
-      }
-      if (first_byte >= 0x80 ||  // Unescape all high-bit characters.
+    // If the code point is allowed, and append the entire unescaped character.
-          // For 7-bit characters, the lookup table tells us all valid chars.
+    result.append(unescaped);
-          (kUrlUnescape[first_byte] ||
+    if (adjustments) {
-           // ...and we allow some additional unescaping when flags are set.
+      for (size_t j = 0; j < unescaped.length(); ++j) {
-           (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
+        adjustments->push_back(
-           // Allow any of the prohibited but non-control characters when
+            base::OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
-           // we're doing "special" chars.
-           ((first_byte == '/' || first_byte == '\\') &&
-            (rules & UnescapeRule::PATH_SEPARATORS)) ||
-           (first_byte > ' ' && first_byte != '/' && first_byte != '\\' &&
-            (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS)) ||
-           // Additionally allow non-display characters if requested.
-           (first_byte < ' ' &&
-            (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)))) {
-        // Use the unescaped version of the character.
-        if (adjustments)
-          adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
-        result.push_back(first_byte);
-        i += 2;
-      } else {
-        // Keep escaped. Append a percent and we'll get the following two
-        // digits on the next loops through.
-        result.push_back('%');
      }
-    } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
-               escaped_text[i] == '+') {
-      result.push_back(' ');
-    } else {
-      // Normal case for unescaped characters.
-      result.push_back(escaped_text[i]);
    }
+    i += 3 * unescaped.length();
  }
  return result;

--- a/net/base/escape.h
+++ b/net/base/escape.h
@@ -81,7 +81,8 @@ class UnescapeRule {
    // Convert %20 to spaces. In some places where we're showing URLs, we may
    // want this. In places where the URL may be copied and pasted out, then
    // you wouldn't want this since it might not be interpreted in one piece
-    // by other applications.
+    // by other applications.  Other unicode spaces will not be unescaped unless
+    // SPOOFING_AND_CONTROL_CHARS is used.
    SPACES = 1 << 1,
    // Unescapes '/' and '\\'. If these characters were unescaped, the resulting
@@ -116,7 +117,8 @@ class UnescapeRule {
 // Unescapes |escaped_text| and returns the result.
 // Unescaping consists of looking for the exact pattern "%XX", where each X is
 // a hex digit, and converting to the character with the numerical value of
-// those digits. Thus "i%20=%203%3b" unescapes to "i = 3;".
+// those digits. Thus "i%20=%203%3b" unescapes to "i = 3;", if the
+// "UnescapeRule::SPACES" used.
 //
 // This method does not ensure that the output is a valid string using any
 // character encoding. However, unless SPOOFING_AND_CONTROL_CHARS is set, it

--- a/net/base/escape_unittest.cc
+++ b/net/base/escape_unittest.cc
@@ -236,6 +236,20 @@ TEST(EscapeTest, UnescapeURLComponent) {
       UnescapeRule::NORMAL | UnescapeRule::SPOOFING_AND_CONTROL_CHARS,
       "Some%20random text %25\xF0\x9F\x94\x93OK"},
+      // Two spoofing characters in a row should not be unescaped.
+      {"%D8%9C%D8%9C", UnescapeRule::NORMAL, "%D8%9C%D8%9C"},
+      // Non-spoofing characters surrounded by spoofing characters should be
+      // unescaped.
+      {"%D8%9C%C2%A1%D8%9C%C2%A1", UnescapeRule::NORMAL,
+       "%D8%9C\xC2\xA1%D8%9C\xC2\xA1"},
+      // Invalid UTF-8 characters surrounded by spoofing characters should be
+      // unescaped.
+      {"%D8%9C%85%D8%9C%85", UnescapeRule::NORMAL, "%D8%9C\x85%D8%9C\x85"},
+      // Test with enough trail bytes to overflow the CBU8_MAX_LENGTH-byte
+      // buffer. The first two bytes are a spoofing character as well.
+      {"%D8%9C%9C%9C%9C%9C%9C%9C%9C%9C%9C", UnescapeRule::NORMAL,
+       "%D8%9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C"},
      {"Some%20random text %25%2dOK", UnescapeRule::SPACES,
       "Some random text %25-OK"},
      {"Some%20random text %25%2dOK", UnescapeRule::PATH_SEPARATORS,
@@ -381,6 +395,7 @@ TEST(EscapeTest, AdjustOffset) {
      {"%2dtest", 1, std::string::npos},
      {"%2dtest", 0, 0},
      {"test%2d", 2, 2},
+      {"test%2e", 2, 2},
      {"%E4%BD%A0+%E5%A5%BD", 9, 1},
      {"%E4%BD%A0+%E5%A5%BD", 6, std::string::npos},
      {"%E4%BD%A0+%E5%A5%BD", 0, 0},