Commit 6e8dbd1d authored by Matt Menke's avatar Matt Menke Committed by Commit Bot

Restructure UnescapeURLWithAdjustmentsImpl().

In particular, unescape entire unicode characters at once, and then
compare against unescape blacklists, rather than the other way around,
to simplify code and avoid the tree structure of the old code. This
will also allow the method to use icu's code point classification
logic, at some point in the future.

Also separate out comparing against the character blacklist and UTF-8
character decoding into separate methods, and add a few more test cases
to unittest.

The method itself should behave exactly the same as before.

Bug: 824715
Change-Id: I5311f25bfda4132b122ec4a079740adf093099a3
Reviewed-on: https://chromium-review.googlesource.com/998014
Commit-Queue: Matt Menke <mmenke@chromium.org>
Reviewed-by: default avatarMatt Giuca <mgiuca@chromium.org>
Reviewed-by: default avatarHelen Li <xunjieli@chromium.org>
Cr-Commit-Position: refs/heads/master@{#551029}
parent dd942765
...@@ -6,7 +6,9 @@ ...@@ -6,7 +6,9 @@
#include "base/logging.h" #include "base/logging.h"
#include "base/strings/string_util.h" #include "base/strings/string_util.h"
#include "base/strings/utf_string_conversion_utils.h"
#include "base/strings/utf_string_conversions.h" #include "base/strings/utf_string_conversions.h"
#include "base/third_party/icu/icu_utf.h"
namespace net { namespace net {
...@@ -101,7 +103,7 @@ const char kUrlUnescape[128] = { ...@@ -101,7 +103,7 @@ const char kUrlUnescape[128] = {
// Attempts to unescape the sequence at |index| within |escaped_text|. If // Attempts to unescape the sequence at |index| within |escaped_text|. If
// successful, sets |value| to the unescaped value. Returns whether // successful, sets |value| to the unescaped value. Returns whether
// unescaping succeeded. // unescaping succeeded.
bool UnescapeUnsignedCharAtIndex(base::StringPiece escaped_text, bool UnescapeUnsignedByteAtIndex(base::StringPiece escaped_text,
size_t index, size_t index,
unsigned char* value) { unsigned char* value) {
if ((index + 2) >= escaped_text.size()) if ((index + 2) >= escaped_text.size())
...@@ -118,71 +120,115 @@ bool UnescapeUnsignedCharAtIndex(base::StringPiece escaped_text, ...@@ -118,71 +120,115 @@ bool UnescapeUnsignedCharAtIndex(base::StringPiece escaped_text,
return false; return false;
} }
// Returns true if there is an Arabic Language Mark at |index|. |first_byte| // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
// is the byte at |index|. // the specified index. On success, returns true, sets |code_point_out| to be
bool HasArabicLanguageMarkAtIndex(base::StringPiece escaped_text, // the character's code point and |unescaped_out| to be the unescaped UTF-8
unsigned char first_byte, // string. |unescaped_out| will always be 1/3rd the length of the substring of
size_t index) { // |escaped_text| that corresponds to the unescaped character.
if (first_byte != 0xD8) bool UnescapeUTF8CharacterAtIndex(base::StringPiece escaped_text,
return false; size_t index,
unsigned char second_byte; uint32_t* code_point_out,
if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) std::string* unescaped_out) {
return false; DCHECK(unescaped_out->empty());
return second_byte == 0x9c;
}
// Returns true if there is a BiDi control char at |index|. |first_byte| is the unsigned char bytes[CBU8_MAX_LENGTH];
// byte at |index|. if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
bool HasThreeByteBidiControlCharAtIndex(base::StringPiece escaped_text,
unsigned char first_byte,
size_t index) {
if (first_byte != 0xE2)
return false;
unsigned char second_byte;
if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
return false;
if (second_byte != 0x80 && second_byte != 0x81)
return false;
unsigned char third_byte;
if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
return false; return false;
if (second_byte == 0x80) {
return third_byte == 0x8E ||
third_byte == 0x8F ||
(third_byte >= 0xAA && third_byte <= 0xAE);
}
return third_byte >= 0xA6 && third_byte <= 0xA9;
}
// Returns true if there is a four-byte banned char at |index|. |first_byte| is size_t num_bytes = 1;
// the byte at |index|.
bool HasFourByteBannedCharAtIndex(base::StringPiece escaped_text, // If this is a lead byte, need to collect trail bytes as well.
unsigned char first_byte, if (CBU8_IS_LEAD(bytes[0])) {
size_t index) { // Look for the last trail byte of the UTF-8 character. Give up once
// The following characters are blacklisted for spoofability concerns. // reach max character length number of bytes, or hit an unescaped
// U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F) // character. No need to check length of escaped_text, as
// U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90) // UnescapeUnsignedByteAtIndex checks lengths.
// U+1F512 LOCK (%F0%9F%94%92) while (num_bytes < arraysize(bytes) &&
// U+1F513 OPEN LOCK (%F0%9F%94%93) UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
if (first_byte != 0xF0) &bytes[num_bytes]) &&
return false; CBU8_IS_TRAIL(bytes[num_bytes])) {
++num_bytes;
}
}
unsigned char second_byte; int32_t char_index = 0;
if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte) || // Check if the unicode "character" that was just unescaped is valid.
second_byte != 0x9F) { if (!base::ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
&char_index, code_point_out)) {
return false; return false;
} }
unsigned char third_byte; // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte) || // and the rest are not valid UTF-8, so need to update |num_bytes| based
third_byte != 0x94) { // on the result of ReadUnicodeCharacter().
return false; num_bytes = char_index + 1;
*unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
return true;
}
// This method takes a Unicode code point and returns true if it should be
// unescaped, based on |rules|.
bool ShouldUnescapeCodePoint(UnescapeRule::Type rules, uint32_t code_point) {
// If this is an ASCII character, use the lookup table.
if (code_point < 0x80) {
return kUrlUnescape[code_point] ||
// Allow some additional unescaping when flags are set.
(code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
// Allow any of the prohibited but non-control characters when doing
// "special" chars.
((code_point == '/' || code_point == '\\') &&
(rules & UnescapeRule::PATH_SEPARATORS)) ||
(code_point > ' ' && code_point != '/' && code_point != '\\' &&
(rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS)) ||
// Additionally allow non-display characters if requested.
(code_point < ' ' &&
(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS));
} }
unsigned char fourth_byte; // Some schemes such as data: and file: need to parse the exact binary data
return UnescapeUnsignedCharAtIndex(escaped_text, index + 9, &fourth_byte) && // when loading the URL. For that reason, SPOOFING_AND_CONTROL_CHARS allows
(fourth_byte == 0x8F || fourth_byte == 0x90 || fourth_byte == 0x92 || // unescaping UTF-8 byte sequences that are not safe to display. DO NOT use
fourth_byte == 0x93); // SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be displayed in
// the UI.
if (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)
return true;
// Compare the codepoint against a list of characters that can be used
// to spoof other URLs.
//
// Can't use icu to make this cleaner, because Cronet cannot depend on
// icu, and currently uses this file.
// TODO(https://crbug.com/829873): Try to make this use icu, both to
// protect against regressions as the Unicode stndard is updated and to
// reduce the number of long lists of characters.
// TODO(https://crbug.com/824715): Add default ignoreable and formatting
// codepoints.
return !(
// Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
// control characters are not allowed to appear unescaped in URLs.
code_point == 0x200E || // LEFT-TO-RIGHT MARK (%E2%80%8E)
code_point == 0x200F || // RIGHT-TO-LEFT MARK (%E2%80%8F)
code_point == 0x202A || // LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
code_point == 0x202B || // RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
code_point == 0x202C || // POP DIRECTIONAL FORMATTING (%E2%80%AC)
code_point == 0x202D || // LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
code_point == 0x202E || // RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
// The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
// since added some new BiDi control characters that are not safe to
// unescape. http://www.unicode.org/reports/tr9
code_point == 0x061C || // ARABIC LETTER MARK (%D8%9C)
code_point == 0x2066 || // LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
code_point == 0x2067 || // RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
code_point == 0x2068 || // FIRST STRONG ISOLATE (%E2%81%A8)
code_point == 0x2069 || // POP DIRECTIONAL ISOLATE (%E2%81%A9)
// The following spoofable characters are also banned in unescaped URLs,
// because they could be used to imitate parts of a web browser's UI.
code_point == 0x1F50F || // LOCK WITH INK PEN (%F0%9F%94%8F)
code_point == 0x1F510 || // CLOSED LOCK WITH KEY (%F0%9F%94%90)
code_point == 0x1F512 || // LOCK (%F0%9F%94%92)
code_point == 0x1F513); // OPEN LOCK (%F0%9F%94%93)
} }
// Unescapes |escaped_text| according to |rules|, returning the resulting // Unescapes |escaped_text| according to |rules|, returning the resulting
...@@ -207,101 +253,60 @@ std::string UnescapeURLWithAdjustmentsImpl( ...@@ -207,101 +253,60 @@ std::string UnescapeURLWithAdjustmentsImpl(
result.reserve(escaped_text.length()); result.reserve(escaped_text.length());
// Locations of adjusted text. // Locations of adjusted text.
for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { for (size_t i = 0, max = escaped_text.size(); i < max;) {
if (static_cast<unsigned char>(escaped_text[i]) >= 128) { // Try to unescape the character.
// Non ASCII character, append as is. uint32_t code_point;
result.push_back(escaped_text[i]); std::string unescaped;
if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
&unescaped)) {
// Check if the next character can be unescaped, but not as a valid UTF-8
// character. In that case, just unescaped and write the non-sense
// character.
//
// TODO(https://crbug.com/829868): Do not unescape illegal UTF-8 sequences
// unless SPOOFING_AND_CONTROL_CHARS is given. Should also split that
// behaviour off into a separate function.
unsigned char non_utf8_byte;
if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
result.push_back(non_utf8_byte);
if (adjustments)
adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
i += 3;
continue; continue;
} }
unsigned char first_byte; // Character is not escaped, so append as is, unless it's a '+' and
if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) { // REPLACE_PLUS_WITH_SPACE is being applied.
// Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi if (escaped_text[i] == '+' &&
// control characters are not allowed to appear unescaped in URLs: (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
// result.push_back(' ');
// U+200E LEFT-TO-RIGHT MARK (%E2%80%8E) } else {
// U+200F RIGHT-TO-LEFT MARK (%E2%80%8F) result.push_back(escaped_text[i]);
// U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
// U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
// U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
// U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
// U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
//
// Additionally, the Unicode Technical Report (TR9) as referenced by RFC
// 3987 above has since added some new BiDi control characters.
// http://www.unicode.org/reports/tr9
//
// U+061C ARABIC LETTER MARK (%D8%9C)
// U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
// U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
// U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
// U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
//
// The following spoofable characters are also banned, because they could
// be used to imitate parts of a web browser's UI.
//
// U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)
// U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)
// U+1F512 LOCK (%F0%9F%94%92)
// U+1F513 OPEN LOCK (%F0%9F%94%93)
//
// However, some schemes such as data: and file: need to parse the exact
// binary data when loading the URL. For that reason,
// SPOOFING_AND_CONTROL_CHARS allows unescaping BiDi control characters.
// DO NOT use SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be
// displayed in the UI.
if (!(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)) {
if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
// Keep Arabic Language Mark escaped.
escaped_text.substr(i, 6).AppendToString(&result);
i += 5;
continue;
} }
if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { ++i;
// Keep BiDi control char escaped.
escaped_text.substr(i, 9).AppendToString(&result);
i += 8;
continue; continue;
} }
if (HasFourByteBannedCharAtIndex(escaped_text, first_byte, i)) {
// Keep banned char escaped. DCHECK(!unescaped.empty());
escaped_text.substr(i, 12).AppendToString(&result);
i += 11; if (!ShouldUnescapeCodePoint(rules, code_point)) {
// If it's a valid UTF-8 character, but not safe to unescape, copy all
// bytes directly.
result.append(escaped_text.begin() + i,
escaped_text.begin() + i + 3 * unescaped.length());
i += unescaped.length() * 3;
continue; continue;
} }
}
if (first_byte >= 0x80 || // Unescape all high-bit characters. // If the code point is allowed, and append the entire unescaped character.
// For 7-bit characters, the lookup table tells us all valid chars. result.append(unescaped);
(kUrlUnescape[first_byte] || if (adjustments) {
// ...and we allow some additional unescaping when flags are set. for (size_t j = 0; j < unescaped.length(); ++j) {
(first_byte == ' ' && (rules & UnescapeRule::SPACES)) || adjustments->push_back(
// Allow any of the prohibited but non-control characters when base::OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
// we're doing "special" chars.
((first_byte == '/' || first_byte == '\\') &&
(rules & UnescapeRule::PATH_SEPARATORS)) ||
(first_byte > ' ' && first_byte != '/' && first_byte != '\\' &&
(rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS)) ||
// Additionally allow non-display characters if requested.
(first_byte < ' ' &&
(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)))) {
// Use the unescaped version of the character.
if (adjustments)
adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
result.push_back(first_byte);
i += 2;
} else {
// Keep escaped. Append a percent and we'll get the following two
// digits on the next loops through.
result.push_back('%');
} }
} else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
escaped_text[i] == '+') {
result.push_back(' ');
} else {
// Normal case for unescaped characters.
result.push_back(escaped_text[i]);
} }
i += 3 * unescaped.length();
} }
return result; return result;
......
...@@ -81,7 +81,8 @@ class UnescapeRule { ...@@ -81,7 +81,8 @@ class UnescapeRule {
// Convert %20 to spaces. In some places where we're showing URLs, we may // Convert %20 to spaces. In some places where we're showing URLs, we may
// want this. In places where the URL may be copied and pasted out, then // want this. In places where the URL may be copied and pasted out, then
// you wouldn't want this since it might not be interpreted in one piece // you wouldn't want this since it might not be interpreted in one piece
// by other applications. // by other applications. Other unicode spaces will not be unescaped unless
// SPOOFING_AND_CONTROL_CHARS is used.
SPACES = 1 << 1, SPACES = 1 << 1,
// Unescapes '/' and '\\'. If these characters were unescaped, the resulting // Unescapes '/' and '\\'. If these characters were unescaped, the resulting
...@@ -116,7 +117,8 @@ class UnescapeRule { ...@@ -116,7 +117,8 @@ class UnescapeRule {
// Unescapes |escaped_text| and returns the result. // Unescapes |escaped_text| and returns the result.
// Unescaping consists of looking for the exact pattern "%XX", where each X is // Unescaping consists of looking for the exact pattern "%XX", where each X is
// a hex digit, and converting to the character with the numerical value of // a hex digit, and converting to the character with the numerical value of
// those digits. Thus "i%20=%203%3b" unescapes to "i = 3;". // those digits. Thus "i%20=%203%3b" unescapes to "i = 3;", if the
// "UnescapeRule::SPACES" used.
// //
// This method does not ensure that the output is a valid string using any // This method does not ensure that the output is a valid string using any
// character encoding. However, unless SPOOFING_AND_CONTROL_CHARS is set, it // character encoding. However, unless SPOOFING_AND_CONTROL_CHARS is set, it
......
...@@ -236,6 +236,20 @@ TEST(EscapeTest, UnescapeURLComponent) { ...@@ -236,6 +236,20 @@ TEST(EscapeTest, UnescapeURLComponent) {
UnescapeRule::NORMAL | UnescapeRule::SPOOFING_AND_CONTROL_CHARS, UnescapeRule::NORMAL | UnescapeRule::SPOOFING_AND_CONTROL_CHARS,
"Some%20random text %25\xF0\x9F\x94\x93OK"}, "Some%20random text %25\xF0\x9F\x94\x93OK"},
// Two spoofing characters in a row should not be unescaped.
{"%D8%9C%D8%9C", UnescapeRule::NORMAL, "%D8%9C%D8%9C"},
// Non-spoofing characters surrounded by spoofing characters should be
// unescaped.
{"%D8%9C%C2%A1%D8%9C%C2%A1", UnescapeRule::NORMAL,
"%D8%9C\xC2\xA1%D8%9C\xC2\xA1"},
// Invalid UTF-8 characters surrounded by spoofing characters should be
// unescaped.
{"%D8%9C%85%D8%9C%85", UnescapeRule::NORMAL, "%D8%9C\x85%D8%9C\x85"},
// Test with enough trail bytes to overflow the CBU8_MAX_LENGTH-byte
// buffer. The first two bytes are a spoofing character as well.
{"%D8%9C%9C%9C%9C%9C%9C%9C%9C%9C%9C", UnescapeRule::NORMAL,
"%D8%9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C\x9C"},
{"Some%20random text %25%2dOK", UnescapeRule::SPACES, {"Some%20random text %25%2dOK", UnescapeRule::SPACES,
"Some random text %25-OK"}, "Some random text %25-OK"},
{"Some%20random text %25%2dOK", UnescapeRule::PATH_SEPARATORS, {"Some%20random text %25%2dOK", UnescapeRule::PATH_SEPARATORS,
...@@ -381,6 +395,7 @@ TEST(EscapeTest, AdjustOffset) { ...@@ -381,6 +395,7 @@ TEST(EscapeTest, AdjustOffset) {
{"%2dtest", 1, std::string::npos}, {"%2dtest", 1, std::string::npos},
{"%2dtest", 0, 0}, {"%2dtest", 0, 0},
{"test%2d", 2, 2}, {"test%2d", 2, 2},
{"test%2e", 2, 2},
{"%E4%BD%A0+%E5%A5%BD", 9, 1}, {"%E4%BD%A0+%E5%A5%BD", 9, 1},
{"%E4%BD%A0+%E5%A5%BD", 6, std::string::npos}, {"%E4%BD%A0+%E5%A5%BD", 6, std::string::npos},
{"%E4%BD%A0+%E5%A5%BD", 0, 0}, {"%E4%BD%A0+%E5%A5%BD", 0, 0},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment