Commit c3e87dbe authored by Christopher Thompson's avatar Christopher Thompson Committed by Commit Bot

Add more charsets to URL unescape banned list

Adds remaining characters from the the Default Ignorable and Formatting
character sets to the URL unescape ban list.

Bug: 824715
Change-Id: If5efe179a7661667380566f89eacb282d711bddd
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1803833
Commit-Queue: Christopher Thompson <cthomp@chromium.org>
Reviewed-by: default avatarMatt Menke <mmenke@chromium.org>
Reviewed-by: default avatarMatt Giuca <mgiuca@chromium.org>
Cr-Commit-Position: refs/heads/master@{#698942}
parent 4d7a0014
......@@ -193,8 +193,6 @@ bool ShouldUnescapeCodePoint(UnescapeRule::Type rules, uint32_t code_point) {
// TODO(https://crbug.com/829873): Try to make this use icu, both to
// protect against regressions as the Unicode standard is updated and to
// reduce the number of long lists of characters.
// TODO(https://crbug.com/824715): Add default ignorable and formatting
// code points.
return !(
// Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
// control characters are not allowed to appear unescaped in URLs.
......@@ -241,7 +239,62 @@ bool ShouldUnescapeCodePoint(UnescapeRule::Type rules, uint32_t code_point) {
code_point == 0x2029 || // PARAGRAPH SEPARATOR (%E2%80%A9)
code_point == 0x202F || // NARROW NO-BREAK SPACE (%E2%80%AF)
code_point == 0x205F || // MEDIUM MATHEMATICAL SPACE (%E2%81%9F)
code_point == 0x3000); // IDEOGRAPHIC SPACE (%E3%80%80)
code_point == 0x3000 || // IDEOGRAPHIC SPACE (%E3%80%80)
// Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
// characters ([:Cf:]) are also banned (see crbug.com/824715).
code_point == 0x00AD || // SOFT HYPHEN (%C2%AD)
code_point == 0x034F || // COMBINING GRAPHEME JOINER (%CD%8F)
// Arabic number formatting
(code_point >= 0x0600 && code_point <= 0x0605) ||
// U+061C is already banned as a BiDi control character.
code_point == 0x06DD || // ARABIC END OF AYAH (%DB%9D)
code_point == 0x070F || // SYRIAC ABBREVIATION MARK (%DC%8F)
code_point == 0x08E2 || // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
code_point == 0x115F || // HANGUL CHOSEONG FILLER (%E1%85%9F)
code_point == 0x1160 || // HANGUL JUNGSEONG FILLER (%E1%85%A0)
code_point == 0x17B4 || // KHMER VOWEL INHERENT AQ (%E1%9E%B4)
code_point == 0x17B5 || // KHMER VOWEL INHERENT AA (%E1%9E%B5)
code_point == 0x180B || // MONGOLIAN FREE VARIATION SELECTOR ONE
// (%E1%A0%8B)
code_point == 0x180C || // MONGOLIAN FREE VARIATION SELECTOR TWO
// (%E1%A0%8C)
code_point == 0x180D || // MONGOLIAN FREE VARIATION SELECTOR THREE
// (%E1%A0%8D)
code_point == 0x180E || // MONGOLIAN VOWEL SEPARATOR (%E1%A0%8E)
code_point == 0x200B || // ZERO WIDTH SPACE (%E2%80%8B)
code_point == 0x200C || // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
code_point == 0x200D || // ZERO WIDTH JOINER (%E2%80%8D)
// U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
// BiDi control characters.
code_point == 0x2060 || // WORD JOINER (%E2%81%A0)
code_point == 0x2061 || // FUNCTION APPLICATION (%E2%81%A1)
code_point == 0x2062 || // INVISIBLE TIMES (%E2%81%A2)
code_point == 0x2063 || // INVISIBLE SEPARATOR (%E2%81%A3)
code_point == 0x2064 || // INVISIBLE PLUS (%E2%81%A4)
code_point == 0x2065 || // null (%E2%81%A5)
// 0x2066--0x2069 are already banned as a BiDi control characters.
// General Punctuation - Deprecated (U+206A--206F)
(code_point >= 0x206A && code_point <= 0x206F) ||
code_point == 0x3164 || // HANGUL FILLER (%E3%85%A4)
(code_point >= 0xFFF0 && code_point <= 0xFFF8) || // null
// Variation selectors (%EF%B8%80 -- %EF%B8%8F)
(code_point >= 0xFE00 && code_point <= 0xFE0F) ||
code_point == 0xFEFF || // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
code_point == 0xFFA0 || // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
code_point == 0xFFF9 || // INTERLINEAR ANNOTATION ANCHOR (%EF%BF%B9)
code_point == 0xFFFA || // INTERLINEAR ANNOTATION SEPARATOR (%EF%BF%BA)
code_point == 0xFFFB || // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
code_point == 0x110BD || // KAITHI NUMBER SIGN (%F0%91%82%BD)
code_point == 0x110CD || // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
// Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
(code_point >= 0x13430 && code_point <= 0x13438) ||
// Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
(code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
// Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
(code_point >= 0x1D173 && code_point <= 0x1D17A) ||
// Tags, Variation Selectors, nulls
(code_point >= 0xE0000 && code_point <= 0xE0FFF));
}
// Unescapes |escaped_text| according to |rules|, returning the resulting
......
......@@ -204,6 +204,48 @@ TEST(EscapeTest, UnescapeURLComponent) {
{"(%E2%80%AF)(%E2%81%9F)(%E3%80%80)", UnescapeRule::NORMAL,
"(%E2%80%AF)(%E2%81%9F)(%E3%80%80)"},
// Default Ignorable and Formatting characters should not be unescaped.
{"(%E2%81%A5)(%EF%BF%B0)(%EF%BF%B8)", UnescapeRule::NORMAL,
"(%E2%81%A5)(%EF%BF%B0)(%EF%BF%B8)"},
{"(%F3%A0%82%80)(%F3%A0%83%BF)(%F3%A0%87%B0)", UnescapeRule::NORMAL,
"(%F3%A0%82%80)(%F3%A0%83%BF)(%F3%A0%87%B0)"},
{"(%F3%A0%BF%BF)(%C2%AD)(%CD%8F)", UnescapeRule::NORMAL,
"(%F3%A0%BF%BF)(%C2%AD)(%CD%8F)"},
{"(%D8%80%20)(%D8%85)(%DB%9D)(%DC%8F)(%E0%A3%A2)", UnescapeRule::NORMAL,
"(%D8%80%20)(%D8%85)(%DB%9D)(%DC%8F)(%E0%A3%A2)"},
{"(%E1%85%9F)(%E1%85%A0)(%E1%9E%B4)(%E1%9E%B5)", UnescapeRule::NORMAL,
"(%E1%85%9F)(%E1%85%A0)(%E1%9E%B4)(%E1%9E%B5)"},
{"(%E1%A0%8B)(%E1%A0%8C)(%E1%A0%8D)(%E1%A0%8E)", UnescapeRule::NORMAL,
"(%E1%A0%8B)(%E1%A0%8C)(%E1%A0%8D)(%E1%A0%8E)"},
{"(%E2%80%8B)(%E2%80%8C)(%E2%80%8D)(%E2%81%A0)", UnescapeRule::NORMAL,
"(%E2%80%8B)(%E2%80%8C)(%E2%80%8D)(%E2%81%A0)"},
{"(%E2%81%A1)(%E2%81%A2)(%E2%81%A3)(%E2%81%A4)", UnescapeRule::NORMAL,
"(%E2%81%A1)(%E2%81%A2)(%E2%81%A3)(%E2%81%A4)"},
{"(%E3%85%A4)(%EF%BB%BF)(%EF%BE%A0)(%EF%BF%B9)", UnescapeRule::NORMAL,
"(%E3%85%A4)(%EF%BB%BF)(%EF%BE%A0)(%EF%BF%B9)"},
{"(%EF%BF%BB)(%F0%91%82%BD)(%F0%91%83%8D)", UnescapeRule::NORMAL,
"(%EF%BF%BB)(%F0%91%82%BD)(%F0%91%83%8D)"},
{"(%F0%93%90%B0)(%F0%93%90%B8)", UnescapeRule::NORMAL,
"(%F0%93%90%B0)(%F0%93%90%B8)"},
// General Punctuation - Deprecated (U+206A--206F)
{"(%E2%81%AA)(%E2%81%AD)(%E2%81%AF)", UnescapeRule::NORMAL,
"(%E2%81%AA)(%E2%81%AD)(%E2%81%AF)"},
// Variation selectors (U+FE00--FE0F)
{"(%EF%B8%80)(%EF%B8%8C)(%EF%B8%8D)", UnescapeRule::NORMAL,
"(%EF%B8%80)(%EF%B8%8C)(%EF%B8%8D)"},
// Shorthand format controls (U+1BCA0--1BCA3)
{"(%F0%9B%B2%A0)(%F0%9B%B2%A1)(%F0%9B%B2%A3)", UnescapeRule::NORMAL,
"(%F0%9B%B2%A0)(%F0%9B%B2%A1)(%F0%9B%B2%A3)"},
// Musical symbols beams and slurs (U+1D173--1D17A)
{"(%F0%9D%85%B3)(%F0%9D%85%B9)(%F0%9D%85%BA)", UnescapeRule::NORMAL,
"(%F0%9D%85%B3)(%F0%9D%85%B9)(%F0%9D%85%BA)"},
// Tags block (U+E0000--E007F), includes unassigned points
{"(%F3%A0%80%80)(%F3%A0%80%81)(%F3%A0%81%8F)", UnescapeRule::NORMAL,
"(%F3%A0%80%80)(%F3%A0%80%81)(%F3%A0%81%8F)"},
// Ideographic-specific variation selectors (U+E0100--E01EF)
{"(%F3%A0%84%80)(%F3%A0%84%90)(%F3%A0%87%AF)", UnescapeRule::NORMAL,
"(%F3%A0%84%80)(%F3%A0%84%90)(%F3%A0%87%AF)"},
// Two spoofing characters in a row should not be unescaped.
{"%D8%9C%D8%9C", UnescapeRule::NORMAL, "%D8%9C%D8%9C"},
// Non-spoofing characters surrounded by spoofing characters should be
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment