Commit cc0bbcbe authored by meacer's avatar meacer Committed by Commit Bot

Restrict Latin Small Letter Thorn (U+00FE) to Icelandic domains

This character (þ) can be confused with both b and p when used in a domain
name. IDN spoof checker doesn't have a good way of flagging a character as
confusable with multiple characters, so it can't catch spoofs containing
this character. As a practical fix, this CL restricts this character to
domains under Iceland's ccTLD (.is). With this change, a domain name containing
"þ" with a non-.is TLD will be displayed in punycode in the UI.

This change affects less than 10 real world domains with limited popularity.

Bug: 798892, 843352, 904327, 1017707
Change-Id: Ib07190dcde406bf62ce4413688a4fb4859a51030
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1879992
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Reviewed-by: default avatarChristopher Thompson <cthomp@chromium.org>
Cr-Commit-Position: refs/heads/master@{#709309}
parent b608449a
...@@ -183,7 +183,7 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -183,7 +183,7 @@ IDNSpoofChecker::IDNSpoofChecker() {
// Supplement the Unicode confusable list by the following mapping. // Supplement the Unicode confusable list by the following mapping.
// - {U+00E6 (æ), U+04D5 (ӕ)} => "ae" // - {U+00E6 (æ), U+04D5 (ӕ)} => "ae"
// - {U+00FE (þ), U+03FC (ϼ), U+048F (ҏ)} => p // - {U+03FC (ϼ), U+048F (ҏ)} => p
// - {U+0127 (ħ), U+043D (н), U+045B (ћ), U+04A3 (ң), U+04A5 (ҥ), // - {U+0127 (ħ), U+043D (н), U+045B (ћ), U+04A3 (ң), U+04A5 (ҥ),
// U+04C8 (ӈ), U+04CA (ӊ), U+050B (ԋ), U+0527 (ԧ), U+0529 (ԩ)} => h // U+04C8 (ӈ), U+04CA (ӊ), U+050B (ԋ), U+0527 (ԧ), U+0529 (ԩ)} => h
// - {U+0138 (ĸ), U+03BA (κ), U+043A (к), U+049B (қ), U+049D (ҝ), // - {U+0138 (ĸ), U+03BA (κ), U+043A (к), U+049B (қ), U+049D (ҝ),
...@@ -228,7 +228,7 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -228,7 +228,7 @@ IDNSpoofChecker::IDNSpoofChecker() {
extra_confusable_mapper_.reset(icu::Transliterator::createFromRules( extra_confusable_mapper_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("ExtraConf"), UNICODE_STRING_SIMPLE("ExtraConf"),
icu::UnicodeString::fromUTF8( icu::UnicodeString::fromUTF8(
"[æӕ] > ae; [þϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;" "[æӕ] > ae; [ϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;"
"[ĸκкқҝҟҡӄԟ] > k; [ŋпԥกח] > n; œ > ce;" "[ĸκкқҝҟҡӄԟ] > k; [ŋпԥกח] > n; œ > ce;"
"[ŧтҭԏ七丅丆丁] > t; [ƅьҍв] > b; [ωшщพฟພຟ] > w;" "[ŧтҭԏ七丅丆丁] > t; [ƅьҍв] > b; [ωшщพฟພຟ] > w;"
"[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;" "[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;"
...@@ -255,8 +255,9 @@ IDNSpoofChecker::~IDNSpoofChecker() { ...@@ -255,8 +255,9 @@ IDNSpoofChecker::~IDNSpoofChecker() {
uspoof_close(checker_); uspoof_close(checker_);
} }
bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label, bool IDNSpoofChecker::SafeToDisplayAsUnicode(
bool is_tld_ascii) { base::StringPiece16 label,
base::StringPiece top_level_domain) {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
int32_t result = int32_t result =
uspoof_check(checker_, label.data(), uspoof_check(checker_, label.data(),
...@@ -283,6 +284,14 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label, ...@@ -283,6 +284,14 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
if (deviation_characters_.containsSome(label_string)) if (deviation_characters_.containsSome(label_string))
return false; return false;
// Latin small letter thorn (U+00FE) can be used to spoof both b and p. It's
// used in modern Icelandic orthography, so allow it for the Icelandic ccTLD
// (.is) but block in any other TLD.
if (label_string.length() > 1 && label_string.indexOf("þ") != -1 &&
top_level_domain != ".is") {
return false;
}
// If there's no script mixing, the input is regarded as safe without any // If there's no script mixing, the input is regarded as safe without any
// extra check unless it falls into one of three categories: // extra check unless it falls into one of three categories:
// - contains Kana letter exceptions // - contains Kana letter exceptions
...@@ -300,6 +309,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label, ...@@ -300,6 +309,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
kana_letters_exceptions_.containsNone(label_string) && kana_letters_exceptions_.containsNone(label_string) &&
combining_diacritics_exceptions_.containsNone(label_string)) { combining_diacritics_exceptions_.containsNone(label_string)) {
bool is_tld_ascii = !top_level_domain.starts_with(".xn--");
// Check Cyrillic confusable only for ASCII TLDs. // Check Cyrillic confusable only for ASCII TLDs.
return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string); return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
} }
......
...@@ -61,7 +61,8 @@ class IDNSpoofChecker { ...@@ -61,7 +61,8 @@ class IDNSpoofChecker {
// Returns true if |label| is safe to display as Unicode. In the event of // Returns true if |label| is safe to display as Unicode. In the event of
// library failure, all IDN inputs will be treated as unsafe. // library failure, all IDN inputs will be treated as unsafe.
// See the function body for details on the specific safety checks performed. // See the function body for details on the specific safety checks performed.
bool SafeToDisplayAsUnicode(base::StringPiece16 label, bool is_tld_ascii); bool SafeToDisplayAsUnicode(base::StringPiece16 label,
base::StringPiece top_level_domain);
// Returns the matching top domain if |hostname| or the last few components of // Returns the matching top domain if |hostname| or the last few components of
// |hostname| looks similar to one of top domains listed i // |hostname| looks similar to one of top domains listed i
......
...@@ -33,7 +33,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments( ...@@ -33,7 +33,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments(
bool IDNToUnicodeOneComponent(const base::char16* comp, bool IDNToUnicodeOneComponent(const base::char16* comp,
size_t comp_len, size_t comp_len,
bool is_tld_ascii, base::StringPiece top_level_domain,
bool enable_spoof_checks, bool enable_spoof_checks,
base::string16* out, base::string16* out,
bool* has_idn_component); bool* has_idn_component);
...@@ -244,11 +244,10 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( ...@@ -244,11 +244,10 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
input16.reserve(host.length()); input16.reserve(host.length());
input16.insert(input16.end(), host.begin(), host.end()); input16.insert(input16.end(), host.begin(), host.end());
bool is_tld_ascii = true; base::StringPiece top_level_domain;
size_t last_dot = host.rfind('.'); size_t last_dot = host.rfind('.');
if (last_dot != base::StringPiece::npos && if (last_dot != base::StringPiece::npos) {
host.substr(last_dot).starts_with(".xn--")) { top_level_domain = host.substr(last_dot);
is_tld_ascii = false;
} }
IDNConversionResult result; IDNConversionResult result;
...@@ -269,7 +268,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( ...@@ -269,7 +268,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
// Add the substring that we just found. // Add the substring that we just found.
bool has_idn_component = false; bool has_idn_component = false;
converted_idn = IDNToUnicodeOneComponent( converted_idn = IDNToUnicodeOneComponent(
input16.data() + component_start, component_length, is_tld_ascii, input16.data() + component_start, component_length, top_level_domain,
enable_spoof_checks, &out16, &has_idn_component); enable_spoof_checks, &out16, &has_idn_component);
result.has_idn_component |= has_idn_component; result.has_idn_component |= has_idn_component;
} }
...@@ -319,8 +318,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments( ...@@ -319,8 +318,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments(
// user. Note that this function does not deal with pure ASCII domain labels at // user. Note that this function does not deal with pure ASCII domain labels at
// all even though it's possible to make up look-alike labels with ASCII // all even though it's possible to make up look-alike labels with ASCII
// characters alone. // characters alone.
bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) { bool IsIDNComponentSafe(base::StringPiece16 label,
return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label, is_tld_ascii); base::StringPiece top_level_domain) {
return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label,
top_level_domain);
} }
// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
...@@ -372,7 +373,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER; ...@@ -372,7 +373,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
// input has IDN, regardless of whether it was converted to unicode or not. // input has IDN, regardless of whether it was converted to unicode or not.
bool IDNToUnicodeOneComponent(const base::char16* comp, bool IDNToUnicodeOneComponent(const base::char16* comp,
size_t comp_len, size_t comp_len,
bool is_tld_ascii, base::StringPiece top_level_domain,
bool enable_spoof_checks, bool enable_spoof_checks,
base::string16* out, base::string16* out,
bool* has_idn_component) { bool* has_idn_component) {
...@@ -418,7 +419,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp, ...@@ -418,7 +419,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
if (IsIDNComponentSafe( if (IsIDNComponentSafe(
base::StringPiece16(out->data() + original_length, base::StringPiece16(out->data() + original_length,
base::checked_cast<size_t>(output_length)), base::checked_cast<size_t>(output_length)),
is_tld_ascii)) { top_level_domain)) {
return true; return true;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment