Restrict Latin Small Letter Thorn (U+00FE) to Icelandic domains

This character (þ) can be confused with both b and p when used in a domain name. IDN spoof checker doesn't have a good way of flagging a character as confusable with multiple characters, so it can't catch spoofs containing this character. As a practical fix, this CL restricts this character to domains under Iceland's ccTLD (.is). With this change, a domain name containing "þ" with a non-.is TLD will be displayed in punycode in the UI. This change affects less than 10 real world domains with limited popularity. Bug: 798892, 843352, 904327, 1017707 Change-Id: Ib07190dcde406bf62ce4413688a4fb4859a51030 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1879992 Commit-Queue: Mustafa Emre Acer <meacer@chromium.org> Reviewed-by: Christopher Thompson <cthomp@chromium.org> Cr-Commit-Position: refs/heads/master@{#709309}

Restrict Latin Small Letter Thorn (U+00FE) to Icelandic domains
This character (þ) can be confused with both b and p when used in a domain name. IDN spoof checker doesn't have a good way of flagging a character as confusable with multiple characters, so it can't catch spoofs containing this character. As a practical fix, this CL restricts this character to domains under Iceland's ccTLD (.is). With this change, a domain name containing "þ" with a non-.is TLD will be displayed in punycode in the UI. This change affects less than 10 real world domains with limited popularity. Bug: 798892, 843352, 904327, 1017707 Change-Id: Ib07190dcde406bf62ce4413688a4fb4859a51030 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1879992 Commit-Queue: Mustafa Emre Acer <meacer@chromium.org> Reviewed-by: Christopher Thompson <cthomp@chromium.org> Cr-Commit-Position: refs/heads/master@{#709309}
cc0bbcbe · meacer · Commit Bot · b608449a · cc0bbcbe · cc0bbcbe
Commit cc0bbcbe authored Oct 25, 2019 by meacer Committed by Commit Bot Oct 25, 2019
3 changed files
--- a/components/url_formatter/spoof_checks/idn_spoof_checker.cc
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker.cc
@@ -183,7 +183,7 @@ IDNSpoofChecker::IDNSpoofChecker() {
  // Supplement the Unicode confusable list by the following mapping.
  //   - {U+00E6 (æ), U+04D5 (ӕ)}  => "ae"
-  //   - {U+00FE (þ), U+03FC (ϼ), U+048F (ҏ)} => p
+  //   - {U+03FC (ϼ), U+048F (ҏ)} => p
  //   - {U+0127 (ħ), U+043D (н), U+045B (ћ), U+04A3 (ң), U+04A5 (ҥ),
  //      U+04C8 (ӈ), U+04CA (ӊ), U+050B (ԋ), U+0527 (ԧ), U+0529 (ԩ)} => h
  //   - {U+0138 (ĸ), U+03BA (κ), U+043A (к), U+049B (қ), U+049D (ҝ),
@@ -228,7 +228,7 @@ IDNSpoofChecker::IDNSpoofChecker() {
  extra_confusable_mapper_.reset(icu::Transliterator::createFromRules(
      UNICODE_STRING_SIMPLE("ExtraConf"),
      icu::UnicodeString::fromUTF8(
-          "[æӕ] > ae; [þϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;"
+          "[æӕ] > ae; [ϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;"
          "[ĸκкқҝҟҡӄԟ] > k; [ŋпԥกח] > n; œ > ce;"
          "[ŧтҭԏ七丅丆丁] > t; [ƅьҍв] > b;  [ωшщพฟພຟ] > w;"
          "[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;"
@@ -255,8 +255,9 @@ IDNSpoofChecker::~IDNSpoofChecker() {
  uspoof_close(checker_);
 }
-bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
+bool IDNSpoofChecker::SafeToDisplayAsUnicode(
-                                             bool is_tld_ascii) {
+    base::StringPiece16 label,
+    base::StringPiece top_level_domain) {
  UErrorCode status = U_ZERO_ERROR;
  int32_t result =
      uspoof_check(checker_, label.data(),
@@ -283,6 +284,14 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
  if (deviation_characters_.containsSome(label_string))
    return false;
+  // Latin small letter thorn (U+00FE) can be used to spoof both b and p. It's
+  // used in modern Icelandic orthography, so allow it for the Icelandic ccTLD
+  // (.is) but block in any other TLD.
+  if (label_string.length() > 1 && label_string.indexOf("þ") != -1 &&
+      top_level_domain != ".is") {
+    return false;
+  }
  // If there's no script mixing, the input is regarded as safe without any
  // extra check unless it falls into one of three categories:
  //   - contains Kana letter exceptions
@@ -300,6 +309,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
  if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
      kana_letters_exceptions_.containsNone(label_string) &&
      combining_diacritics_exceptions_.containsNone(label_string)) {
+    bool is_tld_ascii = !top_level_domain.starts_with(".xn--");
    // Check Cyrillic confusable only for ASCII TLDs.
    return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
  }

--- a/components/url_formatter/spoof_checks/idn_spoof_checker.h
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker.h
@@ -61,7 +61,8 @@ class IDNSpoofChecker {
  // Returns true if |label| is safe to display as Unicode. In the event of
  // library failure, all IDN inputs will be treated as unsafe.
  // See the function body for details on the specific safety checks performed.
-  bool SafeToDisplayAsUnicode(base::StringPiece16 label, bool is_tld_ascii);
+  bool SafeToDisplayAsUnicode(base::StringPiece16 label,
+                              base::StringPiece top_level_domain);
  // Returns the matching top domain if |hostname| or the last few components of
  // |hostname| looks similar to one of top domains listed i

--- a/components/url_formatter/url_formatter.cc
+++ b/components/url_formatter/url_formatter.cc
@@ -33,7 +33,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments(
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                              size_t comp_len,
-                              bool is_tld_ascii,
+                              base::StringPiece top_level_domain,
                              bool enable_spoof_checks,
                              base::string16* out,
                              bool* has_idn_component);
@@ -244,11 +244,10 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
  input16.reserve(host.length());
  input16.insert(input16.end(), host.begin(), host.end());
-  bool is_tld_ascii = true;
+  base::StringPiece top_level_domain;
  size_t last_dot = host.rfind('.');
-  if (last_dot != base::StringPiece::npos &&
+  if (last_dot != base::StringPiece::npos) {
-      host.substr(last_dot).starts_with(".xn--")) {
+    top_level_domain = host.substr(last_dot);
-    is_tld_ascii = false;
  }
  IDNConversionResult result;
@@ -269,7 +268,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
      // Add the substring that we just found.
      bool has_idn_component = false;
      converted_idn = IDNToUnicodeOneComponent(
-          input16.data() + component_start, component_length, is_tld_ascii,
+          input16.data() + component_start, component_length, top_level_domain,
          enable_spoof_checks, &out16, &has_idn_component);
      result.has_idn_component |= has_idn_component;
    }
@@ -319,8 +318,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments(
 // user. Note that this function does not deal with pure ASCII domain labels at
 // all even though it's possible to make up look-alike labels with ASCII
 // characters alone.
-bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) {
+bool IsIDNComponentSafe(base::StringPiece16 label,
-  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label, is_tld_ascii);
+                        base::StringPiece top_level_domain) {
+  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label,
+                                                          top_level_domain);
 }
 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
@@ -372,7 +373,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
 // input has IDN, regardless of whether it was converted to unicode or not.
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                              size_t comp_len,
-                              bool is_tld_ascii,
+                              base::StringPiece top_level_domain,
                              bool enable_spoof_checks,
                              base::string16* out,
                              bool* has_idn_component) {
@@ -418,7 +419,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
    if (IsIDNComponentSafe(
            base::StringPiece16(out->data() + original_length,
                                base::checked_cast<size_t>(output_length)),
-            is_tld_ascii)) {
+            top_level_domain)) {
      return true;
    }
  }