Allow whole-script confusable Cyrillic domains only on Cyrillic TLDs

A whole-script confusable Cyrillic domain consists of entirely Cyrillic characters that look identical to Latin characters (e.g. xn--80ak6aa92e[.]com decodes to аррӏе[.]com where аррӏе is in fact '\x0430\x0440\x0440\x04cf\x0435'). A previous change allowed whole-script confusable Cyrillic characters on non-ASCII top level domains only. This means that xn--80ak6aa92e[.]com remains punycode (TLD is .com) but xn--80ak6aa92e[.]xn--p1ai is decoded as аррӏе[.]рф (TLD is Cyrillic). However, this also allows spoofs in other non-ASCII TLDs such as аррӏе[.]中国 so it's not a sufficient measure. This change further limits allowable whole-script confusable Cyrillic domains to Cyrillic TLDs (instead of non-ASCII) and a small list of additional TLDs containing a large number of Cyrillic domains (bg, by, kz, pyc, ru, su, ua, uz). The idea is that users familiar with Cyrillic are more likely to encounter these TLDs and notice any discrepancies in the displayed domain name. Bug: 968505 Change-Id: Ib7462c9776f3640a5f60e5c79ac1a0c5d7b2028c Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1881887 Commit-Queue: Mustafa Emre Acer <meacer@chromium.org> Reviewed-by: Christopher Thompson <cthomp@chromium.org> Reviewed-by: Peter Kasting <pkasting@chromium.org> Cr-Commit-Position: refs/heads/master@{#712764}

Allow whole-script confusable Cyrillic domains only on Cyrillic TLDs
A whole-script confusable Cyrillic domain consists of entirely Cyrillic characters that look identical to Latin characters (e.g. xn--80ak6aa92e[.]com decodes to аррӏе[.]com where аррӏе is in fact '\x0430\x0440\x0440\x04cf\x0435'). A previous change allowed whole-script confusable Cyrillic characters on non-ASCII top level domains only. This means that xn--80ak6aa92e[.]com remains punycode (TLD is .com) but xn--80ak6aa92e[.]xn--p1ai is decoded as аррӏе[.]рф (TLD is Cyrillic). However, this also allows spoofs in other non-ASCII TLDs such as аррӏе[.]中国 so it's not a sufficient measure. This change further limits allowable whole-script confusable Cyrillic domains to Cyrillic TLDs (instead of non-ASCII) and a small list of additional TLDs containing a large number of Cyrillic domains (bg, by, kz, pyc, ru, su, ua, uz). The idea is that users familiar with Cyrillic are more likely to encounter these TLDs and notice any discrepancies in the displayed domain name. Bug: 968505 Change-Id: Ib7462c9776f3640a5f60e5c79ac1a0c5d7b2028c Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1881887 Commit-Queue: Mustafa Emre Acer <meacer@chromium.org> Reviewed-by: Christopher Thompson <cthomp@chromium.org> Reviewed-by: Peter Kasting <pkasting@chromium.org> Cr-Commit-Position: refs/heads/master@{#712764}
b52ac904 · meacer · Commit Bot · 2b199870 · b52ac904 · b52ac904
Commit b52ac904 authored Nov 05, 2019 by meacer Committed by Commit Bot Nov 05, 2019
4 changed files
--- a/components/url_formatter/spoof_checks/idn_spoof_checker.cc
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker.cc
@@ -265,7 +265,8 @@ IDNSpoofChecker::~IDNSpoofChecker() {

 bool IDNSpoofChecker::SafeToDisplayAsUnicode(
    base::StringPiece16 label,
-    base::StringPiece top_level_domain) {
+    base::StringPiece top_level_domain,
+    base::StringPiece16 top_level_domain_unicode) {
  UErrorCode status = U_ZERO_ERROR;
  int32_t result =
      uspoof_check(checker_, label.data(),
@@ -275,7 +276,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
  if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS))
    return false;

-  icu::UnicodeString label_string(FALSE, label.data(),
+  icu::UnicodeString label_string(FALSE /* isTerminated */, label.data(),
                                  base::checked_cast<int32_t>(label.size()));

  // A punycode label with 'xn--' prefix is not subject to the URL
@@ -293,7 +294,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
    return false;

  // Disallow Icelandic confusables for domains outside Iceland's ccTLD (.is).
-  if (label_string.length() > 1 && top_level_domain != ".is" &&
+  if (label_string.length() > 1 && top_level_domain != "is" &&
      icelandic_characters_.containsSome(label_string))
    return false;

@@ -314,9 +315,11 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
  if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
      kana_letters_exceptions_.containsNone(label_string) &&
      combining_diacritics_exceptions_.containsNone(label_string)) {
-    bool is_tld_ascii = !top_level_domain.starts_with(".xn--");
-    // Check Cyrillic confusable only for ASCII TLDs.
-    return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
+    // Check Cyrillic confusable only for TLDs where Cyrillic characters are
+    // uncommon.
+    return IsCyrillicTopLevelDomain(top_level_domain,
+                                    top_level_domain_unicode) ||
+           !IsMadeOfLatinAlikeCyrillic(label_string);
  }

  // Additional checks for |label| with multiple scripts, one of which is Latin.
@@ -598,6 +601,21 @@ bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
         cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);
 }

+bool IDNSpoofChecker::IsCyrillicTopLevelDomain(
+    base::StringPiece tld,
+    base::StringPiece16 tld_unicode) const {
+  icu::UnicodeString tld_string(
+      FALSE /* isTerminated */, tld_unicode.data(),
+      base::checked_cast<int32_t>(tld_unicode.size()));
+  if (cyrillic_letters_.containsSome(tld_string)) {
+    return true;
+  }
+  // These ASCII TLDs contain a large number of domains with Cyrillic
+  // characters.
+  return tld == "bg" || tld == "by" || tld == "kz" || tld == "pyc" ||
+         tld == "ru" || tld == "su" || tld == "ua" || tld == "uz";
+}
+
 // static
 void IDNSpoofChecker::SetTrieParamsForTesting(
    const HuffmanTrieParams& trie_params) {

--- a/components/url_formatter/spoof_checks/idn_spoof_checker.h
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker.h
@@ -61,8 +61,11 @@ class IDNSpoofChecker {
  // Returns true if |label| is safe to display as Unicode. In the event of
  // library failure, all IDN inputs will be treated as unsafe.
  // See the function body for details on the specific safety checks performed.
+  // top_level_domain_unicode can be empty if top_level_domain is not well
+  // formed punycode.
  bool SafeToDisplayAsUnicode(base::StringPiece16 label,
-                              base::StringPiece top_level_domain);
+                              base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode);

  // Returns the matching top domain if |hostname| or the last few components of
  // |hostname| looks similar to one of top domains listed i
@@ -93,6 +96,11 @@ class IDNSpoofChecker {
  // Returns true if all the Cyrillic letters in |label| belong to a set of
  // Cyrillic letters that look like ASCII Latin letters.
  bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label);
+  // Returns true if |tld| is a top level domain most likely to contain a large
+  // number of Cyrillic domains. |tld_unicode| can be empty if |tld| is not well
+  // formed punycode.
+  bool IsCyrillicTopLevelDomain(base::StringPiece tld,
+                                base::StringPiece16 tld_unicode) const;

  USpoofChecker* checker_;
  icu::UnicodeSet deviation_characters_;

--- a/components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc
@@ -356,34 +356,56 @@ const IDNTestCase kIdnCases[] = {
    {"xn--a1-eo4a.jp", L"a1\x30fe.jp", false},

    // Cyrillic labels made of Latin-look-alike Cyrillic letters.
-    // ѕсоре.com with ѕсоре in Cyrillic
+    // 1) ѕсоре.com with ѕсоре in Cyrillic.
    {"xn--e1argc3h.com", L"\x0455\x0441\x043e\x0440\x0435.com", false},
-    // ѕсоре123.com with ѕсоре in Cyrillic.
+    // 2) ѕсоре123.com with ѕсоре in Cyrillic.
    {"xn--123-qdd8bmf3n.com",
     L"\x0455\x0441\x043e\x0440\x0435"
     L"123.com",
     false},
-    // ѕсоре-рау.com with ѕсоре and рау in Cyrillic.
+    // 3) ѕсоре-рау.com with ѕсоре and рау in Cyrillic.
    {"xn----8sbn9akccw8m.com",
     L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.com", false},
-    // ѕсоре·рау.com with scope and pay in Cyrillic and U+00B7 between them.
+    // 4) ѕсоре·рау.com with scope and pay in Cyrillic and U+00B7 between them.
    {"xn--uba29ona9akccw8m.com",
     L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\x0443.com", false},

-    // The same as above three, but in IDN TLD.
+    // The same as above three, but in IDN TLD (рф).
+    // 1) ѕсоре.рф  with ѕсоре in Cyrillic.
    {"xn--e1argc3h.xn--p1ai", L"\x0455\x0441\x043e\x0440\x0435.\x0440\x0444",
     true},
+    // 2) ѕсоре123.рф with ѕсоре in Cyrillic.
    {"xn--123-qdd8bmf3n.xn--p1ai",
     L"\x0455\x0441\x043e\x0440\x0435"
     L"123.\x0440\x0444",
     true},
+    // 3) ѕсоре-рау.рф with ѕсоре and рау in Cyrillic.
+    {"xn----8sbn9akccw8m.xn--p1ai",
+     L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.\x0440\x0444", true},
+    // 4) ѕсоре·рау.com with scope and pay in Cyrillic and U+00B7 between them.
    {"xn--uba29ona9akccw8m.xn--p1ai",
     L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\x0443.\x0440\x0444",
     true},

-    // ѕсоре-рау.한국 with ѕсоре and рау in Cyrillic.
-    {"xn----8sbn9akccw8m.xn--3e0b707e",
-     L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.\xd55c\xad6d", true},
+    // Same as above three, but in .ru TLD.
+    // 1) ѕсоре.ru  with ѕсоре in Cyrillic.
+    {"xn--e1argc3h.ru", L"\x0455\x0441\x043e\x0440\x0435.ru", true},
+    // 2) ѕсоре123.ru with ѕсоре in Cyrillic.
+    {"xn--123-qdd8bmf3n.ru",
+     L"\x0455\x0441\x043e\x0440\x0435"
+     L"123.ru",
+     true},
+    // 3) ѕсоре-рау.ru with ѕсоре and рау in Cyrillic.
+    {"xn----8sbn9akccw8m.ru",
+     L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.ru", true},
+    // 4) ѕсоре·рау.ru with scope and pay in Cyrillic and U+00B7 between them.
+    {"xn--uba29ona9akccw8m.ru",
+     L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\x0443.ru", true},
+
+    // ѕсоре-рау.한국 with ѕсоре and рау in Cyrillic. The label will remain
+    // punycode while the TLD will be decoded.
+    {"xn----8sbn9akccw8m.xn--3e0b707e", L"xn----8sbn9akccw8m.\xd55c\xad6d",
+     true},

    // музей (museum in Russian) has characters without a Latin-look-alike.
    {"xn--e1adhj9a.com", L"\x043c\x0443\x0437\x0435\x0439.com", true},
@@ -397,6 +419,11 @@ const IDNTestCase kIdnCases[] = {
    // сю.com is Cyrillic with Latin lookalikes.
    {"xn--q1a0a.com", L"\x0441\x044e.com", false},

+    // googlе.한국 where е is Cyrillic. This tests the generic case when one
+    // label is not allowed but  other labels in the domain name are still
+    // decoded. Here, googlе is left in punycode but the TLD is decoded.
+    {"xn--googl-3we.xn--3e0b707e", L"xn--googl-3we.\xd55c\xad6d", true},
+
    // Combining Diacritic marks after a script other than Latin-Greek-Cyrillic
    {"xn--rsa2568fvxya.com", L"\xd55c\x0301\xae00.com", false},  // 한́글.com
    {"xn--rsa0336bjom.com", L"\x6f22\x0307\x5b57.com", false},   // 漢̇字.com

--- a/components/url_formatter/url_formatter.cc
+++ b/components/url_formatter/url_formatter.cc
@@ -34,6 +34,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments(
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                              size_t comp_len,
                              base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode,
                              bool enable_spoof_checks,
                              base::string16* out,
                              bool* has_idn_component);
@@ -233,6 +234,28 @@ base::string16 FormatViewSourceUrl(
 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
    LAZY_INSTANCE_INITIALIZER;

+// Computes the top level domain from |host|. top_level_domain_unicode will
+// contain the unicode version of top_level_domain. top_level_domain_unicode can
+// remain empty if the TLD is not well formed punycode.
+void GetTopLevelDomain(base::StringPiece host,
+                       base::StringPiece* top_level_domain,
+                       base::string16* top_level_domain_unicode) {
+  size_t last_dot = host.rfind('.');
+  if (last_dot == base::StringPiece::npos)
+    return;
+
+  *top_level_domain = host.substr(last_dot + 1);
+  base::string16 tld16;
+  tld16.reserve(top_level_domain->length());
+  tld16.insert(tld16.end(), top_level_domain->begin(), top_level_domain->end());
+
+  // Convert the TLD to unicode with the spoof checks disabled.
+  bool tld_has_idn_component = false;
+  IDNToUnicodeOneComponent(tld16.data(), tld16.size(), std::string(),
+                           base::string16(), false /* enable_spoof_checks */,
+                           top_level_domain_unicode, &tld_has_idn_component);
+}
+
 IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
    base::StringPiece host,
    base::OffsetAdjuster::Adjustments* adjustments,
@@ -240,27 +263,25 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
  if (adjustments)
    adjustments->clear();
  // Convert the ASCII input to a base::string16 for ICU.
-  base::string16 input16;
-  input16.reserve(host.length());
-  input16.insert(input16.end(), host.begin(), host.end());
+  base::string16 host16;
+  host16.reserve(host.length());
+  host16.insert(host16.end(), host.begin(), host.end());

+  // Compute the top level domain to be used in spoof checks later.
  base::StringPiece top_level_domain;
-  size_t last_dot = host.rfind('.');
-  if (last_dot != base::StringPiece::npos) {
-    top_level_domain = host.substr(last_dot);
-  }
+  base::string16 top_level_domain_unicode;
+  GetTopLevelDomain(host, &top_level_domain, &top_level_domain_unicode);

  IDNConversionResult result;
  // Do each component of the host separately, since we enforce script matching
  // on a per-component basis.
  base::string16 out16;
  for (size_t component_start = 0, component_end;
-       component_start < input16.length();
-       component_start = component_end + 1) {
+       component_start < host16.length(); component_start = component_end + 1) {
    // Find the end of the component.
-    component_end = input16.find('.', component_start);
+    component_end = host16.find('.', component_start);
    if (component_end == base::string16::npos)
-      component_end = input16.length();  // For getting the last component.
+      component_end = host16.length();  // For getting the last component.
    size_t component_length = component_end - component_start;
    size_t new_component_start = out16.length();
    bool converted_idn = false;
@@ -268,8 +289,9 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
      // Add the substring that we just found.
      bool has_idn_component = false;
      converted_idn = IDNToUnicodeOneComponent(
-          input16.data() + component_start, component_length, top_level_domain,
-          enable_spoof_checks, &out16, &has_idn_component);
+          host16.data() + component_start, component_length, top_level_domain,
+          top_level_domain_unicode, enable_spoof_checks, &out16,
+          &has_idn_component);
      result.has_idn_component |= has_idn_component;
    }
    size_t new_component_length = out16.length() - new_component_start;
@@ -280,7 +302,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
    }

    // Need to add the dot we just found (if we found one).
-    if (component_end < input16.length())
+    if (component_end < host16.length())
      out16.push_back('.');
  }

@@ -293,7 +315,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
    if (enable_spoof_checks && !result.matching_top_domain.domain.empty()) {
      if (adjustments)
        adjustments->clear();
-      result.result = input16;
+      result.result = host16;
    }
  }

@@ -319,9 +341,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments(
 // all even though it's possible to make up look-alike labels with ASCII
 // characters alone.
 bool IsIDNComponentSafe(base::StringPiece16 label,
-                        base::StringPiece top_level_domain) {
-  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label,
-                                                          top_level_domain);
+                        base::StringPiece top_level_domain,
+                        base::StringPiece16 top_level_domain_unicode) {
+  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(
+      label, top_level_domain, top_level_domain_unicode);
 }

 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
@@ -374,6 +397,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                              size_t comp_len,
                              base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode,
                              bool enable_spoof_checks,
                              base::string16* out,
                              bool* has_idn_component) {
@@ -410,8 +434,10 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,

  if (U_SUCCESS(status) && info.errors == 0) {
    *has_idn_component = true;
-    // Converted successfully. Ensure that the converted component
-    // can be safely displayed to the user.
+    // Converted successfully. At this point the length of the output string
+    // is original_length + output_length which may be shorter than the current
+    // length of |out|. Trim |out| and ensure that the converted component can
+    // be safely displayed to the user.
    out->resize(original_length + output_length);
    if (!enable_spoof_checks) {
      return true;
@@ -419,7 +445,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
    if (IsIDNComponentSafe(
            base::StringPiece16(out->data() + original_length,
                                base::checked_cast<size_t>(output_length)),
-            top_level_domain)) {
+            top_level_domain, top_level_domain_unicode)) {
      return true;
    }
  }