Map U+04CF to lowercase L as well.

U+04CF (ӏ) has the confusability skeleton of 'i' (lowercase I), but it can be confused for 'l' (lowercase L) or '1' (digit) if rendered in some fonts. If a host name contains it, calculate the confusability skeleton twice, once with the default mapping to 'i' (lowercase I) and the 2nd time with an alternative mapping to 'l'. Mapping them to 'l' (lowercase L) also gets it treated as similar to digit 1 because the confusability skeleton of digit 1 is 'l'. Bug: 817247 Test: components_unittests --gtest_filter=*IDN* Change-Id: I7442b950c9457eea285e17f01d1f43c9acc5d79c Reviewed-on: https://chromium-review.googlesource.com/974165 Commit-Queue: Jungshik Shin <jshin@chromium.org> Reviewed-by: Peter Kasting <pkasting@chromium.org> Reviewed-by: Eric Lawrence <elawrence@chromium.org> Cr-Commit-Position: refs/heads/master@{#551263}

Map U+04CF to lowercase L as well.
U+04CF (ӏ) has the confusability skeleton of 'i' (lowercase I), but it can be confused for 'l' (lowercase L) or '1' (digit) if rendered in some fonts. If a host name contains it, calculate the confusability skeleton twice, once with the default mapping to 'i' (lowercase I) and the 2nd time with an alternative mapping to 'l'. Mapping them to 'l' (lowercase L) also gets it treated as similar to digit 1 because the confusability skeleton of digit 1 is 'l'. Bug: 817247 Test: components_unittests --gtest_filter=*IDN* Change-Id: I7442b950c9457eea285e17f01d1f43c9acc5d79c Reviewed-on: https://chromium-review.googlesource.com/974165 Commit-Queue: Jungshik Shin <jshin@chromium.org> Reviewed-by: Peter Kasting <pkasting@chromium.org> Reviewed-by: Eric Lawrence <elawrence@chromium.org> Cr-Commit-Position: refs/heads/master@{#551263}
f9b56bc5 · Jungshik Shin · Commit Bot · 5a45635a · f9b56bc5 · f9b56bc5
Commit f9b56bc5 authored Apr 17, 2018 by Jungshik Shin Committed by Commit Bot Apr 17, 2018
4 changed files
--- a/components/url_formatter/idn_spoof_checker.cc
+++ b/components/url_formatter/idn_spoof_checker.cc
@@ -38,7 +38,9 @@ const size_t kNumberOfLabelsToCheck = 3;
 const unsigned char* g_graph = kDafsa;
 size_t g_graph_length = sizeof(kDafsa);

-bool LookupMatchInTopDomains(base::StringPiece skeleton) {
+bool LookupMatchInTopDomains(const icu::UnicodeString& ustr_skeleton) {
+  std::string skeleton;
+  ustr_skeleton.toUTF8String(skeleton);
  DCHECK_NE(skeleton.back(), '.');
  auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,
                                       base::SPLIT_WANT_ALL);
@@ -169,7 +171,6 @@ IDNSpoofChecker::IDNSpoofChecker() {
  //   - {U+04AB (ҫ), U+1004 (င)} => c
  //   - U+04B1 (ұ) => y
  //   - U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ) => x
-  //   - U+04CF (ӏ) => i (on Windows), l (elsewhere)
  //   - U+0503 (ԃ) => d
  //   - {U+050D (ԍ), U+100c (ဌ)} => g
  //   - {U+0D1F (ട), U+0E23 (ร)} => s
@@ -182,11 +183,6 @@ IDNSpoofChecker::IDNSpoofChecker() {
                                   "[ƅьҍв] > b;  [ωшщฟ] > w; [мӎ] > m;"
                                   "[єҽҿၔ] > e; ґ > r; [ғӻ] > f; [ҫင] > c;"
                                   "ұ > y; [χҳӽӿ] > x;"
-#if defined(OS_WIN)
-                                   "ӏ > i;"
-#else
-                                   "ӏ > l;"
-#endif
                                   "ԃ  > d; [ԍဌ] > g; [ടร] > s; ၂ > j;"
                                   "[зӡ] > 3"),
      UTRANS_FORWARD, parse_error, status));
@@ -309,23 +305,36 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,

 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
  size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
-  icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
+  icu::UnicodeString host(FALSE, hostname.data(), hostname_length);
  // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
  // there is no point in getting rid of diacritics because combining marks
  // attached to non-LGC characters are already blocked.
-  if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
-      ustr_host.length())
-    diacritic_remover_.get()->transliterate(ustr_host);
-  extra_confusable_mapper_.get()->transliterate(ustr_host);
+  if (lgc_letters_n_ascii_.span(host, 0, USET_SPAN_CONTAINED) == host.length())
+    diacritic_remover_.get()->transliterate(host);
+  extra_confusable_mapper_.get()->transliterate(host);

  UErrorCode status = U_ZERO_ERROR;
-  icu::UnicodeString ustr_skeleton;
-  uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,
-                                  &status);
-  if (U_FAILURE(status))
-    return false;
-  std::string skeleton;
-  return LookupMatchInTopDomains(ustr_skeleton.toUTF8String(skeleton));
+  icu::UnicodeString skeleton;
+
+  // Map U+04CF (ӏ) to lowercase L in addition to what uspoof_getSkeleton does
+  // (mapping it to lowercase I).
+  int32_t u04cf_pos;
+  if ((u04cf_pos = host.indexOf(0x4CF)) != -1) {
+    icu::UnicodeString host_alt(host);
+    size_t length = host_alt.length();
+    char16_t* buffer = host_alt.getBuffer(-1);
+    for (char16_t* uc = buffer + u04cf_pos ; uc < buffer + length; ++uc) {
+      if (*uc == 0x4CF)
+        *uc = 0x6C;  // Lowercase L
+    }
+    host_alt.releaseBuffer(length);
+    uspoof_getSkeletonUnicodeString(checker_, 0, host_alt, skeleton, &status);
+    if (U_SUCCESS(status) && LookupMatchInTopDomains(skeleton))
+      return true;
+  }
+
+  uspoof_getSkeletonUnicodeString(checker_, 0, host, skeleton, &status);
+  return U_SUCCESS(status) && LookupMatchInTopDomains(skeleton);
 }

 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

--- a/components/url_formatter/top_domains/test_domains.list
+++ b/components/url_formatter/top_domains/test_domains.list
@@ -12,6 +12,9 @@ rf.com
 cyxe.com
 ldg.com
 idg.com
+ig.com
+ld.com
+1gd.com
 cegjo.com
 wsws.com
 1234567890.com
--- a/components/url_formatter/top_domains/test_skeletons.gperf
+++ b/components/url_formatter/top_domains/test_skeletons.gperf
@@ -22,6 +22,9 @@ rf.corn, 1
 cyxe.corn, 1
 ldg.corn, 1
 idg.corn, 1
+ig.corn, 1
+ld.corn, 1
+lgd.corn, 1
 cegjo.corn, 1
 wsws.corn, 1
 l23456789O.corn, 1

--- a/components/url_formatter/url_formatter_unittest.cc
+++ b/components/url_formatter/url_formatter_unittest.cc
@@ -511,6 +511,16 @@ const IDNTestCase idn_cases[] = {
    // ӏԃԍ.com
    {"xn--s5a8h4a.com", L"\x04cf\x0503\x050d.com", false},

+    // U+04CF(ӏ) is mapped to multiple characters, lowercase L(l) and
+    // lowercase I(i). Lowercase L is also regarded as similar to digit 1.
+    // The test domain list has {ig, ld, 1gd}.com for Cyrillic.
+    // ӏԍ.com
+    {"xn--s5a8j.com", L"\x04cf\x050d.com", false},
+    // ӏԃ.com
+    {"xn--s5a8h.com", L"\x04cf\x0503.com", false},
+    // ӏԍԃ.com
+    {"xn--s5a8h3a.com", L"\x04cf\x050d\x0503.com", false},
+
    // ꓲ2345б7890.com
    {"xn--23457890-e7g93622b.com", L"\xa4f2" L"2345\x0431" L"7890.com", false},
    // 1ᒿ345б7890.com
@@ -533,7 +543,8 @@ const IDNTestCase idn_cases[] = {
    // ငၔဌ၂ဝ.com (entirely made of Myanmar characters)
    {"xn--ridq5c9hnd.com", L"\x1004\x1054\x100c" L"\x1042\x101d.com", false},

-    // ฟรฟร.com (made of two Thai characters)
+    // ฟรฟร.com (made of two Thai characters. similar to wsws.com in
+    // some fonts)
    {"xn--w3calb.com", L"\x0e1f\x0e23\x0e1f\x0e23.com", false},

    // At one point the skeleton of 'w' was 'vv', ensure that