Commit f9b56bc5 authored by Jungshik Shin's avatar Jungshik Shin Committed by Commit Bot

Map U+04CF to lowercase L as well.

U+04CF (ӏ) has the confusability skeleton of 'i' (lowercase
I), but it can be confused for 'l' (lowercase L) or '1' (digit) if rendered
in some fonts.

If a host name contains it, calculate the confusability skeleton
twice, once with the default mapping to 'i' (lowercase I) and the 2nd
time with an alternative mapping to 'l'. Mapping them to 'l' (lowercase L)
also gets it treated as similar to digit 1 because the confusability
skeleton of digit 1 is 'l'.

Bug: 817247
Test: components_unittests --gtest_filter=*IDN*
Change-Id: I7442b950c9457eea285e17f01d1f43c9acc5d79c
Reviewed-on: https://chromium-review.googlesource.com/974165
Commit-Queue: Jungshik Shin <jshin@chromium.org>
Reviewed-by: default avatarPeter Kasting <pkasting@chromium.org>
Reviewed-by: default avatarEric Lawrence <elawrence@chromium.org>
Cr-Commit-Position: refs/heads/master@{#551263}
parent 5a45635a
......@@ -38,7 +38,9 @@ const size_t kNumberOfLabelsToCheck = 3;
const unsigned char* g_graph = kDafsa;
size_t g_graph_length = sizeof(kDafsa);
bool LookupMatchInTopDomains(base::StringPiece skeleton) {
bool LookupMatchInTopDomains(const icu::UnicodeString& ustr_skeleton) {
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
DCHECK_NE(skeleton.back(), '.');
auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,
base::SPLIT_WANT_ALL);
......@@ -169,7 +171,6 @@ IDNSpoofChecker::IDNSpoofChecker() {
// - {U+04AB (ҫ), U+1004 (င)} => c
// - U+04B1 (ұ) => y
// - U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ) => x
// - U+04CF (ӏ) => i (on Windows), l (elsewhere)
// - U+0503 (ԃ) => d
// - {U+050D (ԍ), U+100c (ဌ)} => g
// - {U+0D1F (ട), U+0E23 (ร)} => s
......@@ -182,11 +183,6 @@ IDNSpoofChecker::IDNSpoofChecker() {
"[ƅьҍв] > b; [ωшщฟ] > w; [мӎ] > m;"
"[єҽҿၔ] > e; ґ > r; [ғӻ] > f; [ҫင] > c;"
"ұ > y; [χҳӽӿ] > x;"
#if defined(OS_WIN)
"ӏ > i;"
#else
"ӏ > l;"
#endif
"ԃ > d; [ԍဌ] > g; [ടร] > s; ၂ > j;"
"[зӡ] > 3"),
UTRANS_FORWARD, parse_error, status));
......@@ -309,23 +305,36 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
icu::UnicodeString host(FALSE, hostname.data(), hostname_length);
// If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
// there is no point in getting rid of diacritics because combining marks
// attached to non-LGC characters are already blocked.
if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
ustr_host.length())
diacritic_remover_.get()->transliterate(ustr_host);
extra_confusable_mapper_.get()->transliterate(ustr_host);
if (lgc_letters_n_ascii_.span(host, 0, USET_SPAN_CONTAINED) == host.length())
diacritic_remover_.get()->transliterate(host);
extra_confusable_mapper_.get()->transliterate(host);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString ustr_skeleton;
uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,
&status);
if (U_FAILURE(status))
return false;
std::string skeleton;
return LookupMatchInTopDomains(ustr_skeleton.toUTF8String(skeleton));
icu::UnicodeString skeleton;
// Map U+04CF (ӏ) to lowercase L in addition to what uspoof_getSkeleton does
// (mapping it to lowercase I).
int32_t u04cf_pos;
if ((u04cf_pos = host.indexOf(0x4CF)) != -1) {
icu::UnicodeString host_alt(host);
size_t length = host_alt.length();
char16_t* buffer = host_alt.getBuffer(-1);
for (char16_t* uc = buffer + u04cf_pos ; uc < buffer + length; ++uc) {
if (*uc == 0x4CF)
*uc = 0x6C; // Lowercase L
}
host_alt.releaseBuffer(length);
uspoof_getSkeletonUnicodeString(checker_, 0, host_alt, skeleton, &status);
if (U_SUCCESS(status) && LookupMatchInTopDomains(skeleton))
return true;
}
uspoof_getSkeletonUnicodeString(checker_, 0, host, skeleton, &status);
return U_SUCCESS(status) && LookupMatchInTopDomains(skeleton);
}
bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
......
......@@ -12,6 +12,9 @@ rf.com
cyxe.com
ldg.com
idg.com
ig.com
ld.com
1gd.com
cegjo.com
wsws.com
1234567890.com
......@@ -22,6 +22,9 @@ rf.corn, 1
cyxe.corn, 1
ldg.corn, 1
idg.corn, 1
ig.corn, 1
ld.corn, 1
lgd.corn, 1
cegjo.corn, 1
wsws.corn, 1
l23456789O.corn, 1
......
......@@ -511,6 +511,16 @@ const IDNTestCase idn_cases[] = {
// ӏԃԍ.com
{"xn--s5a8h4a.com", L"\x04cf\x0503\x050d.com", false},
// U+04CF(ӏ) is mapped to multiple characters, lowercase L(l) and
// lowercase I(i). Lowercase L is also regarded as similar to digit 1.
// The test domain list has {ig, ld, 1gd}.com for Cyrillic.
// ӏԍ.com
{"xn--s5a8j.com", L"\x04cf\x050d.com", false},
// ӏԃ.com
{"xn--s5a8h.com", L"\x04cf\x0503.com", false},
// ӏԍԃ.com
{"xn--s5a8h3a.com", L"\x04cf\x050d\x0503.com", false},
// ꓲ2345б7890.com
{"xn--23457890-e7g93622b.com", L"\xa4f2" L"2345\x0431" L"7890.com", false},
// 1ᒿ345б7890.com
......@@ -533,7 +543,8 @@ const IDNTestCase idn_cases[] = {
// ငၔဌ၂ဝ.com (entirely made of Myanmar characters)
{"xn--ridq5c9hnd.com", L"\x1004\x1054\x100c" L"\x1042\x101d.com", false},
// ฟรฟร.com (made of two Thai characters)
// ฟรฟร.com (made of two Thai characters. similar to wsws.com in
// some fonts)
{"xn--w3calb.com", L"\x0e1f\x0e23\x0e1f\x0e23.com", false},
// At one point the skeleton of 'w' was 'vv', ensure that
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment