Commit af38308b authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Commit Bot

Mapping several Indic characters to confusables.

A number of characters from several Indian scripts are confusable,
especially with numbers. This change maps these characters to their
ASCII lookalike to allow fallback to punycode when displaying probable
spoofing URLs.

Bug: 849421
Bug: 892646
Bug: 896722
Change-Id: I6d463642f3541454dc39bf4b32b8291417697c52
Reviewed-on: https://chromium-review.googlesource.com/c/1295179Reviewed-by: default avatarTommy Li <tommycli@chromium.org>
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Cr-Commit-Position: refs/heads/master@{#602032}
parent 3c776ae5
......@@ -237,9 +237,18 @@ IDNSpoofChecker::IDNSpoofChecker() {
// - {U+050D (ԍ), U+100c (ဌ)} => g
// - {U+0D1F (ട), U+0E23 (ร), U+0EA3 (ຣ), U+0EAE (ຮ)} => s
// - U+1042 (၂) => j
// - {U+0966 (०), U+09E6 (০), U+0A66 (੦), U+0AE6 (૦), U+0B30 (ଠ),
// U+0B66 (୦), U+0CE6 (೦)} => o,
// - {U+09ED (৭), U+0A67 (੧), U+0AE7 (૧)} => q,
// - {U+0E1A (บ), U+0E9A (ບ)} => u
// - {U+0968 (२), U+09E8 (২), U+0A68 (੨), U+0A68 (੨), U+0AE8 (૨),
// U+0ce9 (೩), U+0ced (೭)} => 2,
// - {U+0437 (з), U+0499 (ҙ), U+04E1 (ӡ), U+0909 (उ), U+0993 (ও),
// U+0A69 (੩), U+0AE9 (૩), U+0C69 (౩),
// U+1012 (ဒ), U+10D5 (ვ), U+10DE (პ)} => 3
// - {U+0E1A (บ), U+0E9A (ບ)} => u
// - {U+0A6B (੫)} => 4,
// - {U+09EA (৪), U+0A6A (੪), U+0b6b (୫)} => 8,
// - {U+0AED (૭), U+0b68 (୨), U+0C68 (౨)} => 9,
extra_confusable_mapper_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("ExtraConf"),
icu::UnicodeString::fromUTF8(
......@@ -249,7 +258,15 @@ IDNSpoofChecker::IDNSpoofChecker() {
"[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;"
"[ҫင] > c; ұ > y; [χҳӽӿ] > x;"
"ԃ > d; [ԍဌ] > g; [ടรຣຮ] > s; ၂ > j;"
"[зҙӡउওဒვპ] > 3; [บບ] > u"),
"[०০੦૦ଠ୦೦] > o;"
"[৭੧૧] > q;"
"[บບ] > u;"
"[२২੨੨૨೩೭] > 2;"
"[зҙӡउও੩૩౩ဒვპ] > 3;"
"[੫] > 4;"
"[৪੪୫] > 8;"
"[૭୨౨] > 9;"
),
UTRANS_FORWARD, parse_error, status));
DCHECK(U_SUCCESS(status))
<< "Spoofchecker initalization failed due to an error: "
......
......@@ -23,3 +23,10 @@ wsou.com
aece.com
aen.com
n11.com
o2.com
28.com
39.com
89.com
43.com
oo.com
qq.com
......@@ -34,3 +34,10 @@ l23456789O.corn, 1234567890.com
aece.corn, aece.com
aen.corn, aen.com
nll.corn, n11.com
o2.corn, o2.com
28.corn, 28.com
39.corn, 39.com
89.corn, 89.com
43.corn, 43.com
oo.corn, oo.com
qq.corn, qq.com
......@@ -421,6 +421,35 @@ const IDNTestCase idn_cases[] = {
L"123.com",
false},
// 'o2.com', '28.com', '39.com', '43.com', '89.com', 'oo.com' and 'qq.com'
// are all explicitly added to the test domain list to aid testing of
// Latin-lookalikes that are numerics in other character sets and similar
// edge cases.
//
// Bengali:
{"xn--07be.com", L"\x09e6\x09e8.com", false},
{"xn--27be.com", L"\x09e8\x09ea.com", false},
{"xn--77ba.com", L"\x09ed\x09ed.com", false},
// Gurmukhi:
{"xn--qcce.com", L"\x0a68\x0a6a.com", false},
{"xn--occe.com", L"\x0a66\x0a68.com", false},
{"xn--rccd.com", L"\x0a6b\x0a69.com", false},
{"xn--pcca.com", L"\x0a67\x0a67.com", false},
// Telugu:
{"xn--drcb.com", L"\x0c69\x0c68.com", false},
// Devanagari:
{"xn--d4be.com", L"\x0966\x0968.com", false},
// Kannada:
{"xn--yucg.com", L"\x0ce6\x0ce9.com", false},
{"xn--yuco.com", L"\x0ce6\x0ced.com", false},
// Oriya:
{"xn--1jcf.com", L"\x0b6b\x0b68.com", false},
{"xn--zjca.com", L"\x0b66\x0b66.com", false},
// Gujarati:
{"xn--cgce.com", L"\x0ae6\x0ae8.com", false},
{"xn--fgci.com", L"\x0ae9\x0aed.com", false},
{"xn--dgca.com", L"\x0ae7\x0ae7.com", false},
// wmhtb.com
{"xn--l1acpvx.com", L"\x0448\x043c\x043d\x0442\x044c.com", false},
// щмнть.com
......@@ -600,6 +629,8 @@ const IDNTestCase idn_cases[] = {
L"12345678\x0b68"
L"0.com",
false},
// 123456789ଠ.com
{"xn--http://123456789-v01b.com", L"http://123456789\x0b20.com", false},
// 123456789ꓳ.com
{"xn--123456789-tx75a.com", L"123456789\xa4f3.com", false},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment