Commit 49b69f38 authored by meacer's avatar meacer Committed by Commit Bot

Disallow confusable CJK ideographs in IDN display

This CL blocks the following characters when used next to a non-CJK
character:

U+4E00 (一), U+3127 (ㄧ), U+4E28 (丨), U+4E5B (乛), U+4E03 (七),
U+4E05 (丅), U+5341 (十), U+3007 (〇), U+3112 (ㄒ), U+311A (ㄚ),
U+311F (ㄟ), U+3128 (ㄨ), U+3129 (ㄩ), U+3108 (ㄈ), U+31BA (ㆺ),
U+31B3 (ㆳ), U+5DE5 (工), U+31B2 (ㆲ), U+8BA0 (讠), U+4E01 (丁)

According to usage logs, this change affects a small number of domains (<10)
with a small number of users (<300 over 28 days).

It does not block U+30A8 (エ) as this converts more domains to punycode  (~20).

Bug: 990428
Change-Id: Ie8cad20f52446361439ea9e3e0a69b276f8115fd
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1927692
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Reviewed-by: default avatarJoe DeBlasio <jdeblasio@chromium.org>
Cr-Commit-Position: refs/heads/master@{#718375}
parent 9973884f
......@@ -429,14 +429,28 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)"
R"([a-z]\u30fb|\u30fb[a-z]|)"
// Disallow U+4E00 (CJK unified ideograph) and U+3127 (Bopomofo
// Letter I) unless they are next to Hiragana, Katagana or Han.
// U+2F00 (Kangxi Radical One) is similar, but it's normalized to
// U+4E00 so it's not explicitly checked here.
R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
R"([\u4e00\u3127]|)"
R"([\u4e00\u3127])"
R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)"
// Disallow these CJK ideographs if they are next to non-CJK
// characters. These characters can be used to spoof Latin
// characters or punctuation marks:
// U+4E00 (一), U+3127 (ㄧ), U+4E28 (丨), U+4E5B (乛), U+4E03 (七),
// U+4E05 (丅), U+5341 (十), U+3007 (〇), U+3112 (ㄒ), U+311A (ㄚ),
// U+311F (ㄟ), U+3128 (ㄨ), U+3129 (ㄩ), U+3108 (ㄈ), U+31BA (ㆺ),
// U+31B3 (ㆳ), U+5DE5 (工), U+31B2 (ㆲ), U+8BA0 (讠), U+4E01 (丁)
// These characters are already blocked:
// U+2F00 (⼀) (normalized to U+4E00), U+3192 (㆒), U+2F02 (⼂),
// U+2F17 (⼗) and U+3038 (〸) (both normalized to U+5341 (十)).
// Check if there is non-{Hiragana, Katagana, Han, Bopomofo} on the
// left.
R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}\p{scx=bopo}])"
R"([\u4e00\u3127\u4e28\u4e5b\u4e03\u4e05\u5341\u3007\u3112)"
R"(\u311a\u311f\u3128\u3129\u3108\u31ba\u31b3\u5dE5)"
R"(\u31b2\u8ba0\u4e01]|)"
// Check if there is non-{Hiragana, Katagana, Han, Bopomofo} on the
// right.
R"([\u4e00\u3127\u4e28\u4e5b\u4e03\u4e05\u5341\u3007\u3112)"
R"(\u311a\u311f\u3128\u3129\u3108\u31ba\u31b3\u5de5)"
R"(\u31b2\u8ba0\u4e01])"
R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}\p{scx=bopo}]|)"
// Disallow combining diacritical mark (U+0300-U+0339) after a
// non-LGC character. Other combining diacritical marks are not in
......
......@@ -1151,6 +1151,27 @@ const IDNTestCase kIdnCases[] = {
{"xn--l-fda.cat", L"·l.cat", false},
{"xn--l-gda.cat", L"l·.cat", false},
{"xn--googlecom-gk6n.com", L"google丨com.com", false}, // (U+4E28)
{"xn--googlecom-0y6n.com", L"google乛com.com", false}, // (U+4E5B)
{"xn--googlecom-v85n.com", L"google七com.com", false}, // (U+4E03)
{"xn--googlecom-g95n.com", L"google丅com.com", false}, // (U+4E05)
{"xn--googlecom-go6n.com", L"google⼂com.com", false}, // (U+2F02)
{"xn--googlecom-b76o.com", L"google⼗com.com", false}, // (U+2F17)
{"xn--googlecom-b76o.com", L"google〸com.com", false}, // (U+3038)
{"xn--googlecom-ql3h.com", L"google〇com.com", false}, // (U+3007)
{"xn--googlecom-0r5h.com", L"googleㄒcom.com", false}, // (U+3112)
{"xn--googlecom-bu5h.com", L"googleㄚcom.com", false}, // (U+311A)
{"xn--googlecom-qv5h.com", L"googleㄟcom.com", false}, // (U+311F)
{"xn--googlecom-0x5h.com", L"googleㄧcom.com", false}, // (U+3127)
{"xn--googlecom-by5h.com", L"googleㄨcom.com", false}, // (U+3128)
{"xn--googlecom-ly5h.com", L"googleㄩcom.com", false}, // (U+3129)
{"xn--googlecom-5o5h.com", L"googleㄈcom.com", false}, // (U+3108)
{"xn--googlecom-075n.com", L"google㆒com.com", false}, // (U+3192)
{"xn--googlecom-046h.com", L"googleㆺcom.com", false}, // (U+31BA)
{"xn--googlecom-026h.com", L"googleㆳcom.com", false}, // (U+31B3)
{"xn--googlecom-lg9q.com", L"google工com.com", false}, // (U+5DE5)
{"xn--googlecom-g040a.com", L"google讠com.com", false}, // (U+8BA0)
{"xn--googlecom-b85n.com", L"google丁com.com", false}, // (U+4E01)
}; // namespace
namespace test {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment