Commit 226971f2 authored by Mustafa Emre Acer's avatar Mustafa Emre Acer Committed by Commit Bot

IDN: Map a few confusable CJK character skeletons

Bug: 990428
Change-Id: I65f3b0b3c2b0f3ed71a5431257636a3bd6bcd735
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1733806Reviewed-by: default avatarChristopher Thompson <cthomp@chromium.org>
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#683759}
parent eebd5ebe
...@@ -192,7 +192,8 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -192,7 +192,8 @@ IDNSpoofChecker::IDNSpoofChecker() {
// - U+0153 (œ) => "ce" // - U+0153 (œ) => "ce"
// TODO: see https://crbug.com/843352 for further work on // TODO: see https://crbug.com/843352 for further work on
// U+0525 and U+0153. // U+0525 and U+0153.
// - {U+0167 (ŧ), U+0442 (т), U+04AD (ҭ), U+050F (ԏ)} => t // - {U+0167 (ŧ), U+0442 (т), U+04AD (ҭ), U+050F (ԏ), U+4E03 (七),
// U+4E05 (丅), U+4E06 (丆)} => t
// - {U+0185 (ƅ), U+044C (ь), U+048D (ҍ), U+0432 (в)} => b // - {U+0185 (ƅ), U+044C (ь), U+048D (ҍ), U+0432 (в)} => b
// - {U+03C9 (ω), U+0448 (ш), U+0449 (щ), U+0E1E (พ), // - {U+03C9 (ω), U+0448 (ш), U+0449 (щ), U+0E1E (พ),
// U+0E1F (ฟ), U+0E9E (ພ), U+0E9F (ຟ)} => w // U+0E1F (ฟ), U+0E9E (ພ), U+0E9F (ຟ)} => w
...@@ -201,8 +202,8 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -201,8 +202,8 @@ IDNSpoofChecker::IDNSpoofChecker() {
// - U+0491 (ґ) => r // - U+0491 (ґ) => r
// - {U+0493 (ғ), U+04FB (ӻ)} => f // - {U+0493 (ғ), U+04FB (ӻ)} => f
// - {U+04AB (ҫ), U+1004 (င)} => c // - {U+04AB (ҫ), U+1004 (င)} => c
// - U+04B1 (ұ) => y // - {U+04B1 (ұ), U+4E2B (丫)} => y
// - {U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ)} => x // - {U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ), U+4E42 (乂)} => x
// - {U+0503 (ԃ), U+10EB (ძ)} => d // - {U+0503 (ԃ), U+10EB (ძ)} => d
// - {U+050D (ԍ), U+100c (ဌ)} => g // - {U+050D (ԍ), U+100c (ဌ)} => g
// - {U+0D1F (ട), U+0E23 (ร), U+0EA3 (ຣ), U+0EAE (ຮ)} => s // - {U+0D1F (ട), U+0E23 (ร), U+0EA3 (ຣ), U+0EAE (ຮ)} => s
...@@ -217,7 +218,7 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -217,7 +218,7 @@ IDNSpoofChecker::IDNSpoofChecker() {
// - {U+0437 (з), U+0499 (ҙ), U+04E1 (ӡ), U+0909 (उ), U+0993 (ও), // - {U+0437 (з), U+0499 (ҙ), U+04E1 (ӡ), U+0909 (उ), U+0993 (ও),
// U+0A24 (ਤ), U+0A69 (੩), U+0AE9 (૩), U+0C69 (౩), // U+0A24 (ਤ), U+0A69 (੩), U+0AE9 (૩), U+0C69 (౩),
// U+1012 (ဒ), U+10D5 (ვ), U+10DE (პ)} => 3 // U+1012 (ဒ), U+10D5 (ვ), U+10DE (პ)} => 3
// - {U+0A6B (੫)} => 4, // - {U+0A6B (੫), U+4E29 (丩)} => 4,
// - {U+09EA (৪), U+0A6A (੪), U+0b6b (୫)} => 8, // - {U+09EA (৪), U+0A6A (੪), U+0b6b (୫)} => 8,
// - {U+0AED (૭), U+0b68 (୨), U+0C68 (౨)} => 9, // - {U+0AED (૭), U+0b68 (୨), U+0C68 (౨)} => 9,
// Map a few dashes that ICU doesn't map. These are already blocked by ICU, // Map a few dashes that ICU doesn't map. These are already blocked by ICU,
...@@ -228,9 +229,9 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -228,9 +229,9 @@ IDNSpoofChecker::IDNSpoofChecker() {
icu::UnicodeString::fromUTF8( icu::UnicodeString::fromUTF8(
"[æӕ] > ae; [þϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;" "[æӕ] > ae; [þϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;"
"[ĸκкқҝҟҡӄԟ] > k; [ŋпԥก] > n; œ > ce;" "[ĸκкқҝҟҡӄԟ] > k; [ŋпԥก] > n; œ > ce;"
"[ŧтҭԏ] > t; [ƅьҍв] > b; [ωшщพฟພຟ] > w;" "[ŧтҭԏ七丅丆] > t; [ƅьҍв] > b; [ωшщพฟພຟ] > w;"
"[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;" "[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;"
"[ҫင] > c; ұ > y; [χҳӽӿ] > x;" "[ҫင] > c; [ұ丫] > y; [χҳӽӿ乂] > x;"
"[ԃძ] > d; [ԍဌ] > g; [ടรຣຮ] > s; ၂ > j;" "[ԃძ] > d; [ԍဌ] > g; [ടรຣຮ] > s; ၂ > j;"
"[०০੦૦ଠ୦೦] > o;" "[०০੦૦ଠ୦೦] > o;"
"[৭੧૧] > q;" "[৭੧૧] > q;"
...@@ -238,7 +239,7 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -238,7 +239,7 @@ IDNSpoofChecker::IDNSpoofChecker() {
"[θ] > 0;" "[θ] > 0;"
"[२২੨੨૨೩೭] > 2;" "[२২੨੨૨೩೭] > 2;"
"[зҙӡउওਤ੩૩౩ဒვპ] > 3;" "[зҙӡउওਤ੩૩౩ဒვპ] > 3;"
"[੫] > 4;" "[੫] > 4;"
"[৪੪୫] > 8;" "[৪੪୫] > 8;"
"[૭୨౨] > 9;" "[૭୨౨] > 9;"
"[—一―⸺⸻] > \\-;"), "[—一―⸺⸻] > \\-;"),
...@@ -319,8 +320,8 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label, ...@@ -319,8 +320,8 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
dangerous_pattern = new icu::RegexMatcher( dangerous_pattern = new icu::RegexMatcher(
icu::UnicodeString( icu::UnicodeString(
// Disallow the following as they may be mistaken for slashes when // Disallow the following as they may be mistaken for slashes when
// they're surrounded by non-Japanese scripts (i.e. scripts other // they're surrounded by non-Japanese scripts (i.e. has non-Katakana
// than Katakana, Hiragana or Han): // Hiragana or Han scripts on both sides):
// "ノ" (Katakana no, U+30ce), "ソ" (Katakana so, U+30bd), // "ノ" (Katakana no, U+30ce), "ソ" (Katakana so, U+30bd),
// "ゾ" (Katakana zo, U+30be), "ン" (Katakana n, U+30f3), // "ゾ" (Katakana zo, U+30be), "ン" (Katakana n, U+30f3),
// "丶" (CJK unified ideograph, U+4E36), // "丶" (CJK unified ideograph, U+4E36),
......
...@@ -1066,7 +1066,15 @@ const IDNTestCase kIdnCases[] = { ...@@ -1066,7 +1066,15 @@ const IDNTestCase kIdnCases[] = {
{"xn--google-8m4e.com", L"google\x309A.com", false}, {"xn--google-8m4e.com", L"google\x309A.com", false},
// Small letter theta looks like a zero. // Small letter theta looks like a zero.
{"xn--123456789-yzg.com", L"123456789θ.com", false}}; {"xn--123456789-yzg.com", L"123456789θ.com", false},
{"xn--est-118d.net", L"七est.net", false},
{"xn--est-918d.net", L"丅est.net", false},
{"xn--est-e28d.net", L"丆est.net", false},
{"xn--3-cq6a.com", L"丩3.com", false},
{"xn--cxe-n68d.com", L"c丫xe.com", false},
{"xn--cye-b98d.com", L"cy乂e.com", false},
}; // namespace
namespace test { namespace test {
#include "components/url_formatter/spoof_checks/top_domains/test_domains-trie-inc.cc" #include "components/url_formatter/spoof_checks/top_domains/test_domains-trie-inc.cc"
...@@ -1186,36 +1194,35 @@ TEST(IDNSpoofCheckerNoFixtureTest, UnsafeIDNToUnicodeWithDetails) { ...@@ -1186,36 +1194,35 @@ TEST(IDNSpoofCheckerNoFixtureTest, UnsafeIDNToUnicodeWithDetails) {
TEST(IDNSpoofCheckerNoFixtureTest, Skeletons) { TEST(IDNSpoofCheckerNoFixtureTest, Skeletons) {
// All of these should produce the same skeleton. Not all of these are // All of these should produce the same skeleton. Not all of these are
// explicitly mapped in idn_spoof_checker.cc, ICU already handles some. // explicitly mapped in idn_spoof_checker.cc, ICU already handles some.
const GURL kTestCases[] = { const char kDashSite[] = "test-site";
// U+2010 (Hyphen) const struct TestCase {
GURL("http://test‐site"), const GURL url;
// U+2011 (Non breaking hyphen) const char* const expected_skeleton;
GURL("http://test‑site"), } kTestCases[] = {
// U+2012 (Figure dash) {GURL("http://test‐site"), kDashSite}, // U+2010 (Hyphen)
GURL("http://test‒site"), {GURL("http://test‑site"), kDashSite}, // U+2011 (Non breaking hyphen)
// U+2013 (En dash) {GURL("http://test‒site"), kDashSite}, // U+2012 (Figure dash)
GURL("http://test–site"), {GURL("http://test–site"), kDashSite}, // U+2013 (En dash)
// U+2014 (Em dash) {GURL("http://test—site"), kDashSite}, // U+2014 (Em dash)
GURL("http://test—site"), {GURL("http://test―site"), kDashSite}, // U+2015 (Horizontal bar)
// U+2015 (Horizontal bar) {GURL("http://test一site"), kDashSite}, // U+4E00 (一)
GURL("http://test―site"), {GURL("http://test−site"), kDashSite}, // U+2212 (minus sign)
// U+4E00 (一) {GURL("http://test⸺site"), kDashSite}, // U+2E3A (two-em dash)
GURL("http://test一site"), {GURL("http://test⸻site"), kDashSite}, // U+2E3B (three-em dash)
// U+2212 (minus sign) {GURL("http://七est.net"), "test.net"},
GURL("http://test−site"), {GURL("http://丅est.net"), "test.net"},
// U+2E3A (two-em dash) {GURL("http://丆est.net"), "test.net"},
GURL("http://test⸺site"), {GURL("http://c丫xe.com"), "cyxe.corn"},
// U+2E3B (three-em dash) {GURL("http://cy乂e.com"), "cyxe.corn"},
GURL("http://test⸻site"), {GURL("http://丩3.com"), "43.corn"}};
};
IDNSpoofChecker checker; IDNSpoofChecker checker;
for (const GURL& url : kTestCases) { for (const TestCase& test_case : kTestCases) {
const url_formatter::IDNConversionResult result = const url_formatter::IDNConversionResult result =
UnsafeIDNToUnicodeWithDetails(url.host()); UnsafeIDNToUnicodeWithDetails(test_case.url.host());
Skeletons skeletons = checker.GetSkeletons(result.result); Skeletons skeletons = checker.GetSkeletons(result.result);
EXPECT_EQ(1u, skeletons.size()); EXPECT_EQ(1u, skeletons.size());
EXPECT_EQ("test-site", *skeletons.begin()); EXPECT_EQ(test_case.expected_skeleton, *skeletons.begin());
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment