Commit 77d0cf6c authored by meacer's avatar meacer Committed by Commit Bot

IDN Display: Add whole-script-confusable characters for Georgian

This CL adds a list of Georgian characters that look like Latin
characters. A domain consisting only of these characters and not
having a .ge ccTLD or other Georgian TLDs such as .გე will be
converted to punycode, as is done for other whole-script-confusables
such as Cyrillic and Hebrew.

This CL also reorders and regroups some of the recent test cases.

No popular domain is affected by this change.

Bug: 722167
Change-Id: Ib68b3ef68ac55576141b9cb2cd6096d149456227
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1990326
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarJoe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#732564}
parent 352a82ac
...@@ -193,33 +193,37 @@ IDNSpoofChecker::IDNSpoofChecker() { ...@@ -193,33 +193,37 @@ IDNSpoofChecker::IDNSpoofChecker() {
combining_diacritics_exceptions_.freeze(); combining_diacritics_exceptions_.freeze();
const WholeScriptConfusableData kWholeScriptConfusables[] = { const WholeScriptConfusableData kWholeScriptConfusables[] = {
{// Armenian
"[[:Armn:]]",
"[ագզէլհյոսւօՙ]",
{"am"}},
{// Cyrillic {// Cyrillic
"[[:Cyrl:]]", "[[:Cyrl:]]",
"[аысԁеԍһіюјӏорԗԛѕԝхуъЬҽпгѵѡ]", "[аысԁеԍһіюјӏорԗԛѕԝхуъЬҽпгѵѡ]",
// TLDs containing most of the Cyrillic domains. // TLDs containing most of the Cyrillic domains.
{"bg", "by", "kz", "pyc", "ru", "su", "ua", "uz"}}, {"bg", "by", "kz", "pyc", "ru", "su", "ua", "uz"}},
{// Hebrew
"[[:Hebr:]]",
"[דוחיןסװײ׳ﬦ]",
// TLDs containing most of the Hebrew domains.
{"il"}},
{// Ethiopic (Ge'ez). Variants of these characters such as ሁ and ሡ could {// Ethiopic (Ge'ez). Variants of these characters such as ሁ and ሡ could
// arguably be added to this list. However, we are only restricting // arguably be added to this list. However, we are only restricting
// the more obvious characters to keep the list short and to reduce the // the more obvious characters to keep the list short and to reduce the
// probability of false positives. // probability of false positives.
// (Potential set: [ሀሁሃሠሡሰሱሲስበቡቢተቱቲታነከኩኪካኬክዐዑዕዖዘዙዚዛዝዞጠጡጢጣጦፐፒꬁꬂꬅ]) // Potential set: [ሀሁሃሠሡሰሱሲስበቡቢተቱቲታነከኩኪካኬክዐዑዕዖዘዙዚዛዝዞጠጡጢጣጦፐፒꬁꬂꬅ]
"[[:Ethi:]]", "[[:Ethi:]]",
"[ሀሠሰስበነተከዐዕዘጠፐꬅ]", "[ሀሠሰስበነተከዐዕዘጠፐꬅ]",
{"er", "et"}}, {"er", "et"}},
{// Georgian
"[[:Geor:]]",
"[იოყძხჽჿ]",
{"ge"}},
{// Greek {// Greek
"[[:Grek:]]", "[[:Grek:]]",
// This ignores variants such as ά, έ, ή, ί. // This ignores variants such as ά, έ, ή, ί.
"[αικνρυωηοτ]", "[αικνρυωηοτ]",
{"gr"}}, {"gr"}},
{// Armenian {// Hebrew
"[[:Armn:]]", "[[:Hebr:]]",
"[ագզէլհյոսւօՙ]", "[דוחיןסװײ׳ﬦ]",
{"am"}}, // TLDs containing most of the Hebrew domains.
{"il"}},
// Indic scripts in the recommended set. No ccTLDs are allowlisted. // Indic scripts in the recommended set. No ccTLDs are allowlisted.
{// Bengali {// Bengali
"[[:Beng:]]", "[০৭]"}, "[[:Beng:]]", "[০৭]"},
......
...@@ -1201,40 +1201,46 @@ const IDNTestCase kIdnCases[] = { ...@@ -1201,40 +1201,46 @@ const IDNTestCase kIdnCases[] = {
{"xn--googlecom-g040a.com", L"google讠com.com", kUnsafe}, // (U+8BA0) {"xn--googlecom-g040a.com", L"google讠com.com", kUnsafe}, // (U+8BA0)
{"xn--googlecom-b85n.com", L"google丁com.com", kUnsafe}, // (U+4E01) {"xn--googlecom-b85n.com", L"google丁com.com", kUnsafe}, // (U+4E01)
{"xn--7dbh4a.com", L"חסד.com", kUnsafe}, // Whole-script-confusables. Cyrillic is sufficiently handled in cases above
{"xn--7dbh4a.il", L"חסד.il", kSafe}, // so it's not included here.
// Armenian:
// Whole-script-confusable in Ethiopic. {"xn--mbbkpm.com", L"ոսւօ.com", kUnsafe},
{"xn--mbbkpm.am", L"ոսւօ.am", kSafe},
{"xn--mbbkpm.xn--y9a3aq", L"ոսւօ.հայ", kSafe},
// Ethiopic:
{"xn--6xd66aa62c.com", L"ሠዐዐፐ.com", kUnsafe}, {"xn--6xd66aa62c.com", L"ሠዐዐፐ.com", kUnsafe},
{"xn--6xd66aa62c.et", L"ሠዐዐፐ.et", kSafe}, {"xn--6xd66aa62c.et", L"ሠዐዐፐ.et", kSafe},
{"xn--6xd66aa62c.xn--m0d3gwjla96a", L"ሠዐዐፐ.ኢትዮጵያ", kSafe},
// Whole-script-confusable in Greek. // Greek:
{"xn--mxapd.com", L"ικα.com", kUnsafe}, {"xn--mxapd.com", L"ικα.com", kUnsafe},
{"xn--mxapd.gr", L"ικα.gr", kSafe}, {"xn--mxapd.gr", L"ικα.gr", kSafe},
{"xn--mxapd.xn--qxam", L"ικα.ελ", kSafe}, {"xn--mxapd.xn--qxam", L"ικα.ελ", kSafe},
// Georgian:
// Whole-script-confusable in Armenian. {"xn--gpd3ag.com", L"ჽჿხ.com", kUnsafe},
{"xn--mbbkpm.com", L"ոսւօ.com", kUnsafe}, {"xn--gpd3ag.ge", L"ჽჿხ.ge", kSafe},
{"xn--mbbkpm.am", L"ոսւօ.am", kSafe}, {"xn--gpd3ag.xn--node", L"ჽჿხ.გე", kSafe},
{"xn--mbbkpm.xn--y9a3aq", L"ոսւօ.հայ", kSafe}, // Hebrew.
{"xn--7dbh4a.com", L"חסד.com", kUnsafe},
// Whole-script-confusable in Bengali: {"xn--7dbh4a.il", L"חסד.il", kSafe},
{"xn--9dbq2a.xn--7dbh4a", L"קום.חסד", kSafe},
// Indic scripts:
// Bengali:
{"xn--07baub.com", L"০৭০৭.com", kUnsafe}, {"xn--07baub.com", L"০৭০৭.com", kUnsafe},
// Whole-script-confusable in Devanagari: // Devanagari:
{"xn--62ba6j.com", L"ऽ०ऽ.com", kUnsafe}, {"xn--62ba6j.com", L"ऽ०ऽ.com", kUnsafe},
// Whole-script-confusable in Gujarati: // Gujarati:
{"xn--becd.com", L"ડટ.com", kUnsafe}, {"xn--becd.com", L"ડટ.com", kUnsafe},
// Whole-script-confusable in Gurmukhi: // Gurmukhi:
{"xn--occacb.com", L"੦੧੦੧.com", kUnsafe}, {"xn--occacb.com", L"੦੧੦੧.com", kUnsafe},
// Whole-script-confusable in Kannada: // Kannada:
{"xn--stca6jf.com", L"ಽ೦ಽ೧.com", kUnsafe}, {"xn--stca6jf.com", L"ಽ೦ಽ೧.com", kUnsafe},
// Whole-script-confusable in Malayalam: // Malayalam:
{"xn--lwccv.com", L"ടഠധ.com", kUnsafe}, {"xn--lwccv.com", L"ടഠധ.com", kUnsafe},
// Whole-script-confusable in Oriya: // Oriya:
{"xn--zhca6ub.com", L"୮ଠ୮ଠ.com", kUnsafe}, {"xn--zhca6ub.com", L"୮ଠ୮ଠ.com", kUnsafe},
// Whole-script-confusable in Tamil: // Tamil:
{"xn--mlca6ab.com", L"டபடப.com", kUnsafe}, {"xn--mlca6ab.com", L"டபடப.com", kUnsafe},
// Whole-script-confusable in Telugu: // Telugu:
{"xn--brcaabbb.com", L"౧౦౧౦౧౦.com", kUnsafe}, {"xn--brcaabbb.com", L"౧౦౧౦౧౦.com", kUnsafe},
// IDN domain matching an IDN top-domain (fóó.com) // IDN domain matching an IDN top-domain (fóó.com)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment