Commit 3308f680 authored by Koji Ishii's avatar Koji Ishii Committed by Commit Bot

Update Unicode data in Character class

This patch updates Unicode data in Character class:
1. UAX#50 Unicode Vertical Text[1] to rev 19 (Unicode 10).
2. IsCJKIdeographOrSymbol was not updated for a while, include
   ideographic characters to be in sync with UAX#50.
3. IsCJKIdeographOrSymbol includes all plane 2 rather than each
   block.

[1] http://www.unicode.org/reports/tr50/

Change-Id: Ie8e7f3b45d340870be5d1316021b49231be9cdd2
Reviewed-on: https://chromium-review.googlesource.com/822551
Commit-Queue: Koji Ishii <kojii@chromium.org>
Reviewed-by: default avatarDominik Röttsches <drott@chromium.org>
Reviewed-by: default avatarJungshik Shin <jshin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#524661}
parent ba876ca4
...@@ -110,11 +110,7 @@ bool Character::IsUprightInMixedVertical(UChar32 character) { ...@@ -110,11 +110,7 @@ bool Character::IsUprightInMixedVertical(UChar32 character) {
RETURN_HAS_PROPERTY(character, kIsUprightInMixedVertical) RETURN_HAS_PROPERTY(character, kIsUprightInMixedVertical)
} }
bool Character::IsCJKIdeographOrSymbol(UChar32 c) { bool Character::IsCJKIdeographOrSymbolSlow(UChar32 c) {
// Likely common case
if (c < 0x2C7)
return false;
RETURN_HAS_PROPERTY(c, kIsCJKIdeographOrSymbol) RETURN_HAS_PROPERTY(c, kIsCJKIdeographOrSymbol)
} }
...@@ -219,16 +215,12 @@ bool Character::CanTextDecorationSkipInk(UChar32 codepoint) { ...@@ -219,16 +215,12 @@ bool Character::CanTextDecorationSkipInk(UChar32 codepoint) {
switch (block) { switch (block) {
// These blocks contain CJK characters we don't want to skip ink, but are // These blocks contain CJK characters we don't want to skip ink, but are
// not ideograph that IsCJKIdeographOrSymbol() does not cover. // not ideograph that IsCJKIdeographOrSymbol() does not cover.
case UBLOCK_BOPOMOFO:
case UBLOCK_BOPOMOFO_EXTENDED:
case UBLOCK_HANGUL_JAMO: case UBLOCK_HANGUL_JAMO:
case UBLOCK_HANGUL_COMPATIBILITY_JAMO: case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
case UBLOCK_HANGUL_SYLLABLES: case UBLOCK_HANGUL_SYLLABLES:
case UBLOCK_HANGUL_JAMO_EXTENDED_A: case UBLOCK_HANGUL_JAMO_EXTENDED_A:
case UBLOCK_HANGUL_JAMO_EXTENDED_B: case UBLOCK_HANGUL_JAMO_EXTENDED_B:
case UBLOCK_LINEAR_B_IDEOGRAMS: case UBLOCK_LINEAR_B_IDEOGRAMS:
case UBLOCK_TANGUT:
case UBLOCK_TANGUT_COMPONENTS:
return false; return false;
default: default:
return true; return true;
......
...@@ -63,7 +63,10 @@ class PLATFORM_EXPORT Character { ...@@ -63,7 +63,10 @@ class PLATFORM_EXPORT Character {
0xE01EF); // VARIATION SELECTOR-17 to 256 0xE01EF); // VARIATION SELECTOR-17 to 256
} }
static bool IsCJKIdeographOrSymbol(UChar32); static bool IsCJKIdeographOrSymbol(UChar32 c) {
// Below U+02C7 is likely a common case.
return c < 0x2C7 ? false : IsCJKIdeographOrSymbolSlow(c);
}
static bool IsCJKIdeographOrSymbolBase(UChar32 c) { static bool IsCJKIdeographOrSymbolBase(UChar32 c) {
return IsCJKIdeographOrSymbol(c) && return IsCJKIdeographOrSymbol(c) &&
!(U_GET_GC_MASK(c) & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_SK_MASK)); !(U_GET_GC_MASK(c) & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_SK_MASK));
...@@ -172,6 +175,9 @@ class PLATFORM_EXPORT Character { ...@@ -172,6 +175,9 @@ class PLATFORM_EXPORT Character {
static String NormalizeSpaces(const UChar*, unsigned length); static String NormalizeSpaces(const UChar*, unsigned length);
static bool IsCommonOrInheritedScript(UChar32); static bool IsCommonOrInheritedScript(UChar32);
private:
static bool IsCJKIdeographOrSymbolSlow(UChar32);
}; };
} // namespace blink } // namespace blink
......
...@@ -47,13 +47,10 @@ static const UChar32 kIsCJKIdeographOrSymbolRanges[] = { ...@@ -47,13 +47,10 @@ static const UChar32 kIsCJKIdeographOrSymbolRanges[] = {
0x4E00, 0x9FFF, 0x4E00, 0x9FFF,
// CJK Compatibility Ideographs. // CJK Compatibility Ideographs.
0xF900, 0xFAFF, 0xF900, 0xFAFF,
// CJK Unified Ideographs Extension B. // Unicode Plane 2: Supplementary Ideographic Plane. This plane includes:
0x20000, 0x2A6DF, // CJK Unified Ideographs Extension B to F.
// CJK Unified Ideographs Extension C.
// CJK Unified Ideographs Extension D.
0x2A700, 0x2B81F,
// CJK Compatibility Ideographs Supplement. // CJK Compatibility Ideographs Supplement.
0x2F800, 0x2FA1F, 0x20000, 0x2FFFF,
// cjkSymbolRanges // cjkSymbolRanges
0x2156, 0x215A, 0x2160, 0x216B, 0x2170, 0x217B, 0x231A, 0x231B, 0x23E9, 0x2156, 0x215A, 0x2160, 0x216B, 0x2170, 0x217B, 0x231A, 0x231B, 0x23E9,
...@@ -80,13 +77,29 @@ static const UChar32 kIsCJKIdeographOrSymbolRanges[] = { ...@@ -80,13 +77,29 @@ static const UChar32 kIsCJKIdeographOrSymbolRanges[] = {
0x3190, 0x31BF, 0x3190, 0x31BF,
// Enclosed CJK Letters and Months (0x3200 .. 0x32FF). // Enclosed CJK Letters and Months (0x3200 .. 0x32FF).
// CJK Compatibility (0x3300 .. 0x33FF). // CJK Compatibility (0x3300 .. 0x33FF).
0x3200, 0x33FF, 0xF860, 0xF862, 0x3200, 0x33FF,
// Yijing Hexagram Symbols
0x4DC0, 0x4DFF,
// http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT
0xF860, 0xF862,
// CJK Compatibility Forms. // CJK Compatibility Forms.
// Small Form Variants (for CNS 11643). // Small Form Variants (for CNS 11643).
0xFE30, 0xFE6F, 0xFE30, 0xFE6F,
// Halfwidth and Fullwidth Forms // Halfwidth and Fullwidth Forms
// Usually only used in CJK // Usually only used in CJK
0xFF00, 0xFF0C, 0xFF0E, 0xFF1A, 0xFF1F, 0xFFEF, 0xFF00, 0xFF0C, 0xFF0E, 0xFF1A, 0xFF1F, 0xFFEF,
// Ideographic Symbols and Punctuation
0x16FE0, 0x16FFF,
// Tangut
0x17000, 0x187FF,
// Tangut Components
0x18800, 0x18AFF,
// Kana Supplement
0x1B000, 0x1B0FF,
// Kana Extended-A
0x1B100, 0x1B12F,
// Nushu
0x1B170, 0x1B2FF,
// Emoji. // Emoji.
0x1F110, 0x1F129, 0x1F130, 0x1F149, 0x1F150, 0x1F169, 0x1F170, 0x1F189, 0x1F110, 0x1F129, 0x1F130, 0x1F149, 0x1F150, 0x1F169, 0x1F170, 0x1F189,
0x1F191, 0x1F19A, 0x1F1E6, 0x1F1FF, 0x1F200, 0x1F6FF, 0x1F191, 0x1F19A, 0x1F1E6, 0x1F1FF, 0x1F200, 0x1F6FF,
...@@ -158,6 +171,8 @@ static const UChar32 kIsUprightInMixedVerticalRanges[] = { ...@@ -158,6 +171,8 @@ static const UChar32 kIsUprightInMixedVerticalRanges[] = {
0x10980, 0x1099F, 0x10980, 0x1099F,
// Siddham // Siddham
0x11580, 0x115FF, 0x11580, 0x115FF,
// Zanabazar Square
0x11A00, 0x11AAF,
// Egyptian Hieroglyphs // Egyptian Hieroglyphs
0x13000, 0x1342F, 0x13000, 0x1342F,
// Anatolian Hieroglyphs // Anatolian Hieroglyphs
...@@ -170,6 +185,10 @@ static const UChar32 kIsUprightInMixedVerticalRanges[] = { ...@@ -170,6 +185,10 @@ static const UChar32 kIsUprightInMixedVerticalRanges[] = {
0x18800, 0x18AFF, 0x18800, 0x18AFF,
// Kana Supplement // Kana Supplement
0x1B000, 0x1B0FF, 0x1B000, 0x1B0FF,
// Kana Extended-A
0x1B100, 0x1B12F,
// Nushu
0x1B170, 0x1B2FF,
// Byzantine Musical Symbols/Musical Symbols // Byzantine Musical Symbols/Musical Symbols
0x1D000, 0x1D1FF, 0x1D000, 0x1D1FF,
// Tai Xuan Jing Symbols/Counting Rod Numerals // Tai Xuan Jing Symbols/Counting Rod Numerals
......
...@@ -34,21 +34,30 @@ TEST(CharacterTest, HammerEmojiVsCJKIdeographOrSymbol) { ...@@ -34,21 +34,30 @@ TEST(CharacterTest, HammerEmojiVsCJKIdeographOrSymbol) {
static void TestSpecificUChar32RangeIdeograph(UChar32 range_start, static void TestSpecificUChar32RangeIdeograph(UChar32 range_start,
UChar32 range_end, UChar32 range_end,
bool before = true) { bool before = true,
if (before) bool after = true) {
EXPECT_FALSE(Character::IsCJKIdeographOrSymbol(range_start - 1)); if (before) {
EXPECT_TRUE(Character::IsCJKIdeographOrSymbol(range_start)); EXPECT_FALSE(Character::IsCJKIdeographOrSymbol(range_start - 1))
EXPECT_TRUE(Character::IsCJKIdeographOrSymbol( << std::hex << (range_start - 1);
(UChar32)((uint64_t)range_start + (uint64_t)range_end) / 2)); }
EXPECT_TRUE(Character::IsCJKIdeographOrSymbol(range_end)); EXPECT_TRUE(Character::IsCJKIdeographOrSymbol(range_start))
EXPECT_FALSE(Character::IsCJKIdeographOrSymbol(range_end + 1)); << std::hex << range_start;
UChar32 mid = static_cast<UChar32>(
(static_cast<uint64_t>(range_start) + range_end) / 2);
EXPECT_TRUE(Character::IsCJKIdeographOrSymbol(mid)) << std::hex << mid;
EXPECT_TRUE(Character::IsCJKIdeographOrSymbol(range_end))
<< std::hex << range_end;
if (after) {
EXPECT_FALSE(Character::IsCJKIdeographOrSymbol(range_end + 1))
<< std::hex << (range_end + 1);
}
} }
TEST(CharacterTest, TestIsCJKIdeograph) { TEST(CharacterTest, TestIsCJKIdeograph) {
// The basic CJK Unified Ideographs block. // The basic CJK Unified Ideographs block.
TestSpecificUChar32RangeIdeograph(0x4E00, 0x9FFF); TestSpecificUChar32RangeIdeograph(0x4E00, 0x9FFF, false);
// CJK Unified Ideographs Extension A. // CJK Unified Ideographs Extension A.
TestSpecificUChar32RangeIdeograph(0x3400, 0x4DBF, false); TestSpecificUChar32RangeIdeograph(0x3400, 0x4DBF, false, false);
// CJK Unified Ideographs Extension A and Kangxi Radicals. // CJK Unified Ideographs Extension A and Kangxi Radicals.
TestSpecificUChar32RangeIdeograph(0x2E80, 0x2FDF); TestSpecificUChar32RangeIdeograph(0x2E80, 0x2FDF);
// CJK Strokes. // CJK Strokes.
...@@ -56,12 +65,12 @@ TEST(CharacterTest, TestIsCJKIdeograph) { ...@@ -56,12 +65,12 @@ TEST(CharacterTest, TestIsCJKIdeograph) {
// CJK Compatibility Ideographs. // CJK Compatibility Ideographs.
TestSpecificUChar32RangeIdeograph(0xF900, 0xFAFF); TestSpecificUChar32RangeIdeograph(0xF900, 0xFAFF);
// CJK Unified Ideographs Extension B. // CJK Unified Ideographs Extension B.
TestSpecificUChar32RangeIdeograph(0x20000, 0x2A6DF); TestSpecificUChar32RangeIdeograph(0x20000, 0x2A6DF, true, false);
// CJK Unified Ideographs Extension C. // CJK Unified Ideographs Extension C.
// CJK Unified Ideographs Extension D. // CJK Unified Ideographs Extension D.
TestSpecificUChar32RangeIdeograph(0x2A700, 0x2B81F); TestSpecificUChar32RangeIdeograph(0x2A700, 0x2B81F, false, false);
// CJK Compatibility Ideographs Supplement. // CJK Compatibility Ideographs Supplement.
TestSpecificUChar32RangeIdeograph(0x2F800, 0x2FA1F); TestSpecificUChar32RangeIdeograph(0x2F800, 0x2FA1F, false, false);
} }
static void TestSpecificUChar32RangeIdeographSymbol(UChar32 range_start, static void TestSpecificUChar32RangeIdeographSymbol(UChar32 range_start,
...@@ -218,6 +227,23 @@ TEST(CharacterTest, TestIsCJKIdeographOrSymbol) { ...@@ -218,6 +227,23 @@ TEST(CharacterTest, TestIsCJKIdeographOrSymbol) {
TestSpecificUChar32RangeIdeographSymbol(0x1F1E6, 0x1F6FF); TestSpecificUChar32RangeIdeographSymbol(0x1F1E6, 0x1F6FF);
} }
TEST(CharacterTest, CanTextDecorationSkipInk) {
// ASCII
EXPECT_TRUE(Character::CanTextDecorationSkipInk('a'));
// Hangul Jamo
EXPECT_FALSE(Character::CanTextDecorationSkipInk(0x1100));
// Hiragana
EXPECT_FALSE(Character::CanTextDecorationSkipInk(0x3041));
// Bopomofo
EXPECT_FALSE(Character::CanTextDecorationSkipInk(0x31A0));
// The basic CJK Unified Ideographs block
EXPECT_FALSE(Character::CanTextDecorationSkipInk(0x4E01));
// Hangul Syllables
EXPECT_FALSE(Character::CanTextDecorationSkipInk(0xAC00));
// Plane 2 / CJK Ideograph Extension B
EXPECT_FALSE(Character::CanTextDecorationSkipInk(0x20000));
}
TEST(CharacterTest, TestEmojiTextDefault) { TEST(CharacterTest, TestEmojiTextDefault) {
// Text-default emoji, i.e. // Text-default emoji, i.e.
// Emoji=Yes and EmojiPresentation=No // Emoji=Yes and EmojiPresentation=No
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment