Commit f98f5db3 authored by Dominik Röttsches's avatar Dominik Röttsches Committed by Commit Bot

Update CachingWordShapeIterator to more reliably handle emoji sequences

Instead of individual tests for tag sequences, use newer ICU methods to
detect xpicto_sequences as in Unicode TR#29, compare
https://www.unicode.org/reports/tr29/#Regex_Definitions by using ICU's
UCHAR_EMOJI_COMPONENT and UCHAR_EMOJI_COMPONENT checks.

Bug: 1121420
Change-Id: I37c9cab51ea88c6be85c2df637af03dcf68c568f
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2390341Reviewed-by: default avatarKoji Ishii <kojii@chromium.org>
Reviewed-by: default avatarYoshifumi Inoue <yosin@chromium.org>
Commit-Queue: Dominik Röttsches <drott@chromium.org>
Cr-Commit-Position: refs/heads/master@{#804168}
parent 48174d35
......@@ -125,16 +125,10 @@ class PLATFORM_EXPORT CachingWordShapeIterator final {
bool has_any_script = !Character::IsCommonOrInheritedScript(ch);
for (unsigned next_end = end; end < length; end = next_end) {
ch = text_run_.CodepointAtAndNext(next_end);
// If ZWJ, include the next character.
if (ch == kZeroWidthJoinerCharacter) {
if (next_end < length)
text_run_.CodepointAtAndNext(next_end);
continue;
}
// Modifier check in order not to split those Emoji sequences.
// Modifier check in order not to split Emoji sequences.
if (U_GET_GC_MASK(ch) & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_SK_MASK) ||
Character::IsModifier(ch) || Character::IsEmojiTagSequence(ch) ||
ch == kCancelTag)
ch == kZeroWidthJoinerCharacter || Character::IsEmojiComponent(ch) ||
Character::IsExtendedPictographic(ch))
continue;
// Avoid delimiting COMMON/INHERITED alone, which makes harder to
// identify the script.
......
......@@ -213,82 +213,36 @@ TEST_F(CachingWordShaperTest, SegmentCJKAndNonCJKCommon) {
ASSERT_FALSE(iterator.Next(&word_result));
}
TEST_F(CachingWordShaperTest, SegmentEmojiZWJCommon) {
// A family followed by a couple with heart emoji sequence,
// the latter including a variation selector.
const UChar kStr[] = {0xD83D, 0xDC68, 0x200D, 0xD83D, 0xDC69, 0x200D,
0xD83D, 0xDC67, 0x200D, 0xD83D, 0xDC66, 0xD83D,
0xDC69, 0x200D, 0x2764, 0xFE0F, 0x200D, 0xD83D,
0xDC8B, 0x200D, 0xD83D, 0xDC68, 0x0};
TextRun text_run(kStr, 22);
scoped_refptr<const ShapeResult> word_result;
CachingWordShapeIterator iterator(cache.get(), text_run, &font);
ASSERT_TRUE(iterator.Next(&word_result));
EXPECT_EQ(22u, word_result->NumCharacters());
ASSERT_FALSE(iterator.Next(&word_result));
}
TEST_F(CachingWordShaperTest, SegmentEmojiZWJ) {
// ZWJ should include the next character in the "word", so that they are
// shaped together.
String str(u"\U0001F3F4\u200D\u2620\uFE0F");
TextRun text_run(str);
scoped_refptr<const ShapeResult> word_result;
CachingWordShapeIterator iterator(cache.get(), text_run, &font);
ASSERT_TRUE(iterator.Next(&word_result));
EXPECT_EQ(str.length(), word_result->NumCharacters());
ASSERT_FALSE(iterator.Next(&word_result));
}
TEST_F(CachingWordShaperTest, SegmentEmojiPilotJudgeSequence) {
// A family followed by a couple with heart emoji sequence,
// the latter including a variation selector.
const UChar kStr[] = {0xD83D, 0xDC68, 0xD83C, 0xDFFB, 0x200D, 0x2696, 0xFE0F,
0xD83D, 0xDC68, 0xD83C, 0xDFFB, 0x200D, 0x2708, 0xFE0F};
TextRun text_run(kStr, base::size(kStr));
scoped_refptr<const ShapeResult> word_result;
CachingWordShapeIterator iterator(cache.get(), text_run, &font);
ASSERT_TRUE(iterator.Next(&word_result));
EXPECT_EQ(base::size(kStr), word_result->NumCharacters());
ASSERT_FALSE(iterator.Next(&word_result));
}
TEST_F(CachingWordShaperTest, SegmentEmojiHeartZWJSequence) {
// A ZWJ, followed by two family ZWJ Sequences.
const UChar kStr[] = {0xD83D, 0xDC69, 0x200D, 0x2764, 0xFE0F, 0x200D,
0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC68, 0x0};
TextRun text_run(kStr, 11);
scoped_refptr<const ShapeResult> word_result;
CachingWordShapeIterator iterator(cache.get(), text_run, &font);
ASSERT_TRUE(iterator.Next(&word_result));
EXPECT_EQ(11u, word_result->NumCharacters());
ASSERT_FALSE(iterator.Next(&word_result));
}
TEST_F(CachingWordShaperTest, SegmentEmojiSignsOfHornsModifier) {
// A Sign of the Horns emoji, followed by a fitzpatrick modifer
const UChar kStr[] = {0xD83E, 0xDD18, 0xD83C, 0xDFFB, 0x0};
TextRun text_run(kStr, 4);
scoped_refptr<const ShapeResult> word_result;
CachingWordShapeIterator iterator(cache.get(), text_run, &font);
ASSERT_TRUE(iterator.Next(&word_result));
EXPECT_EQ(4u, word_result->NumCharacters());
ASSERT_FALSE(iterator.Next(&word_result));
TEST_F(CachingWordShaperTest, SegmentEmojiSequences) {
std::vector<std::string> test_strings = {
// A family followed by a couple with heart emoji sequence,
// the latter including a variation selector.
u8"\U0001f468\u200D\U0001f469\u200D\U0001f467\u200D\U0001f466\U0001f469"
u8"\u200D\u2764\uFE0F\u200D\U0001f48b\u200D\U0001f468",
// Pirate flag
u8"\U0001F3F4\u200D\u2620\uFE0F",
// Pilot, judge sequence
u8"\U0001f468\U0001f3fb\u200D\u2696\uFE0F\U0001f468\U0001f3fb\u200D\u2708"
u8"\uFE0F",
// Woman, Kiss, Man sequence
u8"\U0001f469\u200D\u2764\uFE0F\u200D\U0001f48b\u200D\U0001f468",
// Signs of horns with skin tone modifier
u8"\U0001f918\U0001f3fb",
// Man, dark skin tone, red hair
u8"\U0001f468\U0001f3ff\u200D\U0001f9b0"};
for (auto test_string : test_strings) {
String emoji_string = String::FromUTF8(test_string);
TextRun text_run(emoji_string);
scoped_refptr<const ShapeResult> word_result;
CachingWordShapeIterator iterator(cache.get(), text_run, &font);
ASSERT_TRUE(iterator.Next(&word_result));
EXPECT_EQ(emoji_string.length(), word_result->NumCharacters())
<< " Length mismatch for sequence: " << test_string;
ASSERT_FALSE(iterator.Next(&word_result));
}
}
TEST_F(CachingWordShaperTest, SegmentEmojiExtraZWJPrefix) {
......
......@@ -224,6 +224,14 @@ bool Character::IsEmojiTagSequence(UChar32 c) {
(c >= kTagLatinSmallLetterA && c <= kTagLatinSmallLetterZ);
}
bool Character::IsExtendedPictographic(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_EXTENDED_PICTOGRAPHIC);
}
bool Character::IsEmojiComponent(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_EMOJI_COMPONENT);
}
template <typename CharacterType>
static inline String NormalizeSpacesInternal(const CharacterType* characters,
unsigned length) {
......
......@@ -154,6 +154,8 @@ class PLATFORM_EXPORT Character {
static bool IsModifier(UChar32 c) { return c >= 0x1F3FB && c <= 0x1F3FF; }
// http://www.unicode.org/reports/tr51/proposed.html#flag-emoji-tag-sequences
static bool IsEmojiTagSequence(UChar32);
static bool IsEmojiComponent(UChar32);
static bool IsExtendedPictographic(UChar32);
static inline UChar NormalizeSpaces(UChar character) {
if (TreatAsSpace(character))
......
......@@ -443,4 +443,28 @@ TEST(CharacterTest, IsVerticalMathCharacter) {
}
}
TEST(CharacterTest, ExtendedPictographic) {
EXPECT_FALSE(Character::IsExtendedPictographic(0x00A8));
EXPECT_TRUE(Character::IsExtendedPictographic(0x00A9));
EXPECT_FALSE(Character::IsExtendedPictographic(0x00AA));
EXPECT_FALSE(Character::IsExtendedPictographic(0x3298));
EXPECT_TRUE(Character::IsExtendedPictographic(0x3299));
EXPECT_FALSE(Character::IsExtendedPictographic(0x329A));
}
TEST(CharacterTest, EmojiComponents) {
UChar32 false_set[] = {0x22, 0x2B, 0x29, 0x40, 0x200C, 0x200E,
0x20E2, 0x20E4, 0xFE0E, 0xFE1A, 0x1F1E5, 0x1f200,
0x1f3fa, 0x1f400, 0x1f9Af, 0x1f9b4, 0xe001F, 0xe0080};
UChar32 true_set[] = {0x23, 0x2a, 0x30, 0x39, 0x200d,
0x20e3, 0xfe0f, 0x1f1e6, 0x1f1ff, 0x1f3fb,
0x1f3ff, 0x1f9b0, 0x1f9b3, 0xe0020, 0xe007f};
for (auto false_test : false_set)
EXPECT_FALSE(Character::IsEmojiComponent(false_test));
for (auto true_test : true_set)
EXPECT_TRUE(Character::IsEmojiComponent(true_test));
}
} // namespace blink
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment