Commit 05672d8d authored by Koji Ishii's avatar Koji Ishii Committed by Commit Bot

Normalize Katakana to Hiragana in ScriptRunIterator

This patch normalizes Katakana to Hiragana in ScriptData, and
thus in ScriptRunIterator.

There are 3 ICU script code for kana; USCRIPT_HIRAGANA,
USCRIPT_KATAKANA, and USCRIPT_KATAKANA_OR_HIRAGANA. However,
OpenType has only one 'kana' for all these 3 script codes.

When shaping, HarfBuzz handles the normalization. However,
Blink splits the incoming string by the ICU scripts.
Splitting these 3 scripts is not useful but consumes CPU
and memory.

By normalizing in ScriptRunIterator, all types of Kana are
in single run.

timeToFirstContentfulPaint:layout shows ~3% improvements
https://pinpoint-dot-chromeperf.appspot.com/job/10d9760fe40000

though bink_perf.layout does not show much differences
https://pinpoint-dot-chromeperf.appspot.com/job/14f19c00140000

This is probably because Katakana characters are not used
much in `japanese-kokoro-insert` (written in ~100 years ago,)
while they are more used in modern pages.

Bug: 636993
Change-Id: If692bb575f2232b8fca10cd2ea87e3022debdf05
Reviewed-on: https://chromium-review.googlesource.com/c/1337650
Commit-Queue: Koji Ishii <kojii@chromium.org>
Reviewed-by: default avatarDominik Röttsches <drott@chromium.org>
Cr-Commit-Position: refs/heads/master@{#609119}
parent 73d50729
......@@ -10,6 +10,26 @@
namespace blink {
namespace {
// UScriptCode and OpenType script are not 1:1; specifically, both Hiragana and
// Katakana map to 'kana' in OpenType. They will be mapped correctly in
// HarfBuzz, but normalizing earlier helps to reduce splitting runs between
// these scripts.
// https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
inline UScriptCode getScriptForOpenType(UChar32 ch, UErrorCode* status) {
UScriptCode script = uscript_getScript(ch, status);
if (UNLIKELY(U_FAILURE(*status)))
return script;
if (UNLIKELY(script == USCRIPT_KATAKANA ||
script == USCRIPT_KATAKANA_OR_HIRAGANA)) {
return USCRIPT_HIRAGANA;
}
return script;
}
} // namespace
typedef ScriptData::PairedBracketType PairedBracketType;
constexpr int ScriptRunIterator::kMaxScriptCount;
......@@ -27,6 +47,10 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
// regardless of the capacity passed to the call. So count can be greater
// than dst->size(), if a later version of the unicode data has more
// than kMaxScriptCount items.
// |uscript_getScriptExtensions| do not need to be collated to
// USCRIPT_HIRAGANA because when ScriptExtensions contains Kana, it contains
// Hira as well, and Hira is always before Kana.
int count = uscript_getScriptExtensions(ch, &dst[0], dst.size(), &status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
// Allow this, we'll just use what we have.
......@@ -35,7 +59,7 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
count = dst.size();
status = U_ZERO_ERROR;
}
UScriptCode primary_script = uscript_getScript(ch, &status);
UScriptCode primary_script = getScriptForOpenType(ch, &status);
if (U_FAILURE(status)) {
DLOG(ERROR) << "Could not get icu script data: " << status << " for 0x"
......
......@@ -370,6 +370,50 @@ TEST_F(ScriptRunIteratorTest, Chinese) {
CHECK_SCRIPT_RUNS({{"萬國碼", USCRIPT_HAN}});
}
struct JapaneseMixedScript {
const char* string;
// The expected primary_script when the string alone was evaluated.
UScriptCode script;
} japanese_mixed_scripts[] = {{"あ", USCRIPT_HIRAGANA},
// Katakana should be normalized to Hiragana
{"ア", USCRIPT_HIRAGANA},
// Script_Extensions=Hira Kana
{"\u30FC", USCRIPT_HIRAGANA},
// Script_Extensions=Hani Hira Kana
{"\u303C", USCRIPT_HAN},
// Script_Extensions=Bopo Hang Hani Hira Kana
{"\u3003", USCRIPT_BOPOMOFO},
// Script_Extensions=Bopo Hang Hani Hira Kana Yiii
{"\u3001", USCRIPT_BOPOMOFO}};
class JapaneseMixedScriptTest
: public ScriptRunIteratorTest,
public testing::WithParamInterface<JapaneseMixedScript> {};
INSTANTIATE_TEST_CASE_P(ScriptRunIteratorTest,
JapaneseMixedScriptTest,
testing::ValuesIn(japanese_mixed_scripts));
TEST_P(JapaneseMixedScriptTest, Data) {
const auto& data = GetParam();
std::string string(data.string);
CheckRuns({{string.data(), data.script}});
// If the string follows Hiragana or Katakana, or is followed by Hiragnaa or
// Katakana, it should be normalized as Hiragana.
std::string hiragana("か");
std::string katakana("カ");
CheckRuns({{(hiragana + string).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(string + hiragana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(katakana + string).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(string + katakana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(hiragana + string + katakana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(katakana + string + hiragana).data(), USCRIPT_HIRAGANA}});
}
// Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest, UnbalancedParens1) {
CHECK_SCRIPT_RUNS(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment