Commit 36c08c1f authored by Koji Ishii's avatar Koji Ishii Committed by Commit Bot

Reland "Normalize Katakana to Hiragana in ScriptRunIterator"

This is a reland of 05672d8d

The original change was reverted as suspected to cause
Android CFI builder to fail:

[ RUN      ] RenderViewImplTest.PreferredSizeZoomed
[FATAL:data_pack.cc(444)] Check failed: !handle->HasResource(resource_id). Duplicate resource 25400 with scale 1

I cannot find any relationship to the failure, and
android_cfi_rel_ng trybot passes. Relanding to see if this
was really the cause.

Original change's description:
> Normalize Katakana to Hiragana in ScriptRunIterator
>
> This patch normalizes Katakana to Hiragana in ScriptData, and
> thus in ScriptRunIterator.
>
> There are 3 ICU script code for kana; USCRIPT_HIRAGANA,
> USCRIPT_KATAKANA, and USCRIPT_KATAKANA_OR_HIRAGANA. However,
> OpenType has only one 'kana' for all these 3 script codes.
>
> When shaping, HarfBuzz handles the normalization. However,
> Blink splits the incoming string by the ICU scripts.
> Splitting these 3 scripts is not useful but consumes CPU
> and memory.
>
> By normalizing in ScriptRunIterator, all types of Kana are
> in single run.
>
> timeToFirstContentfulPaint:layout shows ~3% improvements
> https://pinpoint-dot-chromeperf.appspot.com/job/10d9760fe40000
>
> though bink_perf.layout does not show much differences
> https://pinpoint-dot-chromeperf.appspot.com/job/14f19c00140000
>
> This is probably because Katakana characters are not used
> much in `japanese-kokoro-insert` (written in ~100 years ago,)
> while they are more used in modern pages.
>
> Bug: 636993
> Change-Id: If692bb575f2232b8fca10cd2ea87e3022debdf05
> Reviewed-on: https://chromium-review.googlesource.com/c/1337650
> Commit-Queue: Koji Ishii <kojii@chromium.org>
> Reviewed-by: Dominik Röttsches <drott@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#609119}

TBR=drott@chromium.org

Bug: 636993
Change-Id: I1d741fa47556c5e7bf47e3badcdcf60a81fbc22b
Reviewed-on: https://chromium-review.googlesource.com/c/1343583Reviewed-by: default avatarKoji Ishii <kojii@chromium.org>
Commit-Queue: Koji Ishii <kojii@chromium.org>
Cr-Commit-Position: refs/heads/master@{#609680}
parent 478a17a5
...@@ -10,6 +10,26 @@ ...@@ -10,6 +10,26 @@
namespace blink { namespace blink {
namespace {
// UScriptCode and OpenType script are not 1:1; specifically, both Hiragana and
// Katakana map to 'kana' in OpenType. They will be mapped correctly in
// HarfBuzz, but normalizing earlier helps to reduce splitting runs between
// these scripts.
// https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
inline UScriptCode getScriptForOpenType(UChar32 ch, UErrorCode* status) {
UScriptCode script = uscript_getScript(ch, status);
if (UNLIKELY(U_FAILURE(*status)))
return script;
if (UNLIKELY(script == USCRIPT_KATAKANA ||
script == USCRIPT_KATAKANA_OR_HIRAGANA)) {
return USCRIPT_HIRAGANA;
}
return script;
}
} // namespace
typedef ScriptData::PairedBracketType PairedBracketType; typedef ScriptData::PairedBracketType PairedBracketType;
constexpr int ScriptRunIterator::kMaxScriptCount; constexpr int ScriptRunIterator::kMaxScriptCount;
...@@ -27,6 +47,10 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const { ...@@ -27,6 +47,10 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
// regardless of the capacity passed to the call. So count can be greater // regardless of the capacity passed to the call. So count can be greater
// than dst->size(), if a later version of the unicode data has more // than dst->size(), if a later version of the unicode data has more
// than kMaxScriptCount items. // than kMaxScriptCount items.
// |uscript_getScriptExtensions| do not need to be collated to
// USCRIPT_HIRAGANA because when ScriptExtensions contains Kana, it contains
// Hira as well, and Hira is always before Kana.
int count = uscript_getScriptExtensions(ch, &dst[0], dst.size(), &status); int count = uscript_getScriptExtensions(ch, &dst[0], dst.size(), &status);
if (status == U_BUFFER_OVERFLOW_ERROR) { if (status == U_BUFFER_OVERFLOW_ERROR) {
// Allow this, we'll just use what we have. // Allow this, we'll just use what we have.
...@@ -35,7 +59,7 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const { ...@@ -35,7 +59,7 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
count = dst.size(); count = dst.size();
status = U_ZERO_ERROR; status = U_ZERO_ERROR;
} }
UScriptCode primary_script = uscript_getScript(ch, &status); UScriptCode primary_script = getScriptForOpenType(ch, &status);
if (U_FAILURE(status)) { if (U_FAILURE(status)) {
DLOG(ERROR) << "Could not get icu script data: " << status << " for 0x" DLOG(ERROR) << "Could not get icu script data: " << status << " for 0x"
......
...@@ -370,6 +370,50 @@ TEST_F(ScriptRunIteratorTest, Chinese) { ...@@ -370,6 +370,50 @@ TEST_F(ScriptRunIteratorTest, Chinese) {
CHECK_SCRIPT_RUNS({{"萬國碼", USCRIPT_HAN}}); CHECK_SCRIPT_RUNS({{"萬國碼", USCRIPT_HAN}});
} }
struct JapaneseMixedScript {
const char* string;
// The expected primary_script when the string alone was evaluated.
UScriptCode script;
} japanese_mixed_scripts[] = {{"あ", USCRIPT_HIRAGANA},
// Katakana should be normalized to Hiragana
{"ア", USCRIPT_HIRAGANA},
// Script_Extensions=Hira Kana
{"\u30FC", USCRIPT_HIRAGANA},
// Script_Extensions=Hani Hira Kana
{"\u303C", USCRIPT_HAN},
// Script_Extensions=Bopo Hang Hani Hira Kana
{"\u3003", USCRIPT_BOPOMOFO},
// Script_Extensions=Bopo Hang Hani Hira Kana Yiii
{"\u3001", USCRIPT_BOPOMOFO}};
class JapaneseMixedScriptTest
: public ScriptRunIteratorTest,
public testing::WithParamInterface<JapaneseMixedScript> {};
INSTANTIATE_TEST_CASE_P(ScriptRunIteratorTest,
JapaneseMixedScriptTest,
testing::ValuesIn(japanese_mixed_scripts));
TEST_P(JapaneseMixedScriptTest, Data) {
const auto& data = GetParam();
std::string string(data.string);
CheckRuns({{string.data(), data.script}});
// If the string follows Hiragana or Katakana, or is followed by Hiragnaa or
// Katakana, it should be normalized as Hiragana.
std::string hiragana("か");
std::string katakana("カ");
CheckRuns({{(hiragana + string).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(string + hiragana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(katakana + string).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(string + katakana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(hiragana + string + katakana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(katakana + string + hiragana).data(), USCRIPT_HIRAGANA}});
}
// Close bracket without matching open is ignored // Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest, UnbalancedParens1) { TEST_F(ScriptRunIteratorTest, UnbalancedParens1) {
CHECK_SCRIPT_RUNS( CHECK_SCRIPT_RUNS(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment