Commit edf3a719 authored by Maxim Kolosovskiy's avatar Maxim Kolosovskiy Committed by Commit Bot

Revert "Normalize Katakana to Hiragana in ScriptRunIterator"

This reverts commit 05672d8d.

Reason for revert: Test failures https://bugs.chromium.org/p/chromium/issues/detail?id=906587

Original change's description:
> Normalize Katakana to Hiragana in ScriptRunIterator
> 
> This patch normalizes Katakana to Hiragana in ScriptData, and
> thus in ScriptRunIterator.
> 
> There are 3 ICU script code for kana; USCRIPT_HIRAGANA,
> USCRIPT_KATAKANA, and USCRIPT_KATAKANA_OR_HIRAGANA. However,
> OpenType has only one 'kana' for all these 3 script codes.
> 
> When shaping, HarfBuzz handles the normalization. However,
> Blink splits the incoming string by the ICU scripts.
> Splitting these 3 scripts is not useful but consumes CPU
> and memory.
> 
> By normalizing in ScriptRunIterator, all types of Kana are
> in single run.
> 
> timeToFirstContentfulPaint:layout shows ~3% improvements
> https://pinpoint-dot-chromeperf.appspot.com/job/10d9760fe40000
> 
> though bink_perf.layout does not show much differences
> https://pinpoint-dot-chromeperf.appspot.com/job/14f19c00140000
> 
> This is probably because Katakana characters are not used
> much in `japanese-kokoro-insert` (written in ~100 years ago,)
> while they are more used in modern pages.
> 
> Bug: 636993
> Change-Id: If692bb575f2232b8fca10cd2ea87e3022debdf05
> Reviewed-on: https://chromium-review.googlesource.com/c/1337650
> Commit-Queue: Koji Ishii <kojii@chromium.org>
> Reviewed-by: Dominik Röttsches <drott@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#609119}

TBR=eae@chromium.org,kojii@chromium.org,drott@chromium.org

# Not skipping CQ checks because original CL landed > 1 day ago.

Bug: 636993
Change-Id: Icb14d6ad2a3453c9bfab9287e5c0a4995ac1c97f
Reviewed-on: https://chromium-review.googlesource.com/c/1341519Reviewed-by: default avatarMaxim Kolosovskiy <kolos@chromium.org>
Commit-Queue: Maxim Kolosovskiy <kolos@chromium.org>
Cr-Commit-Position: refs/heads/master@{#609266}
parent 3038dbde
......@@ -10,26 +10,6 @@
namespace blink {
namespace {
// UScriptCode and OpenType script are not 1:1; specifically, both Hiragana and
// Katakana map to 'kana' in OpenType. They will be mapped correctly in
// HarfBuzz, but normalizing earlier helps to reduce splitting runs between
// these scripts.
// https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
inline UScriptCode getScriptForOpenType(UChar32 ch, UErrorCode* status) {
UScriptCode script = uscript_getScript(ch, status);
if (UNLIKELY(U_FAILURE(*status)))
return script;
if (UNLIKELY(script == USCRIPT_KATAKANA ||
script == USCRIPT_KATAKANA_OR_HIRAGANA)) {
return USCRIPT_HIRAGANA;
}
return script;
}
} // namespace
typedef ScriptData::PairedBracketType PairedBracketType;
constexpr int ScriptRunIterator::kMaxScriptCount;
......@@ -47,10 +27,6 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
// regardless of the capacity passed to the call. So count can be greater
// than dst->size(), if a later version of the unicode data has more
// than kMaxScriptCount items.
// |uscript_getScriptExtensions| do not need to be collated to
// USCRIPT_HIRAGANA because when ScriptExtensions contains Kana, it contains
// Hira as well, and Hira is always before Kana.
int count = uscript_getScriptExtensions(ch, &dst[0], dst.size(), &status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
// Allow this, we'll just use what we have.
......@@ -59,7 +35,7 @@ void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
count = dst.size();
status = U_ZERO_ERROR;
}
UScriptCode primary_script = getScriptForOpenType(ch, &status);
UScriptCode primary_script = uscript_getScript(ch, &status);
if (U_FAILURE(status)) {
DLOG(ERROR) << "Could not get icu script data: " << status << " for 0x"
......
......@@ -370,50 +370,6 @@ TEST_F(ScriptRunIteratorTest, Chinese) {
CHECK_SCRIPT_RUNS({{"萬國碼", USCRIPT_HAN}});
}
struct JapaneseMixedScript {
const char* string;
// The expected primary_script when the string alone was evaluated.
UScriptCode script;
} japanese_mixed_scripts[] = {{"あ", USCRIPT_HIRAGANA},
// Katakana should be normalized to Hiragana
{"ア", USCRIPT_HIRAGANA},
// Script_Extensions=Hira Kana
{"\u30FC", USCRIPT_HIRAGANA},
// Script_Extensions=Hani Hira Kana
{"\u303C", USCRIPT_HAN},
// Script_Extensions=Bopo Hang Hani Hira Kana
{"\u3003", USCRIPT_BOPOMOFO},
// Script_Extensions=Bopo Hang Hani Hira Kana Yiii
{"\u3001", USCRIPT_BOPOMOFO}};
class JapaneseMixedScriptTest
: public ScriptRunIteratorTest,
public testing::WithParamInterface<JapaneseMixedScript> {};
INSTANTIATE_TEST_CASE_P(ScriptRunIteratorTest,
JapaneseMixedScriptTest,
testing::ValuesIn(japanese_mixed_scripts));
TEST_P(JapaneseMixedScriptTest, Data) {
const auto& data = GetParam();
std::string string(data.string);
CheckRuns({{string.data(), data.script}});
// If the string follows Hiragana or Katakana, or is followed by Hiragnaa or
// Katakana, it should be normalized as Hiragana.
std::string hiragana("か");
std::string katakana("カ");
CheckRuns({{(hiragana + string).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(string + hiragana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(katakana + string).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(string + katakana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(hiragana + string + katakana).data(), USCRIPT_HIRAGANA}});
CheckRuns({{(katakana + string + hiragana).data(), USCRIPT_HIRAGANA}});
}
// Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest, UnbalancedParens1) {
CHECK_SCRIPT_RUNS(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment