Commit d07fc52f authored by Etienne Bergeron's avatar Etienne Bergeron Committed by Commit Bot

Fix script intersection when performing ItemizeTextToRuns

This CL applying multiple change to fix the way ItemizeTextToRuns
split a sequence of codepoints into runs with same/compatible
scripts.

 *) Increase kMaxScripts to support codepoints with large amount of scripts.
    Some codepoint has much more than the previous maximum 5 scripts
    (see indic_fraction test u+A830, dev_danda u+0964).

 *) Only rely on Script extensions to find longest script interval
    see: http://www.unicode.org/reports/tr24/tr24-29.html
    (more specifically Table 7. Script_Extensions Examples)

    Cases where script was 'Common' and Script Extensions has a set of
    explicit script was not handled properly.

                 Script       Script Extensions     Results
     Previously: Common       {Hira Kana}       ->  {Hira Kana Common}
     Currently:  Common       {Hira Kana}       ->  {Hira Kana}

    This mistake has the consequene to put lot of codepoints from different
    scripts into the same bucket ('common') and there won't be a font that
    can renders (too) different scripts. Which is causing expensive fallback
    fonts lookups.

    ""
      U+30FC  Scx = {Hira Kana}  Script = Common

      Example 1. Mixed script detection for spoofing.
      Using the Script property alone, for example, will not detect that the U+30FC
      KATAKANA-HIRAGANA PROLONGED SOUND MARK (Script=Common) should not be mixed with
      Latin.
    ""

  *) Inherited script should only be applied to the previous codepoint.
     It was incorrectly intersected with the previous or the next one.

[ RUN      ] ItemizeTextToRunsScripts/RenderTextTestWithRunListCase.ItemizeTextToRuns/diac_lat
../../ui/gfx/render_text_unittest.cc(930): error: Expected equality of these values:
  param.expected
    Which is: "[0][1->2]"
  GetRunListStructureString()
    Which is: "[0->2]"

    That was causing more codepoints to be appended to long sequence of script
    when type was 'Common'.


Bug: 1017194
Change-Id: Ia1339697a44d0d7bbac389997e9fa2878e6eac1d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1876489Reviewed-by: default avatarRobert Liao <robliao@chromium.org>
Reviewed-by: default avatarAlexei Svitkine <asvitkine@chromium.org>
Commit-Queue: Etienne Bergeron <etienneb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#710416}
parent 603a50e0
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include "base/command_line.h" #include "base/command_line.h"
#include "base/containers/mru_cache.h" #include "base/containers/mru_cache.h"
#include "base/containers/span.h"
#include "base/feature_list.h" #include "base/feature_list.h"
#include "base/hash/hash.h" #include "base/hash/hash.h"
#include "base/i18n/base_i18n_switches.h" #include "base/i18n/base_i18n_switches.h"
...@@ -20,6 +21,7 @@ ...@@ -20,6 +21,7 @@
#include "base/message_loop/message_loop_current.h" #include "base/message_loop/message_loop_current.h"
#include "base/metrics/histogram_macros.h" #include "base/metrics/histogram_macros.h"
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/stl_util.h"
#include "base/strings/string_number_conversions.h" #include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h" #include "base/strings/string_util.h"
#include "base/strings/stringprintf.h" #include "base/strings/stringprintf.h"
...@@ -67,7 +69,7 @@ const size_t kMaxTextLength = 10000; ...@@ -67,7 +69,7 @@ const size_t kMaxTextLength = 10000;
// The maximum number of scripts a Unicode character can belong to. This value // The maximum number of scripts a Unicode character can belong to. This value
// is arbitrarily chosen to be a good limit because it is unlikely for a single // is arbitrarily chosen to be a good limit because it is unlikely for a single
// character to belong to more scripts. // character to belong to more scripts.
const size_t kMaxScripts = 5; const size_t kMaxScripts = 32;
// Font fallback mechanism used to Shape runs (see ShapeRuns(...)). // Font fallback mechanism used to Shape runs (see ShapeRuns(...)).
// These values are persisted to logs. Entries should not be renumbered and // These values are persisted to logs. Entries should not be renumbered and
...@@ -98,34 +100,18 @@ bool IsBracket(UChar32 codepoint) { ...@@ -98,34 +100,18 @@ bool IsBracket(UChar32 codepoint) {
U_BPT_NONE; U_BPT_NONE;
} }
// If the given scripts match, returns the one that isn't USCRIPT_INHERITED, // Writes the script and the script extensions of the Unicode |codepoint|.
// i.e. the more specific one. Otherwise returns USCRIPT_INVALID_CODE. This // Returns the number of written scripts.
// function is used to split runs between characters of different script codes, size_t GetScriptExtensions(UChar32 codepoint, UScriptCode* scripts) {
// unless either character has USCRIPT_INHERITED property. See crbug.com/448909. // Fill |scripts| with the script extensions.
UScriptCode ScriptIntersect(UScriptCode first, UScriptCode second) {
if (first == second || second == USCRIPT_INHERITED)
return first;
if (first == USCRIPT_INHERITED)
return second;
return USCRIPT_INVALID_CODE;
}
// Writes the script and the script extensions of the character with the
// Unicode |codepoint|. Returns the number of written scripts.
int GetScriptExtensions(UChar32 codepoint, UScriptCode* scripts) {
UErrorCode icu_error = U_ZERO_ERROR; UErrorCode icu_error = U_ZERO_ERROR;
// ICU documentation incorrectly states that the result of size_t count =
// |uscript_getScriptExtensions| will contain the regular script property. uscript_getScriptExtensions(codepoint, scripts, kMaxScripts, &icu_error);
// Write the character's script property to the first element. DCHECK_NE(icu_error, U_BUFFER_OVERFLOW_ERROR) << " #ext: " << count;
scripts[0] = uscript_getScript(codepoint, &icu_error);
if (U_FAILURE(icu_error)) if (U_FAILURE(icu_error))
return 0; return 0;
// Fill the rest of |scripts| with the extensions.
int count = uscript_getScriptExtensions(codepoint, scripts + 1, return count;
kMaxScripts - 1, &icu_error);
if (U_FAILURE(icu_error))
count = 0;
return count + 1;
} }
// Intersects the script extensions set of |codepoint| with |result| and writes // Intersects the script extensions set of |codepoint| with |result| and writes
...@@ -133,19 +119,51 @@ int GetScriptExtensions(UChar32 codepoint, UScriptCode* scripts) { ...@@ -133,19 +119,51 @@ int GetScriptExtensions(UChar32 codepoint, UScriptCode* scripts) {
void ScriptSetIntersect(UChar32 codepoint, void ScriptSetIntersect(UChar32 codepoint,
UScriptCode* result, UScriptCode* result,
size_t* result_size) { size_t* result_size) {
// Each codepoint has a Script property and a Script Extensions (Scx)
// property.
//
// The implicit Script property values 'Common' and 'Inherited' indicate that
// a codepoint is widely used in many scripts, rather than being associated
// to a specific script.
//
// However, some codepoints that are assigned a value of 'Common' or
// 'Inherited' are not commonly used with all scripts, but rather only with a
// limited set of scripts. The Script Extension property is used to specify
// the set of script which borrow the codepoint.
//
// Calls to GetScriptExtensions(...) return the set of scripts where the
// codepoints can be used.
// (see table 7 from http://www.unicode.org/reports/tr24/tr24-29.html)
//
// Script Script Extensions -> Results
// 1) Common {Common} -> {Common}
// Inherited {Inherited} -> {Inherited}
// 2) Latin {Latn} -> {Latn}
// Inherited {Latn} -> {Latn}
// 3) Common {Hira Kana} -> {Hira Kana}
// Inherited {Hira Kana} -> {Hira Kana}
// 4) Devanagari {Deva Dogr Kthi Mahj} -> {Deva Dogr Kthi Mahj}
// Myanmar {Cakm Mymr Tale} -> {Cakm Mymr Tale}
//
// For most of the codepoints, the script extensions set contains only one
// element. For CJK codepoints, it's common to see 3-4 scripts. For really
// rare cases, the set can go above 20 scripts.
UScriptCode scripts[kMaxScripts] = { USCRIPT_INVALID_CODE }; UScriptCode scripts[kMaxScripts] = { USCRIPT_INVALID_CODE };
int count = GetScriptExtensions(codepoint, scripts); size_t count = GetScriptExtensions(codepoint, scripts);
size_t out_size = 0; // Implicit script 'inherited' is inheriting scripts from preceding codepoint.
if (count == 1 && scripts[0] == USCRIPT_INHERITED)
return;
for (size_t i = 0; i < *result_size; ++i) { // Perform the intersection of both script set.
for (int j = 0; j < count; ++j) { auto scripts_span = base::span<UScriptCode>(scripts, count);
UScriptCode intersection = ScriptIntersect(result[i], scripts[j]); DCHECK(!base::Contains(scripts_span, USCRIPT_INHERITED));
if (intersection != USCRIPT_INVALID_CODE) { auto results_span = base::span<UScriptCode>(result, *result_size);
result[out_size++] = intersection;
break; size_t out_size = 0;
} for (UScriptCode current : results_span) {
} if (base::Contains(scripts_span, current))
result[out_size++] = current;
} }
*result_size = out_size; *result_size = out_size;
......
...@@ -1021,6 +1021,121 @@ INSTANTIATE_TEST_SUITE_P(ItemizeTextToRunsBrackets, ...@@ -1021,6 +1021,121 @@ INSTANTIATE_TEST_SUITE_P(ItemizeTextToRunsBrackets,
::testing::ValuesIn(kBracketsRunListCases), ::testing::ValuesIn(kBracketsRunListCases),
RenderTextTestWithRunListCase::ParamInfoToString); RenderTextTestWithRunListCase::ParamInfoToString);
// Test cases to ensure the extraction of script extensions are taken into
// account while performing the text itemization.
// See table 7 from http://www.unicode.org/reports/tr24/tr24-29.html
const RunListCase kScriptExtensionRunListCases[] = {
{"implicit_com_inherited", L"a\u0301", "[0->1]"},
{"explicit_lat", L"\u0061d", "[0->1]"},
{"explicit_inherited_lat", L"x\u0363d", "[0->2]"},
{"explicit_inherited_dev", L"क\u1CD1क", "[0->2]"},
{"multi_explicit_hira", L"は\u30FCz", "[0->1][2]"},
{"multi_explicit_kana", L"ハ\u30FCz", "[0->1][2]"},
{"multi_explicit_lat", L"a\u30FCz", "[0][1][2]"},
{"multi_explicit_impl_dev", L"क\u1CD0z", "[0->1][2]"},
{"multi_explicit_expl_dev", L"क\u096Fz", "[0->1][2]"},
};
INSTANTIATE_TEST_SUITE_P(ItemizeTextToRunsScriptExtension,
RenderTextTestWithRunListCase,
::testing::ValuesIn(kScriptExtensionRunListCases),
RenderTextTestWithRunListCase::ParamInfoToString);
// Test cases to ensure ItemizeTextToRuns is splitting text based on unicode
// script (intersection of script extensions).
// See ScriptExtensions.txt and Scripts.txt from
// http://www.unicode.org/reports/tr24/tr24-29.html
const RunListCase kScriptsRunListCases[] = {
{"lat", L"abc", "[0->2]"},
{"lat_diac", L"e\u0308f", "[0->2]"},
// Indic Fraction codepoints have large set of script extensions.
{"indic_fraction", L"\uA830\uA832\uA834\uA835", "[0->3]"},
// Devanagari Danda codepoints have large set of script extensions.
{"dev_danda", L"\u0964\u0965", "[0->1]"},
// Combining Diacritical Marks (inherited) should only merge with preceding.
{"diac_lat", L"\u0308fg", "[0][1->2]"},
{"diac_dev", L"क\u0308f", "[0->1][2]"},
// ZWJW has the inherited script.
{"lat_ZWNJ", L"ab\u200Ccd", "[0->4]"},
{"dev_ZWNJ", L"क\u200Cक", "[0->2]"},
{"lat_dev_ZWNJ", L"a\u200Cक", "[0->1][2]"},
// Invalid codepoints.
{"invalid_cp", L"\uFFFE", "[0]"},
{"invalid_cps", L"\uFFFE\uFFFF", "[0->1]"},
{"unknown", L"a\u243Fb", "[0][1][2]"},
// Codepoints from different code block should be in same run when part of
// the same script.
{"blocks_lat", L"aɒɠƉĚÑ", "[0->5]"},
{"blocks_lat_paren", L"([_!_])", "[0->1][2->4][5->6]"},
{"blocks_lat_sub", L"ₐₑaeꬱ", "[0->4]"},
{"blocks_lat_smallcap", L"ꟺM", "[0->1]"},
{"blocks_lat_small_letter", L"ᶓᶍᶓᴔᴟ", "[0->4]"},
{"blocks_lat_acc", L"eéěĕȩɇḕẻếⱻꞫ", "[0->10]"},
{"blocks_com", L"⟦ℳ¥¾⟾⁸⟧Ⓔ", "[0][1->5][6][7]"},
// Latin script.
{"latin_numbers", L"a1b2c3", "[0][1][2][3][4][5]"},
{"latin_puncts1", L"a,b,c.", "[0][1][2][3][4][5]"},
{"latin_puncts2", L"aa,bb,cc!!", "[0->1][2][3->4][5][6->7][8->9]"},
{"latin_diac_multi", L"a\u0300e\u0352i", "[0->4]"},
// Common script.
{"common_tm", L"•bug™", "[0][1->3][4]"},
{"common_copyright", L"chromium©", "[0->7][8]"},
{"common_math1", L"ℳ: ¬ƒ(x)=½×¾", "[0][1->2][3][4][5][6][7][8][9->11]"},
{"common_math2", L"𝟏×𝟑", "[0->4]"},
{"common_numbers", L"🄀𝟭𝟐⒓¹²", "[0->8]"},
{"common_puncts", L",.\u0083!", "[0->1][2][3]"},
// Arabic script.
{"arabic", L"\u0633\u069b\u0763\u077f\u08A2\uFB53", "[5<-0]"},
{"arabic_lat", L"\u0633\u069b\u0763\u077f\u08A2\uFB53abc", "[6->8][5<-0]"},
{"arabic_word_ligatures", L"\uFDFD\uFDF3", "[1<-0]"},
{"arabic_diac", L"\u069D\u0300", "[1<-0]"},
{"arabic_diac_lat", L"\u069D\u0300abc", "[2->4][1<-0]"},
{"arabic_diac_lat2", L"abc\u069D\u0300abc", "[0->2][4<-3][5->7]"},
{"arabic_lyd", L"\U00010935\U00010930\u06B0\u06B1", "[5<-4][3<-0]"},
{"arabic_numbers", L"12\u06D034", "[3->4][2][0->1]"},
{"arabic_letters", L"ab\u06D0cd", "[0->1][2][3->4]"},
{"arabic_mixed", L"a1\u06D02d", "[0][1][3][2][4]"},
{"arabic_coptic1", L"\u06D0\U000102E2\u2CB2", "[1->3][0]"},
{"arabic_coptic2", L"\u2CB2\U000102E2\u06D0", "[0->2][3]"},
// Devanagari script.
{"devanagari1", L"ञटठडढणतथ", "[0->7]"},
{"devanagari2", L"ढ꣸ꣴ", "[0->2]"},
{"devanagari_vowels", L"\u0915\u093F\u0915\u094C", "[0->3]"},
{"devanagari_consonants", L"\u0915\u094D\u0937", "[0->2]"},
// Ethiopic script.
{"ethiopic", L"መጩጪᎅⶹⶼꬣꬦ", "[0->7]"},
{"ethiopic_numbers", L"1ቨቤ2", "[0][1->2][3]"},
{"ethiopic_mixed1", L"abቨቤ12", "[0->1][2->3][4->5]"},
{"ethiopic_mixed2", L"a1ቨቤb2", "[0][1][2->3][4][5]"},
// Georgian script.
{"georgian1", L"ႼႽႾႿჀჁჂჳჴჵ", "[0->9]"},
{"georgian2", L"ლⴊⴅ", "[0->2]"},
{"georgian_numbers", L"1ლⴊⴅ2", "[0][1->3][4]"},
{"georgian_mixed", L"a1ლⴊⴅb2", "[0][1][2->4][5][6]"},
// Telugu script.
{"telugu_lat", L"aaఉయ!", "[0->1][2->3][4]"},
{"telugu_numbers", L"123౦౧౨456౩౪౫", "[0->2][3->5][6->8][9->11]"},
{"telugu_puncts", L"కురుచ, చిఱుత, చేరువ, చెఱువు!",
"[0->4][5->6][7->11][12->13][14->18][19->20][21->26][27]"},
// Control Pictures.
{"control_pictures", L"␑␒␓␔␕␖␗␘␙␚␛", "[0->10]"},
{"control_pictures_rewrite", L"␑\t␛", "[0->2]"},
};
INSTANTIATE_TEST_SUITE_P(ItemizeTextToRunsScripts,
RenderTextTestWithRunListCase,
::testing::ValuesIn(kScriptsRunListCases),
RenderTextTestWithRunListCase::ParamInfoToString);
TEST_F(RenderTextTest, ElidedText) { TEST_F(RenderTextTest, ElidedText) {
// TODO(skanuj) : Add more test cases for following // TODO(skanuj) : Add more test cases for following
// - RenderText styles. // - RenderText styles.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment