Commit 9176768a authored by drott@chromium.org's avatar drott@chromium.org

Upstream ScriptRunIterator for segmenting text runs by script

ScriptRunIterator takes a pointer to a UTF-16 text run, and a starting
script value, then consume() can be called on it to retrieve the limit
and script value of the next segmented script run. It takes care of
matching brackets when resolving script runs.

This functionality is needed for changing HarfBuzzShaper.cpp so that
we do not need to pre-split and store the TextRun into HarfBuzzRuns.
We can improve our script segmentation and integrate script splitting
and shaping in one loop.

Original code written by Doug Felt, big thanks!

BUG=526095
R=eae,behdad

Review URL: https://codereview.chromium.org/1323513006

git-svn-id: svn://svn.chromium.org/blink/trunk@201722 bbb929c8-8fbe-4397-9dbb-9b2b20218538
parent dd98ae68
...@@ -391,13 +391,15 @@ ...@@ -391,13 +391,15 @@
'fonts/GlyphPageTreeNode.cpp', 'fonts/GlyphPageTreeNode.cpp',
'fonts/GlyphPageTreeNode.h', 'fonts/GlyphPageTreeNode.h',
'fonts/Latin1TextIterator.h', 'fonts/Latin1TextIterator.h',
'fonts/UTF16TextIterator.cpp', 'fonts/ScriptRunIterator.h',
'fonts/UTF16TextIterator.h', 'fonts/ScriptRunIterator.cpp',
'fonts/SegmentedFontData.cpp', 'fonts/SegmentedFontData.cpp',
'fonts/SegmentedFontData.h', 'fonts/SegmentedFontData.h',
'fonts/SimpleFontData.cpp', 'fonts/SimpleFontData.cpp',
'fonts/SimpleFontData.h', 'fonts/SimpleFontData.h',
'fonts/TextBlob.h', 'fonts/TextBlob.h',
'fonts/UTF16TextIterator.cpp',
'fonts/UTF16TextIterator.h',
'fonts/VDMXParser.cpp', 'fonts/VDMXParser.cpp',
'fonts/VDMXParser.h', 'fonts/VDMXParser.h',
'fonts/android/FontCacheAndroid.cpp', 'fonts/android/FontCacheAndroid.cpp',
...@@ -961,6 +963,7 @@ ...@@ -961,6 +963,7 @@
'fonts/FontTest.cpp', 'fonts/FontTest.cpp',
'fonts/GlyphBufferTest.cpp', 'fonts/GlyphBufferTest.cpp',
'fonts/GlyphPageTreeNodeTest.cpp', 'fonts/GlyphPageTreeNodeTest.cpp',
'fonts/ScriptRunIteratorTest.cpp',
'fonts/android/FontCacheAndroidTest.cpp', 'fonts/android/FontCacheAndroidTest.cpp',
'fonts/mac/FontFamilyMatcherMacTest.mm', 'fonts/mac/FontFamilyMatcherMacTest.mm',
'fonts/shaping/CachingWordShaperTest.cpp', 'fonts/shaping/CachingWordShaperTest.cpp',
......
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "config.h"
#include "ScriptRunIterator.h"
#include "platform/Logging.h"
#include "wtf/Threading.h"
#include <ubidi_props.h>
namespace blink {
typedef ScriptData::PairedBracketType PairedBracketType;
const int ScriptData::kMaxScriptCount = 20;
ScriptData::~ScriptData()
{
}
void ICUScriptData::getScripts(UChar32 ch, Vector<UScriptCode>& dst) const
{
UErrorCode status = U_ZERO_ERROR;
// Leave room to insert primary script. It's not strictly necessary but
// it ensures that the result won't ever be greater than kMaxScriptCount,
// which some client someday might expect.
dst.resize(kMaxScriptCount - 1);
// Note, ICU convention is to return the number of available items
// regardless of the capacity passed to the call. So count can be greater
// than dst->size(), if a later version of the unicode data has more
// than kMaxScriptCount items.
int count = uscript_getScriptExtensions(
ch, &dst[0], dst.size(), &status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
// Allow this, we'll just use what we have.
WTF_LOG_ERROR("Exceeded maximum script count of %d for 0x%x", kMaxScriptCount, ch);
count = dst.size();
status = U_ZERO_ERROR;
}
UScriptCode primaryScript = uscript_getScript(ch, &status);
if (U_FAILURE(status)) {
WTF_LOG_ERROR("Could not get icu script data: %d for 0x%x", status, ch);
dst.clear();
return;
}
dst.resize(count);
if (primaryScript == dst.at(0)) {
// Only one script (might be common or inherited -- these are never in
// the extensions unless they're the only script), or extensions are in
// priority order already.
return;
}
if (primaryScript != USCRIPT_INHERITED
&& primaryScript != USCRIPT_COMMON
&& primaryScript != USCRIPT_INVALID_CODE) {
// Not common or primary, with extensions that are not in order. We know
// the primary, so we insert it at the front and swap the previous front
// to somewhere else in the list.
auto it = std::find(dst.begin() + 1, dst.end(), primaryScript);
if (it == dst.end()) {
dst.append(primaryScript);
}
std::swap(*dst.begin(), *it);
return;
}
if (primaryScript == USCRIPT_COMMON) {
if (count == 1) {
// Common with a preferred script. Keep common at head.
dst.prepend(primaryScript);
return;
}
// Ignore common. Find the preferred script of the multiple scripts that
// remain, and ensure it is at the head. Just keep swapping them in,
// there aren't likely to be many.
for (size_t i = 1; i < dst.size(); ++i) {
if (dst.at(0) == USCRIPT_LATIN || dst.at(i) < dst.at(0)) {
std::swap(dst.at(0), dst.at(i));
}
}
return;
}
// The primary is inherited, and there are other scripts. Put inherited at
// the front, the true primary next, and then the others in random order.
dst.append(dst.at(0));
dst.at(0) = primaryScript;
for (size_t i = 2; i < dst.size(); ++i) {
if (dst.at(1) == USCRIPT_LATIN || dst.at(i) < dst.at(1)) {
std::swap(dst.at(1), dst.at(i));
}
}
}
UChar32 ICUScriptData::getPairedBracket(UChar32 ch) const
{
return u_getBidiPairedBracket(ch);
}
PairedBracketType ICUScriptData::getPairedBracketType(UChar32 ch) const
{
return static_cast<PairedBracketType>(
u_getIntPropertyValue(ch, UCHAR_BIDI_PAIRED_BRACKET_TYPE));
}
const ICUScriptData* ICUScriptData::instance()
{
AtomicallyInitializedStaticReference(const ICUScriptData, icuScriptDataInstance, (new ICUScriptData()));
return &icuScriptDataInstance;
}
ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length, const ScriptData* data)
: m_text(text)
, m_length(length)
, m_bracketsFixupDepth(0)
// The initial value of m_aheadCharacter is not used.
, m_aheadCharacter(0)
, m_aheadPos(0)
, m_commonPreferred(USCRIPT_COMMON)
, m_scriptData(data)
{
ASSERT(text);
ASSERT(data);
if (m_aheadPos < m_length) {
m_currentSet.clear();
// Priming the m_currentSet with USCRIPT_COMMON here so that the first
// resolution between m_currentSet and m_nextSet in mergeSets() leads to
// chosing the script of the first consumed character.
m_currentSet.append(USCRIPT_COMMON);
U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter);
m_scriptData->getScripts(m_aheadCharacter, m_aheadSet);
}
}
ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length)
: ScriptRunIterator(text, length, ICUScriptData::instance())
{
}
bool ScriptRunIterator::consume(unsigned& limit, UScriptCode& script)
{
if (m_currentSet.isEmpty()) {
return false;
}
size_t pos;
UChar32 ch;
while (fetch(&pos, &ch)) {
PairedBracketType pairedType = m_scriptData->getPairedBracketType(ch);
switch (pairedType) {
case PairedBracketType::BracketTypeOpen:
openBracket(ch);
break;
case PairedBracketType::BracketTypeClose:
closeBracket(ch);
break;
default:
break;
}
if (!mergeSets()) {
limit = pos;
script = resolveCurrentScript();
fixupStack(script);
m_currentSet = m_nextSet;
return true;
}
}
limit = m_length;
script = resolveCurrentScript();
m_currentSet.clear();
return true;
}
void ScriptRunIterator::openBracket(UChar32 ch)
{
if (m_brackets.size() == kMaxBrackets) {
m_brackets.removeFirst();
if (m_bracketsFixupDepth == kMaxBrackets) {
--m_bracketsFixupDepth;
}
}
m_brackets.append(BracketRec({ ch, USCRIPT_COMMON }));
++m_bracketsFixupDepth;
}
void ScriptRunIterator::closeBracket(UChar32 ch)
{
if (m_brackets.size() > 0) {
UChar32 target = m_scriptData->getPairedBracket(ch);
for (auto it = m_brackets.rbegin(); it != m_brackets.rend(); ++it) {
if (it->ch == target) {
// Have a match, use open paren's resolved script.
UScriptCode script = it->script;
m_nextSet.clear();
m_nextSet.append(script);
// And pop stack to this point.
int numPopped = std::distance(m_brackets.rbegin(), it);
// TODO: No resize operation in WTF::Deque?
for (int i = 0; i < numPopped; ++i)
m_brackets.removeLast();
m_bracketsFixupDepth = std::max(static_cast<size_t>(0),
m_bracketsFixupDepth - numPopped);
return;
}
}
}
// leave stack alone, no match
}
// Keep items in m_currentSet that are in m_nextSet.
//
// If the sets are disjoint, return false and leave m_currentSet unchanged. Else
// return true and make current set the intersection. Make sure to maintain
// current priority script as priority if it remains, else retain next priority
// script if it remains.
//
// Also maintain a common preferred script. If current and next are both
// common, and there is no common preferred script and next has a preferred
// script, set the common preferred script to that of next.
bool ScriptRunIterator::mergeSets()
{
if (m_nextSet.isEmpty() || m_currentSet.isEmpty()) {
return false;
}
auto currentSetIt = m_currentSet.begin();
auto currentEnd = m_currentSet.end();
// Most of the time, this is the only one.
// Advance the current iterator, we won't need to check it again later.
UScriptCode priorityScript = *currentSetIt++;
// If next is common or inherited, the only thing that might change
// is the common preferred script.
if (m_nextSet.at(0) <= USCRIPT_INHERITED) {
if (m_nextSet.size() == 2 && priorityScript <= USCRIPT_INHERITED && m_commonPreferred == USCRIPT_COMMON) {
m_commonPreferred = m_nextSet.at(1);
}
return true;
}
// If current is common or inherited, use the next script set.
if (priorityScript <= USCRIPT_INHERITED) {
m_currentSet = m_nextSet;
return true;
}
// Neither is common or inherited. If current is a singleton,
// just see if it exists in the next set. This is the common case.
auto next_it = m_nextSet.begin();
auto next_end = m_nextSet.end();
if (currentSetIt == currentEnd) {
return std::find(next_it, next_end, priorityScript) != next_end;
}
// Establish the priority script, if we have one.
// First try current priority script.
bool havePriority = std::find(next_it, next_end, priorityScript)
!= next_end;
if (!havePriority) {
// So try next priority script.
// Skip the first current script, we already know it's not there.
// Advance the next iterator, later we won't need to check it again.
priorityScript = *next_it++;
havePriority = std::find(currentSetIt, currentEnd, priorityScript) != currentEnd;
}
// Note that we can never write more scripts into the current vector than
// it already contains, so currentWriteIt won't ever exceed the size/capacity.
auto currentWriteIt = m_currentSet.begin();
if (havePriority) {
// keep the priority script.
*currentWriteIt++ = priorityScript;
}
if (next_it != next_end) {
// Iterate over the remaining current scripts, and keep them if
// they occur in the remaining next scripts.
while (currentSetIt != currentEnd) {
UScriptCode sc = *currentSetIt++;
if (std::find(next_it, next_end, sc) != next_end) {
*currentWriteIt++ = sc;
}
}
}
// Only change current if the run continues.
int written = std::distance(m_currentSet.begin(), currentWriteIt);
if (written > 0) {
m_currentSet.resize(written);
return true;
}
return false;
}
// When we hit the end of the run, and resolve the script, we now know the
// resolved script of any open bracket that was pushed on the stack since
// the start of the run. Fixup depth records how many of these there
// were. We've maintained this count during pushes, and taken care to
// adjust it if the stack got overfull and open brackets were pushed off
// the bottom. This sets the script of the fixup_depth topmost entries of the
// stack to the resolved script.
void ScriptRunIterator::fixupStack(UScriptCode resolvedScript)
{
if (m_bracketsFixupDepth > 0) {
if (m_bracketsFixupDepth > m_brackets.size()) {
// Should never happen unless someone breaks the code.
WTF_LOG_ERROR("Brackets fixup depth exceeds size of bracket vector.");
m_bracketsFixupDepth = m_brackets.size();
}
auto it = m_brackets.rbegin();
for (size_t i = 0; i < m_bracketsFixupDepth; ++i) {
it->script = resolvedScript;
++it;
}
m_bracketsFixupDepth = 0;
}
}
bool ScriptRunIterator::fetch(size_t* pos, UChar32* ch)
{
if (m_aheadPos > m_length) {
return false;
}
*pos = m_aheadPos - (m_aheadCharacter >= 0x10000 ? 2 : 1);
*ch = m_aheadCharacter;
m_nextSet.swap(m_aheadSet);
if (m_aheadPos == m_length) {
// No more data to fetch, but last character still needs to be
// processed. Advance m_aheadPos so that next time we will know
// this has been done.
m_aheadPos++;
return true;
}
U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter);
m_scriptData->getScripts(m_aheadCharacter, m_aheadSet);
if (m_aheadSet.isEmpty()) {
// No scripts for this character. This has already been logged, so
// we just terminate processing this text.
return false;
}
if (m_aheadSet[0] == USCRIPT_INHERITED && m_aheadSet.size() > 1) {
if (m_nextSet[0] == USCRIPT_COMMON) {
// Overwrite the next set with the non-inherited portion of the set.
m_nextSet = m_aheadSet;
m_nextSet.remove(0);
// Discard the remaining values, we'll inherit.
m_aheadSet.resize(1);
} else {
// Else, this applies to anything.
m_aheadSet.resize(1);
}
}
return true;
}
UScriptCode ScriptRunIterator::resolveCurrentScript() const
{
UScriptCode result = m_currentSet.at(0);
return result == USCRIPT_COMMON ? m_commonPreferred : result;
}
} // namespace blink
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef ScriptRunIterator_h
#define ScriptRunIterator_h
#include "platform/PlatformExport.h"
#include "wtf/Deque.h"
#include "wtf/Vector.h"
#include "wtf/dtoa/utils.h"
#include <unicode/uchar.h>
#include <unicode/uscript.h>
namespace blink {
class ScriptData;
class PLATFORM_EXPORT ScriptRunIterator {
public:
ScriptRunIterator(const UChar* text, size_t length);
// This maintains a reference to data. It must exist for the lifetime of
// this object. Typically data is a singleton that exists for the life of
// the process.
ScriptRunIterator(const UChar* text, size_t length, const ScriptData*);
bool consume(unsigned& limit, UScriptCode&);
private:
struct BracketRec {
UChar32 ch;
UScriptCode script;
};
void openBracket(UChar32);
void closeBracket(UChar32);
bool mergeSets();
void fixupStack(UScriptCode resolvedScript);
bool fetch(size_t* pos, UChar32*);
UScriptCode resolveCurrentScript() const;
const UChar* m_text;
const size_t m_length;
Deque<BracketRec> m_brackets;
size_t m_bracketsFixupDepth;
// Limit max brackets so that the bracket tracking buffer does not grow
// excessively large when processing long runs of text.
static const int kMaxBrackets = 32;
Vector<UScriptCode> m_currentSet;
Vector<UScriptCode> m_nextSet;
Vector<UScriptCode> m_aheadSet;
UChar32 m_aheadCharacter;
size_t m_aheadPos;
UScriptCode m_commonPreferred;
const ScriptData* m_scriptData;
DISALLOW_COPY_AND_ASSIGN(ScriptRunIterator);
};
// ScriptData is a wrapper which returns a set of scripts for a particular
// character retrieved from the character's primary script and script extensions,
// as per ICU / Unicode data. ScriptData maintains a certain priority order of
// the returned values, which are essential for mergeSets method to work
// correctly.
class PLATFORM_EXPORT ScriptData {
protected:
ScriptData() = default;
public:
virtual ~ScriptData();
enum PairedBracketType {
BracketTypeNone,
BracketTypeOpen,
BracketTypeClose,
BracketTypeCount
};
static const int kMaxScriptCount;
virtual void getScripts(UChar32, Vector<UScriptCode>& dst) const = 0;
virtual UChar32 getPairedBracket(UChar32) const = 0;
virtual PairedBracketType getPairedBracketType(UChar32) const = 0;
private:
DISALLOW_COPY_AND_ASSIGN(ScriptData);
};
class PLATFORM_EXPORT ICUScriptData : public ScriptData {
public:
~ICUScriptData() override
{
}
static const ICUScriptData* instance();
void getScripts(UChar32, Vector<UScriptCode>& dst) const override;
UChar32 getPairedBracket(UChar32) const override;
PairedBracketType getPairedBracketType(UChar32) const override;
private:
ICUScriptData()
{
}
DISALLOW_COPY_AND_ASSIGN(ICUScriptData);
};
}
#endif
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "config.h"
#include "platform/fonts/ScriptRunIterator.h"
#include "platform/Logging.h"
#include "wtf/Assertions.h"
#include "wtf/Threading.h"
#include "wtf/text/WTFString.h"
#include <gtest/gtest.h>
#include <string>
#include <vector>
namespace blink {
struct TestRun {
std::string text;
UScriptCode code;
};
struct ExpectedRun {
unsigned limit;
UScriptCode code;
ExpectedRun(unsigned the_limit, UScriptCode the_code)
: limit(the_limit)
, code(the_code)
{
}
};
class MockScriptData : public ScriptData {
public:
~MockScriptData() override {}
static const MockScriptData* instance()
{
AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData()));
return &mockScriptData;
}
void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
{
ASSERT(ch >= kMockCharMin);
ASSERT(ch < kMockCharLimit);
int code = ch - kMockCharMin;
dst.clear();
switch (code & kCodeSpecialMask) {
case kCodeSpecialCommon:
dst.append(USCRIPT_COMMON);
break;
case kCodeSpecialInherited:
dst.append(USCRIPT_INHERITED);
break;
default:
break;
}
int listBits = kTable[code & kCodeListIndexMask];
if (dst.isEmpty() && listBits == 0) {
dst.append(USCRIPT_UNKNOWN);
return;
}
while (listBits) {
switch (listBits & kListMask) {
case 0:
break;
case kLatin:
dst.append(USCRIPT_LATIN);
break;
case kHan:
dst.append(USCRIPT_HAN);
break;
case kGreek:
dst.append(USCRIPT_GREEK);
break;
}
listBits >>= kListShift;
}
}
UChar32 getPairedBracket(UChar32 ch) const override
{
switch (getPairedBracketType(ch)) {
case PairedBracketType::BracketTypeClose:
return ch - kBracketDelta;
case PairedBracketType::BracketTypeOpen:
return ch + kBracketDelta;
default:
return ch;
}
}
PairedBracketType getPairedBracketType(UChar32 ch) const override
{
ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
int code = ch - kMockCharMin;
if ((code & kCodeBracketBit) == 0) {
return PairedBracketType::BracketTypeNone;
}
if (code & kCodeBracketCloseBit) {
return PairedBracketType::BracketTypeClose;
}
return PairedBracketType::BracketTypeOpen;
}
static int TableLookup(int value)
{
for (int i = 0; i < 16; ++i) {
if (kTable[i] == value) {
return i;
}
}
WTF_LOG_ERROR("Table does not contain value 0x%x", value);
return 0;
}
static String ToTestString(const std::string& input)
{
String result(String::make16BitFrom8BitSource(0, 0));
bool inSet = false;
int seen = 0;
int code = 0;
int list = 0;
int currentShift = 0;
for (char c : input) {
if (inSet) {
switch (c) {
case '(':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit;
break;
case '[':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit | kCodeSquareBracketBit;
break;
case ')':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit | kCodeBracketCloseBit;
break;
case ']':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;
break;
case 'i':
ASSERT(seen == 0); // brackets can't be inherited
seen |= kSawSpecial;
code |= kCodeSpecialInherited;
break;
case 'c':
ASSERT((seen & ~kSawBracket) == 0);
seen |= kSawSpecial;
code |= kCodeSpecialCommon;
break;
case 'l':
ASSERT((seen & kSawLatin) == 0);
ASSERT(currentShift < 3);
seen |= kSawLatin;
list |= kLatin << (2 * currentShift++);
break;
case 'h':
ASSERT((seen & kSawHan) == 0);
ASSERT(currentShift < 3);
seen |= kSawHan;
list |= kHan << (2 * currentShift++);
break;
case 'g':
ASSERT((seen & kSawGreek) == 0);
ASSERT(currentShift < 3);
seen |= kSawGreek;
list |= kGreek << (2 * currentShift++);
break;
case '>':
ASSERT(seen != 0);
code |= TableLookup(list);
result.append(static_cast<UChar>(kMockCharMin + code));
inSet = false;
break;
default:
WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
break;
}
continue;
}
// not in set
switch (c) {
case '<':
seen = 0;
code = 0;
list = 0;
currentShift = 0;
inSet = true;
break;
case '(':
code = kCodeBracketBit | kCodeSpecialCommon;
break;
case '[':
code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;
break;
case ')':
code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
break;
case ']':
code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
break;
case 'i':
code = kCodeSpecialInherited;
break;
case 'c':
code = kCodeSpecialCommon;
break;
case 'l':
code = kLatin;
break;
case 'h':
code = kHan;
break;
case 'g':
code = kGreek;
break;
case '?':
code = 0; // unknown
break;
default:
WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
}
if (!inSet) {
result.append(static_cast<UChar>(kMockCharMin + code));
}
}
return result;
}
static std::string MockCharString(UChar mockch)
{
ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);
int code = mockch - kMockCharMin;
// We use set notation in these cases:
// - more than one of special, kLatin, kHan, kGreek
// - bracket and not common (since non-set brackets are common)
bool isBracket = (code & kCodeBracketBit) != 0;
bool isSpecial = (mockch & kCodeSpecialMask) != 0;
bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;
char c;
if (isBracket) {
if (code & kCodeSquareBracketBit) {
if (code & kCodeBracketCloseBit) {
c = ']';
} else {
c = '[';
}
} else {
if (code & kCodeBracketCloseBit) {
c = ')';
} else {
c = '(';
}
}
} else if (isSpecial) {
c = isCommon ? 'c' : 'i';
}
std::string result;
int listBits = kTable[code & kCodeListIndexMask];
while (listBits) {
switch (listBits & kListMask) {
case 0:
break;
case kLatin:
result += 'l';
break;
case kHan:
result += 'h';
break;
case kGreek:
result += 'g';
break;
}
listBits >>= kListShift;
}
bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 || (isBracket && (result.length() > 0 || !isCommon));
if (needSet) {
std::string setResult("<");
if (isBracket) {
setResult += c;
}
if (isSpecial) {
if (isCommon) {
setResult += "c";
} else {
setResult += "i";
}
}
setResult += result;
setResult += ">";
return setResult;
}
if (isBracket || isSpecial) {
result = c;
}
return result;
}
// We determine properties based on the offset from kMockCharMin:
// bits 0-3 represent the list of l, h, c scripts (index into table)
// bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
// bit 6 clear means non-bracket, open means bracket
// bit 7 clear means open bracket, set means close bracket
// bit 8 clear means paren, set means bracket
// if it's a bracket, the matching bracket is 64 code points away
static const UChar32 kMockCharMin = 0xe000;
static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
static const int kLatin = 1;
static const int kHan = 2;
static const int kGreek = 3;
static const int kCodeListIndexMask = 0xf;
static const int kCodeSpecialMask = 0x30;
static const int kCodeSpecialCommon = 0x10;
static const int kCodeSpecialInherited = 0x20;
static const int kCodeBracketCloseBit = 0x40;
static const int kCodeBracketBit = 0x80;
static const int kCodeSquareBracketBit = 0x100;
static const int kListShift = 2;
static const int kListMask = 0x3;
static const int kBracketDelta = kCodeBracketCloseBit;
static const int kTable[16];
static const int kSawBracket = 0x1;
static const int kSawSpecial = 0x2;
static const int kSawLatin = 0x4;
static const int kSawHan = 0x8;
static const int kSawGreek = 0x10;
};
static const int kLatin2 = MockScriptData::kLatin << 2;
static const int kHan2 = MockScriptData::kHan << 2;
static const int kGreek2 = MockScriptData::kGreek << 2;
static const int kLatin3 = MockScriptData::kLatin << 4;
static const int kHan3 = MockScriptData::kHan << 4;
static const int kGreek3 = MockScriptData::kGreek << 4;
const int MockScriptData::kTable[] = {
0, kLatin, kHan, kGreek,
kLatin2 + kHan, kLatin2 + kGreek,
kHan2 + kLatin, kHan2 + kGreek,
kGreek2 + kLatin, kGreek2 + kHan,
kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
};
class ScriptRunIteratorTest : public testing::Test {
protected:
void CheckRuns(const std::vector<TestRun>& runs)
{
String text(String::make16BitFrom8BitSource(0, 0));
std::vector<ExpectedRun> expect;
for (auto& run : runs) {
text.append(String::fromUTF8(run.text.c_str()));
expect.push_back(ExpectedRun(text.length(), run.code));
}
ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
VerifyRuns(&scriptRunIterator, expect);
}
// FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
// suitable equivalent real codepoint sequences instead.
void CheckMockRuns(const std::vector<TestRun>& runs)
{
String text(String::make16BitFrom8BitSource(0, 0));
std::vector<ExpectedRun> expect;
for (const TestRun& run : runs) {
text.append(MockScriptData::ToTestString(run.text));
expect.push_back({ text.length(), run.code });
}
ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
MockScriptData::instance());
VerifyRuns(&scriptRunIterator, expect);
}
void VerifyRuns(ScriptRunIterator* scriptRunIterator,
const std::vector<ExpectedRun>& expect)
{
unsigned limit;
UScriptCode code;
unsigned long runCount = 0;
while (scriptRunIterator->consume(limit, code)) {
ASSERT_LT(runCount, expect.size());
ASSERT_EQ(expect[runCount].limit, limit);
ASSERT_EQ(expect[runCount].code, code);
++runCount;
}
WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);
ASSERT_EQ(expect.size(), runCount);
}
};
TEST_F(ScriptRunIteratorTest, Empty)
{
String empty(String::make16BitFrom8BitSource(0, 0));
ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
unsigned limit = 0;
UScriptCode code = USCRIPT_INVALID_CODE;
ASSERT(!scriptRunIterator.consume(limit, code));
ASSERT_EQ(limit, 0u);
ASSERT_EQ(code, USCRIPT_INVALID_CODE);
}
// Some of our compilers cannot initialize a vector from an array yet.
#define DECLARE_RUNSVECTOR(...) \
static const TestRun runsArray[] = __VA_ARGS__; \
std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof(*runsArray));
#define CHECK_RUNS(...) \
DECLARE_RUNSVECTOR(__VA_ARGS__); \
CheckRuns(runs);
#define CHECK_MOCK_RUNS(...) \
DECLARE_RUNSVECTOR(__VA_ARGS__); \
CheckMockRuns(runs);
TEST_F(ScriptRunIteratorTest, Whitespace)
{
CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, Common)
{
CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, Latin)
{
CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, Chinese)
{
CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
}
// Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
{
CHECK_RUNS({ { "(萬", USCRIPT_HAN },
{ "a]", USCRIPT_LATIN },
{ ")", USCRIPT_HAN } });
}
// Open bracket without matching close is popped when inside
// matching close brackets, so doesn't match later close.
TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
{
CHECK_RUNS({ { "(萬", USCRIPT_HAN },
{ "a[", USCRIPT_LATIN },
{ ")]", USCRIPT_HAN } });
}
// space goes with leading script
TEST_F(ScriptRunIteratorTest, LatinHan)
{
CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
{ "萬國碼", USCRIPT_HAN } });
}
// space goes with leading script
TEST_F(ScriptRunIteratorTest, HanLatin)
{
CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
{ "Unicode", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
{
CHECK_RUNS({ { "()", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, ParenChineseParen)
{
CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, ParenLatinParen)
{
CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
}
// open paren gets leading script
TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
{
CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
{ "萬國碼", USCRIPT_HAN },
{ ")", USCRIPT_LATIN } });
}
// open paren gets first trailing script if no leading script
TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
{
CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
{ "Unicode", USCRIPT_LATIN } });
}
// leading common and open paren get first trailing script.
// TODO(dougfelt): we don't do quote matching, but probably should figure out
// something better then doing nothing.
TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
{
CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
{ "Unicode\"", USCRIPT_LATIN } });
}
// Unmatched close brace gets leading context
TEST_F(ScriptRunIteratorTest, UnmatchedClose)
{
CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
{ "萬國碼] ", USCRIPT_HAN },
{ ") Unicode\"", USCRIPT_LATIN } });
}
// Match up to 32 bracket pairs
TEST_F(ScriptRunIteratorTest, Match32Brackets)
{
CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
{ "Unicode (((((((((((((((((((((((((((((((!"
")))))))))))))))))))))))))))))))",
USCRIPT_LATIN },
{ "]", USCRIPT_HAN } });
}
// Matches 32 most recent bracket pairs. More than that, and we revert to
// surrounding script.
TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
{
CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
{ "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
{ "萬國碼!", USCRIPT_HAN },
{ ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
{ "]", USCRIPT_HAN },
{ "But )))", USCRIPT_LATIN } });
}
// A char with multiple scripts that match both leading and trailing context
// gets the leading context.
TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
{
CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
{ "l", USCRIPT_LATIN } });
}
// A char with multiple scripts that only match trailing context gets the
// trailing context.
TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
{
CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
{ "<gl>l", USCRIPT_LATIN } });
}
// Retain first established priority script. <lhg><gh> produce the script <gh>
// with g as priority, because of the two priority scripts l and g, only g
// remains. Then <gh><hgl> retains g as priority, because of the two priority
// scripts g and h that remain, g was encountered first.
TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
{
CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
}
// Parens can have scripts that break script runs.
TEST_F(ScriptRunIteratorTest, ExtensionsParens)
{
CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
{ "h<[hl>", USCRIPT_HAN },
{ "l", USCRIPT_LATIN },
{ "<]hl>", USCRIPT_HAN },
{ "<)lg>", USCRIPT_GREEK } });
}
// The close paren might be encountered before we've established the open
// paren's script, but when this is the case the current set is still valid, so
// this doesn't affect it nor break the run.
TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
{
CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
}
// A common script with a single extension should be treated as common, but
// with the extended script as a default. If we encounter anything other than
// common, that takes priority. If we encounter other common scripts with a
// single extension, the current priority remains.
TEST_F(ScriptRunIteratorTest, CommonWithPriority)
{
CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
{
CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
{
CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
}
// UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.
// Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has
// adopted the preceding LATIN, it gets the LATIN. This is standard.
TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
{
CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });
}
// In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the
// value inherited by the dotted circle (\xE2\x97\x8C). It captures the
// preceding dotted circle and breaks it from the run it would normally have
// been in.
TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
{
CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
{ "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });
}
// Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
// common, that of Fathatan is inherited. The script extensions for Fathatan
// are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
// preferred script for Fathatan is Arabic, according to Behdad's
// heuristic. This is exactly analogous to the Udatta tests above, except
// Tatweel is Lm. But we don't take properties into account, only scripts.
TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
{
CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
{ "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });
}
// Another case where if the mark accepts a script that was inherited by the
// preceding common-script character, they both continue in that script.
// SYRIAC LETTER NUN \xDC\xA2
// ARABIC TATWEEL \xD9\x80
// ARABIC FATHATAN \xD9\x82
TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
{
CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });
}
// The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
// is not common.
TEST_F(ScriptRunIteratorTest, HanUdatta)
{
CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });
}
// The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn
// it into Devanagari.
TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
{
CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
{ " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });
}
// Make sure Mock code works too.
TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
{
CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
{
CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
{ "c<igl>", USCRIPT_GREEK } });
}
// Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
{
CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
}
// Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
{
CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
{
// DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
{
// DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
// ARABIC FATHATAN \xD9\x8B
CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, OddLatinString)
{
CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
}
class ScriptRunIteratorICUDataTest : public testing::Test {
public:
ScriptRunIteratorICUDataTest()
: m_maxExtensions(0)
, m_maxExtensionsCodepoint(0xffff)
{
int maxExtensions = 0;
UChar32 m_maxExtensionscp = 0;
for (UChar32 cp = 0; cp < 0x11000; ++cp) {
UErrorCode status = U_ZERO_ERROR;
int count = uscript_getScriptExtensions(cp, 0, 0, &status);
if (count > maxExtensions) {
maxExtensions = count;
m_maxExtensionscp = cp;
}
}
m_maxExtensions = maxExtensions;
m_maxExtensionsCodepoint = m_maxExtensionscp;
}
protected:
UChar32 GetACharWithMaxExtensions(int* numExtensions)
{
if (numExtensions) {
*numExtensions = m_maxExtensions;
}
return m_maxExtensionsCodepoint;
}
private:
int m_maxExtensions;
UChar32 m_maxExtensionsCodepoint;
};
// Validate that ICU never returns more than our maximum expected number of
// script extensions.
TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
{
int maxExtensions;
UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)
<< "char " << std::hex << cp << std::dec;
}
// Check that ICUScriptData returns all of a character's scripts.
// This only checks one likely character, but doesn't check all cases.
TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
{
int maxExtensions;
UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
Vector<UScriptCode> extensions;
ICUScriptData::instance()->getScripts(cp, extensions);
// It's possible that GetScripts adds the primary script to the list of
// extensions, resulting in one more script than the raw extension count.
ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)
<< "char " << std::hex << cp << std::dec;
}
TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
{
Vector<UScriptCode> extensions;
for (UChar32 cp = 0; cp < 0x110000; ++cp) {
ICUScriptData::instance()->getScripts(cp, extensions);
UScriptCode primary = extensions.at(0);
if (primary == USCRIPT_COMMON) {
ASSERT_LE(extensions.size(), 2ul)
<< "cp: " << std::hex << cp << std::dec;
}
}
}
// ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
// ignore this for now, as I think it shouldn't matter which run it ends up
// in. HarfBuzz needs to be able to use it as context and shape each
// neighboring character appropriately no matter what run it got assigned to.
} // namespace blink
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment