Refactor CachingWordShapeIterator::nextWordEndIndex

This patch refactors CachingWordShapeIterator::nextWordEndIndex so that: 1. Better readability. 2. Split 3 cases into 3 different loops to make it possible to identify the loop from the stack trace. BUG=639085 Review-Url: https://codereview.chromium.org/2263083002 Cr-Commit-Position: refs/heads/master@{#413463}

Refactor CachingWordShapeIterator::nextWordEndIndex
This patch refactors CachingWordShapeIterator::nextWordEndIndex so that: 1. Better readability. 2. Split 3 cases into 3 different loops to make it possible to identify the loop from the stack trace. BUG=639085 Review-Url: https://codereview.chromium.org/2263083002 Cr-Commit-Position: refs/heads/master@{#413463}
103f7f70 · kojii · Commit bot · c6d602ef · 103f7f70 · 103f7f70
Commit 103f7f70 authored Aug 22, 2016 by kojii Committed by Commit bot Aug 22, 2016
2 changed files
--- a/third_party/WebKit/Source/platform/fonts/shaping/CachingWordShapeIterator.h
+++ b/third_party/WebKit/Source/platform/fonts/shaping/CachingWordShapeIterator.h
@@ -111,7 +111,7 @@ private:
        return ch == spaceCharacter || ch == tabulationCharacter;
    }
-    unsigned nextWordEndIndex()
+    unsigned nextWordEndIndex() const
    {
        const unsigned length = m_textRun.length();
        if (m_startIndex >= length)
@@ -120,47 +120,49 @@ private:
        if (m_startIndex + 1u == length || isWordDelimiter(m_textRun[m_startIndex]))
            return m_startIndex + 1;
-        // Delimit every CJK character because these scripts do not delimit
+        // 8Bit words end at isWordDelimiter().
-        // words by spaces, and not delimiting hits the performance.
+        if (m_textRun.is8Bit()) {
-        if (!m_textRun.is8Bit()) {
+            for (unsigned i = m_startIndex + 1; ; i++) {
-            UChar32 ch;
+                if (i == length || isWordDelimiter(m_textRun[i]))
-            unsigned end = m_startIndex;
+                    return i;
-            U16_NEXT(m_textRun.characters16(), end, length, ch);
-            if (Character::isCJKIdeographOrSymbol(ch)) {
-                bool hasAnyScript = !Character::isCommonOrInheritedScript(ch);
-                for (unsigned i = end; i < length; end = i) {
-                    U16_NEXT(m_textRun.characters16(), i, length, ch);
-                    // ZWJ and modifier check in order not to split those Emoji sequences.
-                    if (U_GET_GC_MASK(ch) & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_SK_MASK)
-                        || ch == zeroWidthJoinerCharacter || Character::isModifier(ch))
-                        continue;
-                    // Avoid delimiting COMMON/INHERITED alone, which makes harder to
-                    // identify the script.
-                    if (Character::isCJKIdeographOrSymbol(ch)) {
-                        if (Character::isCommonOrInheritedScript(ch))
-                            continue;
-                        if (!hasAnyScript) {
-                            hasAnyScript = true;
-                            continue;
-                        }
-                    }
-                    return end;
-                }
-                return length;
            }
        }
-        for (unsigned i = m_startIndex + 1; ; i++) {
+        // Non-CJK/Emoji words end at isWordDelimiter() or CJK/Emoji characters.
-            if (i == length || isWordDelimiter(m_textRun[i])) {
+        unsigned end = m_startIndex;
-                return i;
+        UChar32 ch = m_textRun.codepointAtAndNext(end);
+        if (!Character::isCJKIdeographOrSymbol(ch)) {
+            for (unsigned nextEnd = end; end < length; end = nextEnd) {
+                ch = m_textRun.codepointAtAndNext(nextEnd);
+                if (isWordDelimiter(ch) || Character::isCJKIdeographOrSymbolBase(ch))
+                    return end;
            }
-            if (!m_textRun.is8Bit()) {
+            return length;
-                UChar32 nextChar;
+        }
-                U16_GET(m_textRun.characters16(), 0, i, length, nextChar);
-                if (Character::isCJKIdeographOrSymbolBase(nextChar))
+        // For CJK/Emoji words, delimit every character because these scripts do
-                    return i;
+        // not delimit words by spaces, and delimiting only at isWordDelimiter()
+        // worsen the cache efficiency.
+        bool hasAnyScript = !Character::isCommonOrInheritedScript(ch);
+        for (unsigned nextEnd = end; end < length; end = nextEnd) {
+            ch = m_textRun.codepointAtAndNext(nextEnd);
+            // ZWJ and modifier check in order not to split those Emoji sequences.
+            if (U_GET_GC_MASK(ch) & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_SK_MASK)
+                || ch == zeroWidthJoinerCharacter || Character::isModifier(ch))
+                continue;
+            // Avoid delimiting COMMON/INHERITED alone, which makes harder to
+            // identify the script.
+            if (Character::isCJKIdeographOrSymbol(ch)) {
+                if (Character::isCommonOrInheritedScript(ch))
+                    continue;
+                if (!hasAnyScript) {
+                    hasAnyScript = true;
+                    continue;
+                }
            }
+            return end;
        }
+        return length;
    }
    bool shapeToEndIndex(RefPtr<const ShapeResult>* result, unsigned endIndex)
@@ -180,7 +182,7 @@ private:
        return result->get();
    }
-    unsigned endIndexUntil(UChar ch)
+    unsigned endIndexUntil(UChar ch) const
    {
        unsigned length = m_textRun.length();
        ASSERT(m_startIndex < length);

--- a/third_party/WebKit/Source/platform/text/TextRun.h
+++ b/third_party/WebKit/Source/platform/text/TextRun.h
@@ -35,6 +35,8 @@
 #include "wtf/text/StringView.h"
 #include "wtf/text/WTFString.h"
+#include <unicode/utf16.h>
 class SkTextBlob;
 namespace blink {
@@ -150,6 +152,16 @@ public:
    const LChar* characters8() const { ASSERT(is8Bit()); return m_data.characters8; }
    const UChar* characters16() const { ASSERT(!is8Bit()); return m_data.characters16; }
+    UChar32 codepointAtAndNext(unsigned& i) const
+    {
+        if (is8Bit())
+            return (*this)[i++];
+        UChar32 codepoint;
+        SECURITY_DCHECK(i < m_len);
+        U16_NEXT(characters16(), i, m_len, codepoint);
+        return codepoint;
+    }
    bool is8Bit() const { return m_is8Bit; }
    unsigned length() const { return m_len; }
    unsigned charactersLength() const { return m_charactersLength; }