Smaller CSSParser UTF16 buffers for escaped strings.

If an ASCII stylesheet contains escaped characters those need to be unescaped and stored somewhere. The code used to allocate a buffer the size of the whole CSS file * sizeof(UChar). That meant 700 KB on arstechnica.com to store a single character. Instead store the supposedly rare characters in special buffers of appropriate sizes. BUG=352544 Review URL: https://codereview.chromium.org/196353018 git-svn-id: svn://svn.chromium.org/blink/trunk@170451 bbb929c8-8fbe-4397-9dbb-9b2b20218538

Smaller CSSParser UTF16 buffers for escaped strings.
If an ASCII stylesheet contains escaped characters those need to be unescaped and stored somewhere. The code used to allocate a buffer the size of the whole CSS file * sizeof(UChar). That meant 700 KB on arstechnica.com to store a single character. Instead store the supposedly rare characters in special buffers of appropriate sizes. BUG=352544 Review URL: https://codereview.chromium.org/196353018 git-svn-id: svn://svn.chromium.org/blink/trunk@170451 bbb929c8-8fbe-4397-9dbb-9b2b20218538
1aac036d · bratell@opera.com · f0c3863a · 1aac036d · 1aac036d
Commit 1aac036d authored Mar 31, 2014 by bratell@opera.com
2 changed files
--- a/third_party/WebKit/Source/core/css/CSSTokenizer-in.cpp
+++ b/third_party/WebKit/Source/core/css/CSSTokenizer-in.cpp
@@ -304,14 +304,17 @@ inline UChar*& CSSTokenizer::currentCharacter<UChar>()
    return m_currentCharacter16;
 }
-UChar*& CSSTokenizer::currentCharacter16()
+UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
 {
-    if (!m_currentCharacter16) {
+    // Allocates and returns a CSSTokenizer owned buffer for storing
-        m_dataStart16 = adoptArrayPtr(new UChar[m_length]);
+    // UTF-16 data. Used to get a suitable life span for UTF-16
-        m_currentCharacter16 = m_dataStart16.get();
+    // strings, identifiers and URIs created by the tokenizer.
-    }
+    OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
-    return m_currentCharacter16;
+    UChar* bufferPtr = buffer.get();
+    m_cssStrings16.append(buffer.release());
+    return bufferPtr;
 }
 template <>
@@ -412,7 +415,7 @@ unsigned CSSTokenizer::parseEscape(CharacterType*& src)
        return unicode;
    }
-    return *currentCharacter<CharacterType>()++;
+    return *src++;
 }
 template <>
@@ -438,6 +441,24 @@ inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode
    ++result;
 }
+template <typename SrcCharacterType>
+size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
+{
+    // The decoded form of an identifier (after resolving escape
+    // sequences) will not contain more characters (ASCII or UTF-16
+    // codepoints) than the input. This code can therefore ignore
+    // escape sequences completely.
+    SrcCharacterType* start = src;
+    do {
+        if (LIKELY(*src != '\\'))
+            src++;
+        else
+            parseEscape<SrcCharacterType>(src);
+    } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
+    return src - start;
+}
 template <typename SrcCharacterType, typename DestCharacterType>
 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape)
 {
@@ -471,7 +492,7 @@ inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin
    if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) {
        // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
        ASSERT(is8BitSource());
-        UChar*& result16 = currentCharacter16();
+        UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(result));
        UChar* start16 = result16;
        int i = 0;
        for (; i < result - start; i++)
@@ -489,6 +510,18 @@ inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin
    resultString.init(start, result - start);
 }
+template <typename SrcCharacterType>
+size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
+{
+    // The decoded form of a CSS string (after resolving escape
+    // sequences) will not contain more characters (ASCII or UTF-16
+    // codepoints) than the input. This code can therefore ignore
+    // escape sequences completely and just return the length of the
+    // input string (possibly including terminating quote if any).
+    SrcCharacterType* end = checkAndSkipString(src, quote);
+    return end ? end - src : 0;
+}
 template <typename SrcCharacterType, typename DestCharacterType>
 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote)
 {
@@ -532,7 +565,7 @@ inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r
    if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
        // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
        ASSERT(is8BitSource());
-        UChar*& result16 = currentCharacter16();
+        UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(result, quote));
        UChar* start16 = result16;
        int i = 0;
        for (; i < result - start; i++)
@@ -580,6 +613,29 @@ inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UC
    return true;
 }
+template <typename SrcCharacterType>
+inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
+{
+    // The decoded form of a URI (after resolving escape sequences)
+    // will not contain more characters (ASCII or UTF-16 codepoints)
+    // than the input. This code can therefore ignore escape sequences
+    // completely.
+    SrcCharacterType* start = src;
+    if (quote) {
+        ASSERT(quote == '"' || quote == '\'');
+        return peekMaxStringLen(src, quote);
+    }
+    while (isURILetter(*src)) {
+        if (LIKELY(*src != '\\'))
+            src++;
+        else
+            parseEscape<SrcCharacterType>(src);
+    }
+    return src - start;
+}
 template <typename SrcCharacterType, typename DestCharacterType>
 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote)
 {
@@ -593,7 +649,7 @@ inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter
            *dest++ = *src++;
        } else {
            unsigned unicode = parseEscape<SrcCharacterType>(src);
-            if (unicode > 0xff && sizeof(SrcCharacterType) == 1)
+            if (unicode > 0xff && sizeof(DestCharacterType) == 1)
                return false;
            UnicodeToChars(dest, unicode);
        }
@@ -619,11 +675,12 @@ inline void CSSTokenizer::parseURI(CSSParserString& string)
        // Reset the current character to the start of the URI and re-parse with
        // a 16-bit destination.
        ASSERT(is8BitSource());
-        UChar* uriStart16 = currentCharacter16();
+        UChar* result16 = allocateStringBuffer16(peekMaxURILen(uriStart, quote));
+        UChar* uriStart16 = result16;
        currentCharacter<CharacterType>() = uriStart;
-        bool result = parseURIInternal(currentCharacter<CharacterType>(), currentCharacter16(), quote);
+        bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote);
        ASSERT_UNUSED(result, result);
-        string.init(uriStart16, currentCharacter16() - uriStart16);
+        string.init(uriStart16, result16 - uriStart16);
    }
    currentCharacter<CharacterType>() = uriEnd + 1;

--- a/third_party/WebKit/Source/core/css/CSSTokenizer.h
+++ b/third_party/WebKit/Source/core/css/CSSTokenizer.h
@@ -74,7 +74,7 @@ public:
    inline unsigned tokenStartOffset();
 private:
-    UChar*& currentCharacter16();
+    UChar* allocateStringBuffer16(size_t len);
    template <typename CharacterType>
    inline CharacterType*& currentCharacter();
@@ -92,29 +92,33 @@ private:
    inline CSSParserLocation tokenLocation();
    template <typename CharacterType>
-    unsigned parseEscape(CharacterType*&);
+    static unsigned parseEscape(CharacterType*&);
    template <typename DestCharacterType>
-    inline void UnicodeToChars(DestCharacterType*&, unsigned);
+    static inline void UnicodeToChars(DestCharacterType*&, unsigned);
-    template <typename SrcCharacterType, typename DestCharacterType>
-    inline bool parseIdentifierInternal(SrcCharacterType*&, DestCharacterType*&, bool&);
+    template <typename SrcCharacterType, typename DestCharacterType>
+    static inline bool parseIdentifierInternal(SrcCharacterType*&, DestCharacterType*&, bool&);
+    template <typename SrcCharacterType>
+    static size_t peekMaxIdentifierLen(SrcCharacterType*);
    template <typename CharacterType>
    inline void parseIdentifier(CharacterType*&, CSSParserString&, bool&);
+    template <typename SrcCharacterType>
+    static size_t peekMaxStringLen(SrcCharacterType*, UChar quote);
    template <typename SrcCharacterType, typename DestCharacterType>
-    inline bool parseStringInternal(SrcCharacterType*&, DestCharacterType*&, UChar);
+    static inline bool parseStringInternal(SrcCharacterType*&, DestCharacterType*&, UChar);
    template <typename CharacterType>
    inline void parseString(CharacterType*&, CSSParserString& resultString, UChar);
    template <typename CharacterType>
    inline bool findURI(CharacterType*& start, CharacterType*& end, UChar& quote);
+    template <typename SrcCharacterType>
+    static size_t peekMaxURILen(SrcCharacterType*, UChar quote);
    template <typename SrcCharacterType, typename DestCharacterType>
-    inline bool parseURIInternal(SrcCharacterType*&, DestCharacterType*&, UChar quote);
+    static inline bool parseURIInternal(SrcCharacterType*&, DestCharacterType*&, UChar quote);
    template <typename CharacterType>
    inline void parseURI(CSSParserString&);
    template <typename CharacterType>
    inline bool parseUnicodeRange();
    template <typename CharacterType>
@@ -155,6 +159,13 @@ private:
    OwnPtr<UChar[]> m_dataStart16;
    LChar* m_currentCharacter8;
    UChar* m_currentCharacter16;
+    // During parsing of an ASCII stylesheet we might locate escape
+    // sequences that expand into UTF-16 code points. Strings,
+    // identifiers and URIs containing such escape sequences are
+    // stored in m_cssStrings16 so that we don't have to store the
+    // whole stylesheet as UTF-16.
+    Vector<OwnPtr<UChar[]> > m_cssStrings16;
    union {
        LChar* ptr8;
        UChar* ptr16;