Commit 1aac036d authored by bratell@opera.com's avatar bratell@opera.com

Smaller CSSParser UTF16 buffers for escaped strings.

If an ASCII stylesheet contains escaped characters those need to be
unescaped and stored somewhere. The code used to allocate a buffer the
size of the whole CSS file * sizeof(UChar). That meant 700 KB on
arstechnica.com to store a single character. Instead store the
supposedly rare characters in special buffers of appropriate sizes.

BUG=352544

Review URL: https://codereview.chromium.org/196353018

git-svn-id: svn://svn.chromium.org/blink/trunk@170451 bbb929c8-8fbe-4397-9dbb-9b2b20218538
parent f0c3863a
...@@ -304,14 +304,17 @@ inline UChar*& CSSTokenizer::currentCharacter<UChar>() ...@@ -304,14 +304,17 @@ inline UChar*& CSSTokenizer::currentCharacter<UChar>()
return m_currentCharacter16; return m_currentCharacter16;
} }
UChar*& CSSTokenizer::currentCharacter16() UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
{ {
if (!m_currentCharacter16) { // Allocates and returns a CSSTokenizer owned buffer for storing
m_dataStart16 = adoptArrayPtr(new UChar[m_length]); // UTF-16 data. Used to get a suitable life span for UTF-16
m_currentCharacter16 = m_dataStart16.get(); // strings, identifiers and URIs created by the tokenizer.
} OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
return m_currentCharacter16; UChar* bufferPtr = buffer.get();
m_cssStrings16.append(buffer.release());
return bufferPtr;
} }
template <> template <>
...@@ -412,7 +415,7 @@ unsigned CSSTokenizer::parseEscape(CharacterType*& src) ...@@ -412,7 +415,7 @@ unsigned CSSTokenizer::parseEscape(CharacterType*& src)
return unicode; return unicode;
} }
return *currentCharacter<CharacterType>()++; return *src++;
} }
template <> template <>
...@@ -438,6 +441,24 @@ inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode ...@@ -438,6 +441,24 @@ inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode
++result; ++result;
} }
template <typename SrcCharacterType>
size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
{
// The decoded form of an identifier (after resolving escape
// sequences) will not contain more characters (ASCII or UTF-16
// codepoints) than the input. This code can therefore ignore
// escape sequences completely.
SrcCharacterType* start = src;
do {
if (LIKELY(*src != '\\'))
src++;
else
parseEscape<SrcCharacterType>(src);
} while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
return src - start;
}
template <typename SrcCharacterType, typename DestCharacterType> template <typename SrcCharacterType, typename DestCharacterType>
inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape) inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape)
{ {
...@@ -471,7 +492,7 @@ inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin ...@@ -471,7 +492,7 @@ inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin
if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) { if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) {
// Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
ASSERT(is8BitSource()); ASSERT(is8BitSource());
UChar*& result16 = currentCharacter16(); UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(result));
UChar* start16 = result16; UChar* start16 = result16;
int i = 0; int i = 0;
for (; i < result - start; i++) for (; i < result - start; i++)
...@@ -489,6 +510,18 @@ inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin ...@@ -489,6 +510,18 @@ inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin
resultString.init(start, result - start); resultString.init(start, result - start);
} }
template <typename SrcCharacterType>
size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
{
// The decoded form of a CSS string (after resolving escape
// sequences) will not contain more characters (ASCII or UTF-16
// codepoints) than the input. This code can therefore ignore
// escape sequences completely and just return the length of the
// input string (possibly including terminating quote if any).
SrcCharacterType* end = checkAndSkipString(src, quote);
return end ? end - src : 0;
}
template <typename SrcCharacterType, typename DestCharacterType> template <typename SrcCharacterType, typename DestCharacterType>
inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote) inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote)
{ {
...@@ -532,7 +565,7 @@ inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r ...@@ -532,7 +565,7 @@ inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r
if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) { if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
// Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
ASSERT(is8BitSource()); ASSERT(is8BitSource());
UChar*& result16 = currentCharacter16(); UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(result, quote));
UChar* start16 = result16; UChar* start16 = result16;
int i = 0; int i = 0;
for (; i < result - start; i++) for (; i < result - start; i++)
...@@ -580,6 +613,29 @@ inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UC ...@@ -580,6 +613,29 @@ inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UC
return true; return true;
} }
template <typename SrcCharacterType>
inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
{
// The decoded form of a URI (after resolving escape sequences)
// will not contain more characters (ASCII or UTF-16 codepoints)
// than the input. This code can therefore ignore escape sequences
// completely.
SrcCharacterType* start = src;
if (quote) {
ASSERT(quote == '"' || quote == '\'');
return peekMaxStringLen(src, quote);
}
while (isURILetter(*src)) {
if (LIKELY(*src != '\\'))
src++;
else
parseEscape<SrcCharacterType>(src);
}
return src - start;
}
template <typename SrcCharacterType, typename DestCharacterType> template <typename SrcCharacterType, typename DestCharacterType>
inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote) inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote)
{ {
...@@ -593,7 +649,7 @@ inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter ...@@ -593,7 +649,7 @@ inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter
*dest++ = *src++; *dest++ = *src++;
} else { } else {
unsigned unicode = parseEscape<SrcCharacterType>(src); unsigned unicode = parseEscape<SrcCharacterType>(src);
if (unicode > 0xff && sizeof(SrcCharacterType) == 1) if (unicode > 0xff && sizeof(DestCharacterType) == 1)
return false; return false;
UnicodeToChars(dest, unicode); UnicodeToChars(dest, unicode);
} }
...@@ -619,11 +675,12 @@ inline void CSSTokenizer::parseURI(CSSParserString& string) ...@@ -619,11 +675,12 @@ inline void CSSTokenizer::parseURI(CSSParserString& string)
// Reset the current character to the start of the URI and re-parse with // Reset the current character to the start of the URI and re-parse with
// a 16-bit destination. // a 16-bit destination.
ASSERT(is8BitSource()); ASSERT(is8BitSource());
UChar* uriStart16 = currentCharacter16(); UChar* result16 = allocateStringBuffer16(peekMaxURILen(uriStart, quote));
UChar* uriStart16 = result16;
currentCharacter<CharacterType>() = uriStart; currentCharacter<CharacterType>() = uriStart;
bool result = parseURIInternal(currentCharacter<CharacterType>(), currentCharacter16(), quote); bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote);
ASSERT_UNUSED(result, result); ASSERT_UNUSED(result, result);
string.init(uriStart16, currentCharacter16() - uriStart16); string.init(uriStart16, result16 - uriStart16);
} }
currentCharacter<CharacterType>() = uriEnd + 1; currentCharacter<CharacterType>() = uriEnd + 1;
......
...@@ -74,7 +74,7 @@ public: ...@@ -74,7 +74,7 @@ public:
inline unsigned tokenStartOffset(); inline unsigned tokenStartOffset();
private: private:
UChar*& currentCharacter16(); UChar* allocateStringBuffer16(size_t len);
template <typename CharacterType> template <typename CharacterType>
inline CharacterType*& currentCharacter(); inline CharacterType*& currentCharacter();
...@@ -92,29 +92,33 @@ private: ...@@ -92,29 +92,33 @@ private:
inline CSSParserLocation tokenLocation(); inline CSSParserLocation tokenLocation();
template <typename CharacterType> template <typename CharacterType>
unsigned parseEscape(CharacterType*&); static unsigned parseEscape(CharacterType*&);
template <typename DestCharacterType> template <typename DestCharacterType>
inline void UnicodeToChars(DestCharacterType*&, unsigned); static inline void UnicodeToChars(DestCharacterType*&, unsigned);
template <typename SrcCharacterType, typename DestCharacterType>
inline bool parseIdentifierInternal(SrcCharacterType*&, DestCharacterType*&, bool&);
template <typename SrcCharacterType, typename DestCharacterType>
static inline bool parseIdentifierInternal(SrcCharacterType*&, DestCharacterType*&, bool&);
template <typename SrcCharacterType>
static size_t peekMaxIdentifierLen(SrcCharacterType*);
template <typename CharacterType> template <typename CharacterType>
inline void parseIdentifier(CharacterType*&, CSSParserString&, bool&); inline void parseIdentifier(CharacterType*&, CSSParserString&, bool&);
template <typename SrcCharacterType>
static size_t peekMaxStringLen(SrcCharacterType*, UChar quote);
template <typename SrcCharacterType, typename DestCharacterType> template <typename SrcCharacterType, typename DestCharacterType>
inline bool parseStringInternal(SrcCharacterType*&, DestCharacterType*&, UChar); static inline bool parseStringInternal(SrcCharacterType*&, DestCharacterType*&, UChar);
template <typename CharacterType> template <typename CharacterType>
inline void parseString(CharacterType*&, CSSParserString& resultString, UChar); inline void parseString(CharacterType*&, CSSParserString& resultString, UChar);
template <typename CharacterType> template <typename CharacterType>
inline bool findURI(CharacterType*& start, CharacterType*& end, UChar& quote); inline bool findURI(CharacterType*& start, CharacterType*& end, UChar& quote);
template <typename SrcCharacterType>
static size_t peekMaxURILen(SrcCharacterType*, UChar quote);
template <typename SrcCharacterType, typename DestCharacterType> template <typename SrcCharacterType, typename DestCharacterType>
inline bool parseURIInternal(SrcCharacterType*&, DestCharacterType*&, UChar quote); static inline bool parseURIInternal(SrcCharacterType*&, DestCharacterType*&, UChar quote);
template <typename CharacterType> template <typename CharacterType>
inline void parseURI(CSSParserString&); inline void parseURI(CSSParserString&);
template <typename CharacterType> template <typename CharacterType>
inline bool parseUnicodeRange(); inline bool parseUnicodeRange();
template <typename CharacterType> template <typename CharacterType>
...@@ -155,6 +159,13 @@ private: ...@@ -155,6 +159,13 @@ private:
OwnPtr<UChar[]> m_dataStart16; OwnPtr<UChar[]> m_dataStart16;
LChar* m_currentCharacter8; LChar* m_currentCharacter8;
UChar* m_currentCharacter16; UChar* m_currentCharacter16;
// During parsing of an ASCII stylesheet we might locate escape
// sequences that expand into UTF-16 code points. Strings,
// identifiers and URIs containing such escape sequences are
// stored in m_cssStrings16 so that we don't have to store the
// whole stylesheet as UTF-16.
Vector<OwnPtr<UChar[]> > m_cssStrings16;
union { union {
LChar* ptr8; LChar* ptr8;
UChar* ptr16; UChar* ptr16;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment