CSS Tokenizer: Fix handling of escaped newlines

This patch fixes our handling of escaped newlines in the css-syntax tokenizer. Since we don't perform preprocessing, the logic is slightly trickier than in the spec. The preprocessing replaces \r, \r\n, \f with \n. Regarding escaped newlines in strings, the spec states: "Otherwise, if the next input code point is a newline, consume it." In this case, we may need to consume two tokens if we have \r\n. The other cases don't require special handling, since \r\n starts with \r (also a newline) and in these cases the following token is just going to be a whitespace token. BUG=424988 Review URL: https://codereview.chromium.org/656033010 git-svn-id: svn://svn.chromium.org/blink/trunk@184347 bbb929c8-8fbe-4397-9dbb-9b2b20218538

CSS Tokenizer: Fix handling of escaped newlines
This patch fixes our handling of escaped newlines in the css-syntax tokenizer. Since we don't perform preprocessing, the logic is slightly trickier than in the spec. The preprocessing replaces \r, \r\n, \f with \n. Regarding escaped newlines in strings, the spec states: "Otherwise, if the next input code point is a newline, consume it." In this case, we may need to consume two tokens if we have \r\n. The other cases don't require special handling, since \r\n starts with \r (also a newline) and in these cases the following token is just going to be a whitespace token. BUG=424988 Review URL: https://codereview.chromium.org/656033010 git-svn-id: svn://svn.chromium.org/blink/trunk@184347 bbb929c8-8fbe-4397-9dbb-9b2b20218538
c1a02303 · timloh@chromium.org · f4cefe72 · c1a02303 · c1a02303
Commit c1a02303 authored Oct 24, 2014 by timloh@chromium.org
2 changed files
--- a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp
+++ b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp
@@ -31,10 +31,16 @@ static bool isNameChar(UChar c)
    return isNameStart(c) || isASCIIDigit(c) || c == '-';
 }
+static bool isNewLine(UChar cc)
+{
+    // We check \r and \f here, since we have no preprocessing stage
+    return (cc == '\r' || cc == '\n' || cc == '\f');
+}
 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
 static bool twoCharsAreValidEscape(UChar first, UChar second)
 {
-    return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker));
+    return first == '\\' && !isNewLine(second) && second != kEndOfFileMarker;
 }
 CSSTokenizer::CSSTokenizer(CSSTokenizerInputStream& inputStream)
@@ -353,12 +359,6 @@ CSSParserToken CSSTokenizer::consumeIdentLikeToken()
    return CSSParserToken(IdentToken, name);
 }
-static bool isNewLine(UChar cc)
-{
-    // We check \r and \f here, since we have no preprocessing stage
-    return (cc == '\r' || cc == '\n' || cc == '\f');
-}
 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
 CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
 {
@@ -379,7 +379,7 @@ CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
            if (m_input.nextInputChar() == kEndOfFileMarker)
                continue;
            if (isNewLine(m_input.nextInputChar()))
-                consume();
+                consumeSingleWhitespaceIfNext(); // This handles \r\n for us
            else
                output.append(consumeEscape());
        } else {
@@ -456,7 +456,7 @@ String CSSTokenizer::consumeName()
 UChar CSSTokenizer::consumeEscape()
 {
    UChar cc = consume();
-    ASSERT(cc != '\n');
+    ASSERT(!isNewLine(cc));
    if (isASCIIHexDigit(cc)) {
        unsigned consumedHexDigits = 1;
        StringBuilder hexChars;

--- a/third_party/WebKit/Source/core/css/parser/CSSTokenizerTest.cpp
+++ b/third_party/WebKit/Source/core/css/parser/CSSTokenizerTest.cpp
@@ -157,6 +157,9 @@ TEST(CSSTokenizerTest, Escapes)
    TEST_TOKENS("te\\s\\t", ident("test"));
    TEST_TOKENS("spaces\\ in\\\tident", ident("spaces in\tident"));
    TEST_TOKENS("\\.\\,\\:\\!", ident(".,:!"));
+    TEST_TOKENS("\\\r", delim('\\'), whitespace);
+    TEST_TOKENS("\\\f", delim('\\'), whitespace);
+    TEST_TOKENS("\\\r\n", delim('\\'), whitespace);
    // FIXME: We don't correctly return replacement characters
    // String replacement = fromUChar32(0xFFFD);
    // TEST_TOKENS("null\\0", ident("null" + replacement));
@@ -168,10 +171,6 @@ TEST(CSSTokenizerTest, Escapes)
    // FIXME: We don't correctly return supplementary plane characters
    // TEST_TOKENS("\\10fFfF", ident(fromUChar32(0x10ffff) + "0"));
    // TEST_TOKENS("\\10000000", ident(fromUChar32(0x100000) + "000"));
-    // FIXME: We don't correctly match newlines (normally handled in preprocessing)
-    // TEST_TOKENS("\\\r", delim('\\'), whitespace);
-    // TEST_TOKENS("\\\f", delim('\\'), whitespace);
-    // TEST_TOKENS("\\\r\n", delim('\\'), whitespace);
 }
 TEST(CSSTokenizerTest, IdentToken)
@@ -218,14 +217,13 @@ TEST(CSSTokenizerTest, StringToken)
    TEST_TOKENS("'esca\\\nped'", string("escaped"));
    TEST_TOKENS("\"esc\\\faped\"", string("escaped"));
    TEST_TOKENS("'new\\\rline'", string("newline"));
+    TEST_TOKENS("\"new\\\r\nline\"", string("newline"));
    TEST_TOKENS("'bad\nstring", badString, whitespace, ident("string"));
    TEST_TOKENS("'bad\rstring", badString, whitespace, ident("string"));
    TEST_TOKENS("'bad\r\nstring", badString, whitespace, ident("string"));
    TEST_TOKENS("'bad\fstring", badString, whitespace, ident("string"));
    // FIXME: Preprocessing is supposed to replace U+0000 with U+FFFD
    // TEST_TOKENS("'\0'", string(fromUChar32(0xFFFD)));
-    // FIXME: We don't correctly match newlines (normally handled in preprocessing)
-    // TEST_TOKENS("\"new\\\r\nline\"", string("newline"));
 }
 TEST(CSSTokenizerTest, NumberToken)