Script streaming: remove byte order marks, detect encoding based on them.

Byte order mark detection needs to be done on the Blink side, since it also affects encoding detection. This also enables the preparse data for streamed scripts again, because the corrupt data was caused by byte order marks not being detected properly. BUG=430890 Review URL: https://codereview.chromium.org/708093002 git-svn-id: svn://svn.chromium.org/blink/trunk@185117 bbb929c8-8fbe-4397-9dbb-9b2b20218538

Script streaming: remove byte order marks, detect encoding based on them.
Byte order mark detection needs to be done on the Blink side, since it also affects encoding detection. This also enables the preparse data for streamed scripts again, because the corrupt data was caused by byte order marks not being detected properly. BUG=430890 Review URL: https://codereview.chromium.org/708093002 git-svn-id: svn://svn.chromium.org/blink/trunk@185117 bbb929c8-8fbe-4397-9dbb-9b2b20218538
444b4466 · marja@chromium.org · 4adf734f · 444b4466 · 444b4466 · 444b4466
Commit 444b4466 authored Nov 11, 2014 by marja@chromium.org
4 changed files
--- a/third_party/WebKit/Source/bindings/core/v8/ScriptStreamer.cpp
+++ b/third_party/WebKit/Source/bindings/core/v8/ScriptStreamer.cpp
@@ -12,6 +12,7 @@
 #include "core/dom/PendingScript.h"
 #include "core/fetch/ScriptResource.h"
 #include "core/frame/Settings.h"
+#include "core/html/parser/TextResourceDecoder.h"
 #include "platform/SharedBuffer.h"
 #include "platform/TraceEvent.h"
 #include "public/platform/Platform.h"
@@ -123,10 +124,10 @@ public:
        m_dataQueue.finish();
    }
-    void didReceiveData()
+    void didReceiveData(size_t lengthOfBOM)
    {
        ASSERT(isMainThread());
-        prepareDataOnMainThread();
+        prepareDataOnMainThread(lengthOfBOM);
    }
    void cancel()
@@ -145,7 +146,7 @@ public:
    }
 private:
-    void prepareDataOnMainThread()
+    void prepareDataOnMainThread(size_t lengthOfBOM)
    {
        ASSERT(isMainThread());
        // The Resource must still be alive; otherwise we should've cancelled
@@ -153,6 +154,9 @@ private:
        // waiting).
        ASSERT(m_streamer->resource());
+        // BOM can only occur at the beginning of the data.
+        ASSERT(lengthOfBOM == 0 || m_dataPosition == 0);
        if (m_streamer->resource()->cachedMetadata(V8ScriptRunner::tagForCodeCache())) {
            // The resource has a code cache, so it's unnecessary to stream and
            // parse the code. Cancel the streaming and resume the non-streaming
@@ -169,8 +173,6 @@ private:
        if (!m_resourceBuffer) {
            // We don't have a buffer yet. Try to get it from the resource.
            SharedBuffer* buffer = m_streamer->resource()->resourceBuffer();
-            if (!buffer)
-                return;
            m_resourceBuffer = RefPtr<SharedBuffer>(buffer);
        }
@@ -190,12 +192,15 @@ private:
        }
        // Copy the data chunks into a new buffer, since we're going to give the
        // data to a background thread.
-        if (dataLength > 0) {
+        if (dataLength > lengthOfBOM) {
+            dataLength -= lengthOfBOM;
            uint8_t* copiedData = new uint8_t[dataLength];
            unsigned offset = 0;
            for (size_t i = 0; i < chunks.size(); ++i) {
-                memcpy(copiedData + offset, chunks[i], chunkLengths[i]);
+                memcpy(copiedData + offset, chunks[i] + lengthOfBOM, chunkLengths[i] - lengthOfBOM);
-                offset += chunkLengths[i];
+                offset += chunkLengths[i] - lengthOfBOM;
+                // BOM is only in the first chunk
+                lengthOfBOM = 0;
            }
            m_dataQueue.produce(copiedData, dataLength);
        }
@@ -286,21 +291,34 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource)
        if (m_streamingSuppressed)
            return;
    }
+    size_t lengthOfBOM = 0;
    if (!m_haveEnoughDataForStreaming) {
        // Even if the first data chunk is small, the script can still be big
        // enough - wait until the next data chunk comes before deciding whether
        // to start the streaming.
-        if (resource->resourceBuffer()->size() < kSmallScriptThreshold) {
+        ASSERT(resource->resourceBuffer());
+        if (resource->resourceBuffer()->size() < kSmallScriptThreshold)
            return;
-        }
        m_haveEnoughDataForStreaming = true;
        const char* histogramName = startedStreamingHistogramName(m_scriptType);
        // Encoding should be detected only when we have some data. It's
        // possible that resource->encoding() returns a different encoding
-        // before the loading has started and after we got some data.
+        // before the loading has started and after we got some data.  In
-        WTF::TextEncoding textEncoding(resource->encoding());
+        // addition, check for byte order marks. Note that checking the byte
-        const char* encodingName = textEncoding.name();
+        // order mark might change the encoding. We cannot decode the full text
+        // here, because it might contain incomplete UTF-8 characters. Also note
+        // that have at least kSmallScriptThreshold worth of data, which is more
+        // than enough for detecting a BOM.
+        const char* data = 0;
+        unsigned length = resource->resourceBuffer()->getSomeData(data, 0);
+        OwnPtr<TextResourceDecoder> decoder(TextResourceDecoder::create("application/javascript", resource->encoding()));
+        lengthOfBOM = decoder->checkForBOM(data, length);
+        // Maybe the encoding changed because we saw the BOM; get the encoding
+        // from the decoder.
+        const char* encodingName = decoder->encoding().name();
        // Here's a list of encodings we can use for streaming. These are
        // the canonical names.
@@ -313,9 +331,9 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource)
            encoding = v8::ScriptCompiler::StreamedSource::UTF8;
        } else {
            // We don't stream other encodings; especially we don't stream two
-            // byte scripts to avoid the handling of byte order marks. Most
+            // byte scripts to avoid the handling of endianness. Most scripts
-            // scripts are Latin1 or UTF-8 anyway, so this should be enough for
+            // are Latin1 or UTF-8 anyway, so this should be enough for most
-            // most real world purposes.
+            // real world purposes.
            suppressStreaming();
            blink::Platform::current()->histogramEnumeration(histogramName, 0, 2);
            return;
@@ -363,7 +381,7 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource)
        blink::Platform::current()->histogramEnumeration(histogramName, 1, 2);
    }
    if (m_stream)
-        m_stream->didReceiveData();
+        m_stream->didReceiveData(lengthOfBOM);
 }
 void ScriptStreamer::notifyFinished(Resource* resource)

--- a/third_party/WebKit/Source/bindings/core/v8/ScriptStreamerTest.cpp
+++ b/third_party/WebKit/Source/bindings/core/v8/ScriptStreamerTest.cpp
@@ -86,11 +86,11 @@ protected:
    void appendData(const char* data)
    {
        m_resource->appendData(data, strlen(data));
-        // Yield control to the background thread, so that V8 gets a change to
+        // Yield control to the background thread, so that V8 gets a chance to
        // process the data before the main thread adds more. Note that we
        // cannot fully control in what kind of chunks the data is passed to V8
-        // (if the V8 is not requesting more data between two appendData calls,
+        // (if V8 is not requesting more data between two appendData calls, it
-        // V8 will get both chunks together).
+        // will get both chunks together).
        Platform::current()->yieldCurrentThread();
    }
@@ -364,6 +364,34 @@ TEST_P(ScriptStreamingTest, EncodingChanges)
    EXPECT_FALSE(tryCatch.HasCaught());
 }
+TEST_P(ScriptStreamingTest, EncodingFromBOM)
+{
+    // Byte order marks should be removed before giving the data to V8. They
+    // will also affect encoding detection.
+    m_resource->setEncoding("windows-1252"); // This encoding is wrong on purpose.
+    ScriptStreamer::startStreaming(pendingScript(), m_settings.get(), m_scope.scriptState(), PendingScript::ParsingBlocking);
+    TestScriptResourceClient client;
+    pendingScript().watchForLoad(&client);
+    // \xef\xbb\xbf is the UTF-8 byte order mark. \xec\x92\x81 are the raw bytes
+    // for \uc481.
+    appendData("\xef\xbb\xbf function foo() { var foob\xec\x92\x81r = 13; return foob\xec\x92\x81r; } foo();");
+    finish();
+    processTasksUntilStreamingComplete();
+    EXPECT_TRUE(client.finished());
+    bool errorOccurred = false;
+    ScriptSourceCode sourceCode = pendingScript().getSource(KURL(), errorOccurred);
+    EXPECT_FALSE(errorOccurred);
+    EXPECT_TRUE(sourceCode.streamer());
+    v8::TryCatch tryCatch;
+    v8::Handle<v8::Script> script = V8ScriptRunner::compileScript(sourceCode, isolate());
+    EXPECT_FALSE(script.IsEmpty());
+    EXPECT_FALSE(tryCatch.HasCaught());
+}
 INSTANTIATE_TEST_CASE_P(ScriptStreamingInstantiation, ScriptStreamingTest, ::testing::Values(false, true));
 } // namespace

--- a/third_party/WebKit/Source/bindings/core/v8/V8ScriptRunner.cpp
+++ b/third_party/WebKit/Source/bindings/core/v8/V8ScriptRunner.cpp
@@ -127,9 +127,7 @@ v8::Local<v8::Script> V8ScriptRunner::compileScript(v8::Handle<v8::String> code,
        const v8::ScriptCompiler::CachedData* newCachedData = streamer->source()->GetCachedData();
        if (newCachedData) {
            resource->clearCachedMetadata();
-            // Temporarily disable cached metadata for streaming; it's
+            resource->setCachedMetadata(streamer->cachedDataType(), reinterpret_cast<const char*>(newCachedData->data), newCachedData->length, Resource::CacheLocally);
-            // producing broken data. FIXME: enable it again.
-            // resource->setCachedMetadata(streamer->cachedDataType(), reinterpret_cast<const char*>(newCachedData->data), newCachedData->length);
        }
    } else if (!resource || !resource->url().protocolIsInHTTPFamily() || code->Length() < 1024) {
        v8::ScriptCompiler::Source source(code, origin);

--- a/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h
+++ b/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h
@@ -68,6 +68,7 @@ public:
    void useLenientXMLDecoding() { m_useLenientXMLDecoding = true; }
    bool sawError() const { return m_sawError; }
+    size_t checkForBOM(const char*, size_t);
 private:
    TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& defaultEncoding, bool usesEncodingDetector);
@@ -76,7 +77,6 @@ private:
    static ContentType determineContentType(const String& mimeType);
    static const WTF::TextEncoding& defaultEncoding(ContentType, const WTF::TextEncoding& defaultEncoding);
-    size_t checkForBOM(const char*, size_t);
    bool checkForCSSCharset(const char*, size_t, bool& movedDataToBuffer);
    bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer);
    void checkForMetaCharset(const char*, size_t);