Commit 444b4466 authored by marja@chromium.org's avatar marja@chromium.org

Script streaming: remove byte order marks, detect encoding based on them.

Byte order mark detection needs to be done on the Blink side, since it also
affects encoding detection.

This also enables the preparse data for streamed scripts again, because the
corrupt data was caused by byte order marks not being detected properly.

BUG=430890

Review URL: https://codereview.chromium.org/708093002

git-svn-id: svn://svn.chromium.org/blink/trunk@185117 bbb929c8-8fbe-4397-9dbb-9b2b20218538
parent 4adf734f
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "core/dom/PendingScript.h" #include "core/dom/PendingScript.h"
#include "core/fetch/ScriptResource.h" #include "core/fetch/ScriptResource.h"
#include "core/frame/Settings.h" #include "core/frame/Settings.h"
#include "core/html/parser/TextResourceDecoder.h"
#include "platform/SharedBuffer.h" #include "platform/SharedBuffer.h"
#include "platform/TraceEvent.h" #include "platform/TraceEvent.h"
#include "public/platform/Platform.h" #include "public/platform/Platform.h"
...@@ -123,10 +124,10 @@ public: ...@@ -123,10 +124,10 @@ public:
m_dataQueue.finish(); m_dataQueue.finish();
} }
void didReceiveData() void didReceiveData(size_t lengthOfBOM)
{ {
ASSERT(isMainThread()); ASSERT(isMainThread());
prepareDataOnMainThread(); prepareDataOnMainThread(lengthOfBOM);
} }
void cancel() void cancel()
...@@ -145,7 +146,7 @@ public: ...@@ -145,7 +146,7 @@ public:
} }
private: private:
void prepareDataOnMainThread() void prepareDataOnMainThread(size_t lengthOfBOM)
{ {
ASSERT(isMainThread()); ASSERT(isMainThread());
// The Resource must still be alive; otherwise we should've cancelled // The Resource must still be alive; otherwise we should've cancelled
...@@ -153,6 +154,9 @@ private: ...@@ -153,6 +154,9 @@ private:
// waiting). // waiting).
ASSERT(m_streamer->resource()); ASSERT(m_streamer->resource());
// BOM can only occur at the beginning of the data.
ASSERT(lengthOfBOM == 0 || m_dataPosition == 0);
if (m_streamer->resource()->cachedMetadata(V8ScriptRunner::tagForCodeCache())) { if (m_streamer->resource()->cachedMetadata(V8ScriptRunner::tagForCodeCache())) {
// The resource has a code cache, so it's unnecessary to stream and // The resource has a code cache, so it's unnecessary to stream and
// parse the code. Cancel the streaming and resume the non-streaming // parse the code. Cancel the streaming and resume the non-streaming
...@@ -169,8 +173,6 @@ private: ...@@ -169,8 +173,6 @@ private:
if (!m_resourceBuffer) { if (!m_resourceBuffer) {
// We don't have a buffer yet. Try to get it from the resource. // We don't have a buffer yet. Try to get it from the resource.
SharedBuffer* buffer = m_streamer->resource()->resourceBuffer(); SharedBuffer* buffer = m_streamer->resource()->resourceBuffer();
if (!buffer)
return;
m_resourceBuffer = RefPtr<SharedBuffer>(buffer); m_resourceBuffer = RefPtr<SharedBuffer>(buffer);
} }
...@@ -190,12 +192,15 @@ private: ...@@ -190,12 +192,15 @@ private:
} }
// Copy the data chunks into a new buffer, since we're going to give the // Copy the data chunks into a new buffer, since we're going to give the
// data to a background thread. // data to a background thread.
if (dataLength > 0) { if (dataLength > lengthOfBOM) {
dataLength -= lengthOfBOM;
uint8_t* copiedData = new uint8_t[dataLength]; uint8_t* copiedData = new uint8_t[dataLength];
unsigned offset = 0; unsigned offset = 0;
for (size_t i = 0; i < chunks.size(); ++i) { for (size_t i = 0; i < chunks.size(); ++i) {
memcpy(copiedData + offset, chunks[i], chunkLengths[i]); memcpy(copiedData + offset, chunks[i] + lengthOfBOM, chunkLengths[i] - lengthOfBOM);
offset += chunkLengths[i]; offset += chunkLengths[i] - lengthOfBOM;
// BOM is only in the first chunk
lengthOfBOM = 0;
} }
m_dataQueue.produce(copiedData, dataLength); m_dataQueue.produce(copiedData, dataLength);
} }
...@@ -286,21 +291,34 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource) ...@@ -286,21 +291,34 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource)
if (m_streamingSuppressed) if (m_streamingSuppressed)
return; return;
} }
size_t lengthOfBOM = 0;
if (!m_haveEnoughDataForStreaming) { if (!m_haveEnoughDataForStreaming) {
// Even if the first data chunk is small, the script can still be big // Even if the first data chunk is small, the script can still be big
// enough - wait until the next data chunk comes before deciding whether // enough - wait until the next data chunk comes before deciding whether
// to start the streaming. // to start the streaming.
if (resource->resourceBuffer()->size() < kSmallScriptThreshold) { ASSERT(resource->resourceBuffer());
if (resource->resourceBuffer()->size() < kSmallScriptThreshold)
return; return;
}
m_haveEnoughDataForStreaming = true; m_haveEnoughDataForStreaming = true;
const char* histogramName = startedStreamingHistogramName(m_scriptType); const char* histogramName = startedStreamingHistogramName(m_scriptType);
// Encoding should be detected only when we have some data. It's // Encoding should be detected only when we have some data. It's
// possible that resource->encoding() returns a different encoding // possible that resource->encoding() returns a different encoding
// before the loading has started and after we got some data. // before the loading has started and after we got some data. In
WTF::TextEncoding textEncoding(resource->encoding()); // addition, check for byte order marks. Note that checking the byte
const char* encodingName = textEncoding.name(); // order mark might change the encoding. We cannot decode the full text
// here, because it might contain incomplete UTF-8 characters. Also note
// that have at least kSmallScriptThreshold worth of data, which is more
// than enough for detecting a BOM.
const char* data = 0;
unsigned length = resource->resourceBuffer()->getSomeData(data, 0);
OwnPtr<TextResourceDecoder> decoder(TextResourceDecoder::create("application/javascript", resource->encoding()));
lengthOfBOM = decoder->checkForBOM(data, length);
// Maybe the encoding changed because we saw the BOM; get the encoding
// from the decoder.
const char* encodingName = decoder->encoding().name();
// Here's a list of encodings we can use for streaming. These are // Here's a list of encodings we can use for streaming. These are
// the canonical names. // the canonical names.
...@@ -313,9 +331,9 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource) ...@@ -313,9 +331,9 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource)
encoding = v8::ScriptCompiler::StreamedSource::UTF8; encoding = v8::ScriptCompiler::StreamedSource::UTF8;
} else { } else {
// We don't stream other encodings; especially we don't stream two // We don't stream other encodings; especially we don't stream two
// byte scripts to avoid the handling of byte order marks. Most // byte scripts to avoid the handling of endianness. Most scripts
// scripts are Latin1 or UTF-8 anyway, so this should be enough for // are Latin1 or UTF-8 anyway, so this should be enough for most
// most real world purposes. // real world purposes.
suppressStreaming(); suppressStreaming();
blink::Platform::current()->histogramEnumeration(histogramName, 0, 2); blink::Platform::current()->histogramEnumeration(histogramName, 0, 2);
return; return;
...@@ -363,7 +381,7 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource) ...@@ -363,7 +381,7 @@ void ScriptStreamer::notifyAppendData(ScriptResource* resource)
blink::Platform::current()->histogramEnumeration(histogramName, 1, 2); blink::Platform::current()->histogramEnumeration(histogramName, 1, 2);
} }
if (m_stream) if (m_stream)
m_stream->didReceiveData(); m_stream->didReceiveData(lengthOfBOM);
} }
void ScriptStreamer::notifyFinished(Resource* resource) void ScriptStreamer::notifyFinished(Resource* resource)
......
...@@ -86,11 +86,11 @@ protected: ...@@ -86,11 +86,11 @@ protected:
void appendData(const char* data) void appendData(const char* data)
{ {
m_resource->appendData(data, strlen(data)); m_resource->appendData(data, strlen(data));
// Yield control to the background thread, so that V8 gets a change to // Yield control to the background thread, so that V8 gets a chance to
// process the data before the main thread adds more. Note that we // process the data before the main thread adds more. Note that we
// cannot fully control in what kind of chunks the data is passed to V8 // cannot fully control in what kind of chunks the data is passed to V8
// (if the V8 is not requesting more data between two appendData calls, // (if V8 is not requesting more data between two appendData calls, it
// V8 will get both chunks together). // will get both chunks together).
Platform::current()->yieldCurrentThread(); Platform::current()->yieldCurrentThread();
} }
...@@ -364,6 +364,34 @@ TEST_P(ScriptStreamingTest, EncodingChanges) ...@@ -364,6 +364,34 @@ TEST_P(ScriptStreamingTest, EncodingChanges)
EXPECT_FALSE(tryCatch.HasCaught()); EXPECT_FALSE(tryCatch.HasCaught());
} }
TEST_P(ScriptStreamingTest, EncodingFromBOM)
{
// Byte order marks should be removed before giving the data to V8. They
// will also affect encoding detection.
m_resource->setEncoding("windows-1252"); // This encoding is wrong on purpose.
ScriptStreamer::startStreaming(pendingScript(), m_settings.get(), m_scope.scriptState(), PendingScript::ParsingBlocking);
TestScriptResourceClient client;
pendingScript().watchForLoad(&client);
// \xef\xbb\xbf is the UTF-8 byte order mark. \xec\x92\x81 are the raw bytes
// for \uc481.
appendData("\xef\xbb\xbf function foo() { var foob\xec\x92\x81r = 13; return foob\xec\x92\x81r; } foo();");
finish();
processTasksUntilStreamingComplete();
EXPECT_TRUE(client.finished());
bool errorOccurred = false;
ScriptSourceCode sourceCode = pendingScript().getSource(KURL(), errorOccurred);
EXPECT_FALSE(errorOccurred);
EXPECT_TRUE(sourceCode.streamer());
v8::TryCatch tryCatch;
v8::Handle<v8::Script> script = V8ScriptRunner::compileScript(sourceCode, isolate());
EXPECT_FALSE(script.IsEmpty());
EXPECT_FALSE(tryCatch.HasCaught());
}
INSTANTIATE_TEST_CASE_P(ScriptStreamingInstantiation, ScriptStreamingTest, ::testing::Values(false, true)); INSTANTIATE_TEST_CASE_P(ScriptStreamingInstantiation, ScriptStreamingTest, ::testing::Values(false, true));
} // namespace } // namespace
......
...@@ -127,9 +127,7 @@ v8::Local<v8::Script> V8ScriptRunner::compileScript(v8::Handle<v8::String> code, ...@@ -127,9 +127,7 @@ v8::Local<v8::Script> V8ScriptRunner::compileScript(v8::Handle<v8::String> code,
const v8::ScriptCompiler::CachedData* newCachedData = streamer->source()->GetCachedData(); const v8::ScriptCompiler::CachedData* newCachedData = streamer->source()->GetCachedData();
if (newCachedData) { if (newCachedData) {
resource->clearCachedMetadata(); resource->clearCachedMetadata();
// Temporarily disable cached metadata for streaming; it's resource->setCachedMetadata(streamer->cachedDataType(), reinterpret_cast<const char*>(newCachedData->data), newCachedData->length, Resource::CacheLocally);
// producing broken data. FIXME: enable it again.
// resource->setCachedMetadata(streamer->cachedDataType(), reinterpret_cast<const char*>(newCachedData->data), newCachedData->length);
} }
} else if (!resource || !resource->url().protocolIsInHTTPFamily() || code->Length() < 1024) { } else if (!resource || !resource->url().protocolIsInHTTPFamily() || code->Length() < 1024) {
v8::ScriptCompiler::Source source(code, origin); v8::ScriptCompiler::Source source(code, origin);
......
...@@ -68,6 +68,7 @@ public: ...@@ -68,6 +68,7 @@ public:
void useLenientXMLDecoding() { m_useLenientXMLDecoding = true; } void useLenientXMLDecoding() { m_useLenientXMLDecoding = true; }
bool sawError() const { return m_sawError; } bool sawError() const { return m_sawError; }
size_t checkForBOM(const char*, size_t);
private: private:
TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& defaultEncoding, bool usesEncodingDetector); TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& defaultEncoding, bool usesEncodingDetector);
...@@ -76,7 +77,6 @@ private: ...@@ -76,7 +77,6 @@ private:
static ContentType determineContentType(const String& mimeType); static ContentType determineContentType(const String& mimeType);
static const WTF::TextEncoding& defaultEncoding(ContentType, const WTF::TextEncoding& defaultEncoding); static const WTF::TextEncoding& defaultEncoding(ContentType, const WTF::TextEncoding& defaultEncoding);
size_t checkForBOM(const char*, size_t);
bool checkForCSSCharset(const char*, size_t, bool& movedDataToBuffer); bool checkForCSSCharset(const char*, size_t, bool& movedDataToBuffer);
bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer); bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer);
void checkForMetaCharset(const char*, size_t); void checkForMetaCharset(const char*, size_t);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment