Commit 10eb9f5e authored by ricea's avatar ricea Committed by Commit bot

Mime sniffer: reduce table from 256 bytes to 4

The lookup table for sniffing text/plain messages for binary characters
was 256 bytes in size. The information it encoded could be stored in 32
bits.

Use a 32-bit lookup table instead of the 256-byte lookup table. This
reduces binary size.

This change is mostly performance neutral; micro-benchmarks show
no statistically significant change on Android. Micro-benchmarks on
high-performance Intel processors show an additional cost of around
400ns, apparently due to branch mis-prediction.

However, real-world performance is likely to be dominated by the time taken to
load the table into CPU cache, which the new implementation should win
easily.

Also add a micro-benchmark test.

BUG=
TEST=net_unittests

Review URL: https://codereview.chromium.org/1058003005

Cr-Commit-Position: refs/heads/master@{#330577}
parent 41b04f38
...@@ -1589,6 +1589,7 @@ if (!is_android && !is_mac) { ...@@ -1589,6 +1589,7 @@ if (!is_android && !is_mac) {
executable("net_perftests") { executable("net_perftests") {
testonly = true testonly = true
sources = [ sources = [
"base/mime_sniffer_perftest.cc",
"cookies/cookie_monster_perftest.cc", "cookies/cookie_monster_perftest.cc",
"disk_cache/blockfile/disk_cache_perftest.cc", "disk_cache/blockfile/disk_cache_perftest.cc",
"extras/sqlite/sqlite_persistent_cookie_store_perftest.cc", "extras/sqlite/sqlite_persistent_cookie_store_perftest.cc",
......
...@@ -647,27 +647,6 @@ static const MagicNumber kByteOrderMark[] = { ...@@ -647,27 +647,6 @@ static const MagicNumber kByteOrderMark[] = {
MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8
}; };
// Whether a given byte looks like it might be part of binary content.
// Source: HTML5 spec
static char kByteLooksBinary[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF
};
// Returns true and sets result to "application/octet-stream" if the content // Returns true and sets result to "application/octet-stream" if the content
// appears to be binary data. Otherwise, returns false and sets "text/plain". // appears to be binary data. Otherwise, returns false and sets "text/plain".
// Clears have_enough_content if more data could possibly change the result. // Clears have_enough_content if more data could possibly change the result.
...@@ -700,12 +679,9 @@ static bool SniffBinary(const char* content, ...@@ -700,12 +679,9 @@ static bool SniffBinary(const char* content,
} }
// Next we look to see if any of the bytes "look binary." // Next we look to see if any of the bytes "look binary."
for (size_t i = 0; i < size; ++i) { if (LooksLikeBinary(content, size)) {
// If we a see a binary-looking byte, we think the content is binary. result->assign("application/octet-stream");
if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { return true;
result->assign("application/octet-stream");
return true;
}
} }
// No evidence either way. Default to non-binary and, if truncated, clear // No evidence either way. Default to non-binary and, if truncated, clear
...@@ -965,4 +941,22 @@ bool SniffMimeTypeFromLocalData(const char* content, ...@@ -965,4 +941,22 @@ bool SniffMimeTypeFromLocalData(const char* content,
arraysize(kMagicNumbers), NULL, result); arraysize(kMagicNumbers), NULL, result);
} }
bool LooksLikeBinary(const char* content, size_t size) {
// The definition of "binary bytes" is from the spec at
// https://mimesniff.spec.whatwg.org/#binary-data-byte
//
// The bytes which are considered to be "binary" are all < 0x20. Encode them
// one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
// least-significant bit represents byte 0x00, the most-significant bit
// represents byte 0x1F.
const uint32 kBinaryBits =
~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
for (size_t i = 0; i < size; ++i) {
uint8 byte = static_cast<uint8>(content[i]);
if (byte < 0x20 && (kBinaryBits & (1u << byte)))
return true;
}
return false;
}
} // namespace net } // namespace net
...@@ -57,6 +57,13 @@ NET_EXPORT bool SniffMimeTypeFromLocalData(const char* content, ...@@ -57,6 +57,13 @@ NET_EXPORT bool SniffMimeTypeFromLocalData(const char* content,
size_t content_size, size_t content_size,
std::string* result); std::string* result);
// Returns true if |content| contains bytes that are control codes that do
// not usually appear in plain text.
// @param content A buffer contains bytes that may be binary.
// @param size The number of bytes in the |content| buffer.
// @return Returns true if |content| looks like binary.
NET_EXPORT_PRIVATE bool LooksLikeBinary(const char* content, size_t size);
} // namespace net } // namespace net
#endif // NET_BASE_MIME_SNIFFER_H__ #endif // NET_BASE_MIME_SNIFFER_H__
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "net/base/mime_sniffer.h"
#include <vector>
#include "base/bits.h"
#include "base/logging.h"
#include "base/timer/elapsed_timer.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace net {
namespace {
// This text is supposed to be representative of a plain text file the browser
// might encounter, including a variation in line lengths and blank
// lines. CRLF is used as the line-terminator to make it slightly more
// difficult. It is roughly 1KB.
const char kRepresentativePlainText[] =
"The Tragedie of Hamlet\r\n"
"\r\n"
"Actus Primus. Scoena Prima.\r\n"
"\r\n"
"Enter Barnardo and Francisco two Centinels.\r\n"
"\r\n"
" Barnardo. Who's there?\r\n"
" Fran. Nay answer me: Stand & vnfold\r\n"
"your selfe\r\n"
"\r\n"
" Bar. Long liue the King\r\n"
"\r\n"
" Fran. Barnardo?\r\n"
" Bar. He\r\n"
"\r\n"
" Fran. You come most carefully vpon your houre\r\n"
"\r\n"
" Bar. 'Tis now strook twelue, get thee to bed Francisco\r\n"
"\r\n"
" Fran. For this releefe much thankes: 'Tis bitter cold,\r\n"
"And I am sicke at heart\r\n"
"\r\n"
" Barn. Haue you had quiet Guard?\r\n"
" Fran. Not a Mouse stirring\r\n"
"\r\n"
" Barn. Well, goodnight. If you do meet Horatio and\r\n"
"Marcellus, the Riuals of my Watch, bid them make hast.\r\n"
"Enter Horatio and Marcellus.\r\n"
"\r\n"
" Fran. I thinke I heare them. Stand: who's there?\r\n"
" Hor. Friends to this ground\r\n"
"\r\n"
" Mar. And Leige-men to the Dane\r\n"
"\r\n"
" Fran. Giue you good night\r\n"
"\r\n"
" Mar. O farwel honest Soldier, who hath relieu'd you?\r\n"
" Fra. Barnardo ha's my place: giue you goodnight.\r\n"
"\r\n"
"Exit Fran.\r\n"
"\r\n"
" Mar. Holla Barnardo\r\n"
"\r\n"
" Bar. Say, what is Horatio there?\r\n"
" Hor. A peece of him\r\n"
"\r\n"
" Bar. Welcome Horatio, welcome good Marcellus\r\n"
"\r\n";
void RunLooksLikeBinary(const std::string& plaintext, size_t iterations) {
bool looks_like_binary = false;
for (size_t i = 0; i < iterations; ++i) {
if (LooksLikeBinary(&plaintext[0], plaintext.size()))
looks_like_binary = true;
}
CHECK(!looks_like_binary);
}
TEST(MimeSnifferTest, PlainTextPerfTest) {
// Android systems have a relatively small CPU cache (512KB to 2MB).
// It is better if the test data fits in cache so that we are not just
// testing bus bandwidth.
const size_t kTargetSize = 1 << 18; // 256KB
const size_t kWarmupIterations = 16;
const size_t kMeasuredIterations = 1 << 15;
std::string plaintext = kRepresentativePlainText;
// The purpose of the static_cast<size_t>() here is to prevent MSVC from
// complaining about an implicit promotion to 64 bits when compiling 64-bit.
size_t expected_size =
plaintext.size() *
static_cast<size_t>(
1u << base::bits::Log2Ceiling(kTargetSize / plaintext.size()));
plaintext.reserve(expected_size);
while (plaintext.size() < kTargetSize)
plaintext += plaintext;
DCHECK_EQ(expected_size, plaintext.size());
RunLooksLikeBinary(plaintext, kWarmupIterations);
base::ElapsedTimer elapsed_timer;
RunLooksLikeBinary(plaintext, kMeasuredIterations);
LOG(INFO) << (elapsed_timer.Elapsed().InMicroseconds() * 1000 * 1024 /
(static_cast<int64>(plaintext.size()) * kMeasuredIterations))
<< "ns per KB";
}
} // namespace net
} // namespace
...@@ -8,6 +8,11 @@ ...@@ -8,6 +8,11 @@
#include "url/gurl.h" #include "url/gurl.h"
namespace net { namespace net {
namespace {
using ::testing::Range;
using ::testing::Values;
using ::net::SniffMimeType; // It is shadowed by SniffMimeType(), below.
struct SnifferTest { struct SnifferTest {
const char* content; const char* content;
...@@ -484,4 +489,66 @@ TEST(MimeSnifferTest, AudioVideoTest) { ...@@ -484,4 +489,66 @@ TEST(MimeSnifferTest, AudioVideoTest) {
mime_type.clear(); mime_type.clear();
} }
// The tests need char parameters, but the ranges to test include 0xFF, and some
// platforms have signed chars and are noisy about it. Using an int parameter
// and casting it to char inside the test case solves both these problems.
class MimeSnifferBinaryTest : public ::testing::TestWithParam<int> {};
// From https://mimesniff.spec.whatwg.org/#binary-data-byte :
// A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte
// 0x0B (VT), a byte in the range 0x0E to 0x1A (SO to SUB), or a byte in the
// range 0x1C to 0x1F (FS to US).
TEST_P(MimeSnifferBinaryTest, IsBinaryControlCode) {
char param = static_cast<char>(GetParam());
EXPECT_TRUE(LooksLikeBinary(&param, 1));
}
// ::testing::Range(a, b) tests an open-ended range, ie. "b" is not included.
INSTANTIATE_TEST_CASE_P(MimeSnifferBinaryTestRange1,
MimeSnifferBinaryTest,
Range(0x00, 0x09));
INSTANTIATE_TEST_CASE_P(MimeSnifferBinaryTestByte0x0B,
MimeSnifferBinaryTest,
Values(0x0B));
INSTANTIATE_TEST_CASE_P(MimeSnifferBinaryTestRange2,
MimeSnifferBinaryTest,
Range(0x0E, 0x1B));
INSTANTIATE_TEST_CASE_P(MimeSnifferBinaryTestRange3,
MimeSnifferBinaryTest,
Range(0x1C, 0x20));
class MimeSnifferPlainTextTest : public ::testing::TestWithParam<int> {};
TEST_P(MimeSnifferPlainTextTest, NotBinaryControlCode) {
char param = static_cast<char>(GetParam());
EXPECT_FALSE(LooksLikeBinary(&param, 1));
}
INSTANTIATE_TEST_CASE_P(MimeSnifferPlainTextTestPlainTextControlCodes,
MimeSnifferPlainTextTest,
Values(0x09, 0x0A, 0x0C, 0x0D, 0x1B));
INSTANTIATE_TEST_CASE_P(MimeSnifferPlainTextTestNotControlCodeRange,
MimeSnifferPlainTextTest,
Range(0x20, 0x100));
class MimeSnifferControlCodesEdgeCaseTest
: public ::testing::TestWithParam<const char*> {};
TEST_P(MimeSnifferControlCodesEdgeCaseTest, EdgeCase) {
const char* param = GetParam();
EXPECT_TRUE(LooksLikeBinary(param, strlen(param)));
}
INSTANTIATE_TEST_CASE_P(MimeSnifferControlCodesEdgeCaseTest,
MimeSnifferControlCodesEdgeCaseTest,
Values("\x01__", // first byte is binary
"__\x03", // last byte is binary
"_\x02_" // a byte in the middle is binary
));
} // namespace
} // namespace net } // namespace net
...@@ -455,6 +455,7 @@ ...@@ -455,6 +455,7 @@
'net_test_support', 'net_test_support',
], ],
'sources': [ 'sources': [
'base/mime_sniffer_perftest.cc',
'cookies/cookie_monster_perftest.cc', 'cookies/cookie_monster_perftest.cc',
'disk_cache/blockfile/disk_cache_perftest.cc', 'disk_cache/blockfile/disk_cache_perftest.cc',
'extras/sqlite/sqlite_persistent_cookie_store_perftest.cc', 'extras/sqlite/sqlite_persistent_cookie_store_perftest.cc',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment