Commit 6f500092 authored by Mikhail Naganov's avatar Mikhail Naganov

Implement fast path in UTF8ToUTF16 for pure ASCII strings

Blink WebString::fromUTF8 are marginally faster than UTF8ToUTF16 on
pure ASCII input because they have a fast path for this case. Implement
the same thing for base::UTF8ToUTF16

DoIsStringASCII is now a port of charactersAreAllASCII from Blink.

BUG=391492
R=brettw@chromium.org, torne@chromium.org

Review URL: https://codereview.chromium.org/543043002

Cr-Commit-Position: refs/heads/master@{#297158}
parent d0fa7a78
...@@ -64,6 +64,34 @@ static bool CompareParameter(const ReplacementOffset& elem1, ...@@ -64,6 +64,34 @@ static bool CompareParameter(const ReplacementOffset& elem1,
return elem1.parameter < elem2.parameter; return elem1.parameter < elem2.parameter;
} }
// Assuming that a pointer is the size of a "machine word", then
// uintptr_t is an integer type that is also a machine word.
typedef uintptr_t MachineWord;
const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
inline bool IsAlignedToMachineWord(const void* pointer) {
return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
}
template<typename T> inline T* AlignToMachineWord(T* pointer) {
return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
~kMachineWordAlignmentMask);
}
template<size_t size, typename CharacterType> struct NonASCIIMask;
template<> struct NonASCIIMask<4, base::char16> {
static inline uint32_t value() { return 0xFF80FF80U; }
};
template<> struct NonASCIIMask<4, char> {
static inline uint32_t value() { return 0x80808080U; }
};
template<> struct NonASCIIMask<8, base::char16> {
static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
};
template<> struct NonASCIIMask<8, char> {
static inline uint64_t value() { return 0x8080808080808080ULL; }
};
} // namespace } // namespace
namespace base { namespace base {
...@@ -322,22 +350,46 @@ bool ContainsOnlyChars(const StringPiece16& input, ...@@ -322,22 +350,46 @@ bool ContainsOnlyChars(const StringPiece16& input,
return input.find_first_not_of(characters) == StringPiece16::npos; return input.find_first_not_of(characters) == StringPiece16::npos;
} }
template<class STR> template <class Char>
static bool DoIsStringASCII(const STR& str) { inline bool DoIsStringASCII(const Char* characters, size_t length) {
for (size_t i = 0; i < str.length(); i++) { MachineWord all_char_bits = 0;
typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i]; const Char* end = characters + length;
if (c > 0x7F)
return false; // Prologue: align the input.
while (!IsAlignedToMachineWord(characters) && characters != end) {
all_char_bits |= *characters;
++characters;
} }
return true;
// Compare the values of CPU word size.
const Char* word_end = AlignToMachineWord(end);
const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
while (characters < word_end) {
all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
characters += loop_increment;
}
// Process the remaining bytes.
while (characters != end) {
all_char_bits |= *characters;
++characters;
}
MachineWord non_ascii_bit_mask =
NonASCIIMask<sizeof(MachineWord), Char>::value();
return !(all_char_bits & non_ascii_bit_mask);
} }
bool IsStringASCII(const StringPiece& str) { bool IsStringASCII(const StringPiece& str) {
return DoIsStringASCII(str); return DoIsStringASCII(str.data(), str.length());
}
bool IsStringASCII(const StringPiece16& str) {
return DoIsStringASCII(str.data(), str.length());
} }
bool IsStringASCII(const string16& str) { bool IsStringASCII(const string16& str) {
return DoIsStringASCII(str); return DoIsStringASCII(str.data(), str.length());
} }
bool IsStringUTF8(const std::string& str) { bool IsStringUTF8(const std::string& str) {
......
...@@ -245,8 +245,14 @@ BASE_EXPORT bool ContainsOnlyChars(const StringPiece16& input, ...@@ -245,8 +245,14 @@ BASE_EXPORT bool ContainsOnlyChars(const StringPiece16& input,
// to have the maximum 'discriminating' power from other encodings. If // to have the maximum 'discriminating' power from other encodings. If
// there's a use case for just checking the structural validity, we have to // there's a use case for just checking the structural validity, we have to
// add a new function for that. // add a new function for that.
//
// IsStringASCII assumes the input is likely all ASCII, and does not leave early
// if it is not the case.
BASE_EXPORT bool IsStringUTF8(const std::string& str); BASE_EXPORT bool IsStringUTF8(const std::string& str);
BASE_EXPORT bool IsStringASCII(const StringPiece& str); BASE_EXPORT bool IsStringASCII(const StringPiece& str);
BASE_EXPORT bool IsStringASCII(const StringPiece16& str);
// A convenience adaptor for WebStrings, as they don't convert into
// StringPieces directly.
BASE_EXPORT bool IsStringASCII(const string16& str); BASE_EXPORT bool IsStringASCII(const string16& str);
// Converts the elements of the given string. This version uses a pointer to // Converts the elements of the given string. This version uses a pointer to
......
...@@ -386,6 +386,55 @@ TEST(StringUtilTest, IsStringUTF8) { ...@@ -386,6 +386,55 @@ TEST(StringUtilTest, IsStringUTF8) {
EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000")); EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000"));
} }
TEST(StringUtilTest, IsStringASCII) {
static char char_ascii[] =
"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
static char16 char16_ascii[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A',
'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6',
'7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 0 };
// Test a variety of the fragment start positions and lengths in order to make
// sure that bit masking in IsStringASCII works correctly.
// Also, test that a non-ASCII character will be detected regardless of its
// position inside the string.
{
const size_t string_length = arraysize(char_ascii) - 1;
for (size_t offset = 0; offset < 8; ++offset) {
for (size_t len = 0, max_len = string_length - offset; len < max_len;
++len) {
EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len)));
for (size_t char_pos = offset; char_pos < len; ++char_pos) {
char_ascii[char_pos] |= '\x80';
EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len)));
char_ascii[char_pos] &= ~'\x80';
}
}
}
}
{
const size_t string_length = arraysize(char16_ascii) - 1;
for (size_t offset = 0; offset < 4; ++offset) {
for (size_t len = 0, max_len = string_length - offset; len < max_len;
++len) {
EXPECT_TRUE(IsStringASCII(StringPiece16(char16_ascii + offset, len)));
for (size_t char_pos = offset; char_pos < len; ++char_pos) {
char16_ascii[char_pos] |= 0x80;
EXPECT_FALSE(
IsStringASCII(StringPiece16(char16_ascii + offset, len)));
char16_ascii[char_pos] &= ~0x80;
// Also test when the upper half is non-zero.
char16_ascii[char_pos] |= 0x100;
EXPECT_FALSE(
IsStringASCII(StringPiece16(char16_ascii + offset, len)));
char16_ascii[char_pos] &= ~0x100;
}
}
}
}
}
TEST(StringUtilTest, ConvertASCII) { TEST(StringUtilTest, ConvertASCII) {
static const char* char_cases[] = { static const char* char_cases[] = {
"Google Video", "Google Video",
......
...@@ -56,13 +56,23 @@ std::string WideToUTF8(const std::wstring& wide) { ...@@ -56,13 +56,23 @@ std::string WideToUTF8(const std::wstring& wide) {
} }
bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
if (IsStringASCII(StringPiece(src, src_len))) {
output->assign(src, src + src_len);
return true;
} else {
PrepareForUTF16Or32Output(src, src_len, output); PrepareForUTF16Or32Output(src, src_len, output);
return ConvertUnicode(src, src_len, output); return ConvertUnicode(src, src_len, output);
}
} }
std::wstring UTF8ToWide(const StringPiece& utf8) { std::wstring UTF8ToWide(const StringPiece& utf8) {
if (IsStringASCII(utf8)) {
return std::wstring(utf8.begin(), utf8.end());
}
std::wstring ret; std::wstring ret;
UTF8ToWide(utf8.data(), utf8.length(), &ret); PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
ConvertUnicode(utf8.data(), utf8.length(), &ret);
return ret; return ret;
} }
...@@ -126,15 +136,25 @@ std::wstring UTF16ToWide(const string16& utf16) { ...@@ -126,15 +136,25 @@ std::wstring UTF16ToWide(const string16& utf16) {
#if defined(WCHAR_T_IS_UTF32) #if defined(WCHAR_T_IS_UTF32)
bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
if (IsStringASCII(StringPiece(src, src_len))) {
output->assign(src, src + src_len);
return true;
} else {
PrepareForUTF16Or32Output(src, src_len, output); PrepareForUTF16Or32Output(src, src_len, output);
return ConvertUnicode(src, src_len, output); return ConvertUnicode(src, src_len, output);
}
} }
string16 UTF8ToUTF16(const StringPiece& utf8) { string16 UTF8ToUTF16(const StringPiece& utf8) {
if (IsStringASCII(utf8)) {
return string16(utf8.begin(), utf8.end());
}
string16 ret; string16 ret;
PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
// Ignore the success flag of this call, it will do the best it can for // Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here. // invalid input, which is what we want here.
UTF8ToUTF16(utf8.data(), utf8.length(), &ret); ConvertUnicode(utf8.data(), utf8.length(), &ret);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment