Implement fast path in UTF8ToUTF16 for pure ASCII strings

Blink WebString::fromUTF8 are marginally faster than UTF8ToUTF16 on pure ASCII input because they have a fast path for this case. Implement the same thing for base::UTF8ToUTF16 DoIsStringASCII is now a port of charactersAreAllASCII from Blink. BUG=391492 R=brettw@chromium.org, torne@chromium.org Review URL: https://codereview.chromium.org/543043002 Cr-Commit-Position: refs/heads/master@{#297158}

Implement fast path in UTF8ToUTF16 for pure ASCII strings
Blink WebString::fromUTF8 are marginally faster than UTF8ToUTF16 on pure ASCII input because they have a fast path for this case. Implement the same thing for base::UTF8ToUTF16 DoIsStringASCII is now a port of charactersAreAllASCII from Blink. BUG=391492 R=brettw@chromium.org, torne@chromium.org Review URL: https://codereview.chromium.org/543043002 Cr-Commit-Position: refs/heads/master@{#297158}
6f500092 · Mikhail Naganov · d0fa7a78 · 6f500092 · 6f500092 · 6f500092
Commit 6f500092 authored Sep 29, 2014 by Mikhail Naganov
4 changed files
--- a/base/strings/string_util.cc
+++ b/base/strings/string_util.cc
@@ -64,6 +64,34 @@ static bool CompareParameter(const ReplacementOffset& elem1,
  return elem1.parameter < elem2.parameter;
 }
+// Assuming that a pointer is the size of a "machine word", then
+// uintptr_t is an integer type that is also a machine word.
+typedef uintptr_t MachineWord;
+const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
+inline bool IsAlignedToMachineWord(const void* pointer) {
+  return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
+}
+template<typename T> inline T* AlignToMachineWord(T* pointer) {
+  return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
+                              ~kMachineWordAlignmentMask);
+}
+template<size_t size, typename CharacterType> struct NonASCIIMask;
+template<> struct NonASCIIMask<4, base::char16> {
+    static inline uint32_t value() { return 0xFF80FF80U; }
+};
+template<> struct NonASCIIMask<4, char> {
+    static inline uint32_t value() { return 0x80808080U; }
+};
+template<> struct NonASCIIMask<8, base::char16> {
+    static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
+};
+template<> struct NonASCIIMask<8, char> {
+    static inline uint64_t value() { return 0x8080808080808080ULL; }
+};
 }  // namespace
 namespace base {
@@ -322,22 +350,46 @@ bool ContainsOnlyChars(const StringPiece16& input,
  return input.find_first_not_of(characters) == StringPiece16::npos;
 }
-template<class STR>
+template <class Char>
-static bool DoIsStringASCII(const STR& str) {
+inline bool DoIsStringASCII(const Char* characters, size_t length) {
-  for (size_t i = 0; i < str.length(); i++) {
+  MachineWord all_char_bits = 0;
-    typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
+  const Char* end = characters + length;
-    if (c > 0x7F)
-      return false;
+  // Prologue: align the input.
+  while (!IsAlignedToMachineWord(characters) && characters != end) {
+    all_char_bits |= *characters;
+    ++characters;
  }
-  return true;
+  // Compare the values of CPU word size.
+  const Char* word_end = AlignToMachineWord(end);
+  const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
+  while (characters < word_end) {
+    all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
+    characters += loop_increment;
+  }
+  // Process the remaining bytes.
+  while (characters != end) {
+    all_char_bits |= *characters;
+    ++characters;
+  }
+  MachineWord non_ascii_bit_mask =
+      NonASCIIMask<sizeof(MachineWord), Char>::value();
+  return !(all_char_bits & non_ascii_bit_mask);
 }
 bool IsStringASCII(const StringPiece& str) {
-  return DoIsStringASCII(str);
+  return DoIsStringASCII(str.data(), str.length());
+}
+bool IsStringASCII(const StringPiece16& str) {
+  return DoIsStringASCII(str.data(), str.length());
 }
 bool IsStringASCII(const string16& str) {
-  return DoIsStringASCII(str);
+  return DoIsStringASCII(str.data(), str.length());
 }
 bool IsStringUTF8(const std::string& str) {

--- a/base/strings/string_util.h
+++ b/base/strings/string_util.h
@@ -245,8 +245,14 @@ BASE_EXPORT bool ContainsOnlyChars(const StringPiece16& input,
 // to have the maximum 'discriminating' power from other encodings. If
 // there's a use case for just checking the structural validity, we have to
 // add a new function for that.
+//
+// IsStringASCII assumes the input is likely all ASCII, and does not leave early
+// if it is not the case.
 BASE_EXPORT bool IsStringUTF8(const std::string& str);
 BASE_EXPORT bool IsStringASCII(const StringPiece& str);
+BASE_EXPORT bool IsStringASCII(const StringPiece16& str);
+// A convenience adaptor for WebStrings, as they don't convert into
+// StringPieces directly.
 BASE_EXPORT bool IsStringASCII(const string16& str);
 // Converts the elements of the given string.  This version uses a pointer to

--- a/base/strings/string_util_unittest.cc
+++ b/base/strings/string_util_unittest.cc
@@ -386,6 +386,55 @@ TEST(StringUtilTest, IsStringUTF8) {
  EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000"));
 }
+TEST(StringUtilTest, IsStringASCII) {
+  static char char_ascii[] =
+      "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
+  static char16 char16_ascii[] = {
+      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A',
+      'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6',
+      '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 0 };
+  // Test a variety of the fragment start positions and lengths in order to make
+  // sure that bit masking in IsStringASCII works correctly.
+  // Also, test that a non-ASCII character will be detected regardless of its
+  // position inside the string.
+  {
+    const size_t string_length = arraysize(char_ascii) - 1;
+    for (size_t offset = 0; offset < 8; ++offset) {
+      for (size_t len = 0, max_len = string_length - offset; len < max_len;
+           ++len) {
+        EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len)));
+        for (size_t char_pos = offset; char_pos < len; ++char_pos) {
+          char_ascii[char_pos] |= '\x80';
+          EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len)));
+          char_ascii[char_pos] &= ~'\x80';
+        }
+      }
+    }
+  }
+  {
+    const size_t string_length = arraysize(char16_ascii) - 1;
+    for (size_t offset = 0; offset < 4; ++offset) {
+      for (size_t len = 0, max_len = string_length - offset; len < max_len;
+           ++len) {
+        EXPECT_TRUE(IsStringASCII(StringPiece16(char16_ascii + offset, len)));
+        for (size_t char_pos = offset; char_pos < len; ++char_pos) {
+          char16_ascii[char_pos] |= 0x80;
+          EXPECT_FALSE(
+              IsStringASCII(StringPiece16(char16_ascii + offset, len)));
+          char16_ascii[char_pos] &= ~0x80;
+          // Also test when the upper half is non-zero.
+          char16_ascii[char_pos] |= 0x100;
+          EXPECT_FALSE(
+              IsStringASCII(StringPiece16(char16_ascii + offset, len)));
+          char16_ascii[char_pos] &= ~0x100;
+        }
+      }
+    }
+  }
+}
 TEST(StringUtilTest, ConvertASCII) {
  static const char* char_cases[] = {
    "Google Video",

--- a/base/strings/utf_string_conversions.cc
+++ b/base/strings/utf_string_conversions.cc
@@ -56,13 +56,23 @@ std::string WideToUTF8(const std::wstring& wide) {
 }
 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
+  if (IsStringASCII(StringPiece(src, src_len))) {
+    output->assign(src, src + src_len);
+    return true;
+  } else {
    PrepareForUTF16Or32Output(src, src_len, output);
    return ConvertUnicode(src, src_len, output);
+  }
 }
 std::wstring UTF8ToWide(const StringPiece& utf8) {
+  if (IsStringASCII(utf8)) {
+    return std::wstring(utf8.begin(), utf8.end());
+  }
  std::wstring ret;
-  UTF8ToWide(utf8.data(), utf8.length(), &ret);
+  PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
+  ConvertUnicode(utf8.data(), utf8.length(), &ret);
  return ret;
 }
@@ -126,15 +136,25 @@ std::wstring UTF16ToWide(const string16& utf16) {
 #if defined(WCHAR_T_IS_UTF32)
 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
+  if (IsStringASCII(StringPiece(src, src_len))) {
+    output->assign(src, src + src_len);
+    return true;
+  } else {
    PrepareForUTF16Or32Output(src, src_len, output);
    return ConvertUnicode(src, src_len, output);
+  }
 }
 string16 UTF8ToUTF16(const StringPiece& utf8) {
+  if (IsStringASCII(utf8)) {
+    return string16(utf8.begin(), utf8.end());
+  }
  string16 ret;
+  PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
  // Ignore the success flag of this call, it will do the best it can for
  // invalid input, which is what we want here.
-  UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
+  ConvertUnicode(utf8.data(), utf8.length(), &ret);
  return ret;
 }