Move To{Lower,Upper}Unicode to CaseMap

This patch moves |StringImpl::To{Lower,Upper}Unicode| to |CaseMap|, so that we can share the code when implementing fast code path for |CaseMap|. No behavior changes other than moving and renaming. Bug: 985201 Change-Id: Id4675625f1fc9a5195479ea6dee9ba44bdbb1438 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1710240Reviewed-by: Emil A Eklund <eae@chromium.org> Commit-Queue: Emil A Eklund <eae@chromium.org> Cr-Commit-Position: refs/heads/master@{#681368}

Move To{Lower,Upper}Unicode to CaseMap
This patch moves |StringImpl::To{Lower,Upper}Unicode| to |CaseMap|, so that we can share the code when implementing fast code path for |CaseMap|. No behavior changes other than moving and renaming. Bug: 985201 Change-Id: Id4675625f1fc9a5195479ea6dee9ba44bdbb1438 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1710240Reviewed-by: Emil A Eklund <eae@chromium.org> Commit-Queue: Emil A Eklund <eae@chromium.org> Cr-Commit-Position: refs/heads/master@{#681368}
5a0025d0 · Koji Ishii · Commit Bot · e392e29a · 5a0025d0 · 5a0025d0
Commit 5a0025d0 authored Jul 26, 2019 by Koji Ishii Committed by Commit Bot Jul 26, 2019
7 changed files
--- a/third_party/blink/renderer/platform/wtf/text/atomic_string.cc
+++ b/third_party/blink/renderer/platform/wtf/text/atomic_string.cc
@@ -25,6 +25,7 @@

 #include "third_party/blink/renderer/platform/wtf/dtoa/dtoa.h"
 #include "third_party/blink/renderer/platform/wtf/text/atomic_string_table.h"
+#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
 #include "third_party/blink/renderer/platform/wtf/text/string_impl.h"

 namespace WTF {
@@ -75,7 +76,7 @@ AtomicString AtomicString::DeprecatedLower() const {
  StringImpl* impl = this->Impl();
  if (UNLIKELY(!impl))
    return *this;
-  scoped_refptr<StringImpl> new_impl = impl->LowerUnicode();
+  scoped_refptr<StringImpl> new_impl = CaseMap::FastToLowerInvariant(impl);
  if (LIKELY(new_impl == impl))
    return *this;
  return AtomicString(String(std::move(new_impl)));

--- a/third_party/blink/renderer/platform/wtf/text/case_map.cc
+++ b/third_party/blink/renderer/platform/wtf/text/case_map.cc
@@ -132,16 +132,200 @@ CaseMap::Locale::Locale(const AtomicString& locale) {
    case_map_locale_ = nullptr;
 }

+scoped_refptr<StringImpl> CaseMap::FastToLowerInvariant(StringImpl* source) {
+  DCHECK(source);
+
+  // Note: This is a hot function in the Dromaeo benchmark, specifically the
+  // no-op code path up through the first 'return' statement.
+
+  // First scan the string for uppercase and non-ASCII characters:
+  if (source->Is8Bit()) {
+    wtf_size_t first_index_to_be_lowered = source->length();
+    for (wtf_size_t i = 0; i < source->length(); ++i) {
+      LChar ch = source->Characters8()[i];
+      if (UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
+        first_index_to_be_lowered = i;
+        break;
+      }
+    }
+
+    // Nothing to do if the string is all ASCII with no uppercase.
+    if (first_index_to_be_lowered == source->length())
+      return source;
+
+    LChar* data8;
+    scoped_refptr<StringImpl> new_impl =
+        StringImpl::CreateUninitialized(source->length(), data8);
+    memcpy(data8, source->Characters8(), first_index_to_be_lowered);
+
+    for (wtf_size_t i = first_index_to_be_lowered; i < source->length(); ++i) {
+      LChar ch = source->Characters8()[i];
+      data8[i] = UNLIKELY(ch & ~0x7F) ? static_cast<LChar>(unicode::ToLower(ch))
+                                      : ToASCIILower(ch);
+    }
+
+    return new_impl;
+  }
+
+  bool no_upper = true;
+  UChar ored = 0;
+
+  const UChar* end = source->Characters16() + source->length();
+  for (const UChar* chp = source->Characters16(); chp != end; ++chp) {
+    if (UNLIKELY(IsASCIIUpper(*chp)))
+      no_upper = false;
+    ored |= *chp;
+  }
+  // Nothing to do if the string is all ASCII with no uppercase.
+  if (no_upper && !(ored & ~0x7F))
+    return source;
+
+  CHECK_LE(source->length(),
+           static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
+  int32_t length = source->length();
+
+  if (!(ored & ~0x7F)) {
+    UChar* data16;
+    scoped_refptr<StringImpl> new_impl =
+        StringImpl::CreateUninitialized(source->length(), data16);
+
+    for (int32_t i = 0; i < length; ++i) {
+      UChar c = source->Characters16()[i];
+      data16[i] = ToASCIILower(c);
+    }
+    return new_impl;
+  }
+
+  // Do a slower implementation for cases that include non-ASCII characters.
+  UChar* data16;
+  scoped_refptr<StringImpl> new_impl =
+      StringImpl::CreateUninitialized(source->length(), data16);
+
+  bool error;
+  int32_t real_length = unicode::ToLower(data16, length, source->Characters16(),
+                                         source->length(), &error);
+  if (!error && real_length == length)
+    return new_impl;
+
+  new_impl = StringImpl::CreateUninitialized(real_length, data16);
+  unicode::ToLower(data16, real_length, source->Characters16(),
+                   source->length(), &error);
+  if (error)
+    return source;
+  return new_impl;
+}
+
+scoped_refptr<StringImpl> CaseMap::FastToUpperInvariant(StringImpl* source) {
+  DCHECK(source);
+
+  // This function could be optimized for no-op cases the way LowerUnicode() is,
+  // but in empirical testing, few actual calls to UpperUnicode() are no-ops, so
+  // it wouldn't be worth the extra time for pre-scanning.
+
+  CHECK_LE(source->length(),
+           static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
+  int32_t length = source->length();
+
+  if (source->Is8Bit()) {
+    LChar* data8;
+    scoped_refptr<StringImpl> new_impl =
+        StringImpl::CreateUninitialized(source->length(), data8);
+
+    // Do a faster loop for the case where all the characters are ASCII.
+    LChar ored = 0;
+    for (int i = 0; i < length; ++i) {
+      LChar c = source->Characters8()[i];
+      ored |= c;
+      data8[i] = ToASCIIUpper(c);
+    }
+    if (!(ored & ~0x7F))
+      return new_impl;
+
+    // Do a slower implementation for cases that include non-ASCII Latin-1
+    // characters.
+    int number_sharp_s_characters = 0;
+
+    // There are two special cases.
+    //  1. latin-1 characters when converted to upper case are 16 bit
+    //     characters.
+    //  2. Lower case sharp-S converts to "SS" (two characters)
+    for (int32_t i = 0; i < length; ++i) {
+      LChar c = source->Characters8()[i];
+      if (UNLIKELY(c == kSmallLetterSharpSCharacter))
+        ++number_sharp_s_characters;
+      UChar upper = static_cast<UChar>(unicode::ToUpper(c));
+      if (UNLIKELY(upper > 0xff)) {
+        // Since this upper-cased character does not fit in an 8-bit string, we
+        // need to take the 16-bit path.
+        goto upconvert;
+      }
+      data8[i] = static_cast<LChar>(upper);
+    }
+
+    if (!number_sharp_s_characters)
+      return new_impl;
+
+    // We have numberSSCharacters sharp-s characters, but none of the other
+    // special characters.
+    new_impl = StringImpl::CreateUninitialized(
+        source->length() + number_sharp_s_characters, data8);
+
+    LChar* dest = data8;
+
+    for (int32_t i = 0; i < length; ++i) {
+      LChar c = source->Characters8()[i];
+      if (c == kSmallLetterSharpSCharacter) {
+        *dest++ = 'S';
+        *dest++ = 'S';
+      } else {
+        *dest++ = static_cast<LChar>(unicode::ToUpper(c));
+      }
+    }
+
+    return new_impl;
+  }
+
+upconvert:
+  scoped_refptr<StringImpl> upconverted = source->UpconvertedString();
+  const UChar* source16 = upconverted->Characters16();
+
+  UChar* data16;
+  scoped_refptr<StringImpl> new_impl =
+      StringImpl::CreateUninitialized(source->length(), data16);
+
+  // Do a faster loop for the case where all the characters are ASCII.
+  UChar ored = 0;
+  for (int i = 0; i < length; ++i) {
+    UChar c = source16[i];
+    ored |= c;
+    data16[i] = ToASCIIUpper(c);
+  }
+  if (!(ored & ~0x7F))
+    return new_impl;
+
+  // Do a slower implementation for cases that include non-ASCII characters.
+  bool error;
+  int32_t real_length =
+      unicode::ToUpper(data16, length, source16, source->length(), &error);
+  if (!error && real_length == length)
+    return new_impl;
+  new_impl = StringImpl::CreateUninitialized(real_length, data16);
+  unicode::ToUpper(data16, real_length, source16, source->length(), &error);
+  if (error)
+    return source;
+  return new_impl;
+}
+
 scoped_refptr<StringImpl> CaseMap::ToLower(StringImpl* source) const {
  if (!case_map_locale_)
-    return source->LowerUnicode();
+    return FastToLowerInvariant(source);

  return CaseConvert(CaseMapType::kLowerLegacy, source, case_map_locale_);
 }

 scoped_refptr<StringImpl> CaseMap::ToUpper(StringImpl* source) const {
  if (!case_map_locale_)
-    return source->UpperUnicode();
+    return FastToUpperInvariant(source);

  return CaseConvert(CaseMapType::kUpperLegacy, source, case_map_locale_);
 }

--- a/third_party/blink/renderer/platform/wtf/text/case_map.h
+++ b/third_party/blink/renderer/platform/wtf/text/case_map.h
@@ -52,7 +52,15 @@ class WTF_EXPORT CaseMap {
  String ToLower(const String& source, TextOffsetMap* offset_map) const;
  String ToUpper(const String& source, TextOffsetMap* offset_map) const;

+  // Fast code path for simple cases, only for root locale.
+  // TODO(crbug.com/627682): This should move to private, once
+  // |DeprecatedLower()| is deprecated.
+  static scoped_refptr<StringImpl> FastToLowerInvariant(StringImpl* source);
+
 private:
+  // Fast code path for simple cases, only for root locale.
+  static scoped_refptr<StringImpl> FastToUpperInvariant(StringImpl* source);
+
  const char* case_map_locale_;
 };


--- a/third_party/blink/renderer/platform/wtf/text/string_impl.cc
+++ b/third_party/blink/renderer/platform/wtf/text/string_impl.cc
@@ -391,177 +391,6 @@ scoped_refptr<StringImpl> StringImpl::LowerASCII() {
  return new_impl;
 }

-scoped_refptr<StringImpl> StringImpl::LowerUnicode() {
-  // Note: This is a hot function in the Dromaeo benchmark, specifically the
-  // no-op code path up through the first 'return' statement.
-
-  // First scan the string for uppercase and non-ASCII characters:
-  if (Is8Bit()) {
-    wtf_size_t first_index_to_be_lowered = length_;
-    for (wtf_size_t i = 0; i < length_; ++i) {
-      LChar ch = Characters8()[i];
-      if (UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
-        first_index_to_be_lowered = i;
-        break;
-      }
-    }
-
-    // Nothing to do if the string is all ASCII with no uppercase.
-    if (first_index_to_be_lowered == length_)
-      return this;
-
-    LChar* data8;
-    scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data8);
-    memcpy(data8, Characters8(), first_index_to_be_lowered);
-
-    for (wtf_size_t i = first_index_to_be_lowered; i < length_; ++i) {
-      LChar ch = Characters8()[i];
-      data8[i] = UNLIKELY(ch & ~0x7F) ? static_cast<LChar>(unicode::ToLower(ch))
-                                      : ToASCIILower(ch);
-    }
-
-    return new_impl;
-  }
-
-  bool no_upper = true;
-  UChar ored = 0;
-
-  const UChar* end = Characters16() + length_;
-  for (const UChar* chp = Characters16(); chp != end; ++chp) {
-    if (UNLIKELY(IsASCIIUpper(*chp)))
-      no_upper = false;
-    ored |= *chp;
-  }
-  // Nothing to do if the string is all ASCII with no uppercase.
-  if (no_upper && !(ored & ~0x7F))
-    return this;
-
-  CHECK_LE(length_, static_cast<wtf_size_t>(numeric_limits<int32_t>::max()));
-  int32_t length = length_;
-
-  if (!(ored & ~0x7F)) {
-    UChar* data16;
-    scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data16);
-
-    for (int32_t i = 0; i < length; ++i) {
-      UChar c = Characters16()[i];
-      data16[i] = ToASCIILower(c);
-    }
-    return new_impl;
-  }
-
-  // Do a slower implementation for cases that include non-ASCII characters.
-  UChar* data16;
-  scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data16);
-
-  bool error;
-  int32_t real_length =
-      unicode::ToLower(data16, length, Characters16(), length_, &error);
-  if (!error && real_length == length)
-    return new_impl;
-
-  new_impl = CreateUninitialized(real_length, data16);
-  unicode::ToLower(data16, real_length, Characters16(), length_, &error);
-  if (error)
-    return this;
-  return new_impl;
-}
-
-scoped_refptr<StringImpl> StringImpl::UpperUnicode() {
-  // This function could be optimized for no-op cases the way LowerUnicode() is,
-  // but in empirical testing, few actual calls to UpperUnicode() are no-ops, so
-  // it wouldn't be worth the extra time for pre-scanning.
-
-  CHECK_LE(length_, static_cast<wtf_size_t>(numeric_limits<int32_t>::max()));
-  int32_t length = length_;
-
-  if (Is8Bit()) {
-    LChar* data8;
-    scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data8);
-
-    // Do a faster loop for the case where all the characters are ASCII.
-    LChar ored = 0;
-    for (int i = 0; i < length; ++i) {
-      LChar c = Characters8()[i];
-      ored |= c;
-      data8[i] = ToASCIIUpper(c);
-    }
-    if (!(ored & ~0x7F))
-      return new_impl;
-
-    // Do a slower implementation for cases that include non-ASCII Latin-1
-    // characters.
-    int number_sharp_s_characters = 0;
-
-    // There are two special cases.
-    //  1. latin-1 characters when converted to upper case are 16 bit
-    //     characters.
-    //  2. Lower case sharp-S converts to "SS" (two characters)
-    for (int32_t i = 0; i < length; ++i) {
-      LChar c = Characters8()[i];
-      if (UNLIKELY(c == kSmallLetterSharpSCharacter))
-        ++number_sharp_s_characters;
-      UChar upper = static_cast<UChar>(unicode::ToUpper(c));
-      if (UNLIKELY(upper > 0xff)) {
-        // Since this upper-cased character does not fit in an 8-bit string, we
-        // need to take the 16-bit path.
-        goto upconvert;
-      }
-      data8[i] = static_cast<LChar>(upper);
-    }
-
-    if (!number_sharp_s_characters)
-      return new_impl;
-
-    // We have numberSSCharacters sharp-s characters, but none of the other
-    // special characters.
-    new_impl = CreateUninitialized(length_ + number_sharp_s_characters, data8);
-
-    LChar* dest = data8;
-
-    for (int32_t i = 0; i < length; ++i) {
-      LChar c = Characters8()[i];
-      if (c == kSmallLetterSharpSCharacter) {
-        *dest++ = 'S';
-        *dest++ = 'S';
-      } else {
-        *dest++ = static_cast<LChar>(unicode::ToUpper(c));
-      }
-    }
-
-    return new_impl;
-  }
-
-upconvert:
-  scoped_refptr<StringImpl> upconverted = UpconvertedString();
-  const UChar* source16 = upconverted->Characters16();
-
-  UChar* data16;
-  scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data16);
-
-  // Do a faster loop for the case where all the characters are ASCII.
-  UChar ored = 0;
-  for (int i = 0; i < length; ++i) {
-    UChar c = source16[i];
-    ored |= c;
-    data16[i] = ToASCIIUpper(c);
-  }
-  if (!(ored & ~0x7F))
-    return new_impl;
-
-  // Do a slower implementation for cases that include non-ASCII characters.
-  bool error;
-  int32_t real_length =
-      unicode::ToUpper(data16, length, source16, length_, &error);
-  if (!error && real_length == length)
-    return new_impl;
-  new_impl = CreateUninitialized(real_length, data16);
-  unicode::ToUpper(data16, real_length, source16, length_, &error);
-  if (error)
-    return this;
-  return new_impl;
-}
-
 scoped_refptr<StringImpl> StringImpl::UpperASCII() {
  if (Is8Bit()) {
    LChar* data8;

--- a/third_party/blink/renderer/platform/wtf/text/string_impl.h
+++ b/third_party/blink/renderer/platform/wtf/text/string_impl.h
@@ -353,9 +353,7 @@ class WTF_EXPORT StringImpl {
  double ToDouble(bool* ok = nullptr);
  float ToFloat(bool* ok = nullptr);

-  scoped_refptr<StringImpl> LowerUnicode();
  scoped_refptr<StringImpl> LowerASCII();
-  scoped_refptr<StringImpl> UpperUnicode();
  scoped_refptr<StringImpl> UpperASCII();

  scoped_refptr<StringImpl> Fill(UChar);

--- a/third_party/blink/renderer/platform/wtf/text/string_impl_test.cc
+++ b/third_party/blink/renderer/platform/wtf/text/string_impl_test.cc
@@ -26,6 +26,7 @@
 #include "third_party/blink/renderer/platform/wtf/text/string_impl.h"

 #include "testing/gtest/include/gtest/gtest.h"
+#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"

 namespace WTF {
@@ -55,9 +56,10 @@ TEST(StringImplTest, LowerASCII) {
  EXPECT_TRUE(Equal(test_string_impl.get(),
                    StringImpl::Create("lInk")->LowerASCII().get()));

-  EXPECT_TRUE(Equal(StringImpl::Create("LINK")->LowerUnicode().get(),
+  CaseMap case_map("");
+  EXPECT_TRUE(Equal(case_map.ToLower(StringImpl::Create("LINK")).Impl(),
                    StringImpl::Create("LINK")->LowerASCII().get()));
-  EXPECT_TRUE(Equal(StringImpl::Create("lInk")->LowerUnicode().get(),
+  EXPECT_TRUE(Equal(case_map.ToLower(StringImpl::Create("lInk")).Impl(),
                    StringImpl::Create("lInk")->LowerASCII().get()));

  EXPECT_TRUE(Equal(StringImpl::Create("a\xE1").get(),
@@ -112,9 +114,10 @@ TEST(StringImplTest, UpperASCII) {
  EXPECT_TRUE(Equal(test_string_impl.get(),
                    StringImpl::Create("lInk")->UpperASCII().get()));

-  EXPECT_TRUE(Equal(StringImpl::Create("LINK")->UpperUnicode().get(),
+  CaseMap case_map("");
+  EXPECT_TRUE(Equal(case_map.ToUpper(StringImpl::Create("LINK")).Impl(),
                    StringImpl::Create("LINK")->UpperASCII().get()));
-  EXPECT_TRUE(Equal(StringImpl::Create("lInk")->UpperUnicode().get(),
+  EXPECT_TRUE(Equal(case_map.ToUpper(StringImpl::Create("lInk")).Impl(),
                    StringImpl::Create("lInk")->UpperASCII().get()));

  EXPECT_TRUE(Equal(StringImpl::Create("A\xE1").get(),

--- a/third_party/blink/renderer/platform/wtf/text/wtf_string.cc
+++ b/third_party/blink/renderer/platform/wtf/text/wtf_string.cc
@@ -30,6 +30,7 @@
 #include "third_party/blink/renderer/platform/wtf/dtoa/dtoa.h"
 #include "third_party/blink/renderer/platform/wtf/math_extras.h"
 #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
+#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
 #include "third_party/blink/renderer/platform/wtf/text/unicode.h"
@@ -109,7 +110,7 @@ String String::Substring(unsigned pos, unsigned len) const {
 String String::DeprecatedLower() const {
  if (!impl_)
    return String();
-  return impl_->LowerUnicode();
+  return CaseMap::FastToLowerInvariant(impl_.get());
 }

 String String::LowerASCII() const {