Commit 5a0025d0 authored by Koji Ishii's avatar Koji Ishii Committed by Commit Bot

Move To{Lower,Upper}Unicode to CaseMap

This patch moves |StringImpl::To{Lower,Upper}Unicode| to
|CaseMap|, so that we can share the code when implementing
fast code path for |CaseMap|.

No behavior changes other than moving and renaming.

Bug: 985201
Change-Id: Id4675625f1fc9a5195479ea6dee9ba44bdbb1438
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1710240Reviewed-by: default avatarEmil A Eklund <eae@chromium.org>
Commit-Queue: Emil A Eklund <eae@chromium.org>
Cr-Commit-Position: refs/heads/master@{#681368}
parent e392e29a
......@@ -25,6 +25,7 @@
#include "third_party/blink/renderer/platform/wtf/dtoa/dtoa.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string_table.h"
#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
#include "third_party/blink/renderer/platform/wtf/text/string_impl.h"
namespace WTF {
......@@ -75,7 +76,7 @@ AtomicString AtomicString::DeprecatedLower() const {
StringImpl* impl = this->Impl();
if (UNLIKELY(!impl))
return *this;
scoped_refptr<StringImpl> new_impl = impl->LowerUnicode();
scoped_refptr<StringImpl> new_impl = CaseMap::FastToLowerInvariant(impl);
if (LIKELY(new_impl == impl))
return *this;
return AtomicString(String(std::move(new_impl)));
......
......@@ -132,16 +132,200 @@ CaseMap::Locale::Locale(const AtomicString& locale) {
case_map_locale_ = nullptr;
}
scoped_refptr<StringImpl> CaseMap::FastToLowerInvariant(StringImpl* source) {
DCHECK(source);
// Note: This is a hot function in the Dromaeo benchmark, specifically the
// no-op code path up through the first 'return' statement.
// First scan the string for uppercase and non-ASCII characters:
if (source->Is8Bit()) {
wtf_size_t first_index_to_be_lowered = source->length();
for (wtf_size_t i = 0; i < source->length(); ++i) {
LChar ch = source->Characters8()[i];
if (UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
first_index_to_be_lowered = i;
break;
}
}
// Nothing to do if the string is all ASCII with no uppercase.
if (first_index_to_be_lowered == source->length())
return source;
LChar* data8;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data8);
memcpy(data8, source->Characters8(), first_index_to_be_lowered);
for (wtf_size_t i = first_index_to_be_lowered; i < source->length(); ++i) {
LChar ch = source->Characters8()[i];
data8[i] = UNLIKELY(ch & ~0x7F) ? static_cast<LChar>(unicode::ToLower(ch))
: ToASCIILower(ch);
}
return new_impl;
}
bool no_upper = true;
UChar ored = 0;
const UChar* end = source->Characters16() + source->length();
for (const UChar* chp = source->Characters16(); chp != end; ++chp) {
if (UNLIKELY(IsASCIIUpper(*chp)))
no_upper = false;
ored |= *chp;
}
// Nothing to do if the string is all ASCII with no uppercase.
if (no_upper && !(ored & ~0x7F))
return source;
CHECK_LE(source->length(),
static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
int32_t length = source->length();
if (!(ored & ~0x7F)) {
UChar* data16;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data16);
for (int32_t i = 0; i < length; ++i) {
UChar c = source->Characters16()[i];
data16[i] = ToASCIILower(c);
}
return new_impl;
}
// Do a slower implementation for cases that include non-ASCII characters.
UChar* data16;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data16);
bool error;
int32_t real_length = unicode::ToLower(data16, length, source->Characters16(),
source->length(), &error);
if (!error && real_length == length)
return new_impl;
new_impl = StringImpl::CreateUninitialized(real_length, data16);
unicode::ToLower(data16, real_length, source->Characters16(),
source->length(), &error);
if (error)
return source;
return new_impl;
}
scoped_refptr<StringImpl> CaseMap::FastToUpperInvariant(StringImpl* source) {
DCHECK(source);
// This function could be optimized for no-op cases the way LowerUnicode() is,
// but in empirical testing, few actual calls to UpperUnicode() are no-ops, so
// it wouldn't be worth the extra time for pre-scanning.
CHECK_LE(source->length(),
static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
int32_t length = source->length();
if (source->Is8Bit()) {
LChar* data8;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data8);
// Do a faster loop for the case where all the characters are ASCII.
LChar ored = 0;
for (int i = 0; i < length; ++i) {
LChar c = source->Characters8()[i];
ored |= c;
data8[i] = ToASCIIUpper(c);
}
if (!(ored & ~0x7F))
return new_impl;
// Do a slower implementation for cases that include non-ASCII Latin-1
// characters.
int number_sharp_s_characters = 0;
// There are two special cases.
// 1. latin-1 characters when converted to upper case are 16 bit
// characters.
// 2. Lower case sharp-S converts to "SS" (two characters)
for (int32_t i = 0; i < length; ++i) {
LChar c = source->Characters8()[i];
if (UNLIKELY(c == kSmallLetterSharpSCharacter))
++number_sharp_s_characters;
UChar upper = static_cast<UChar>(unicode::ToUpper(c));
if (UNLIKELY(upper > 0xff)) {
// Since this upper-cased character does not fit in an 8-bit string, we
// need to take the 16-bit path.
goto upconvert;
}
data8[i] = static_cast<LChar>(upper);
}
if (!number_sharp_s_characters)
return new_impl;
// We have numberSSCharacters sharp-s characters, but none of the other
// special characters.
new_impl = StringImpl::CreateUninitialized(
source->length() + number_sharp_s_characters, data8);
LChar* dest = data8;
for (int32_t i = 0; i < length; ++i) {
LChar c = source->Characters8()[i];
if (c == kSmallLetterSharpSCharacter) {
*dest++ = 'S';
*dest++ = 'S';
} else {
*dest++ = static_cast<LChar>(unicode::ToUpper(c));
}
}
return new_impl;
}
upconvert:
scoped_refptr<StringImpl> upconverted = source->UpconvertedString();
const UChar* source16 = upconverted->Characters16();
UChar* data16;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data16);
// Do a faster loop for the case where all the characters are ASCII.
UChar ored = 0;
for (int i = 0; i < length; ++i) {
UChar c = source16[i];
ored |= c;
data16[i] = ToASCIIUpper(c);
}
if (!(ored & ~0x7F))
return new_impl;
// Do a slower implementation for cases that include non-ASCII characters.
bool error;
int32_t real_length =
unicode::ToUpper(data16, length, source16, source->length(), &error);
if (!error && real_length == length)
return new_impl;
new_impl = StringImpl::CreateUninitialized(real_length, data16);
unicode::ToUpper(data16, real_length, source16, source->length(), &error);
if (error)
return source;
return new_impl;
}
scoped_refptr<StringImpl> CaseMap::ToLower(StringImpl* source) const {
if (!case_map_locale_)
return source->LowerUnicode();
return FastToLowerInvariant(source);
return CaseConvert(CaseMapType::kLowerLegacy, source, case_map_locale_);
}
scoped_refptr<StringImpl> CaseMap::ToUpper(StringImpl* source) const {
if (!case_map_locale_)
return source->UpperUnicode();
return FastToUpperInvariant(source);
return CaseConvert(CaseMapType::kUpperLegacy, source, case_map_locale_);
}
......
......@@ -52,7 +52,15 @@ class WTF_EXPORT CaseMap {
String ToLower(const String& source, TextOffsetMap* offset_map) const;
String ToUpper(const String& source, TextOffsetMap* offset_map) const;
// Fast code path for simple cases, only for root locale.
// TODO(crbug.com/627682): This should move to private, once
// |DeprecatedLower()| is deprecated.
static scoped_refptr<StringImpl> FastToLowerInvariant(StringImpl* source);
private:
// Fast code path for simple cases, only for root locale.
static scoped_refptr<StringImpl> FastToUpperInvariant(StringImpl* source);
const char* case_map_locale_;
};
......
......@@ -391,177 +391,6 @@ scoped_refptr<StringImpl> StringImpl::LowerASCII() {
return new_impl;
}
scoped_refptr<StringImpl> StringImpl::LowerUnicode() {
// Note: This is a hot function in the Dromaeo benchmark, specifically the
// no-op code path up through the first 'return' statement.
// First scan the string for uppercase and non-ASCII characters:
if (Is8Bit()) {
wtf_size_t first_index_to_be_lowered = length_;
for (wtf_size_t i = 0; i < length_; ++i) {
LChar ch = Characters8()[i];
if (UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
first_index_to_be_lowered = i;
break;
}
}
// Nothing to do if the string is all ASCII with no uppercase.
if (first_index_to_be_lowered == length_)
return this;
LChar* data8;
scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data8);
memcpy(data8, Characters8(), first_index_to_be_lowered);
for (wtf_size_t i = first_index_to_be_lowered; i < length_; ++i) {
LChar ch = Characters8()[i];
data8[i] = UNLIKELY(ch & ~0x7F) ? static_cast<LChar>(unicode::ToLower(ch))
: ToASCIILower(ch);
}
return new_impl;
}
bool no_upper = true;
UChar ored = 0;
const UChar* end = Characters16() + length_;
for (const UChar* chp = Characters16(); chp != end; ++chp) {
if (UNLIKELY(IsASCIIUpper(*chp)))
no_upper = false;
ored |= *chp;
}
// Nothing to do if the string is all ASCII with no uppercase.
if (no_upper && !(ored & ~0x7F))
return this;
CHECK_LE(length_, static_cast<wtf_size_t>(numeric_limits<int32_t>::max()));
int32_t length = length_;
if (!(ored & ~0x7F)) {
UChar* data16;
scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data16);
for (int32_t i = 0; i < length; ++i) {
UChar c = Characters16()[i];
data16[i] = ToASCIILower(c);
}
return new_impl;
}
// Do a slower implementation for cases that include non-ASCII characters.
UChar* data16;
scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data16);
bool error;
int32_t real_length =
unicode::ToLower(data16, length, Characters16(), length_, &error);
if (!error && real_length == length)
return new_impl;
new_impl = CreateUninitialized(real_length, data16);
unicode::ToLower(data16, real_length, Characters16(), length_, &error);
if (error)
return this;
return new_impl;
}
scoped_refptr<StringImpl> StringImpl::UpperUnicode() {
// This function could be optimized for no-op cases the way LowerUnicode() is,
// but in empirical testing, few actual calls to UpperUnicode() are no-ops, so
// it wouldn't be worth the extra time for pre-scanning.
CHECK_LE(length_, static_cast<wtf_size_t>(numeric_limits<int32_t>::max()));
int32_t length = length_;
if (Is8Bit()) {
LChar* data8;
scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data8);
// Do a faster loop for the case where all the characters are ASCII.
LChar ored = 0;
for (int i = 0; i < length; ++i) {
LChar c = Characters8()[i];
ored |= c;
data8[i] = ToASCIIUpper(c);
}
if (!(ored & ~0x7F))
return new_impl;
// Do a slower implementation for cases that include non-ASCII Latin-1
// characters.
int number_sharp_s_characters = 0;
// There are two special cases.
// 1. latin-1 characters when converted to upper case are 16 bit
// characters.
// 2. Lower case sharp-S converts to "SS" (two characters)
for (int32_t i = 0; i < length; ++i) {
LChar c = Characters8()[i];
if (UNLIKELY(c == kSmallLetterSharpSCharacter))
++number_sharp_s_characters;
UChar upper = static_cast<UChar>(unicode::ToUpper(c));
if (UNLIKELY(upper > 0xff)) {
// Since this upper-cased character does not fit in an 8-bit string, we
// need to take the 16-bit path.
goto upconvert;
}
data8[i] = static_cast<LChar>(upper);
}
if (!number_sharp_s_characters)
return new_impl;
// We have numberSSCharacters sharp-s characters, but none of the other
// special characters.
new_impl = CreateUninitialized(length_ + number_sharp_s_characters, data8);
LChar* dest = data8;
for (int32_t i = 0; i < length; ++i) {
LChar c = Characters8()[i];
if (c == kSmallLetterSharpSCharacter) {
*dest++ = 'S';
*dest++ = 'S';
} else {
*dest++ = static_cast<LChar>(unicode::ToUpper(c));
}
}
return new_impl;
}
upconvert:
scoped_refptr<StringImpl> upconverted = UpconvertedString();
const UChar* source16 = upconverted->Characters16();
UChar* data16;
scoped_refptr<StringImpl> new_impl = CreateUninitialized(length_, data16);
// Do a faster loop for the case where all the characters are ASCII.
UChar ored = 0;
for (int i = 0; i < length; ++i) {
UChar c = source16[i];
ored |= c;
data16[i] = ToASCIIUpper(c);
}
if (!(ored & ~0x7F))
return new_impl;
// Do a slower implementation for cases that include non-ASCII characters.
bool error;
int32_t real_length =
unicode::ToUpper(data16, length, source16, length_, &error);
if (!error && real_length == length)
return new_impl;
new_impl = CreateUninitialized(real_length, data16);
unicode::ToUpper(data16, real_length, source16, length_, &error);
if (error)
return this;
return new_impl;
}
scoped_refptr<StringImpl> StringImpl::UpperASCII() {
if (Is8Bit()) {
LChar* data8;
......
......@@ -353,9 +353,7 @@ class WTF_EXPORT StringImpl {
double ToDouble(bool* ok = nullptr);
float ToFloat(bool* ok = nullptr);
scoped_refptr<StringImpl> LowerUnicode();
scoped_refptr<StringImpl> LowerASCII();
scoped_refptr<StringImpl> UpperUnicode();
scoped_refptr<StringImpl> UpperASCII();
scoped_refptr<StringImpl> Fill(UChar);
......
......@@ -26,6 +26,7 @@
#include "third_party/blink/renderer/platform/wtf/text/string_impl.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
namespace WTF {
......@@ -55,9 +56,10 @@ TEST(StringImplTest, LowerASCII) {
EXPECT_TRUE(Equal(test_string_impl.get(),
StringImpl::Create("lInk")->LowerASCII().get()));
EXPECT_TRUE(Equal(StringImpl::Create("LINK")->LowerUnicode().get(),
CaseMap case_map("");
EXPECT_TRUE(Equal(case_map.ToLower(StringImpl::Create("LINK")).Impl(),
StringImpl::Create("LINK")->LowerASCII().get()));
EXPECT_TRUE(Equal(StringImpl::Create("lInk")->LowerUnicode().get(),
EXPECT_TRUE(Equal(case_map.ToLower(StringImpl::Create("lInk")).Impl(),
StringImpl::Create("lInk")->LowerASCII().get()));
EXPECT_TRUE(Equal(StringImpl::Create("a\xE1").get(),
......@@ -112,9 +114,10 @@ TEST(StringImplTest, UpperASCII) {
EXPECT_TRUE(Equal(test_string_impl.get(),
StringImpl::Create("lInk")->UpperASCII().get()));
EXPECT_TRUE(Equal(StringImpl::Create("LINK")->UpperUnicode().get(),
CaseMap case_map("");
EXPECT_TRUE(Equal(case_map.ToUpper(StringImpl::Create("LINK")).Impl(),
StringImpl::Create("LINK")->UpperASCII().get()));
EXPECT_TRUE(Equal(StringImpl::Create("lInk")->UpperUnicode().get(),
EXPECT_TRUE(Equal(case_map.ToUpper(StringImpl::Create("lInk")).Impl(),
StringImpl::Create("lInk")->UpperASCII().get()));
EXPECT_TRUE(Equal(StringImpl::Create("A\xE1").get(),
......
......@@ -30,6 +30,7 @@
#include "third_party/blink/renderer/platform/wtf/dtoa/dtoa.h"
#include "third_party/blink/renderer/platform/wtf/math_extras.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"
......@@ -109,7 +110,7 @@ String String::Substring(unsigned pos, unsigned len) const {
String String::DeprecatedLower() const {
if (!impl_)
return String();
return impl_->LowerUnicode();
return CaseMap::FastToLowerInvariant(impl_.get());
}
String String::LowerASCII() const {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment