Commit cff00e53 authored by Hidehiko Abe's avatar Hidehiko Abe Committed by Commit Bot

Use base/third_party/icu for char_iterator.

This CL replaces third_party/icu uses with base/third_party/icu.
Along with the change, base/third_party/icu/* is updated to v67.
- Extracted several macros used for char_iterator implementation.
- Removed utf8_nextCharSafeBody, as it is no longer used.

This CL is preparation to move char_iterator to base/strings
so that it can be used in new functions in base/.

Bug: 2492481
Test: Ran base_unittests.
Change-Id: I779c7384b15701dcac3feac9d258be5793ca209d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2494402
Commit-Queue: Hidehiko Abe <hidehiko@chromium.org>
Reviewed-by: default avatarDaniel Cheng <dcheng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#821661}
parent 782db57a
...@@ -702,7 +702,6 @@ component("base") { ...@@ -702,7 +702,6 @@ component("base") {
"third_party/cityhash/city.h", "third_party/cityhash/city.h",
"third_party/cityhash_v103/src/city_v103.cc", "third_party/cityhash_v103/src/city_v103.cc",
"third_party/cityhash_v103/src/city_v103.h", "third_party/cityhash_v103/src/city_v103.h",
"third_party/icu/icu_utf.cc",
"third_party/icu/icu_utf.h", "third_party/icu/icu_utf.h",
"third_party/nspr/prtime.cc", "third_party/nspr/prtime.cc",
"third_party/nspr/prtime.h", "third_party/nspr/prtime.h",
......
...@@ -5,8 +5,7 @@ ...@@ -5,8 +5,7 @@
#include "base/i18n/char_iterator.h" #include "base/i18n/char_iterator.h"
#include "base/check_op.h" #include "base/check_op.h"
#include "third_party/icu/source/common/unicode/utf16.h" #include "base/third_party/icu/icu_utf.h"
#include "third_party/icu/source/common/unicode/utf8.h"
namespace base { namespace base {
namespace i18n { namespace i18n {
...@@ -16,7 +15,7 @@ namespace i18n { ...@@ -16,7 +15,7 @@ namespace i18n {
UTF8CharIterator::UTF8CharIterator(base::StringPiece str) UTF8CharIterator::UTF8CharIterator(base::StringPiece str)
: str_(str), array_pos_(0), next_pos_(0), char_pos_(0), char_(0) { : str_(str), array_pos_(0), next_pos_(0), char_pos_(0), char_(0) {
if (!str_.empty()) if (!str_.empty())
U8_NEXT(str_.data(), next_pos_, str_.length(), char_); CBU8_NEXT(str_.data(), next_pos_, str_.length(), char_);
} }
UTF8CharIterator::~UTF8CharIterator() = default; UTF8CharIterator::~UTF8CharIterator() = default;
...@@ -28,7 +27,7 @@ bool UTF8CharIterator::Advance() { ...@@ -28,7 +27,7 @@ bool UTF8CharIterator::Advance() {
array_pos_ = next_pos_; array_pos_ = next_pos_;
char_pos_++; char_pos_++;
if (next_pos_ < str_.length()) if (next_pos_ < str_.length())
U8_NEXT(str_.data(), next_pos_, str_.length(), char_); CBU8_NEXT(str_.data(), next_pos_, str_.length(), char_);
return true; return true;
} }
...@@ -49,7 +48,7 @@ UTF16CharIterator& UTF16CharIterator::operator=(UTF16CharIterator&& to_move) = ...@@ -49,7 +48,7 @@ UTF16CharIterator& UTF16CharIterator::operator=(UTF16CharIterator&& to_move) =
UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str, UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str,
size_t array_index) { size_t array_index) {
DCHECK_LE(array_index, str.length()); DCHECK_LE(array_index, str.length());
U16_SET_CP_START(str.data(), 0, array_index); CBU16_SET_CP_START(str.data(), 0, array_index);
return UTF16CharIterator(str, array_index); return UTF16CharIterator(str, array_index);
} }
...@@ -57,7 +56,7 @@ UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str, ...@@ -57,7 +56,7 @@ UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str,
UTF16CharIterator UTF16CharIterator::UpperBound(StringPiece16 str, UTF16CharIterator UTF16CharIterator::UpperBound(StringPiece16 str,
size_t array_index) { size_t array_index) {
DCHECK_LE(array_index, str.length()); DCHECK_LE(array_index, str.length());
U16_SET_CP_LIMIT(str.data(), 0, array_index, str.length()); CBU16_SET_CP_LIMIT(str.data(), 0, array_index, str.length());
return UTF16CharIterator(str, array_index); return UTF16CharIterator(str, array_index);
} }
...@@ -65,8 +64,8 @@ int32_t UTF16CharIterator::NextCodePoint() const { ...@@ -65,8 +64,8 @@ int32_t UTF16CharIterator::NextCodePoint() const {
if (next_pos_ >= str_.length()) if (next_pos_ >= str_.length())
return 0; return 0;
UChar32 c; base_icu::UChar32 c;
U16_GET(str_.data(), 0, next_pos_, str_.length(), c); CBU16_GET(str_.data(), 0, next_pos_, str_.length(), c);
return c; return c;
} }
...@@ -75,8 +74,8 @@ int32_t UTF16CharIterator::PreviousCodePoint() const { ...@@ -75,8 +74,8 @@ int32_t UTF16CharIterator::PreviousCodePoint() const {
return 0; return 0;
uint32_t pos = array_pos_; uint32_t pos = array_pos_;
UChar32 c; base_icu::UChar32 c;
U16_PREV(str_.data(), 0, pos, c); CBU16_PREV(str_.data(), 0, pos, c);
return c; return c;
} }
...@@ -98,7 +97,7 @@ bool UTF16CharIterator::Rewind() { ...@@ -98,7 +97,7 @@ bool UTF16CharIterator::Rewind() {
next_pos_ = array_pos_; next_pos_ = array_pos_;
char_offset_--; char_offset_--;
U16_PREV(str_.data(), 0, array_pos_, char_); CBU16_PREV(str_.data(), 0, array_pos_, char_);
return true; return true;
} }
...@@ -115,7 +114,7 @@ UTF16CharIterator::UTF16CharIterator(StringPiece16 str, size_t initial_pos) ...@@ -115,7 +114,7 @@ UTF16CharIterator::UTF16CharIterator(StringPiece16 str, size_t initial_pos)
void UTF16CharIterator::ReadChar() { void UTF16CharIterator::ReadChar() {
// This is actually a huge macro, so is worth having in a separate function. // This is actually a huge macro, so is worth having in a separate function.
U16_NEXT(str_.data(), next_pos_, str_.length(), char_); CBU16_NEXT(str_.data(), next_pos_, str_.length(), char_);
} }
} // namespace i18n } // namespace i18n
......
...@@ -6,12 +6,12 @@ License File: NOT_SHIPPED ...@@ -6,12 +6,12 @@ License File: NOT_SHIPPED
This file has the relevant components from ICU copied to handle basic UTF8/16/32 This file has the relevant components from ICU copied to handle basic UTF8/16/32
conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h
into icu_utf.h, and from utf_impl.cpp into icu_utf.cc. into icu_utf.h.
The main change is that U_/U8_/U16_ prefixes have been replaced with The main change is that U_/U8_/U16_/UPRV_ prefixes have been replaced with
CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU CBU_/CBU8_/CBU16_/CBUPRV_ (for "Chrome Base") to avoid confusion with the "real"
macros should ICU be in use on the system. For the same reason, the functions ICU macros should ICU be in use on the system. For the same reason, the
and types have been put in the "base_icu" namespace. functions and types have been put in the "base_icu" namespace.
Note that this license file is marked as NOT_SHIPPED, since a more complete Note that this license file is marked as NOT_SHIPPED, since a more complete
ICU license is included from //third_party/icu/README.chromium ICU license is included from //third_party/icu/README.chromium
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file provides implementation functions for macros in the utfXX.h
* that would otherwise be too long as macros.
*/
#include "base/third_party/icu/icu_utf.h"
namespace base_icu {
// source/common/utf_impl.cpp
static const UChar32
utf8_errorValue[6]={
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
// but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
0x10ffff
};
static UChar32
errorValue(int32_t count, int8_t strict) {
if(strict>=0) {
return utf8_errorValue[count];
} else if(strict==-3) {
return 0xfffd;
} else {
return CBU_SENTINEL;
}
}
/*
* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
*
* U8_NEXT() supports NUL-terminated strings indicated via length<0.
*
* The "strict" parameter controls the error behavior:
* <0 "Safe" behavior of U8_NEXT():
* -1: All illegal byte sequences yield U_SENTINEL=-1.
* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
* Some implementations use this for roundtripping of
* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
* contain unpaired surrogates.
* -3: All illegal byte sequences yield U+FFFD.
* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
* All illegal byte sequences yield a positive code point such that this
* result code point would be encoded with the same number of bytes as
* the illegal sequence.
* >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
* Same as the obsolete "safe" behavior, but non-characters are also treated
* like illegal sequences.
*
* Note that a UBool is the same as an int8_t.
*/
UChar32
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
// *pi is one after byte c.
int32_t i=*pi;
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
if(i==length || c>0xf4) {
// end of string, or not a lead byte
} else if(c>=0xf0) {
// Test for 4-byte sequences first because
// U8_NEXT() handles shorter valid sequences inline.
uint8_t t1=s[i], t2, t3;
c&=7;
if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f &&
++i!=length && (t3=s[i]-0x80)<=0x3f) {
++i;
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else if(c>=0xe0) {
c&=0xf;
if(strict!=-2) {
uint8_t t1=s[i], t2;
if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
++i;
c=(c<<12)|((t1&0x3f)<<6)|t2;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else {
// strict=-2 -> lenient: allow surrogates
uint8_t t1=s[i]-0x80, t2;
if(t1<=0x3f && (c>0 || t1>=0x20) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
*pi=i+1;
return (c<<12)|(t1<<6)|t2;
}
}
} else if(c>=0xc2) {
uint8_t t1=s[i]-0x80;
if(t1<=0x3f) {
*pi=i+1;
return ((c-0xc0)<<6)|t1;
}
} // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
c=errorValue(i-*pi, strict);
*pi=i;
return c;
}
} // namespace base_icu
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment