Commit cff00e53 authored by Hidehiko Abe's avatar Hidehiko Abe Committed by Commit Bot

Use base/third_party/icu for char_iterator.

This CL replaces third_party/icu uses with base/third_party/icu.
Along with the change, base/third_party/icu/* is updated to v67.
- Extracted several macros used for char_iterator implementation.
- Removed utf8_nextCharSafeBody, as it is no longer used.

This CL is preparation to move char_iterator to base/strings
so that it can be used in new functions in base/.

Bug: 2492481
Test: Ran base_unittests.
Change-Id: I779c7384b15701dcac3feac9d258be5793ca209d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2494402
Commit-Queue: Hidehiko Abe <hidehiko@chromium.org>
Reviewed-by: default avatarDaniel Cheng <dcheng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#821661}
parent 782db57a
...@@ -702,7 +702,6 @@ component("base") { ...@@ -702,7 +702,6 @@ component("base") {
"third_party/cityhash/city.h", "third_party/cityhash/city.h",
"third_party/cityhash_v103/src/city_v103.cc", "third_party/cityhash_v103/src/city_v103.cc",
"third_party/cityhash_v103/src/city_v103.h", "third_party/cityhash_v103/src/city_v103.h",
"third_party/icu/icu_utf.cc",
"third_party/icu/icu_utf.h", "third_party/icu/icu_utf.h",
"third_party/nspr/prtime.cc", "third_party/nspr/prtime.cc",
"third_party/nspr/prtime.h", "third_party/nspr/prtime.h",
......
...@@ -5,8 +5,7 @@ ...@@ -5,8 +5,7 @@
#include "base/i18n/char_iterator.h" #include "base/i18n/char_iterator.h"
#include "base/check_op.h" #include "base/check_op.h"
#include "third_party/icu/source/common/unicode/utf16.h" #include "base/third_party/icu/icu_utf.h"
#include "third_party/icu/source/common/unicode/utf8.h"
namespace base { namespace base {
namespace i18n { namespace i18n {
...@@ -16,7 +15,7 @@ namespace i18n { ...@@ -16,7 +15,7 @@ namespace i18n {
UTF8CharIterator::UTF8CharIterator(base::StringPiece str) UTF8CharIterator::UTF8CharIterator(base::StringPiece str)
: str_(str), array_pos_(0), next_pos_(0), char_pos_(0), char_(0) { : str_(str), array_pos_(0), next_pos_(0), char_pos_(0), char_(0) {
if (!str_.empty()) if (!str_.empty())
U8_NEXT(str_.data(), next_pos_, str_.length(), char_); CBU8_NEXT(str_.data(), next_pos_, str_.length(), char_);
} }
UTF8CharIterator::~UTF8CharIterator() = default; UTF8CharIterator::~UTF8CharIterator() = default;
...@@ -28,7 +27,7 @@ bool UTF8CharIterator::Advance() { ...@@ -28,7 +27,7 @@ bool UTF8CharIterator::Advance() {
array_pos_ = next_pos_; array_pos_ = next_pos_;
char_pos_++; char_pos_++;
if (next_pos_ < str_.length()) if (next_pos_ < str_.length())
U8_NEXT(str_.data(), next_pos_, str_.length(), char_); CBU8_NEXT(str_.data(), next_pos_, str_.length(), char_);
return true; return true;
} }
...@@ -49,7 +48,7 @@ UTF16CharIterator& UTF16CharIterator::operator=(UTF16CharIterator&& to_move) = ...@@ -49,7 +48,7 @@ UTF16CharIterator& UTF16CharIterator::operator=(UTF16CharIterator&& to_move) =
UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str, UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str,
size_t array_index) { size_t array_index) {
DCHECK_LE(array_index, str.length()); DCHECK_LE(array_index, str.length());
U16_SET_CP_START(str.data(), 0, array_index); CBU16_SET_CP_START(str.data(), 0, array_index);
return UTF16CharIterator(str, array_index); return UTF16CharIterator(str, array_index);
} }
...@@ -57,7 +56,7 @@ UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str, ...@@ -57,7 +56,7 @@ UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str,
UTF16CharIterator UTF16CharIterator::UpperBound(StringPiece16 str, UTF16CharIterator UTF16CharIterator::UpperBound(StringPiece16 str,
size_t array_index) { size_t array_index) {
DCHECK_LE(array_index, str.length()); DCHECK_LE(array_index, str.length());
U16_SET_CP_LIMIT(str.data(), 0, array_index, str.length()); CBU16_SET_CP_LIMIT(str.data(), 0, array_index, str.length());
return UTF16CharIterator(str, array_index); return UTF16CharIterator(str, array_index);
} }
...@@ -65,8 +64,8 @@ int32_t UTF16CharIterator::NextCodePoint() const { ...@@ -65,8 +64,8 @@ int32_t UTF16CharIterator::NextCodePoint() const {
if (next_pos_ >= str_.length()) if (next_pos_ >= str_.length())
return 0; return 0;
UChar32 c; base_icu::UChar32 c;
U16_GET(str_.data(), 0, next_pos_, str_.length(), c); CBU16_GET(str_.data(), 0, next_pos_, str_.length(), c);
return c; return c;
} }
...@@ -75,8 +74,8 @@ int32_t UTF16CharIterator::PreviousCodePoint() const { ...@@ -75,8 +74,8 @@ int32_t UTF16CharIterator::PreviousCodePoint() const {
return 0; return 0;
uint32_t pos = array_pos_; uint32_t pos = array_pos_;
UChar32 c; base_icu::UChar32 c;
U16_PREV(str_.data(), 0, pos, c); CBU16_PREV(str_.data(), 0, pos, c);
return c; return c;
} }
...@@ -98,7 +97,7 @@ bool UTF16CharIterator::Rewind() { ...@@ -98,7 +97,7 @@ bool UTF16CharIterator::Rewind() {
next_pos_ = array_pos_; next_pos_ = array_pos_;
char_offset_--; char_offset_--;
U16_PREV(str_.data(), 0, array_pos_, char_); CBU16_PREV(str_.data(), 0, array_pos_, char_);
return true; return true;
} }
...@@ -115,7 +114,7 @@ UTF16CharIterator::UTF16CharIterator(StringPiece16 str, size_t initial_pos) ...@@ -115,7 +114,7 @@ UTF16CharIterator::UTF16CharIterator(StringPiece16 str, size_t initial_pos)
void UTF16CharIterator::ReadChar() { void UTF16CharIterator::ReadChar() {
// This is actually a huge macro, so is worth having in a separate function. // This is actually a huge macro, so is worth having in a separate function.
U16_NEXT(str_.data(), next_pos_, str_.length(), char_); CBU16_NEXT(str_.data(), next_pos_, str_.length(), char_);
} }
} // namespace i18n } // namespace i18n
......
...@@ -6,12 +6,12 @@ License File: NOT_SHIPPED ...@@ -6,12 +6,12 @@ License File: NOT_SHIPPED
This file has the relevant components from ICU copied to handle basic UTF8/16/32 This file has the relevant components from ICU copied to handle basic UTF8/16/32
conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h
into icu_utf.h, and from utf_impl.cpp into icu_utf.cc. into icu_utf.h.
The main change is that U_/U8_/U16_ prefixes have been replaced with The main change is that U_/U8_/U16_/UPRV_ prefixes have been replaced with
CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU CBU_/CBU8_/CBU16_/CBUPRV_ (for "Chrome Base") to avoid confusion with the "real"
macros should ICU be in use on the system. For the same reason, the functions ICU macros should ICU be in use on the system. For the same reason, the
and types have been put in the "base_icu" namespace. functions and types have been put in the "base_icu" namespace.
Note that this license file is marked as NOT_SHIPPED, since a more complete Note that this license file is marked as NOT_SHIPPED, since a more complete
ICU license is included from //third_party/icu/README.chromium ICU license is included from //third_party/icu/README.chromium
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file provides implementation functions for macros in the utfXX.h
* that would otherwise be too long as macros.
*/
#include "base/third_party/icu/icu_utf.h"
namespace base_icu {
// source/common/utf_impl.cpp
static const UChar32
utf8_errorValue[6]={
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
// but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
0x10ffff
};
static UChar32
errorValue(int32_t count, int8_t strict) {
if(strict>=0) {
return utf8_errorValue[count];
} else if(strict==-3) {
return 0xfffd;
} else {
return CBU_SENTINEL;
}
}
/*
* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
*
* U8_NEXT() supports NUL-terminated strings indicated via length<0.
*
* The "strict" parameter controls the error behavior:
* <0 "Safe" behavior of U8_NEXT():
* -1: All illegal byte sequences yield U_SENTINEL=-1.
* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
* Some implementations use this for roundtripping of
* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
* contain unpaired surrogates.
* -3: All illegal byte sequences yield U+FFFD.
* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
* All illegal byte sequences yield a positive code point such that this
* result code point would be encoded with the same number of bytes as
* the illegal sequence.
* >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
* Same as the obsolete "safe" behavior, but non-characters are also treated
* like illegal sequences.
*
* Note that a UBool is the same as an int8_t.
*/
UChar32
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
// *pi is one after byte c.
int32_t i=*pi;
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
if(i==length || c>0xf4) {
// end of string, or not a lead byte
} else if(c>=0xf0) {
// Test for 4-byte sequences first because
// U8_NEXT() handles shorter valid sequences inline.
uint8_t t1=s[i], t2, t3;
c&=7;
if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f &&
++i!=length && (t3=s[i]-0x80)<=0x3f) {
++i;
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else if(c>=0xe0) {
c&=0xf;
if(strict!=-2) {
uint8_t t1=s[i], t2;
if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
++i;
c=(c<<12)|((t1&0x3f)<<6)|t2;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else {
// strict=-2 -> lenient: allow surrogates
uint8_t t1=s[i]-0x80, t2;
if(t1<=0x3f && (c>0 || t1>=0x20) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
*pi=i+1;
return (c<<12)|(t1<<6)|t2;
}
}
} else if(c>=0xc2) {
uint8_t t1=s[i]-0x80;
if(t1<=0x3f) {
*pi=i+1;
return ((c-0xc0)<<6)|t1;
}
} // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
c=errorValue(i-*pi, strict);
*pi=i;
return c;
}
} // namespace base_icu
...@@ -60,6 +60,25 @@ typedef int32_t UChar32; ...@@ -60,6 +60,25 @@ typedef int32_t UChar32;
*/ */
#define CBU_SENTINEL (-1) #define CBU_SENTINEL (-1)
/**
* \def UPRV_BLOCK_MACRO_BEGIN
* Defined as the "do" keyword by default.
* @internal
*/
#ifndef CBUPRV_BLOCK_MACRO_BEGIN
#define CBUPRV_BLOCK_MACRO_BEGIN do
#endif
/**
* \def UPRV_BLOCK_MACRO_END
* Defined as "while (FALSE)" by default.
* @internal
*/
#ifndef CBUPRV_BLOCK_MACRO_END
#define CBUPRV_BLOCK_MACRO_END while (0)
#endif
// source/common/unicode/utf.h // source/common/unicode/utf.h
/** /**
...@@ -146,21 +165,6 @@ typedef int32_t UChar32; ...@@ -146,21 +165,6 @@ typedef int32_t UChar32;
*/ */
#define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) #define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clie
nts;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros i
n this
* file and thus must remain stable, and should not be hidden when other interna
l
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
UChar32
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu::UChar32 c, ::base_icu::UBool strict);
/** /**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte) * @param c 8-bit code unit (byte)
...@@ -230,29 +234,36 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu: ...@@ -230,29 +234,36 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
* @see U8_NEXT_UNSAFE * @see U8_NEXT_UNSAFE
* @stable ICU 2.4 * @stable ICU 2.4
*/ */
#define CBU8_NEXT(s, i, length, c) { \ #define CBU8_NEXT(s, i, length, c) CBU8_INTERNAL_NEXT_OR_SUB(s, i, length, c, CBU_SENTINEL)
/** @internal */
#define CBU8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) CBUPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \ (c)=(uint8_t)(s)[(i)++]; \
if(!CBU8_IS_SINGLE(c)) { \ if(!CBU8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \ uint8_t __t = 0; \
if( /* handle U+0800..U+FFFF inline */ \ if((i)!=(length) && \
(0xe0<=(c) && (c)<0xf0) && \ /* fetch/validate/assemble all but last trail byte */ \
(((i)+1)<(length) || (length)<0) && \ ((c)>=0xe0 ? \
CBU8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \ ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \ CBU8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \ (__t&=0x3f, 1) \
(i)+=2; \ : /* U+10000..U+10FFFF */ \
} else if( /* handle U+0080..U+07FF inline */ \ ((c)-=0xf0)<=4 && \
((c)<0xe0 && (c)>=0xc2) && \ CBU8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
((i)!=(length)) && \ ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
(__t1=(s)[i]-0x80)<=0x3f) { \ (__t=(s)[i]-0x80)<=0x3f) && \
(c)=(((c)&0x1f)<<6)|__t1; \ /* valid second-to-last trail byte */ \
++(i); \ ((c)=((c)<<6)|__t, ++(i)!=(length)) \
: /* U+0080..U+07FF */ \
(c)>=0xc2 && ((c)&=0x1f, 1)) && \
/* last trail byte */ \
(__t=(s)[i]-0x80)<=0x3f && \
((c)=((c)<<6)|__t, ++(i), 1)) { \
} else { \ } else { \
/* function call for "complicated" and error cases */ \ (c)=(sub); /* ill-formed*/ \
(c)=::base_icu::utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
} \ } \
} \ } \
} } CBUPRV_BLOCK_MACRO_END
/** /**
* Append a code point to a string, overwriting 1 to 4 bytes. * Append a code point to a string, overwriting 1 to 4 bytes.
...@@ -267,24 +278,25 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu: ...@@ -267,24 +278,25 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
* @see U8_APPEND * @see U8_APPEND
* @stable ICU 2.4 * @stable ICU 2.4
*/ */
#define CBU8_APPEND_UNSAFE(s, i, c) { \ #define CBU8_APPEND_UNSAFE(s, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0x7f) { \ uint32_t __uc=(c); \
(s)[(i)++]=(uint8_t)(c); \ if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else { \ } else { \
if((uint32_t)(c)<=0x7ff) { \ if(__uc<=0x7ff) { \
(s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
} else { \ } else { \
if((uint32_t)(c)<=0xffff) { \ if(__uc<=0xffff) { \
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
} else { \ } else { \
(s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
} \ } \
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
} \ } \
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} \ } \
} } CBUPRV_BLOCK_MACRO_END
// source/common/unicode/utf16.h // source/common/unicode/utf16.h
...@@ -383,6 +395,45 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu: ...@@ -383,6 +395,45 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
*/ */
#define CBU16_MAX_LENGTH 2 #define CBU16_MAX_LENGTH 2
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 2.4
*/
#define CBU16_GET(s, start, i, length, c) CBUPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(CBU16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(CBU16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
} \
} else { \
if((i)>(start) && CBU16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=CBU16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} \
} CBUPRV_BLOCK_MACRO_END
/** /**
* Get a code point from a string at a code point boundary offset, * Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary. * and advance the offset to the next code point boundary.
...@@ -404,7 +455,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu: ...@@ -404,7 +455,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
* @see U16_NEXT_UNSAFE * @see U16_NEXT_UNSAFE
* @stable ICU 2.4 * @stable ICU 2.4
*/ */
#define CBU16_NEXT(s, i, length, c) { \ #define CBU16_NEXT(s, i, length, c) CBUPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \ (c)=(s)[(i)++]; \
if(CBU16_IS_LEAD(c)) { \ if(CBU16_IS_LEAD(c)) { \
uint16_t __c2; \ uint16_t __c2; \
...@@ -413,7 +464,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu: ...@@ -413,7 +464,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
(c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
} \ } \
} \ } \
} } CBUPRV_BLOCK_MACRO_END
/** /**
* Append a code point to a string, overwriting 1 or 2 code units. * Append a code point to a string, overwriting 1 or 2 code units.
...@@ -428,14 +479,88 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu: ...@@ -428,14 +479,88 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
* @see U16_APPEND * @see U16_APPEND
* @stable ICU 2.4 * @stable ICU 2.4
*/ */
#define CBU16_APPEND_UNSAFE(s, i, c) { \ #define CBU16_APPEND_UNSAFE(s, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \ if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \ (s)[(i)++]=(uint16_t)(c); \
} else { \ } else { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} \ } \
} } CBUPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i
* @see U16_SET_CP_START_UNSAFE
* @stable ICU 2.4
*/
#define CBU16_SET_CP_START(s, start, i) CBUPRV_BLOCK_MACRO_BEGIN { \
if(CBU16_IS_TRAIL((s)[i]) && (i)>(start) && CBU16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} CBUPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 2.4
*/
#define CBU16_PREV(s, start, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(CBU16_IS_TRAIL(c)) { \
uint16_t __c2; \
if((i)>(start) && CBU16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=CBU16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} CBUPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, start<=i<=length
* @param length int32_t string length
* @see U16_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define CBU16_SET_CP_LIMIT(s, start, i, length) CBUPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0) && CBU16_IS_LEAD((s)[(i)-1]) && CBU16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} CBUPRV_BLOCK_MACRO_END
} // namesapce base_icu } // namesapce base_icu
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment