Use base/third_party/icu for char_iterator.

This CL replaces third_party/icu uses with base/third_party/icu. Along with the change, base/third_party/icu/* is updated to v67. - Extracted several macros used for char_iterator implementation. - Removed utf8_nextCharSafeBody, as it is no longer used. This CL is preparation to move char_iterator to base/strings so that it can be used in new functions in base/. Bug: 2492481 Test: Ran base_unittests. Change-Id: I779c7384b15701dcac3feac9d258be5793ca209d Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2494402 Commit-Queue: Hidehiko Abe <hidehiko@chromium.org> Reviewed-by: Daniel Cheng <dcheng@chromium.org> Cr-Commit-Position: refs/heads/master@{#821661}

Use base/third_party/icu for char_iterator.
This CL replaces third_party/icu uses with base/third_party/icu. Along with the change, base/third_party/icu/* is updated to v67. - Extracted several macros used for char_iterator implementation. - Removed utf8_nextCharSafeBody, as it is no longer used. This CL is preparation to move char_iterator to base/strings so that it can be used in new functions in base/. Bug: 2492481 Test: Ran base_unittests. Change-Id: I779c7384b15701dcac3feac9d258be5793ca209d Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2494402 Commit-Queue: Hidehiko Abe <hidehiko@chromium.org> Reviewed-by: Daniel Cheng <dcheng@chromium.org> Cr-Commit-Position: refs/heads/master@{#821661}
cff00e53 · Hidehiko Abe · Commit Bot · 782db57a · cff00e53 · cff00e53
Commit cff00e53 authored Oct 28, 2020 by Hidehiko Abe Committed by Commit Bot Oct 28, 2020
5 changed files
--- a/base/BUILD.gn
+++ b/base/BUILD.gn
@@ -702,7 +702,6 @@ component("base") {
    "third_party/cityhash/city.h",
    "third_party/cityhash_v103/src/city_v103.cc",
    "third_party/cityhash_v103/src/city_v103.h",
-    "third_party/icu/icu_utf.cc",
    "third_party/icu/icu_utf.h",
    "third_party/nspr/prtime.cc",
    "third_party/nspr/prtime.h",

--- a/base/i18n/char_iterator.cc
+++ b/base/i18n/char_iterator.cc
@@ -5,8 +5,7 @@
 #include "base/i18n/char_iterator.h"
 #include "base/check_op.h"
-#include "third_party/icu/source/common/unicode/utf16.h"
+#include "base/third_party/icu/icu_utf.h"
-#include "third_party/icu/source/common/unicode/utf8.h"
 namespace base {
 namespace i18n {
@@ -16,7 +15,7 @@ namespace i18n {
 UTF8CharIterator::UTF8CharIterator(base::StringPiece str)
    : str_(str), array_pos_(0), next_pos_(0), char_pos_(0), char_(0) {
  if (!str_.empty())
-    U8_NEXT(str_.data(), next_pos_, str_.length(), char_);
+    CBU8_NEXT(str_.data(), next_pos_, str_.length(), char_);
 }
 UTF8CharIterator::~UTF8CharIterator() = default;
@@ -28,7 +27,7 @@ bool UTF8CharIterator::Advance() {
  array_pos_ = next_pos_;
  char_pos_++;
  if (next_pos_ < str_.length())
-    U8_NEXT(str_.data(), next_pos_, str_.length(), char_);
+    CBU8_NEXT(str_.data(), next_pos_, str_.length(), char_);
  return true;
 }
@@ -49,7 +48,7 @@ UTF16CharIterator& UTF16CharIterator::operator=(UTF16CharIterator&& to_move) =
 UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str,
                                                size_t array_index) {
  DCHECK_LE(array_index, str.length());
-  U16_SET_CP_START(str.data(), 0, array_index);
+  CBU16_SET_CP_START(str.data(), 0, array_index);
  return UTF16CharIterator(str, array_index);
 }
@@ -57,7 +56,7 @@ UTF16CharIterator UTF16CharIterator::LowerBound(StringPiece16 str,
 UTF16CharIterator UTF16CharIterator::UpperBound(StringPiece16 str,
                                                size_t array_index) {
  DCHECK_LE(array_index, str.length());
-  U16_SET_CP_LIMIT(str.data(), 0, array_index, str.length());
+  CBU16_SET_CP_LIMIT(str.data(), 0, array_index, str.length());
  return UTF16CharIterator(str, array_index);
 }
@@ -65,8 +64,8 @@ int32_t UTF16CharIterator::NextCodePoint() const {
  if (next_pos_ >= str_.length())
    return 0;
-  UChar32 c;
+  base_icu::UChar32 c;
-  U16_GET(str_.data(), 0, next_pos_, str_.length(), c);
+  CBU16_GET(str_.data(), 0, next_pos_, str_.length(), c);
  return c;
 }
@@ -75,8 +74,8 @@ int32_t UTF16CharIterator::PreviousCodePoint() const {
    return 0;
  uint32_t pos = array_pos_;
-  UChar32 c;
+  base_icu::UChar32 c;
-  U16_PREV(str_.data(), 0, pos, c);
+  CBU16_PREV(str_.data(), 0, pos, c);
  return c;
 }
@@ -98,7 +97,7 @@ bool UTF16CharIterator::Rewind() {
  next_pos_ = array_pos_;
  char_offset_--;
-  U16_PREV(str_.data(), 0, array_pos_, char_);
+  CBU16_PREV(str_.data(), 0, array_pos_, char_);
  return true;
 }
@@ -115,7 +114,7 @@ UTF16CharIterator::UTF16CharIterator(StringPiece16 str, size_t initial_pos)
 void UTF16CharIterator::ReadChar() {
  // This is actually a huge macro, so is worth having in a separate function.
-  U16_NEXT(str_.data(), next_pos_, str_.length(), char_);
+  CBU16_NEXT(str_.data(), next_pos_, str_.length(), char_);
 }
 }  // namespace i18n

--- a/base/third_party/icu/README.chromium
+++ b/base/third_party/icu/README.chromium
@@ -6,12 +6,12 @@ License File: NOT_SHIPPED
 This file has the relevant components from ICU copied to handle basic UTF8/16/32
 conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h
-into icu_utf.h, and from utf_impl.cpp into icu_utf.cc.
+into icu_utf.h.
-The main change is that U_/U8_/U16_ prefixes have been replaced with
+The main change is that U_/U8_/U16_/UPRV_ prefixes have been replaced with
-CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU
+CBU_/CBU8_/CBU16_/CBUPRV_ (for "Chrome Base") to avoid confusion with the "real"
-macros should ICU be in use on the system. For the same reason, the functions
+ICU macros should ICU be in use on the system. For the same reason, the
-and types have been put in the "base_icu" namespace.
+functions and types have been put in the "base_icu" namespace.
 Note that this license file is marked as NOT_SHIPPED, since a more complete
 ICU license is included from //third_party/icu/README.chromium
--- a/base/third_party/icu/icu_utf.cc
+++ b/base/third_party/icu/icu_utf.cc
-// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-*   Copyright (C) 1999-2012, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-*
-******************************************************************************
-*   file name:  utf_impl.cpp
-*   encoding:   UTF-8
-*   tab size:   8 (not used)
-*   indentation:4
-*
-*   created on: 1999sep13
-*   created by: Markus W. Scherer
-*
-*   This file provides implementation functions for macros in the utfXX.h
-*   that would otherwise be too long as macros.
-*/
-#include "base/third_party/icu/icu_utf.h"
-namespace base_icu {
-// source/common/utf_impl.cpp
-static const UChar32
-utf8_errorValue[6]={
-    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
-    // but without relying on the obsolete unicode/utf_old.h.
-    0x15, 0x9f, 0xffff,
-    0x10ffff
-};
-static UChar32
-errorValue(int32_t count, int8_t strict) {
-    if(strict>=0) {
-        return utf8_errorValue[count];
-    } else if(strict==-3) {
-        return 0xfffd;
-    } else {
-        return CBU_SENTINEL;
-    }
-}
-/*
- * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
- * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
- *
- * U8_NEXT() supports NUL-terminated strings indicated via length<0.
- *
- * The "strict" parameter controls the error behavior:
- * <0  "Safe" behavior of U8_NEXT():
- *     -1: All illegal byte sequences yield U_SENTINEL=-1.
- *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
- *         Some implementations use this for roundtripping of
- *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
- *         contain unpaired surrogates.
- *     -3: All illegal byte sequences yield U+FFFD.
- *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
- *     All illegal byte sequences yield a positive code point such that this
- *     result code point would be encoded with the same number of bytes as
- *     the illegal sequence.
- * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
- *     Same as the obsolete "safe" behavior, but non-characters are also treated
- *     like illegal sequences.
- *
- * Note that a UBool is the same as an int8_t.
- */
-UChar32
-utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
-    // *pi is one after byte c.
-    int32_t i=*pi;
-    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
-    if(i==length || c>0xf4) {
-        // end of string, or not a lead byte
-    } else if(c>=0xf0) {
-        // Test for 4-byte sequences first because
-        // U8_NEXT() handles shorter valid sequences inline.
-        uint8_t t1=s[i], t2, t3;
-        c&=7;
-        if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
-                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
-                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
-            ++i;
-            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
-            // strict: forbid non-characters like U+fffe
-            if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
-                *pi=i;
-                return c;
-            }
-        }
-    } else if(c>=0xe0) {
-        c&=0xf;
-        if(strict!=-2) {
-            uint8_t t1=s[i], t2;
-            if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
-                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
-                ++i;
-                c=(c<<12)|((t1&0x3f)<<6)|t2;
-                // strict: forbid non-characters like U+fffe
-                if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
-                    *pi=i;
-                    return c;
-                }
-            }
-        } else {
-            // strict=-2 -> lenient: allow surrogates
-            uint8_t t1=s[i]-0x80, t2;
-            if(t1<=0x3f && (c>0 || t1>=0x20) &&
-                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
-                *pi=i+1;
-                return (c<<12)|(t1<<6)|t2;
-            }
-        }
-    } else if(c>=0xc2) {
-        uint8_t t1=s[i]-0x80;
-        if(t1<=0x3f) {
-            *pi=i+1;
-            return ((c-0xc0)<<6)|t1;
-        }
-    }  // else 0x80<=c<0xc2 is not a lead byte
-    /* error handling */
-    c=errorValue(i-*pi, strict);
-    *pi=i;
-    return c;
-}
-}  // namespace base_icu
--- a/base/third_party/icu/icu_utf.h
+++ b/base/third_party/icu/icu_utf.h
@@ -60,6 +60,25 @@ typedef int32_t UChar32;
 */
 #define CBU_SENTINEL (-1)
+/**
+ * \def UPRV_BLOCK_MACRO_BEGIN
+ * Defined as the "do" keyword by default.
+ * @internal
+ */
+#ifndef CBUPRV_BLOCK_MACRO_BEGIN
+#define CBUPRV_BLOCK_MACRO_BEGIN do
+#endif
+/**
+ * \def UPRV_BLOCK_MACRO_END
+ * Defined as "while (FALSE)" by default.
+ * @internal
+ */
+#ifndef CBUPRV_BLOCK_MACRO_END
+#define CBUPRV_BLOCK_MACRO_END while (0)
+#endif
 // source/common/unicode/utf.h
 /**
@@ -146,21 +165,6 @@ typedef int32_t UChar32;
 */
 #define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
-/**
- * Function for handling "next code point" with error-checking.
- *
- * This is internal since it is not meant to be called directly by external clie
-nts;
- * however it is U_STABLE (not U_INTERNAL) since it is called by public macros i
-n this
- * file and thus must remain stable, and should not be hidden when other interna
-l
- * functions are hidden (otherwise public macros would fail to compile).
- * @internal
- */
-UChar32
-utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu::UChar32 c, ::base_icu::UBool strict);
 /**
 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
 * @param c 8-bit code unit (byte)
@@ -230,29 +234,36 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
 * @see U8_NEXT_UNSAFE
 * @stable ICU 2.4
 */
-#define CBU8_NEXT(s, i, length, c) { \
+#define CBU8_NEXT(s, i, length, c) CBU8_INTERNAL_NEXT_OR_SUB(s, i, length, c, CBU_SENTINEL)
+/** @internal */
+#define CBU8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) CBUPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[(i)++]; \
    if(!CBU8_IS_SINGLE(c)) { \
-        uint8_t __t1, __t2; \
+        uint8_t __t = 0; \
-        if( /* handle U+0800..U+FFFF inline */ \
+        if((i)!=(length) && \
-                (0xe0<=(c) && (c)<0xf0) && \
+            /* fetch/validate/assemble all but last trail byte */ \
-                (((i)+1)<(length) || (length)<0) && \
+            ((c)>=0xe0 ? \
-                CBU8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+                ((c)<0xf0 ?  /* U+0800..U+FFFF except surrogates */ \
-                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+                    CBU8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
-            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
+                    (__t&=0x3f, 1) \
-            (i)+=2; \
+                :  /* U+10000..U+10FFFF */ \
-        } else if( /* handle U+0080..U+07FF inline */ \
+                    ((c)-=0xf0)<=4 && \
-                ((c)<0xe0 && (c)>=0xc2) && \
+                    CBU8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
-                ((i)!=(length)) && \
+                    ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
-                (__t1=(s)[i]-0x80)<=0x3f) { \
+                    (__t=(s)[i]-0x80)<=0x3f) && \
-            (c)=(((c)&0x1f)<<6)|__t1; \
+                /* valid second-to-last trail byte */ \
-            ++(i); \
+                ((c)=((c)<<6)|__t, ++(i)!=(length)) \
+            :  /* U+0080..U+07FF */ \
+                (c)>=0xc2 && ((c)&=0x1f, 1)) && \
+            /* last trail byte */ \
+            (__t=(s)[i]-0x80)<=0x3f && \
+            ((c)=((c)<<6)|__t, ++(i), 1)) { \
        } else { \
-            /* function call for "complicated" and error cases */ \
+            (c)=(sub);  /* ill-formed*/ \
-            (c)=::base_icu::utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
        } \
    } \
-}
+} CBUPRV_BLOCK_MACRO_END
 /**
 * Append a code point to a string, overwriting 1 to 4 bytes.
@@ -267,24 +278,25 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
 * @see U8_APPEND
 * @stable ICU 2.4
 */
-#define CBU8_APPEND_UNSAFE(s, i, c) { \
+#define CBU8_APPEND_UNSAFE(s, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \
-    if((uint32_t)(c)<=0x7f) { \
+    uint32_t __uc=(c); \
-        (s)[(i)++]=(uint8_t)(c); \
+    if(__uc<=0x7f) { \
+        (s)[(i)++]=(uint8_t)__uc; \
    } else { \
-        if((uint32_t)(c)<=0x7ff) { \
+        if(__uc<=0x7ff) { \
-            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
+            (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
        } else { \
-            if((uint32_t)(c)<=0xffff) { \
+            if(__uc<=0xffff) { \
-                (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
+                (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
            } else { \
-                (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
+                (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
-                (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
+                (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
            } \
-            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
+            (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        } \
-        (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
+        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } \
-}
+} CBUPRV_BLOCK_MACRO_END
 // source/common/unicode/utf16.h
@@ -383,6 +395,45 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
 */
 #define CBU16_MAX_LENGTH 2
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The offset may point to either the lead or trail surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the adjacent matching surrogate as well.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * If the offset points to a single, unpaired surrogate, then
+ * c is set to that unpaired surrogate.
+ * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<=i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @see U16_GET_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU16_GET(s, start, i, length, c) CBUPRV_BLOCK_MACRO_BEGIN { \
+    (c)=(s)[i]; \
+    if(CBU16_IS_SURROGATE(c)) { \
+        uint16_t __c2; \
+        if(CBU16_IS_SURROGATE_LEAD(c)) { \
+            if((i)+1!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)+1])) { \
+                (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
+            } \
+        } else { \
+            if((i)>(start) && CBU16_IS_LEAD(__c2=(s)[(i)-1])) { \
+                (c)=CBU16_GET_SUPPLEMENTARY(__c2, (c)); \
+            } \
+        } \
+    } \
+} CBUPRV_BLOCK_MACRO_END
 /**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
@@ -404,7 +455,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
 * @see U16_NEXT_UNSAFE
 * @stable ICU 2.4
 */
-#define CBU16_NEXT(s, i, length, c) { \
+#define CBU16_NEXT(s, i, length, c) CBUPRV_BLOCK_MACRO_BEGIN { \
    (c)=(s)[(i)++]; \
    if(CBU16_IS_LEAD(c)) { \
        uint16_t __c2; \
@@ -413,7 +464,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
            (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
        } \
    } \
-}
+} CBUPRV_BLOCK_MACRO_END
 /**
 * Append a code point to a string, overwriting 1 or 2 code units.
@@ -428,14 +479,88 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu:
 * @see U16_APPEND
 * @stable ICU 2.4
 */
-#define CBU16_APPEND_UNSAFE(s, i, c) { \
+#define CBU16_APPEND_UNSAFE(s, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \
    if((uint32_t)(c)<=0xffff) { \
        (s)[(i)++]=(uint16_t)(c); \
    } else { \
        (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    } \
-}
+} CBUPRV_BLOCK_MACRO_END
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to the trail surrogate of a surrogate pair,
+ * then the offset is decremented.
+ * Otherwise, it is not modified.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<=i
+ * @see U16_SET_CP_START_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU16_SET_CP_START(s, start, i) CBUPRV_BLOCK_MACRO_BEGIN { \
+    if(CBU16_IS_TRAIL((s)[i]) && (i)>(start) && CBU16_IS_LEAD((s)[(i)-1])) { \
+        --(i); \
+    } \
+} CBUPRV_BLOCK_MACRO_END
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a trail surrogate unit
+ * for a supplementary code point, then the macro will read
+ * the preceding lead surrogate as well.
+ * If the offset is behind a lead surrogate or behind a single, unpaired
+ * trail surrogate, then c is set to that unpaired surrogate.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<i
+ * @param c output UChar32 variable
+ * @see U16_PREV_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU16_PREV(s, start, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \
+    (c)=(s)[--(i)]; \
+    if(CBU16_IS_TRAIL(c)) { \
+        uint16_t __c2; \
+        if((i)>(start) && CBU16_IS_LEAD(__c2=(s)[(i)-1])) { \
+            --(i); \
+            (c)=CBU16_GET_SUPPLEMENTARY(__c2, (c)); \
+        } \
+    } \
+} CBUPRV_BLOCK_MACRO_END
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind the lead surrogate of a surrogate pair,
+ * then the offset is incremented.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const UChar * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, start<=i<=length
+ * @param length int32_t string length
+ * @see U16_SET_CP_LIMIT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU16_SET_CP_LIMIT(s, start, i, length) CBUPRV_BLOCK_MACRO_BEGIN { \
+    if((start)<(i) && ((i)<(length) || (length)<0) && CBU16_IS_LEAD((s)[(i)-1]) && CBU16_IS_TRAIL((s)[i])) { \
+        ++(i); \
+    } \
+} CBUPRV_BLOCK_MACRO_END
 }  // namesapce base_icu