UTF-16 Decoder: Convert unpaired surrogates to replacement characters

The decoder blithely passed any old 16-bit code unit through, in violation of the Encoding standard. Surrogate pairs should go through unscathed: [ ... 0xD800 0xDC00 ... ] => [ ... U+D800 U+DC00 ... ] But cases like these should result in replacement characters: [ ... 0xD800 ... ] => [ ... U+FFFD ... ] [ ... 0xDC00 ... ] => [ ... U+FFFD ... ] [ ... 0xDC00 0xD800 ... ] => [ ... U+FFFD U+FFFD ... ] This aligns Chrome's behavior with Firefox and Edge. Note that flushing at the end of a stream remains a special case. Streams terminating in the above sequences will not get replacements emitted (current behavior). In addition, a lead surrogate appearing at the end of a stream will now not be emitted, matching other browsers. BUG=368904 R=jshin@chromium.org,foolip@chromium.org Review-Url: https://codereview.chromium.org/2379333003 Cr-Commit-Position: refs/heads/master@{#422929}

UTF-16 Decoder: Convert unpaired surrogates to replacement characters
The decoder blithely passed any old 16-bit code unit through, in violation of the Encoding standard. Surrogate pairs should go through unscathed: [ ... 0xD800 0xDC00 ... ] => [ ... U+D800 U+DC00 ... ] But cases like these should result in replacement characters: [ ... 0xD800 ... ] => [ ... U+FFFD ... ] [ ... 0xDC00 ... ] => [ ... U+FFFD ... ] [ ... 0xDC00 0xD800 ... ] => [ ... U+FFFD U+FFFD ... ] This aligns Chrome's behavior with Firefox and Edge. Note that flushing at the end of a stream remains a special case. Streams terminating in the above sequences will not get replacements emitted (current behavior). In addition, a lead surrogate appearing at the end of a stream will now not be emitted, matching other browsers. BUG=368904 R=jshin@chromium.org,foolip@chromium.org Review-Url: https://codereview.chromium.org/2379333003 Cr-Commit-Position: refs/heads/master@{#422929}
9158f6d5 · jsbell · Commit bot · 34b6f744 · 9158f6d5 · 9158f6d5
Commit 9158f6d5 authored Oct 04, 2016 by jsbell Committed by Commit bot Oct 04, 2016
5 changed files
--- a/third_party/WebKit/LayoutTests/fast/encoding/char-decoding-truncated.html
+++ b/third_party/WebKit/LayoutTests/fast/encoding/char-decoding-truncated.html
@@ -12,12 +12,12 @@ testDecode('utf-8', '%E2', 'U+FFFD');

 // UTF-16 codec does not emit replacement characters
 testDecode('utf-16', '%69%D8%D6%DE', 'U+D869/U+DED6');
-testDecode('utf-16', '%69%D8%D6', 'U+D869');
-testDecode('utf-16', '%69%D8', 'U+D869');
+testDecode('utf-16', '%69%D8%D6', '');
+testDecode('utf-16', '%69%D8', '');
 testDecode('utf-16', '%69', '');
 testDecode('utf-16be', '%D8%69%DE%D6', 'U+D869/U+DED6');
-testDecode('utf-16be', '%D8%69%DE', 'U+D869');
-testDecode('utf-16be', '%D8%69', 'U+D869');
+testDecode('utf-16be', '%D8%69%DE', '');
+testDecode('utf-16be', '%D8%69', '');
 testDecode('utf-16be', '%D8', '');

 // Other codecs emit replacement characters

--- a/third_party/WebKit/LayoutTests/fast/encoding/utf-16-lone-surrogates.html
+++ b/third_party/WebKit/LayoutTests/fast/encoding/utf-16-lone-surrogates.html
+<!DOCTYPE html>
+<title>UTF-16 Decoding: Lone Surrogates</title>
+<script src="../../resources/testharness.js"></script>
+<script src="../../resources/testharnessreport.js"></script>
+<script src="resources/char-decoding-utils.js"></script>
+<script>
+
+// Trailing U+0000 is used to force the encoder to process a buffered
+// lead surrogate since flushing does not occur at the end of streams.
+
+testDecode('utf-16', '%69%D8%D6%DE%00%00', 'U+D869/U+DED6/U+0000');
+testDecode('utf-16', '%D6%DE%69%D8%00%00', 'U+FFFD/U+FFFD/U+0000');
+testDecode('utf-16', '%69%D8%00%00', 'U+FFFD/U+0000');
+testDecode('utf-16', '%D6%DE%00%00', 'U+FFFD/U+0000');
+
+testDecode('utf-16be', '%D8%69%DE%D6%00%00', 'U+D869/U+DED6/U+0000');
+testDecode('utf-16be', '%DE%D6%D8%69%00%00', 'U+FFFD/U+FFFD/U+0000');
+testDecode('utf-16be', '%D8%69%00%00', 'U+FFFD/U+0000');
+testDecode('utf-16be', '%DE%D6%00%00', 'U+FFFD/U+0000');
+
+</script>
--- a/third_party/WebKit/LayoutTests/imported/wpt/encoding/textdecoder-utf16-surrogates-expected.txt
+++ b/third_party/WebKit/LayoutTests/imported/wpt/encoding/textdecoder-utf16-surrogates-expected.txt
-This is a testharness.js-based test.
-FAIL utf-16le - lone surrogate lead assert_equals: expected "\ufffd" but got ""
-FAIL utf-16le - lone surrogate lead (fatal flag set) assert_throws: function "function () {
-            new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
-        }" did not throw
-FAIL utf-16le - lone surrogate trail assert_equals: expected "\ufffd" but got ""
-FAIL utf-16le - lone surrogate trail (fatal flag set) assert_throws: function "function () {
-            new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
-        }" did not throw
-FAIL utf-16le - unmatched surrogate lead assert_equals: expected "\ufffd\0" but got "\0"
-FAIL utf-16le - unmatched surrogate lead (fatal flag set) assert_throws: function "function () {
-            new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
-        }" did not throw
-FAIL utf-16le - unmatched surrogate trail assert_equals: expected "\ufffd\0" but got "\0"
-FAIL utf-16le - unmatched surrogate trail (fatal flag set) assert_throws: function "function () {
-            new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
-        }" did not throw
-FAIL utf-16le - swapped surrogate pair assert_equals: expected "\ufffd\ufffd" but got ""
-FAIL utf-16le - swapped surrogate pair (fatal flag set) assert_throws: function "function () {
-            new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
-        }" did not throw
-Harness: the test ran to completion.
-
--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp
+++ b/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp
@@ -76,59 +76,74 @@ String TextCodecUTF16::decode(const char* bytes,
  const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;

  if (!length) {
-    if (!reallyFlush || !m_haveBufferedByte)
-      return String();
+    if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
+      m_haveLeadByte = m_haveLeadSurrogate = false;
      sawError = true;
      return String(&replacementCharacter, 1);
    }
-
-  // FIXME: This should generate an error if there is an unpaired surrogate.
+    return String();
+  }

  const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
-  size_t numBytes = length + m_haveBufferedByte;
-  size_t numCharsIn = numBytes / 2;
-  size_t numCharsOut =
-      ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numCharsIn;
+  const size_t numBytes = length + m_haveLeadByte;
+  const bool willHaveExtraByte = numBytes & 1;
+  const size_t numCharsIn = numBytes / 2;
+  const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) +
+                             (reallyFlush && willHaveExtraByte ? 1 : 0);

-  StringBuffer<UChar> buffer(numCharsOut);
+  StringBuffer<UChar> buffer(maxCharsOut);
  UChar* q = buffer.characters();

-  if (m_haveBufferedByte) {
+  for (size_t i = 0; i < numCharsIn; ++i) {
    UChar c;
-    if (m_littleEndian)
-      c = m_bufferedByte | (p[0] << 8);
-    else
-      c = (m_bufferedByte << 8) | p[0];
-    *q++ = c;
-    m_haveBufferedByte = false;
-    p += 1;
-    numCharsIn -= 1;
+    if (m_haveLeadByte) {
+      c = m_littleEndian ? (m_leadByte | (p[0] << 8))
+                         : ((m_leadByte << 8) | p[0]);
+      m_haveLeadByte = false;
+      ++p;
+    } else {
+      c = m_littleEndian ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);
+      p += 2;
    }

-  if (m_littleEndian) {
-    for (size_t i = 0; i < numCharsIn; ++i) {
-      UChar c = p[0] | (p[1] << 8);
-      p += 2;
+    // TODO(jsbell): If necessary for performance, m_haveLeadByte handling
+    // can be pulled out and this loop split into distinct cases for
+    // big/little endian. The logic from here to the end of the loop is
+    // constant with respect to m_haveLeadByte and m_littleEndian.
+
+    if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {
+      *q++ = m_leadSurrogate;
+      m_haveLeadSurrogate = false;
      *q++ = c;
+    } else {
+      if (m_haveLeadSurrogate) {
+        m_haveLeadSurrogate = false;
+        sawError = true;
+        *q++ = replacementCharacter;
      }
+
+      if (U_IS_LEAD(c)) {
+        m_haveLeadSurrogate = true;
+        m_leadSurrogate = c;
+      } else if (U_IS_TRAIL(c)) {
+        sawError = true;
+        *q++ = replacementCharacter;
      } else {
-    for (size_t i = 0; i < numCharsIn; ++i) {
-      UChar c = (p[0] << 8) | p[1];
-      p += 2;
        *q++ = c;
      }
    }
+  }

-  if (numBytes & 1) {
-    ASSERT(!m_haveBufferedByte);
+  DCHECK(!m_haveLeadByte);
+  if (willHaveExtraByte) {
+    m_haveLeadByte = true;
+    m_leadByte = p[0];
+  }

-    if (reallyFlush) {
+  if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
+    m_haveLeadByte = m_haveLeadSurrogate = false;
    sawError = true;
    *q++ = replacementCharacter;
-    } else {
-      m_haveBufferedByte = true;
-      m_bufferedByte = p[0];
-    }
  }

  buffer.shrink(q - buffer.characters());

--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF16.h
+++ b/third_party/WebKit/Source/wtf/text/TextCodecUTF16.h
@@ -35,8 +35,7 @@ class TextCodecUTF16 final : public TextCodec {
  static void registerEncodingNames(EncodingNameRegistrar);
  static void registerCodecs(TextCodecRegistrar);

-  TextCodecUTF16(bool littleEndian)
-      : m_littleEndian(littleEndian), m_haveBufferedByte(false) {}
+  TextCodecUTF16(bool littleEndian) : m_littleEndian(littleEndian) {}

  String decode(const char*,
                size_t length,
@@ -48,8 +47,10 @@ class TextCodecUTF16 final : public TextCodec {

 private:
  bool m_littleEndian;
-  bool m_haveBufferedByte;
-  unsigned char m_bufferedByte;
+  bool m_haveLeadByte = false;
+  unsigned char m_leadByte;
+  bool m_haveLeadSurrogate = false;
+  UChar m_leadSurrogate;
 };

 }  // namespace WTF