Commit 9158f6d5 authored by jsbell's avatar jsbell Committed by Commit bot

UTF-16 Decoder: Convert unpaired surrogates to replacement characters

The decoder blithely passed any old 16-bit code unit through, in
violation of the Encoding standard. Surrogate pairs should go through
unscathed:

  [ ... 0xD800 0xDC00 ... ] => [ ... U+D800 U+DC00 ... ]

But cases like these should result in replacement characters:

  [ ... 0xD800 ... ] => [ ... U+FFFD ... ]
  [ ... 0xDC00 ... ] => [ ... U+FFFD ... ]
  [ ... 0xDC00 0xD800 ... ] => [ ... U+FFFD U+FFFD ... ]

This aligns Chrome's behavior with Firefox and Edge.

Note that flushing at the end of a stream remains a special case.
Streams terminating in the above sequences will not get replacements
emitted (current behavior). In addition, a lead surrogate appearing at
the end of a stream will now not be emitted, matching other browsers.

BUG=368904
R=jshin@chromium.org,foolip@chromium.org

Review-Url: https://codereview.chromium.org/2379333003
Cr-Commit-Position: refs/heads/master@{#422929}
parent 34b6f744
......@@ -12,12 +12,12 @@ testDecode('utf-8', '%E2', 'U+FFFD');
// UTF-16 codec does not emit replacement characters
testDecode('utf-16', '%69%D8%D6%DE', 'U+D869/U+DED6');
testDecode('utf-16', '%69%D8%D6', 'U+D869');
testDecode('utf-16', '%69%D8', 'U+D869');
testDecode('utf-16', '%69%D8%D6', '');
testDecode('utf-16', '%69%D8', '');
testDecode('utf-16', '%69', '');
testDecode('utf-16be', '%D8%69%DE%D6', 'U+D869/U+DED6');
testDecode('utf-16be', '%D8%69%DE', 'U+D869');
testDecode('utf-16be', '%D8%69', 'U+D869');
testDecode('utf-16be', '%D8%69%DE', '');
testDecode('utf-16be', '%D8%69', '');
testDecode('utf-16be', '%D8', '');
// Other codecs emit replacement characters
......
<!DOCTYPE html>
<title>UTF-16 Decoding: Lone Surrogates</title>
<script src="../../resources/testharness.js"></script>
<script src="../../resources/testharnessreport.js"></script>
<script src="resources/char-decoding-utils.js"></script>
<script>
// Trailing U+0000 is used to force the encoder to process a buffered
// lead surrogate since flushing does not occur at the end of streams.
testDecode('utf-16', '%69%D8%D6%DE%00%00', 'U+D869/U+DED6/U+0000');
testDecode('utf-16', '%D6%DE%69%D8%00%00', 'U+FFFD/U+FFFD/U+0000');
testDecode('utf-16', '%69%D8%00%00', 'U+FFFD/U+0000');
testDecode('utf-16', '%D6%DE%00%00', 'U+FFFD/U+0000');
testDecode('utf-16be', '%D8%69%DE%D6%00%00', 'U+D869/U+DED6/U+0000');
testDecode('utf-16be', '%DE%D6%D8%69%00%00', 'U+FFFD/U+FFFD/U+0000');
testDecode('utf-16be', '%D8%69%00%00', 'U+FFFD/U+0000');
testDecode('utf-16be', '%DE%D6%00%00', 'U+FFFD/U+0000');
</script>
This is a testharness.js-based test.
FAIL utf-16le - lone surrogate lead assert_equals: expected "\ufffd" but got ""
FAIL utf-16le - lone surrogate lead (fatal flag set) assert_throws: function "function () {
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
}" did not throw
FAIL utf-16le - lone surrogate trail assert_equals: expected "\ufffd" but got ""
FAIL utf-16le - lone surrogate trail (fatal flag set) assert_throws: function "function () {
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
}" did not throw
FAIL utf-16le - unmatched surrogate lead assert_equals: expected "\ufffd\0" but got "\0"
FAIL utf-16le - unmatched surrogate lead (fatal flag set) assert_throws: function "function () {
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
}" did not throw
FAIL utf-16le - unmatched surrogate trail assert_equals: expected "\ufffd\0" but got "\0"
FAIL utf-16le - unmatched surrogate trail (fatal flag set) assert_throws: function "function () {
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
}" did not throw
FAIL utf-16le - swapped surrogate pair assert_equals: expected "\ufffd\ufffd" but got ""
FAIL utf-16le - swapped surrogate pair (fatal flag set) assert_throws: function "function () {
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
}" did not throw
Harness: the test ran to completion.
......@@ -76,59 +76,74 @@ String TextCodecUTF16::decode(const char* bytes,
const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;
if (!length) {
if (!reallyFlush || !m_haveBufferedByte)
return String();
if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
m_haveLeadByte = m_haveLeadSurrogate = false;
sawError = true;
return String(&replacementCharacter, 1);
}
// FIXME: This should generate an error if there is an unpaired surrogate.
return String();
}
const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
size_t numBytes = length + m_haveBufferedByte;
size_t numCharsIn = numBytes / 2;
size_t numCharsOut =
((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numCharsIn;
const size_t numBytes = length + m_haveLeadByte;
const bool willHaveExtraByte = numBytes & 1;
const size_t numCharsIn = numBytes / 2;
const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) +
(reallyFlush && willHaveExtraByte ? 1 : 0);
StringBuffer<UChar> buffer(numCharsOut);
StringBuffer<UChar> buffer(maxCharsOut);
UChar* q = buffer.characters();
if (m_haveBufferedByte) {
for (size_t i = 0; i < numCharsIn; ++i) {
UChar c;
if (m_littleEndian)
c = m_bufferedByte | (p[0] << 8);
else
c = (m_bufferedByte << 8) | p[0];
*q++ = c;
m_haveBufferedByte = false;
p += 1;
numCharsIn -= 1;
if (m_haveLeadByte) {
c = m_littleEndian ? (m_leadByte | (p[0] << 8))
: ((m_leadByte << 8) | p[0]);
m_haveLeadByte = false;
++p;
} else {
c = m_littleEndian ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);
p += 2;
}
if (m_littleEndian) {
for (size_t i = 0; i < numCharsIn; ++i) {
UChar c = p[0] | (p[1] << 8);
p += 2;
// TODO(jsbell): If necessary for performance, m_haveLeadByte handling
// can be pulled out and this loop split into distinct cases for
// big/little endian. The logic from here to the end of the loop is
// constant with respect to m_haveLeadByte and m_littleEndian.
if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {
*q++ = m_leadSurrogate;
m_haveLeadSurrogate = false;
*q++ = c;
} else {
if (m_haveLeadSurrogate) {
m_haveLeadSurrogate = false;
sawError = true;
*q++ = replacementCharacter;
}
if (U_IS_LEAD(c)) {
m_haveLeadSurrogate = true;
m_leadSurrogate = c;
} else if (U_IS_TRAIL(c)) {
sawError = true;
*q++ = replacementCharacter;
} else {
for (size_t i = 0; i < numCharsIn; ++i) {
UChar c = (p[0] << 8) | p[1];
p += 2;
*q++ = c;
}
}
}
if (numBytes & 1) {
ASSERT(!m_haveBufferedByte);
DCHECK(!m_haveLeadByte);
if (willHaveExtraByte) {
m_haveLeadByte = true;
m_leadByte = p[0];
}
if (reallyFlush) {
if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
m_haveLeadByte = m_haveLeadSurrogate = false;
sawError = true;
*q++ = replacementCharacter;
} else {
m_haveBufferedByte = true;
m_bufferedByte = p[0];
}
}
buffer.shrink(q - buffer.characters());
......
......@@ -35,8 +35,7 @@ class TextCodecUTF16 final : public TextCodec {
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
TextCodecUTF16(bool littleEndian)
: m_littleEndian(littleEndian), m_haveBufferedByte(false) {}
TextCodecUTF16(bool littleEndian) : m_littleEndian(littleEndian) {}
String decode(const char*,
size_t length,
......@@ -48,8 +47,10 @@ class TextCodecUTF16 final : public TextCodec {
private:
bool m_littleEndian;
bool m_haveBufferedByte;
unsigned char m_bufferedByte;
bool m_haveLeadByte = false;
unsigned char m_leadByte;
bool m_haveLeadSurrogate = false;
UChar m_leadSurrogate;
};
} // namespace WTF
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment