From 180af17b522f531eb15b917f4fde9570b6aa95ae Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Thu, 1 Feb 2018 02:28:39 +0100 Subject: [PATCH] string_decoder: reimplement in C++ Implement string decoder in C++. The perks are a decent speed boost (for decoding, whereas creation show some performance degradation), that this can now be used more easily to add native decoding support to C++ streams and (arguably) more readable variable names. PR-URL: https://github.com/nodejs/node/pull/18537 Reviewed-By: James M Snell Reviewed-By: Ben Noordhuis --- lib/string_decoder.js | 285 ++++------------------- node.gyp | 4 + src/node_internals.h | 1 + src/string_decoder-inl.h | 38 +++ src/string_decoder.cc | 334 +++++++++++++++++++++++++++ src/string_decoder.h | 50 ++++ test/parallel/test-string-decoder.js | 4 + 7 files changed, 478 insertions(+), 238 deletions(-) create mode 100644 src/string_decoder-inl.h create mode 100644 src/string_decoder.cc create mode 100644 src/string_decoder.h diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 1e569ba6b26a4c..d955a663307de9 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -22,10 +22,23 @@ 'use strict'; const { Buffer } = require('buffer'); +const { + kIncompleteCharactersStart, + kIncompleteCharactersEnd, + kMissingBytes, + kBufferedBytes, + kEncodingField, + kSize, + decode, + flush, + encodings +} = internalBinding('string_decoder'); const internalUtil = require('internal/util'); const errors = require('internal/errors'); const isEncoding = Buffer[internalUtil.kIsEncodingSymbol]; +const kNativeDecoder = Symbol('kNativeDecoder'); + // Do not cache `Buffer.isEncoding` when checking encoding names as some // modules monkey-patch it to support additional encodings function normalizeEncoding(enc) { @@ -36,258 +49,54 @@ function normalizeEncoding(enc) { return nenc || enc; } +const encodingsMap = {}; +for (var i = 0; i < encodings.length; ++i) + encodingsMap[encodings[i]] = i; + // StringDecoder provides an interface for efficiently splitting a series of // buffers into a series of JS strings without breaking apart multi-byte // characters. -exports.StringDecoder = StringDecoder; -function StringDecoder(encoding) { - this.encoding = normalizeEncoding(encoding); - var nb; - switch (this.encoding) { - case 'utf16le': - this.text = utf16Text; - this.end = utf16End; - nb = 4; - break; - case 'utf8': - this.fillLast = utf8FillLast; - nb = 4; - break; - case 'base64': - this.text = base64Text; - this.end = base64End; - nb = 3; - break; - default: - this.write = simpleWrite; - this.end = simpleEnd; - return; - } - this.lastNeed = 0; - this.lastTotal = 0; - this.lastChar = Buffer.allocUnsafe(nb); -} - -StringDecoder.prototype.write = function(buf) { - if (buf.length === 0) - return ''; - var r; - var i; - if (this.lastNeed) { - r = this.fillLast(buf); - if (r === undefined) - return ''; - i = this.lastNeed; - this.lastNeed = 0; - } else { - i = 0; - } - if (i < buf.length) - return (r ? r + this.text(buf, i) : this.text(buf, i)); - return r || ''; -}; - -StringDecoder.prototype.end = utf8End; - -// Returns only complete characters in a Buffer -StringDecoder.prototype.text = utf8Text; - -// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer -StringDecoder.prototype.fillLast = function(buf) { - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); - } - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); - this.lastNeed -= buf.length; -}; - -// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. If an invalid byte is detected, -2 is returned. -function utf8CheckByte(byte) { - if (byte <= 0x7F) - return 0; - else if (byte >> 5 === 0x06) - return 2; - else if (byte >> 4 === 0x0E) - return 3; - else if (byte >> 3 === 0x1E) - return 4; - return (byte >> 6 === 0x02 ? -1 : -2); -} - -// Checks at most 3 bytes at the end of a Buffer in order to detect an -// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) -// needed to complete the UTF-8 character (if applicable) are returned. -function utf8CheckIncomplete(self, buf, i) { - var j = buf.length - 1; - if (j < i) - return 0; - var nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb - 1; - return nb; - } - if (--j < i || nb === -2) - return 0; - nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb - 2; - return nb; - } - if (--j < i || nb === -2) - return 0; - nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) { - if (nb === 2) - nb = 0; - else - self.lastNeed = nb - 3; - } - return nb; - } - return 0; -} - -// Validates as many continuation bytes for a multi-byte UTF-8 character as -// needed or are available. If we see a non-continuation byte where we expect -// one, we "replace" the validated continuation bytes we've seen so far with -// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding -// behavior. The continuation byte check is included three times in the case -// where all of the continuation bytes for a character exist in the same buffer. -// It is also done this way as a slight performance increase instead of using a -// loop. -function utf8CheckExtraBytes(self, buf, p) { - if ((buf[0] & 0xC0) !== 0x80) { - self.lastNeed = 0; - return '\ufffd'; - } - if (self.lastNeed > 1 && buf.length > 1) { - if ((buf[1] & 0xC0) !== 0x80) { - self.lastNeed = 1; - return '\ufffd'; - } - if (self.lastNeed > 2 && buf.length > 2) { - if ((buf[2] & 0xC0) !== 0x80) { - self.lastNeed = 2; - return '\ufffd'; - } - } +class StringDecoder { + constructor(encoding) { + this.encoding = normalizeEncoding(encoding); + this[kNativeDecoder] = Buffer.alloc(kSize); + this[kNativeDecoder][kEncodingField] = encodingsMap[this.encoding]; } -} -// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. -function utf8FillLast(buf) { - const p = this.lastTotal - this.lastNeed; - var r = utf8CheckExtraBytes(this, buf, p); - if (r !== undefined) - return r; - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, p, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); + write(buf) { + if (typeof buf === 'string') + return buf; + if (!ArrayBuffer.isView(buf)) + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'buf', + ['Buffer', 'Uint8Array', 'ArrayBufferView']); + return decode(this[kNativeDecoder], buf); } - buf.copy(this.lastChar, p, 0, buf.length); - this.lastNeed -= buf.length; -} -// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a -// partial character, the character's bytes are buffered until the required -// number of bytes are available. -function utf8Text(buf, i) { - const total = utf8CheckIncomplete(this, buf, i); - if (!this.lastNeed) - return buf.toString('utf8', i); - this.lastTotal = total; - const end = buf.length - (total - this.lastNeed); - buf.copy(this.lastChar, 0, end); - return buf.toString('utf8', i, end); -} - -// For UTF-8, a replacement character is added when ending on a partial -// character. -function utf8End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - this.lastNeed = 0; - this.lastTotal = 0; - return r + '\ufffd'; + end(buf) { + let ret = ''; + if (buf !== undefined) + ret = this.write(buf); + if (this[kNativeDecoder][kBufferedBytes] > 0) + ret += flush(this[kNativeDecoder]); + return ret; } - return r; -} -// UTF-16LE typically needs two bytes per character, but even if we have an even -// number of bytes available, we need to check if we end on a leading/high -// surrogate. In that case, we need to wait for the next two bytes in order to -// decode the last character properly. -function utf16Text(buf, i) { - if ((buf.length - i) % 2 === 0) { - const r = buf.toString('utf16le', i); - if (r) { - const c = r.charCodeAt(r.length - 1); - if (c >= 0xD800 && c <= 0xDBFF) { - this.lastNeed = 2; - this.lastTotal = 4; - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; - return r.slice(0, -1); - } - } - return r; - } - this.lastNeed = 1; - this.lastTotal = 2; - this.lastChar[0] = buf[buf.length - 1]; - return buf.toString('utf16le', i, buf.length - 1); -} + /* Everything below this line is undocumented legacy stuff. */ -// For UTF-16LE we do not explicitly append special replacement characters if we -// end on a partial character, we simply let v8 handle that. -function utf16End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = this.lastTotal - this.lastNeed; - this.lastNeed = 0; - this.lastTotal = 0; - return r + this.lastChar.toString('utf16le', 0, end); + text(buf, offset) { + this[kNativeDecoder][kMissingBytes] = 0; + this[kNativeDecoder][kBufferedBytes] = 0; + return this.write(buf.slice(offset)); } - return r; -} -function base64Text(buf, i) { - const n = (buf.length - i) % 3; - if (n === 0) - return buf.toString('base64', i); - this.lastNeed = 3 - n; - this.lastTotal = 3; - if (n === 1) { - this.lastChar[0] = buf[buf.length - 1]; - } else { - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; + get lastTotal() { + return this[kNativeDecoder][kBufferedBytes] + this.lastNeed; } - return buf.toString('base64', i, buf.length - n); -} - -function base64End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = 3 - this.lastNeed; - this.lastNeed = 0; - this.lastTotal = 0; - return r + this.lastChar.toString('base64', 0, end); + get lastChar() { + return this[kNativeDecoder].subarray(kIncompleteCharactersStart, + kIncompleteCharactersEnd); } - return r; } -// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) -function simpleWrite(buf) { - return buf.toString(this.encoding); -} - -function simpleEnd(buf) { - return (buf && buf.length ? this.write(buf) : ''); -} +exports.StringDecoder = StringDecoder; diff --git a/node.gyp b/node.gyp index 9c398284939b50..e2b17cd2b5fae6 100644 --- a/node.gyp +++ b/node.gyp @@ -326,6 +326,7 @@ 'src/signal_wrap.cc', 'src/spawn_sync.cc', 'src/string_bytes.cc', + 'src/string_decoder.cc', 'src/string_search.cc', 'src/stream_base.cc', 'src/stream_wrap.cc', @@ -379,6 +380,8 @@ 'src/req_wrap.h', 'src/req_wrap-inl.h', 'src/string_bytes.h', + 'src/string_decoder.h', + 'src/string_decoder-inl.h', 'src/stream_base.h', 'src/stream_base-inl.h', 'src/stream_wrap.h', @@ -989,6 +992,7 @@ '<(obj_path)<(obj_separator)node_url.<(obj_suffix)', '<(obj_path)<(obj_separator)util.<(obj_suffix)', '<(obj_path)<(obj_separator)string_bytes.<(obj_suffix)', + '<(obj_path)<(obj_separator)string_decoder.<(obj_suffix)', '<(obj_path)<(obj_separator)string_search.<(obj_suffix)', '<(obj_path)<(obj_separator)stream_base.<(obj_suffix)', '<(obj_path)<(obj_separator)node_constants.<(obj_suffix)', diff --git a/src/node_internals.h b/src/node_internals.h index b3e1f5cd9f270c..094fcc2d839d5f 100644 --- a/src/node_internals.h +++ b/src/node_internals.h @@ -120,6 +120,7 @@ struct sockaddr; V(signal_wrap) \ V(spawn_sync) \ V(stream_wrap) \ + V(string_decoder) \ V(tcp_wrap) \ V(timer_wrap) \ V(trace_events) \ diff --git a/src/string_decoder-inl.h b/src/string_decoder-inl.h new file mode 100644 index 00000000000000..8a04211906f759 --- /dev/null +++ b/src/string_decoder-inl.h @@ -0,0 +1,38 @@ +#ifndef SRC_STRING_DECODER_INL_H_ +#define SRC_STRING_DECODER_INL_H_ + +#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#include "string_decoder.h" +#include "util.h" + +namespace node { + +void StringDecoder::SetEncoding(enum encoding encoding) { + state_[kBufferedBytes] = 0; + state_[kMissingBytes] = 0; + state_[kEncodingField] = encoding; +} + +enum encoding StringDecoder::Encoding() const { + return static_cast(state_[kEncodingField]); +} + +unsigned StringDecoder::BufferedBytes() const { + return state_[kBufferedBytes]; +} + +unsigned StringDecoder::MissingBytes() const { + return state_[kMissingBytes]; +} + +char* StringDecoder::IncompleteCharacterBuffer() { + return reinterpret_cast(state_ + kIncompleteCharactersStart); +} + + +} // namespace node + +#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#endif // SRC_STRING_DECODER_INL_H_ diff --git a/src/string_decoder.cc b/src/string_decoder.cc new file mode 100644 index 00000000000000..ad1bace918c678 --- /dev/null +++ b/src/string_decoder.cc @@ -0,0 +1,334 @@ +#include "string_decoder-inl.h" +#include "string_bytes.h" +#include "node_internals.h" +#include "node_buffer.h" + +using v8::Array; +using v8::Context; +using v8::FunctionCallbackInfo; +using v8::Integer; +using v8::Isolate; +using v8::Local; +using v8::MaybeLocal; +using v8::Object; +using v8::String; +using v8::Value; + +namespace node { + +namespace { + +MaybeLocal MakeString(Isolate* isolate, + const char* data, + size_t length, + enum encoding encoding) { + Local error; + MaybeLocal ret; + if (encoding == UTF8) { + return String::NewFromUtf8( + isolate, + data, + v8::NewStringType::kNormal, + length); + } else if (encoding == UCS2) { +#ifdef DEBUG + CHECK_EQ(reinterpret_cast(data) % 2, 0); + CHECK_EQ(length % 2, 0); +#endif + ret = StringBytes::Encode( + isolate, + reinterpret_cast(data), + length / 2, + &error); + } else { + ret = StringBytes::Encode( + isolate, + data, + length, + encoding, + &error); + } + + if (ret.IsEmpty()) { + CHECK(!error.IsEmpty()); + isolate->ThrowException(error); + } + +#ifdef DEBUG + CHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString()); +#endif + return ret.FromMaybe(Local()).As(); +} + +} // anonymous namespace + + +MaybeLocal StringDecoder::DecodeData(Isolate* isolate, + const char* data, + size_t* nread_ptr) { + Local prepend, body; + + size_t nread = *nread_ptr; + + if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) { + // See if we want bytes to finish a character from the previous + // chunk; if so, copy the new bytes to the missing bytes buffer + // and create a small string from it that is to be prepended to the + // main body. + if (MissingBytes() > 0) { + // There are never more bytes missing than the pre-calculated maximum. + CHECK_LE(MissingBytes() + BufferedBytes(), + kIncompleteCharactersEnd); + if (Encoding() == UTF8) { + // For UTF-8, we need special treatment to align with the V8 decoder: + // If an incomplete character is found at a chunk boundary, we turn + // that character into a single invalid one. + for (size_t i = 0; i < nread && i < MissingBytes(); ++i) { + if ((data[i] & 0xC0) != 0x80) { + // This byte is not a continuation byte even though it should have + // been one. + // Act as if there was a 1-byte incomplete character, which does + // not make sense but works here because we know it's invalid. + state_[kMissingBytes] = 0; + state_[kBufferedBytes] = 1; + data += i; + nread -= i; + break; + } + } + } + + size_t found_bytes = + std::min(nread, static_cast(MissingBytes())); + memcpy(IncompleteCharacterBuffer() + BufferedBytes(), + data, + found_bytes); + // Adjust the two buffers. + data += found_bytes; + nread -= found_bytes; + + state_[kMissingBytes] -= found_bytes; + state_[kBufferedBytes] += found_bytes; + + if (LIKELY(MissingBytes() == 0)) { + // If no more bytes are missing, create a small string that we + // will later prepend. + if (!MakeString(isolate, + IncompleteCharacterBuffer(), + BufferedBytes(), + Encoding()).ToLocal(&prepend)) { + return MaybeLocal(); + } + + *nread_ptr += BufferedBytes(); + // No more buffered bytes. + state_[kBufferedBytes] = 0; + } + } + + // It could be that trying to finish the previous chunk already + // consumed all data that we received in this chunk. + if (UNLIKELY(nread == 0)) { + body = !prepend.IsEmpty() ? prepend : String::Empty(isolate); + prepend = Local(); + } else { +#ifdef DEBUG + // If not, that means is no character left to finish at this point. + CHECK_EQ(MissingBytes(), 0); + CHECK_EQ(BufferedBytes(), 0); +#endif + + // See whether there is a character that we may have to cut off and + // finish when receiving the next chunk. + if (Encoding() == UTF8 && data[nread - 1] & 0x80) { + // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte. + // This means we'll need to figure out where the character to which + // the byte belongs begins. + for (size_t i = nread - 1; ; --i) { +#ifdef DEBUG + CHECK_LT(i, nread); +#endif + state_[kBufferedBytes]++; + if ((data[i] & 0xC0) == 0x80) { + // This byte does not start a character (a "trailing" byte). + if (state_[kBufferedBytes] >= 4 || i == 0) { + // We either have more then 4 trailing bytes (which means + // the current character would not be inside the range for + // valid Unicode, and in particular cannot be represented + // through JavaScript's UTF-16-based approach to strings), or the + // current buffer does not contain the start of an UTF-8 character + // at all. Either way, this is invalid UTF8 and we can just + // let the engine's decoder handle it. + state_[kBufferedBytes] = 0; + break; + } + } else { + // Found the first byte of a UTF-8 character. By looking at the + // upper bits we can tell how long the character *should* be. + if ((data[i] & 0xE0) == 0xC0) { + state_[kMissingBytes] = 2; + } else if ((data[i] & 0xF0) == 0xE0) { + state_[kMissingBytes] = 3; + } else if ((data[i] & 0xF8) == 0xF0) { + state_[kMissingBytes] = 4; + } else { + // This lead byte would indicate a character outside of the + // representable range. + state_[kBufferedBytes] = 0; + break; + } + + if (BufferedBytes() >= MissingBytes()) { + // Received more or exactly as many trailing bytes than the lead + // character would indicate. In the "==" case, we have valid + // data and don't need to slice anything off; + // in the ">" case, this is invalid UTF-8 anyway. + state_[kMissingBytes] = 0; + state_[kBufferedBytes] = 0; + } + + state_[kMissingBytes] -= state_[kBufferedBytes]; + break; + } + } + } else if (Encoding() == UCS2) { + if ((nread % 2) == 1) { + // We got half a codepoint, and need the second byte of it. + state_[kBufferedBytes] = 1; + state_[kMissingBytes] = 1; + } else if ((data[nread - 1] & 0xFC) == 0xD8) { + // Half a split UTF-16 character. + state_[kBufferedBytes] = 2; + state_[kMissingBytes] = 2; + } + } else if (Encoding() == BASE64) { + state_[kBufferedBytes] = nread % 3; + if (state_[kBufferedBytes] > 0) + state_[kMissingBytes] = 3 - BufferedBytes(); + } + + if (BufferedBytes() > 0) { + // Copy the requested number of buffered bytes from the end of the + // input into the incomplete character buffer. + nread -= BufferedBytes(); + *nread_ptr -= BufferedBytes(); + memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes()); + } + + if (nread > 0) { + if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body)) + return MaybeLocal(); + } else { + body = String::Empty(isolate); + } + } + + if (prepend.IsEmpty()) { + return body; + } else { + return String::Concat(prepend, body); + } + } else { + CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1); + return MakeString(isolate, data, nread, Encoding()); + } +} + +MaybeLocal StringDecoder::FlushData(Isolate* isolate) { + if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) { + CHECK_EQ(MissingBytes(), 0); + CHECK_EQ(BufferedBytes(), 0); + } + + if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) { + // Ignore a single trailing byte, like the JS decoder does. + state_[kMissingBytes]--; + state_[kBufferedBytes]--; + } + + if (BufferedBytes() == 0) + return String::Empty(isolate); + + MaybeLocal ret = + MakeString(isolate, + IncompleteCharacterBuffer(), + BufferedBytes(), + Encoding()); + + state_[kMissingBytes] = 0; + state_[kBufferedBytes] = 0; + + return ret; +} + +namespace { + +void DecodeData(const FunctionCallbackInfo& args) { + StringDecoder* decoder = + reinterpret_cast(Buffer::Data(args[0])); + CHECK_NE(decoder, nullptr); + size_t nread = Buffer::Length(args[1]); + MaybeLocal ret = + decoder->DecodeData(args.GetIsolate(), Buffer::Data(args[1]), &nread); + if (!ret.IsEmpty()) + args.GetReturnValue().Set(ret.ToLocalChecked()); +} + +void FlushData(const FunctionCallbackInfo& args) { + StringDecoder* decoder = + reinterpret_cast(Buffer::Data(args[0])); + CHECK_NE(decoder, nullptr); + MaybeLocal ret = decoder->FlushData(args.GetIsolate()); + if (!ret.IsEmpty()) + args.GetReturnValue().Set(ret.ToLocalChecked()); +} + +void InitializeStringDecoder(Local target, + Local unused, + Local context) { + Environment* env = Environment::GetCurrent(context); + Isolate* isolate = env->isolate(); + +#define SET_DECODER_CONSTANT(name) \ + target->Set(context, \ + FIXED_ONE_BYTE_STRING(isolate, #name), \ + Integer::New(isolate, StringDecoder::name)).FromJust() + + SET_DECODER_CONSTANT(kIncompleteCharactersStart); + SET_DECODER_CONSTANT(kIncompleteCharactersEnd); + SET_DECODER_CONSTANT(kMissingBytes); + SET_DECODER_CONSTANT(kBufferedBytes); + SET_DECODER_CONSTANT(kEncodingField); + SET_DECODER_CONSTANT(kNumFields); + + Local encodings = Array::New(isolate); +#define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \ + encodings->Set(context, \ + static_cast(cname), \ + FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust() + ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii"); + ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8"); + ADD_TO_ENCODINGS_ARRAY(BASE64, "base64"); + ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le"); + ADD_TO_ENCODINGS_ARRAY(HEX, "hex"); + ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer"); + ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1"); + + target->Set(context, + FIXED_ONE_BYTE_STRING(isolate, "encodings"), + encodings).FromJust(); + + target->Set(context, + FIXED_ONE_BYTE_STRING(isolate, "kSize"), + Integer::New(isolate, sizeof(StringDecoder))).FromJust(); + + env->SetMethod(target, "decode", DecodeData); + env->SetMethod(target, "flush", FlushData); +} + +} // anonymous namespace + +} // namespace node + +NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder, + node::InitializeStringDecoder) diff --git a/src/string_decoder.h b/src/string_decoder.h new file mode 100644 index 00000000000000..9059eeaa9d2eb7 --- /dev/null +++ b/src/string_decoder.h @@ -0,0 +1,50 @@ +#ifndef SRC_STRING_DECODER_H_ +#define SRC_STRING_DECODER_H_ + +#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#include "node.h" + +namespace node { + +class StringDecoder { + public: + StringDecoder() { state_[kEncodingField] = BUFFER; } + inline void SetEncoding(enum encoding encoding); + inline enum encoding Encoding() const; + + inline char* IncompleteCharacterBuffer(); + inline unsigned MissingBytes() const; + inline unsigned BufferedBytes() const; + + // Decode a string from the specified encoding. + // The value pointed to by `nread` will be modified to reflect that + // less data may have been read because it ended on an incomplete character + // and more data may have been read because a previously incomplete character + // was finished. + v8::MaybeLocal DecodeData(v8::Isolate* isolate, + const char* data, + size_t* nread); + // Flush an incomplete character. For character encodings like UTF8 this + // means printing replacement characters, buf for e.g. Base64 the returned + // string contains more data. + v8::MaybeLocal FlushData(v8::Isolate* isolate); + + enum Fields { + kIncompleteCharactersStart = 0, + kIncompleteCharactersEnd = 4, + kMissingBytes = 4, + kBufferedBytes = 5, + kEncodingField = 6, + kNumFields = 7 + }; + + private: + uint8_t state_[kNumFields] = {}; +}; + +} // namespace node + +#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#endif // SRC_STRING_DECODER_H_ diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 9d1fe69a25df73..21a0b6c3e38539 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -128,6 +128,10 @@ assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), ''); assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), ''); assert.strictEqual(decoder.end(), '\ud83d'); +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d'); +assert.strictEqual(decoder.end(), ''); + common.expectsError( () => new StringDecoder(1), {