Delayed setEncoding on Readable causes multibyte characters to break #27932

verdaster · 2019-05-28T07:16:31Z

Version: v10.15.3
Platform: 4.15.0-50-generic #54~16.04.1-Ubuntu x86_64 GNU/Linux
Subsystem: stream/http

Server

const http = require('http');

const server = http.createServer(async (req, res) => {
    const data = [];
    // simulate some preprocessing - eg. authentication in middleware
    // if setEncoding is called before async operation all works correctly
    await sleep(2000);
    req.setEncoding('utf8');
    req.on('data', (d) => {
        // already buffered chunks are read as buffer, although encoding is set to utf8
        data.push(d);
        console.log(d);
    });
    req.on('end', () => {
        const body = data.join('');
        // expected {"řeřicha":"hello čača"} but got {"��eřicha":"hello čača"}
        console.log(body);
        console.log(body.toString());
        res.statusCode = 204;
        res.end();
    });
});

server.listen(3000, 'localhost', () => {
    console.log('LISTENING');
});

async function sleep(ms = 1000) {
    return new Promise((res) => setTimeout(res, ms));
}

Client

const http = require('http');

const data = Buffer.from(JSON.stringify({ řeřicha: 'hello čača' }));

async function main() {
    const req = http.request(
        {
            hostname: 'localhost',
            port: 3000,
            method: 'POST',
            path: '/',
            headers: {
                'content-type': 'application/json',
                'content-length': data.length
            }
        },
        (res) => {
            console.log(res.statusCode);
            console.log(res.headers);
        }
    );

    const offset = 3;
    // write some bytes
    req.write(data.slice(0, offset));
    for (let i = offset; i < data.length; i++) {
        // simulate very slow connection
        await sleep();
        req.write(data.slice(i, i + 1));
    }
    req.end();
}

main();

async function sleep(ms = 1000) {
    return new Promise((res) => setTimeout(res, ms));
}

When setEncoding('utf8') is used on Readable stream (eg. IncomingMessage) with some delay, although no data are read, then already buffered chunks are read as buffer which breaks multibyte characters.

If encoding is set immediately, eg. on http 'request' event, than it works properly, however http frameworks (Express, Fastify, Koa) allows including asynchronous middlewares in processing pipeline before body parsing occurs, which delays setEncoding call and thus breaks multibyte characters.

Included code simulates this delay as two first emitted 'data' chunks are buffers which in standard body processing breaks characters if chunk contains only some bytes of unicode character. In case of http body parsing middlewares this also breaks content-length check.

Example body processing middleware from Fastify v1.14.6

function rawBody (request, reply, options, parser, done) {
  var asString = parser.asString
  var limit = options.limit === null ? parser.bodyLimit : options.limit
  var contentLength = request.headers['content-length'] === undefined
    ? NaN
    : Number.parseInt(request.headers['content-length'], 10)

  if (contentLength > limit) {
    const err = new Error('Request body is too large')
    err.statusCode = 413
    reply.code(err.statusCode).send(err)
    return
  }

  var receivedLength = 0
  var body = asString === true ? '' : []
  var req = request.raw

  if (asString === true) {
    req.setEncoding('utf8')
  }

  req.on('data', onData)
  req.on('end', onEnd)
  req.on('error', onEnd)

  function onData (chunk) {
    receivedLength += chunk.length

    if (receivedLength > limit) {
      req.removeListener('data', onData)
      req.removeListener('end', onEnd)
      req.removeListener('error', onEnd)
      const err = new Error('Request body is too large')
      err.statusCode = 413
      reply.code(err.statusCode).send(err)
      return
    }

    if (asString === true) {
      // first chunks might be buffers automatically coerced to strings
      // with broken multibytes characters
      body += chunk
    } else {
      body.push(chunk)
    }
  }

  function onEnd (err) {
    req.removeListener('data', onData)
    req.removeListener('end', onEnd)
    req.removeListener('error', onEnd)

    if (err !== undefined) {
      err.statusCode = 400
      reply.code(err.statusCode).send(err)
      return
    }

    if (asString === true) {
      receivedLength = Buffer.byteLength(body)
    }
    if (!Number.isNaN(contentLength) && receivedLength !== contentLength) {
      const err = new Error('Request body size did not match Content-Length')
      err.statusCode = 400
      reply.code(err.statusCode).send(err)
      return
    }

    if (asString === false) {
      body = Buffer.concat(body)
    }

    var result = parser.fn(req, body, done)
    if (result && typeof result.then === 'function') {
      result.then(body => done(null, body), done)
    }
  }
}

The text was updated successfully, but these errors were encountered:

Convert already-stored chunks when `.setEncoding()` is called so that subsequent `data` events will receive decoded strings, as they expect. Fixes: nodejs#27932

verdaster · 2019-05-28T18:00:35Z

Hi, thanks for quick reaction. I've noticed that pull request has label for Node v10.x, but it is possible that this problem exists on other versions too - on one server we have Node v8.16.0 which has this issue too, so it might exists even on v12.x.

Convert already-stored chunks when `.setEncoding()` is called so that subsequent `data` events will receive decoded strings, as they expect. Fixes: #27932 PR-URL: #27936 Reviewed-By: Matteo Collina <[email protected]> Reviewed-By: Luigi Pinca <[email protected]> Reviewed-By: Ruben Bridgewater <[email protected]> Reviewed-By: Rich Trott <[email protected]>

lpinca added the stream Issues and PRs related to the stream subsystem. label May 28, 2019

addaleax added the confirmed-bug Issues with confirmed bugs. label May 28, 2019

addaleax mentioned this issue May 28, 2019

stream: convert existing buffer when calling .setEncoding #27936

Closed

3 tasks

Trott closed this as completed in afb8474 May 30, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Delayed setEncoding on Readable causes multibyte characters to break #27932

Delayed setEncoding on Readable causes multibyte characters to break #27932

verdaster commented May 28, 2019

verdaster commented May 28, 2019

Delayed setEncoding on Readable causes multibyte characters to break #27932

Delayed setEncoding on Readable causes multibyte characters to break #27932

Comments

verdaster commented May 28, 2019

Server

Client

Example body processing middleware from Fastify v1.14.6

verdaster commented May 28, 2019