fix: speedup header parsing & bug fixes & nyc coverage

nodejs · Jan 11, 2023 · b7b9645 · b7b9645
1 parent cb631ec
commit b7b9645
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 77 deletions.
diff --git a/lib/formdata/constants.js b/lib/formdata/constants.js
@@ -27,12 +27,15 @@ const chars = {
 
 const emptyBuffer = Buffer.alloc(0)
 
+const crlfBuffer = Buffer.from('\r\n')
+
 const maxHeaderLength = 16 * 1024
 
 module.exports = {
   states,
   chars,
   headerStates,
   emptyBuffer,
-  maxHeaderLength
+  maxHeaderLength,
+  crlfBuffer
 }
diff --git a/lib/formdata/parser.js b/lib/formdata/parser.js
@@ -2,12 +2,15 @@
 
 const { Writable } = require('stream')
 const { win32: { basename } } = require('path')
-const { states, chars, headerStates, emptyBuffer, maxHeaderLength } = require('./constants')
+const { states, chars, headerStates, emptyBuffer, maxHeaderLength, crlfBuffer } = require('./constants')
 const { FileStream } = require('./filestream')
 const {
   collectHTTPQuotedStringLenient,
-  findCharacterType
+  findCharacterType,
+  hasHeaderValue,
+  isWhitespace
 } = require('./util')
+const { TextSearch } = require('./textsearch')
 const {
   parseMIMEType,
   collectASequenceOfCodePoints,
@@ -43,13 +46,19 @@ class FormDataParser extends Writable {
     },
     headers: {
       attributes: {}
-    }
+    },
+    headerEndSearch: new TextSearch(Buffer.from('\r\n\r\n')),
+    headerBuffersChecked: 0
   }
 
   #opts = {}
 
   constructor (opts) {
-    super({ highWaterMark: opts?.highWaterMark })
+    super({
+      highWaterMark: opts?.highWaterMark,
+      autoDestroy: true,
+      emitClose: true
+    })
 
     const contentType = opts.headers.get?.('content-type') ?? opts.headers['content-type']
     const mimeType = contentType ? parseMIMEType(contentType) : null
@@ -114,6 +123,10 @@ class FormDataParser extends Writable {
       this.#info.stream = null
     }
 
+    if (this.#info.headerEndSearch.lookedAt) {
+      this.emit('error', new Error('Malformed part header'))
+    }
+
     cb(err)
   }
 
@@ -178,76 +191,60 @@ class FormDataParser extends Writable {
         // arbitrary length. Therefore it's easier to read the headers, and
         // then parse once we receive 2 CRLFs which marks the body's start.
 
-        if (this.#byteOffset < 4) {
+        if (this.#byteOffset < crlfBuffer.length * 2) {
           return callback()
         }
 
-        // number of bytes to remove from the last buffer
-        let bytesToRemove = 0
-
-        done: { // eslint-disable-line no-labels
-          this.#info.headerState = headerStates.DEFAULT
-
-          for (let j = 0; j < this.#buffers.length; j++) {
-            const buffer = this.#buffers[j]
-
-            for (let i = 0; i < buffer.length; i++) {
-              const byte = buffer[i]
-              const state = this.#info.headerState
-
-              if (byte === chars.cr && state === headerStates.DEFAULT) {
-                this.#info.headerState = headerStates.FIRST
-              } else if (byte === chars.lf && state === headerStates.FIRST) {
-                this.#info.headerState = headerStates.SECOND
-              } else if (byte === chars.cr && state === headerStates.SECOND) {
-                this.#info.headerState = headerStates.THIRD
-              } else if (byte === chars.lf && state === headerStates.THIRD) {
-                // Got \r\n\r\n which marks the end of the headers.
-                this.#state = states.READ_BODY
-                bytesToRemove += i
-
-                const removed = this.consume(bytesToRemove - 3) // remove \r\n\r
-                const headers = this.#info.headers
-
-                // This is a very lazy implementation.
-                if ('content-type' in headers && headers['content-type'].length === 0) {
-                  const mimeUnparsed = removed.toString(this.#opts.defCharset)
-                  const mimeType = parseMIMEType(mimeUnparsed)
-
-                  if (mimeType !== 'failure') {
-                    headers['content-type'] = mimeType.essence
-                  }
-                }
-
-                this.#parseRawHeaders(removed)
-                this.consume(4)
-
-                break done // eslint-disable-line no-labels
-              } else {
-                this.#info.headerState = headerStates.DEFAULT
-
-                if (state === headerStates.SECOND) {
-                  const removed = this.consume(bytesToRemove + i - 2)
-
-                  if (this.#info.skipPart) {
-                    // If we are meant to skip this part, we shouldn't
-                    // parse the headers. We just need to consume the bytes.
-                    continue
-                  }
-
-                  this.#parseRawHeaders(removed)
-                  this.consume(2)
-                  bytesToRemove -= removed.length + 2
-                }
-              }
-            }
+        let i = this.#info.headerBuffersChecked
+
+        while (!this.#info.headerEndSearch.finished && i < this.#buffers.length) {
+          this.#info.headerEndSearch.write(this.#buffers[i++])
+          this.#info.headerBuffersChecked++
 
-            bytesToRemove += buffer.length
+          if (this.#info.headerEndSearch.lookedAt > maxHeaderLength) {
+            this.#info.skipPart = 'Malformed part header'
+            break
           }
+        }
 
+        if (!this.#info.headerEndSearch.finished) {
           return callback()
         }
 
+        const consumed = this.consume(this.#info.headerEndSearch.lookedAt)
+
+        this.#info.headerBuffersChecked = 0
+        this.#info.headerEndSearch.reset()
+
+        let index = consumed.indexOf(crlfBuffer)
+        let start = 0
+
+        while (index !== -1 && start < index) {
+          const buffer = consumed.subarray(start, index)
+          const valueIndex = buffer.indexOf(':')
+
+          if (valueIndex === -1) {
+            index = consumed.indexOf(crlfBuffer, index + buffer.length)
+            continue
+          } else if (!hasHeaderValue(buffer, valueIndex + 1)) {
+            // HTTP/1.1 header field values can be folded onto multiple lines if the
+            // continuation line begins with a space or horizontal tab.
+            index = consumed.indexOf(crlfBuffer, index + buffer.length)
+            continue
+          }
+
+          this.#parseRawHeaders(buffer)
+
+          start = index + 2
+          index = consumed.indexOf(crlfBuffer, start)
+
+          if (this.#info.skipPart) {
+            break
+          }
+        }
+
+        this.#state = states.READ_BODY
+
         const { attributes, ...headers } = this.#info.headers
 
         if (
@@ -257,6 +254,7 @@ class FormDataParser extends Writable {
             !attributes.name
           )
         ) {
+          this.#info.headers = { attributes: {} }
           // https://www.rfc-editor.org/rfc/rfc7578#section-4.2
           this.#info.skipPart = 'invalid content-disposition'
         } else if (
@@ -266,13 +264,18 @@ class FormDataParser extends Writable {
         ) {
           if (this.#info.limits.files >= this.#opts.limits.files) {
             this.#info.skipPart = 'file'
+            this.#info.headers = { attributes: {} }
             continue
           }
 
           // End the active stream if there is one
           this.#info.stream?.push(null)
-          this.#info.stream = new FileStream({ highWaterMark: this.#opts.fileHwm })
-          this.#info.stream.truncated = false // TODO
+          this.#info.stream = new FileStream({
+            highWaterMark: this.#opts.fileHwm,
+            autoDestroy: true,
+            emitClose: true
+          })
+          this.#info.stream.truncated = false
 
           this.emit(
             'file',
@@ -284,6 +287,7 @@ class FormDataParser extends Writable {
               mimeType: headers['content-type'] || 'application/octet-stream'
             }
           )
+          this.#info.headers = { attributes: {} }
         }
       } else if (this.#state === states.READ_BODY) {
         // A part's body can contain CRLFs so they cannot be used to
@@ -432,6 +436,7 @@ class FormDataParser extends Writable {
               }
             )
 
+            this.#info.headers = { attributes: {} }
             this.#info.limits.fields++
             body.chunks.length = 0
             body.length = 0
@@ -556,11 +561,15 @@ class FormDataParser extends Writable {
         continue
       }
 
-      if (buffer[position.position + 1] === chars[' ']) {
-        position.position += 2 // skip "; "
-      } else {
-        position.position++ // skip ":"
-      }
+      position.position++ // skip semicolon
+
+      // Header values can contain leading whitespace, make sure
+      // we remove it all.
+      collectASequenceOfCodePoints(
+        (char) => isWhitespace(char),
+        buffer,
+        position
+      )
 
       const value = collectASequenceOfCodePoints(
         (char) => {

diff --git a/lib/formdata/textsearch.js b/lib/formdata/textsearch.js
@@ -0,0 +1,47 @@
+'use strict'
+
+class TextSearch {
+  /** @param {Buffer} pattern */
+  constructor (pattern) {
+    this.pattern = pattern
+
+    this.back = 0
+    this.lookedAt = 0
+  }
+
+  /**
+   * @param {Buffer} chunk
+   */
+  write (chunk) {
+    if (this.finished) {
+      return true
+    }
+
+    for (const byte of chunk) {
+      this.lookedAt++
+
+      if (byte !== this.pattern[this.back]) {
+        this.back = 0
+      } else {
+        if (++this.back === this.pattern.length) {
+          return true
+        }
+      }
+    }
+
+    return this.back === this.pattern.length
+  }
+
+  reset () {
+    this.back = 0
+    this.lookedAt = 0
+  }
+
+  get finished () {
+    return this.back === this.pattern.length
+  }
+}
+
+module.exports = {
+  TextSearch
+}
diff --git a/lib/formdata/util.js b/lib/formdata/util.js
@@ -33,7 +33,23 @@ function findCharacterType (type) {
   }
 }
 
+function isWhitespace (char) {
+  return char === 0x09 || char === 0x20 || char === 0x0D || char === 0x0A
+}
+
+function hasHeaderValue (chunk, start) {
+  for (let i = start; i < chunk.length; i++) {
+    if (!isWhitespace(chunk[i])) {
+      return true
+    }
+  }
+
+  return false
+}
+
 module.exports = {
   collectHTTPQuotedStringLenient,
-  findCharacterType
+  findCharacterType,
+  isWhitespace,
+  hasHeaderValue
 }
diff --git a/package.json b/package.json
@@ -46,7 +46,8 @@
     "build:wasm": "node build/wasm.js --docker",
     "lint": "standard | snazzy",
     "lint:fix": "standard --fix | snazzy",
-    "test": "npm run test:tap && npm run test:node-fetch && npm run test:fetch && npm run test:wpt && npm run test:websocket && npm run test:jest && tsd",
+    "test": "npm run test:tap && npm run test:node-fetch && npm run test:fetch && npm run test:wpt && npm run test:websocket && npm run test:busboy && npm run test:jest && tsd",
+    "test:busboy": "node test/busboy/test.js",
     "test:node-fetch": "node scripts/verifyVersion.js 16 || mocha test/node-fetch",
     "test:fetch": "node scripts/verifyVersion.js 16 || (npm run build:node && tap test/fetch/*.js && tap test/webidl/*.js)",
     "test:jest": "node scripts/verifyVersion.js 14 || jest",

diff --git a/test/busboy/test-types-multipart.js b/test/busboy/test-types-multipart.js
@@ -781,8 +781,6 @@ const tests = [
     what: 'Files limit'
   },
   {
-    // Note: this test is very slow because we need to write > 64 KiB
-    // of data one byte at a time.
     source: [
       ['-----------------------------paZqsnEHRufoShdX6fh0lUhXBP4k',
         'Content-Disposition: form-data; ' +