From fb119c767e9c43e71ea823311b0d53f566d86b73 Mon Sep 17 00:00:00 2001
From: Ben Darnell <ben@bendarnell.com>
Date: Wed, 5 Jun 2024 16:50:11 -0400
Subject: [PATCH 1/2] http1connection: Stricter handling of transfer-encoding

Unexpected transfer-encoding values were previously ignored and treated
as the HTTP/1.0 default of read-until-close. This can lead to framing
issues with certain proxies. We now treat any unexpected value as an
error.
---
 tornado/http1connection.py             | 57 ++++++++++++++-------
 tornado/test/httpserver_test.py        | 70 ++++++++++++++++++++++++++
 tornado/test/simple_httpclient_test.py |  2 +-
 3 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/tornado/http1connection.py b/tornado/http1connection.py
index ca50e8ff55..30f079ae45 100644
--- a/tornado/http1connection.py
+++ b/tornado/http1connection.py
@@ -389,14 +389,11 @@ def write_headers(
             self._request_start_line = start_line
             lines.append(utf8("%s %s HTTP/1.1" % (start_line[0], start_line[1])))
             # Client requests with a non-empty body must have either a
-            # Content-Length or a Transfer-Encoding.
+            # Content-Length or a Transfer-Encoding. If Content-Length is not
+            # present we'll add our Transfer-Encoding below.
             self._chunking_output = (
                 start_line.method in ("POST", "PUT", "PATCH")
                 and "Content-Length" not in headers
-                and (
-                    "Transfer-Encoding" not in headers
-                    or headers["Transfer-Encoding"] == "chunked"
-                )
             )
         else:
             assert isinstance(start_line, httputil.ResponseStartLine)
@@ -418,9 +415,6 @@ def write_headers(
                 and (start_line.code < 100 or start_line.code >= 200)
                 # No need to chunk the output if a Content-Length is specified.
                 and "Content-Length" not in headers
-                # Applications are discouraged from touching Transfer-Encoding,
-                # but if they do, leave it alone.
-                and "Transfer-Encoding" not in headers
             )
             # If connection to a 1.1 client will be closed, inform client
             if (
@@ -560,7 +554,7 @@ def _can_keep_alive(
             return connection_header != "close"
         elif (
             "Content-Length" in headers
-            or headers.get("Transfer-Encoding", "").lower() == "chunked"
+            or is_transfer_encoding_chunked(headers)
             or getattr(start_line, "method", None) in ("HEAD", "GET")
         ):
             # start_line may be a request or response start line; only
@@ -598,13 +592,6 @@ def _read_body(
         delegate: httputil.HTTPMessageDelegate,
     ) -> Optional[Awaitable[None]]:
         if "Content-Length" in headers:
-            if "Transfer-Encoding" in headers:
-                # Response cannot contain both Content-Length and
-                # Transfer-Encoding headers.
-                # http://tools.ietf.org/html/rfc7230#section-3.3.3
-                raise httputil.HTTPInputError(
-                    "Response with both Transfer-Encoding and Content-Length"
-                )
             if "," in headers["Content-Length"]:
                 # Proxies sometimes cause Content-Length headers to get
                 # duplicated.  If all the values are identical then we can
@@ -631,20 +618,22 @@ def _read_body(
         else:
             content_length = None
 
+        is_chunked = is_transfer_encoding_chunked(headers)
+
         if code == 204:
             # This response code is not allowed to have a non-empty body,
             # and has an implicit length of zero instead of read-until-close.
             # http://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.3
-            if "Transfer-Encoding" in headers or content_length not in (None, 0):
+            if is_chunked or content_length not in (None, 0):
                 raise httputil.HTTPInputError(
                     "Response with code %d should not have body" % code
                 )
             content_length = 0
 
+        if is_chunked:
+            return self._read_chunked_body(delegate)
         if content_length is not None:
             return self._read_fixed_body(content_length, delegate)
-        if headers.get("Transfer-Encoding", "").lower() == "chunked":
-            return self._read_chunked_body(delegate)
         if self.is_client:
             return self._read_body_until_close(delegate)
         return None
@@ -863,3 +852,33 @@ def parse_hex_int(s: str) -> int:
     if HEXDIGITS.fullmatch(s) is None:
         raise ValueError("not a hexadecimal integer: %r" % s)
     return int(s, 16)
+
+
+def is_transfer_encoding_chunked(headers: httputil.HTTPHeaders) -> bool:
+    """Returns true if the headers specify Transfer-Encoding: chunked.
+
+    Raise httputil.HTTPInputError if any other transfer encoding is used.
+    """
+    # Note that transfer-encoding is an area in which postel's law can lead
+    # us astray. If a proxy and a backend server are liberal in what they accept,
+    # but accept slightly different things, this can lead to mismatched framing
+    # and request smuggling issues. Therefore we are as strict as possible here
+    # (even technically going beyond the requirements of the RFCs: a value of
+    # ",chunked" is legal but doesn't appear in practice for legitimate traffic)
+    if "Transfer-Encoding" not in headers:
+        return False
+    if "Content-Length" in headers:
+        # Message cannot contain both Content-Length and
+        # Transfer-Encoding headers.
+        # http://tools.ietf.org/html/rfc7230#section-3.3.3
+        raise httputil.HTTPInputError(
+            "Message with both Transfer-Encoding and Content-Length"
+        )
+    if headers["Transfer-Encoding"].lower() == "chunked":
+        return True
+    # We do not support any transfer-encodings other than chunked, and we do not
+    # expect to add any support because the concept of transfer-encoding has
+    # been removed in HTTP/2.
+    raise httputil.HTTPInputError(
+        "Unsupported Transfer-Encoding %s" % headers["Transfer-Encoding"]
+    )
diff --git a/tornado/test/httpserver_test.py b/tornado/test/httpserver_test.py
index 1faf63fabf..0b29a39ccf 100644
--- a/tornado/test/httpserver_test.py
+++ b/tornado/test/httpserver_test.py
@@ -581,6 +581,76 @@ def test_chunked_request_body_invalid_size(self):
             )
         self.assertEqual(400, start_line.code)
 
+    def test_chunked_request_body_duplicate_header(self):
+        # Repeated Transfer-Encoding headers should be an error (and not confuse
+        # the chunked-encoding detection to mess up framing).
+        self.stream.write(
+            b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: chunked
+Transfer-encoding: chunked
+
+2
+ok
+0
+
+"""
+        )
+        with ExpectLog(
+            gen_log,
+            ".*Unsupported Transfer-Encoding chunked,chunked",
+            level=logging.INFO,
+        ):
+            start_line, headers, response = self.io_loop.run_sync(
+                lambda: read_stream_body(self.stream)
+            )
+        self.assertEqual(400, start_line.code)
+
+    def test_chunked_request_body_unsupported_transfer_encoding(self):
+        # We don't support transfer-encodings other than chunked.
+        self.stream.write(
+            b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: gzip, chunked
+
+2
+ok
+0
+
+"""
+        )
+        with ExpectLog(
+            gen_log, ".*Unsupported Transfer-Encoding gzip, chunked", level=logging.INFO
+        ):
+            start_line, headers, response = self.io_loop.run_sync(
+                lambda: read_stream_body(self.stream)
+            )
+        self.assertEqual(400, start_line.code)
+
+    def test_chunked_request_body_transfer_encoding_and_content_length(self):
+        # Transfer-encoding and content-length are mutually exclusive
+        self.stream.write(
+            b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: chunked
+Content-Length: 2
+
+2
+ok
+0
+
+"""
+        )
+        with ExpectLog(
+            gen_log,
+            ".*Message with both Transfer-Encoding and Content-Length",
+            level=logging.INFO,
+        ):
+            start_line, headers, response = self.io_loop.run_sync(
+                lambda: read_stream_body(self.stream)
+            )
+        self.assertEqual(400, start_line.code)
+
     @gen_test
     def test_invalid_content_length(self):
         # HTTP only allows decimal digits in content-length. Make sure we don't
diff --git a/tornado/test/simple_httpclient_test.py b/tornado/test/simple_httpclient_test.py
index 62bd4830c8..593f81fd34 100644
--- a/tornado/test/simple_httpclient_test.py
+++ b/tornado/test/simple_httpclient_test.py
@@ -828,7 +828,7 @@ def test_chunked_with_content_length(self):
         with ExpectLog(
             gen_log,
             (
-                "Malformed HTTP message from None: Response "
+                "Malformed HTTP message from None: Message "
                 "with both Transfer-Encoding and Content-Length"
             ),
             level=logging.INFO,

From 8d721a877dd5c2bc0693d9c4d3954eb11fbd404b Mon Sep 17 00:00:00 2001
From: Ben Darnell <ben@bendarnell.com>
Date: Wed, 5 Jun 2024 16:50:37 -0400
Subject: [PATCH 2/2] httputil: Only strip tabs and spaces from header values

The RFC specifies that only tabs and spaces should be stripped.
Removing additonal whitespace characters can lead to framing
errors with certain proxies.
---
 tornado/httputil.py           |  7 +++++--
 tornado/test/httputil_test.py | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tornado/httputil.py b/tornado/httputil.py
index b21d8046c4..9ce992d82b 100644
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -62,6 +62,9 @@
     from asyncio import Future  # noqa: F401
     import unittest  # noqa: F401
 
+# To be used with str.strip() and related methods.
+HTTP_WHITESPACE = " \t"
+
 
 @lru_cache(1000)
 def _normalize_header(name: str) -> str:
@@ -171,7 +174,7 @@ def parse_line(self, line: str) -> None:
             # continuation of a multi-line header
             if self._last_key is None:
                 raise HTTPInputError("first header line cannot start with whitespace")
-            new_part = " " + line.lstrip()
+            new_part = " " + line.lstrip(HTTP_WHITESPACE)
             self._as_list[self._last_key][-1] += new_part
             self._dict[self._last_key] += new_part
         else:
@@ -179,7 +182,7 @@ def parse_line(self, line: str) -> None:
                 name, value = line.split(":", 1)
             except ValueError:
                 raise HTTPInputError("no colon in header line")
-            self.add(name, value.strip())
+            self.add(name, value.strip(HTTP_WHITESPACE))
 
     @classmethod
     def parse(cls, headers: str) -> "HTTPHeaders":
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
index aa9b6ee253..6d618839e0 100644
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -334,6 +334,25 @@ def test_unicode_newlines(self):
                     gen_log.warning("failed while trying %r in %s", newline, encoding)
                     raise
 
+    def test_unicode_whitespace(self):
+        # Only tabs and spaces are to be stripped according to the HTTP standard.
+        # Other unicode whitespace is to be left as-is. In the context of headers,
+        # this specifically means the whitespace characters falling within the
+        # latin1 charset.
+        whitespace = [
+            (" ", True),  # SPACE
+            ("\t", True),  # TAB
+            ("\u00a0", False),  # NON-BREAKING SPACE
+            ("\u0085", False),  # NEXT LINE
+        ]
+        for c, stripped in whitespace:
+            headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c)
+            if stripped:
+                expected = [("Transfer-Encoding", "chunked")]
+            else:
+                expected = [("Transfer-Encoding", "%schunked" % c)]
+            self.assertEqual(expected, list(headers.get_all()))
+
     def test_optional_cr(self):
         # Both CRLF and LF should be accepted as separators. CR should not be
         # part of the data when followed by LF, but it is a normal char