From 2e7e5c199ef1f0e579c4e1039f55506cf0b0dddd Mon Sep 17 00:00:00 2001
From: "Paul J. Dorn" <pajod@users.noreply.github.com>
Date: Thu, 27 Mar 2025 14:51:14 +0100
Subject: [PATCH 1/3] unescaped latin-1 permitted in URI, but no ctrls

* reject ascii controls
* reject ascii <>{}`^|\
* permit latin-1, including nbsp+shy
* permit " (quotation mark)
---
 gunicorn/http/message.py                | 33 +++++++++++++++++++++++++
 tests/requests/invalid/nonascii_05.http |  4 +++
 tests/requests/invalid/nonascii_05.py   |  3 +++
 tests/requests/valid/041.http           |  5 ++++
 tests/requests/valid/041.py             | 10 ++++++++
 tests/requests/valid/042.http           |  3 +++
 tests/requests/valid/042.py             |  9 +++++++
 7 files changed, 67 insertions(+)
 create mode 100644 tests/requests/invalid/nonascii_05.http
 create mode 100644 tests/requests/invalid/nonascii_05.py
 create mode 100644 tests/requests/valid/041.http
 create mode 100644 tests/requests/valid/041.py
 create mode 100644 tests/requests/valid/042.http
 create mode 100644 tests/requests/valid/042.py

diff --git a/gunicorn/http/message.py b/gunicorn/http/message.py
index 59ce0bf4b..aa047dd2f 100644
--- a/gunicorn/http/message.py
+++ b/gunicorn/http/message.py
@@ -29,6 +29,35 @@
 VERSION_RE = re.compile(r"HTTP/(\d)\.(\d)")
 RFC9110_5_5_INVALID_AND_DANGEROUS = re.compile(r"[\0\r\n]")
 
+RFC3986_2_URI_SPECIALS = (
+    # gen-delims
+    ":/?#[]@"
+    # sub-delims
+    "!$&'()*+,;="
+    # for unreserved
+    "-._~"
+    # for pct-encoded
+    "%"
+    # notably absent from this list (must be pct-encoded):
+    #   \N{SPACE}
+    #   <> and {}
+    #   ` a.k.a \N{GRAVE ACCENT}
+    #   ^ a.k.a \N{CIRCUMFLEX ACCENT}
+    #   | a.k.a \N{VERTICAL LINE}
+    #   backslash a.k.a \N{REVERSE SOLIDUS}
+)
+GUNICORN_NONSTANDARD_URI_CHARACTERS = (
+    "\N{QUOTATION MARK}"
+    # used in tests/requests/valid/027.http (utf8 decoded as latin-1)
+    #   "\N{LATIN CAPITAL LETTER A WITH TILDE}"
+    #   "\N{NO-BREAK SPACE}"
+    # includes the above - all latin-1 upper bits
+    #   also includes "\N{SOFT HYPHEN}"
+    + bytes(range(0xA0, 0xff + 1)).decode("latin-1")
+)
+GUNICORN_URI_SPECIALS = RFC3986_2_URI_SPECIALS + GUNICORN_NONSTANDARD_URI_CHARACTERS
+URI_CHARACTERS_RE = re.compile(r"[%s0-9a-zA-Z]+" % (re.escape(GUNICORN_URI_SPECIALS)))
+
 
 class Message:
     def __init__(self, cfg, unreader, peer_addr):
@@ -425,6 +454,7 @@ def parse_request_line(self, line_bytes):
         if self.cfg.casefold_http_method:
             self.method = self.method.upper()
 
+        # https://datatracker.ietf.org/doc/html/rfc9112#section-3.2
         # URI
         self.uri = bits[1]
 
@@ -438,6 +468,9 @@ def parse_request_line(self, line_bytes):
         # => manually reject one always invalid URI: empty
         if len(self.uri) == 0:
             raise InvalidRequestLine(bytes_to_str(line_bytes))
+        # => reject URI exceeding characters listed in RFC 3986
+        if not URI_CHARACTERS_RE.fullmatch(self.uri):
+            raise InvalidRequestLine(bytes_to_str(line_bytes))
 
         try:
             parts = split_request_uri(self.uri)
diff --git a/tests/requests/invalid/nonascii_05.http b/tests/requests/invalid/nonascii_05.http
new file mode 100644
index 000000000..ee2dca3b1
--- /dev/null
+++ b/tests/requests/invalid/nonascii_05.http
@@ -0,0 +1,4 @@
+GET /one\0/two HTTP/1.1\r\n
+Content-Length: 3\r\n
+\r\n
+WOW
diff --git a/tests/requests/invalid/nonascii_05.py b/tests/requests/invalid/nonascii_05.py
new file mode 100644
index 000000000..98f43e009
--- /dev/null
+++ b/tests/requests/invalid/nonascii_05.py
@@ -0,0 +1,3 @@
+from gunicorn.http.errors import InvalidRequestLine
+
+request = InvalidRequestLine
diff --git a/tests/requests/valid/041.http b/tests/requests/valid/041.http
new file mode 100644
index 000000000..b2b6c0bd1
--- /dev/null
+++ b/tests/requests/valid/041.http
@@ -0,0 +1,5 @@
+GET scheme+ext://user+ext:password!@[::1]:8000/path?query#frag HTTP/1.1\r\n
+Host: localhost\r\n
+CONTENT-LENGTH: 3\r\n
+\r\n
+odd
diff --git a/tests/requests/valid/041.py b/tests/requests/valid/041.py
new file mode 100644
index 000000000..27b2d09e6
--- /dev/null
+++ b/tests/requests/valid/041.py
@@ -0,0 +1,10 @@
+request = {
+    "method": "GET",
+    "uri": uri("scheme+ext://user+ext:password!@[::1]:8000/path?query#frag"),
+    "version": (1, 1),
+    "headers": [
+        ("HOST", "localhost"),
+        ("CONTENT-LENGTH", "3"),
+    ],
+    "body": b'odd'
+}
diff --git a/tests/requests/valid/042.http b/tests/requests/valid/042.http
new file mode 100644
index 000000000..501efb78d
--- /dev/null
+++ b/tests/requests/valid/042.http
@@ -0,0 +1,3 @@
+OPTIONS * HTTP/1.1\r\n
+Content-Length: 0\r\n
+\r\n
diff --git a/tests/requests/valid/042.py b/tests/requests/valid/042.py
new file mode 100644
index 000000000..8850e1036
--- /dev/null
+++ b/tests/requests/valid/042.py
@@ -0,0 +1,9 @@
+request = {
+    "method": "OPTIONS",
+    "uri": uri("*"),
+    "version": (1, 1),
+    "headers": [
+        ("CONTENT-LENGTH", "0"),
+    ],
+    "body": b''
+}

From f1c72c8758a9d99903e7ed1b3dce3aadc0b46614 Mon Sep 17 00:00:00 2001
From: "Paul J. Dorn" <pajod@users.noreply.github.com>
Date: Thu, 27 Mar 2025 18:53:21 +0100
Subject: [PATCH 2/3] relax URI character set: pipe

---
 gunicorn/http/message.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gunicorn/http/message.py b/gunicorn/http/message.py
index aa047dd2f..25c69a2ed 100644
--- a/gunicorn/http/message.py
+++ b/gunicorn/http/message.py
@@ -48,6 +48,8 @@
 )
 GUNICORN_NONSTANDARD_URI_CHARACTERS = (
     "\N{QUOTATION MARK}"
+    # firefox and curl do not consider pipe escape-worthy
+    "\N{VERTICAL LINE}"
     # used in tests/requests/valid/027.http (utf8 decoded as latin-1)
     #   "\N{LATIN CAPITAL LETTER A WITH TILDE}"
     #   "\N{NO-BREAK SPACE}"

From 44491df0211c231d4724c57e3468dcf87d0995b5 Mon Sep 17 00:00:00 2001
From: "Paul J. Dorn" <pajod@users.noreply.github.com>
Date: Thu, 27 Mar 2025 22:41:56 +0100
Subject: [PATCH 3/3] relax URI character set: 3+ wide utf-8

---
 gunicorn/http/message.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gunicorn/http/message.py b/gunicorn/http/message.py
index 25c69a2ed..8b8502cc5 100644
--- a/gunicorn/http/message.py
+++ b/gunicorn/http/message.py
@@ -53,9 +53,10 @@
     # used in tests/requests/valid/027.http (utf8 decoded as latin-1)
     #   "\N{LATIN CAPITAL LETTER A WITH TILDE}"
     #   "\N{NO-BREAK SPACE}"
-    # includes the above - all latin-1 upper bits
+    # any with significant bit set - includes the above
     #   also includes "\N{SOFT HYPHEN}"
-    + bytes(range(0xA0, 0xff + 1)).decode("latin-1")
+    # simplify this once util.bytes_to_str is deleted
+    + bytes(range(0x80, 0xff + 1)).decode("latin-1")
 )
 GUNICORN_URI_SPECIALS = RFC3986_2_URI_SPECIALS + GUNICORN_NONSTANDARD_URI_CHARACTERS
 URI_CHARACTERS_RE = re.compile(r"[%s0-9a-zA-Z]+" % (re.escape(GUNICORN_URI_SPECIALS)))