From 2e7e5c199ef1f0e579c4e1039f55506cf0b0dddd Mon Sep 17 00:00:00 2001 From: "Paul J. Dorn" Date: Thu, 27 Mar 2025 14:51:14 +0100 Subject: [PATCH 1/3] unescaped latin-1 permitted in URI, but no ctrls * reject ascii controls * reject ascii <>{}`^|\ * permit latin-1, including nbsp+shy * permit " (quotation mark) --- gunicorn/http/message.py | 33 +++++++++++++++++++++++++ tests/requests/invalid/nonascii_05.http | 4 +++ tests/requests/invalid/nonascii_05.py | 3 +++ tests/requests/valid/041.http | 5 ++++ tests/requests/valid/041.py | 10 ++++++++ tests/requests/valid/042.http | 3 +++ tests/requests/valid/042.py | 9 +++++++ 7 files changed, 67 insertions(+) create mode 100644 tests/requests/invalid/nonascii_05.http create mode 100644 tests/requests/invalid/nonascii_05.py create mode 100644 tests/requests/valid/041.http create mode 100644 tests/requests/valid/041.py create mode 100644 tests/requests/valid/042.http create mode 100644 tests/requests/valid/042.py diff --git a/gunicorn/http/message.py b/gunicorn/http/message.py index 59ce0bf4b..aa047dd2f 100644 --- a/gunicorn/http/message.py +++ b/gunicorn/http/message.py @@ -29,6 +29,35 @@ VERSION_RE = re.compile(r"HTTP/(\d)\.(\d)") RFC9110_5_5_INVALID_AND_DANGEROUS = re.compile(r"[\0\r\n]") +RFC3986_2_URI_SPECIALS = ( + # gen-delims + ":/?#[]@" + # sub-delims + "!$&'()*+,;=" + # for unreserved + "-._~" + # for pct-encoded + "%" + # notably absent from this list (must be pct-encoded): + # \N{SPACE} + # <> and {} + # ` a.k.a \N{GRAVE ACCENT} + # ^ a.k.a \N{CIRCUMFLEX ACCENT} + # | a.k.a \N{VERTICAL LINE} + # backslash a.k.a \N{REVERSE SOLIDUS} +) +GUNICORN_NONSTANDARD_URI_CHARACTERS = ( + "\N{QUOTATION MARK}" + # used in tests/requests/valid/027.http (utf8 decoded as latin-1) + # "\N{LATIN CAPITAL LETTER A WITH TILDE}" + # "\N{NO-BREAK SPACE}" + # includes the above - all latin-1 upper bits + # also includes "\N{SOFT HYPHEN}" + + bytes(range(0xA0, 0xff + 1)).decode("latin-1") +) +GUNICORN_URI_SPECIALS = RFC3986_2_URI_SPECIALS + GUNICORN_NONSTANDARD_URI_CHARACTERS +URI_CHARACTERS_RE = re.compile(r"[%s0-9a-zA-Z]+" % (re.escape(GUNICORN_URI_SPECIALS))) + class Message: def __init__(self, cfg, unreader, peer_addr): @@ -425,6 +454,7 @@ def parse_request_line(self, line_bytes): if self.cfg.casefold_http_method: self.method = self.method.upper() + # https://datatracker.ietf.org/doc/html/rfc9112#section-3.2 # URI self.uri = bits[1] @@ -438,6 +468,9 @@ def parse_request_line(self, line_bytes): # => manually reject one always invalid URI: empty if len(self.uri) == 0: raise InvalidRequestLine(bytes_to_str(line_bytes)) + # => reject URI exceeding characters listed in RFC 3986 + if not URI_CHARACTERS_RE.fullmatch(self.uri): + raise InvalidRequestLine(bytes_to_str(line_bytes)) try: parts = split_request_uri(self.uri) diff --git a/tests/requests/invalid/nonascii_05.http b/tests/requests/invalid/nonascii_05.http new file mode 100644 index 000000000..ee2dca3b1 --- /dev/null +++ b/tests/requests/invalid/nonascii_05.http @@ -0,0 +1,4 @@ +GET /one\0/two HTTP/1.1\r\n +Content-Length: 3\r\n +\r\n +WOW diff --git a/tests/requests/invalid/nonascii_05.py b/tests/requests/invalid/nonascii_05.py new file mode 100644 index 000000000..98f43e009 --- /dev/null +++ b/tests/requests/invalid/nonascii_05.py @@ -0,0 +1,3 @@ +from gunicorn.http.errors import InvalidRequestLine + +request = InvalidRequestLine diff --git a/tests/requests/valid/041.http b/tests/requests/valid/041.http new file mode 100644 index 000000000..b2b6c0bd1 --- /dev/null +++ b/tests/requests/valid/041.http @@ -0,0 +1,5 @@ +GET scheme+ext://user+ext:password!@[::1]:8000/path?query#frag HTTP/1.1\r\n +Host: localhost\r\n +CONTENT-LENGTH: 3\r\n +\r\n +odd diff --git a/tests/requests/valid/041.py b/tests/requests/valid/041.py new file mode 100644 index 000000000..27b2d09e6 --- /dev/null +++ b/tests/requests/valid/041.py @@ -0,0 +1,10 @@ +request = { + "method": "GET", + "uri": uri("scheme+ext://user+ext:password!@[::1]:8000/path?query#frag"), + "version": (1, 1), + "headers": [ + ("HOST", "localhost"), + ("CONTENT-LENGTH", "3"), + ], + "body": b'odd' +} diff --git a/tests/requests/valid/042.http b/tests/requests/valid/042.http new file mode 100644 index 000000000..501efb78d --- /dev/null +++ b/tests/requests/valid/042.http @@ -0,0 +1,3 @@ +OPTIONS * HTTP/1.1\r\n +Content-Length: 0\r\n +\r\n diff --git a/tests/requests/valid/042.py b/tests/requests/valid/042.py new file mode 100644 index 000000000..8850e1036 --- /dev/null +++ b/tests/requests/valid/042.py @@ -0,0 +1,9 @@ +request = { + "method": "OPTIONS", + "uri": uri("*"), + "version": (1, 1), + "headers": [ + ("CONTENT-LENGTH", "0"), + ], + "body": b'' +} From f1c72c8758a9d99903e7ed1b3dce3aadc0b46614 Mon Sep 17 00:00:00 2001 From: "Paul J. Dorn" Date: Thu, 27 Mar 2025 18:53:21 +0100 Subject: [PATCH 2/3] relax URI character set: pipe --- gunicorn/http/message.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gunicorn/http/message.py b/gunicorn/http/message.py index aa047dd2f..25c69a2ed 100644 --- a/gunicorn/http/message.py +++ b/gunicorn/http/message.py @@ -48,6 +48,8 @@ ) GUNICORN_NONSTANDARD_URI_CHARACTERS = ( "\N{QUOTATION MARK}" + # firefox and curl do not consider pipe escape-worthy + "\N{VERTICAL LINE}" # used in tests/requests/valid/027.http (utf8 decoded as latin-1) # "\N{LATIN CAPITAL LETTER A WITH TILDE}" # "\N{NO-BREAK SPACE}" From 44491df0211c231d4724c57e3468dcf87d0995b5 Mon Sep 17 00:00:00 2001 From: "Paul J. Dorn" Date: Thu, 27 Mar 2025 22:41:56 +0100 Subject: [PATCH 3/3] relax URI character set: 3+ wide utf-8 --- gunicorn/http/message.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gunicorn/http/message.py b/gunicorn/http/message.py index 25c69a2ed..8b8502cc5 100644 --- a/gunicorn/http/message.py +++ b/gunicorn/http/message.py @@ -53,9 +53,10 @@ # used in tests/requests/valid/027.http (utf8 decoded as latin-1) # "\N{LATIN CAPITAL LETTER A WITH TILDE}" # "\N{NO-BREAK SPACE}" - # includes the above - all latin-1 upper bits + # any with significant bit set - includes the above # also includes "\N{SOFT HYPHEN}" - + bytes(range(0xA0, 0xff + 1)).decode("latin-1") + # simplify this once util.bytes_to_str is deleted + + bytes(range(0x80, 0xff + 1)).decode("latin-1") ) GUNICORN_URI_SPECIALS = RFC3986_2_URI_SPECIALS + GUNICORN_NONSTANDARD_URI_CHARACTERS URI_CHARACTERS_RE = re.compile(r"[%s0-9a-zA-Z]+" % (re.escape(GUNICORN_URI_SPECIALS)))