http: parser: straighten out %00 legality

https://github.com/warmcat/libwebsockets/issues/2262 This adds a README explaining what can be expected if your URLs contain %00, and adds a safe helper for urlargs-by-name that is length-based. Contains fix for extra NUL on some headers https://github.com/warmcat/libwebsockets/issues/2267
2025-03-09 00:00:04 +01:00 · 2021-04-08 15:26:23 +01:00 · 2021-04-08 15:26:23 +01:00 · 24abd699f6
commit 24abd699f6
parent 354b29c747
7 changed files with 162 additions and 26 deletions
--- a/READMEs/README.http_parser.md
+++ b/READMEs/README.http_parser.md
@ -0,0 +1,33 @@
+# Notes on http parser corner cases
+
+## Dealing with %00
+
+%00 is considered illegal in
+
+ - the path part of the URL.  A lot of user code handles it as a NUL terminated string,
+   even though the header get apis are based around length.  So it is disallowed to
+   avoid ambiguity.
+
+ - the name part of a urlarg, like ?name=value
+
+%00 is valid in
+
+ - the value part of a urlarg, like ?name=value
+
+When the parser sees %00 where it is not allowed, it simply drops the connection.
+
+## Note on proper urlarg handling
+
+urlargs are allowed to contain non-NUL terminated binary.  So it is important to
+use the length-based urlarg apis
+
+ - `lws_hdr_copy_fragment()`
+ - `lws_get_urlarg_by_name_safe()`
+
+The non-length based urlarg api
+
+ - `lws_get_urlarg_by_name()`
+
+...is soft-deprecated, it's still allowed but it will be fooled by the first %00
+seen in the argument into truncating the argument.  Use `lws_get_urlarg_by_name_safe()`
+instead.
--- a/include/libwebsockets/lws-http.h
+++ b/include/libwebsockets/lws-http.h
@ -494,8 +494,35 @@ LWS_VISIBLE LWS_EXTERN int
 lws_hdr_custom_copy(struct lws *wsi, char *dst, int len, const char *name,
 		    int nlen);

+/**
+ * lws_get_urlarg_by_name_safe() - get copy and return length of y for x=y urlargs
+ *
+ * \param wsi: the connection to check
+ * \param name: the arg name, like "token" or "token="
+ * \param buf: the buffer to receive the urlarg (including the name= part)
+ * \param len: the length of the buffer to receive the urlarg
+ *
+ * Returns -1 if not present, else the length of y in the urlarg name=y.  If
+ * zero or greater, then buf contains a copy of the string y.  Any = after the
+ * name match is trimmed off if the name does not end with = itself.
+ *
+ * This returns the explicit length and so can deal with binary blobs that are
+ * percent-encoded.  It also makes sure buf has a NUL just after the valid
+ * length so it can work with NUL-based apis if you don't care about truncation.
+ *
+ * buf may have been written even when -1 is returned indicating no match.
+ *
+ * Use this in place of lws_get_urlarg_by_name() that does not return an
+ * explicit length.
+ *
+ * Use lws_get_urlarg_by_name_safe() instead of this, which returns the length.
+ */
+LWS_VISIBLE LWS_EXTERN int
+lws_get_urlarg_by_name_safe(struct lws *wsi, const char *name, char *buf, int len);
+
 /**
 * lws_get_urlarg_by_name() - return pointer to arg value if present
+ *
 * \param wsi: the connection to check
 * \param name: the arg name, like "token="
 * \param buf: the buffer to receive the urlarg (including the name= part)
@ -503,9 +530,16 @@ lws_hdr_custom_copy(struct lws *wsi, char *dst, int len, const char *name,
 *
 *     Returns NULL if not found or a pointer inside buf to just after the
 *     name= part.
+ *
+ * This assumed the argument can be represented with a NUL-terminated string.
+ * It can't correctly deal with binary values encoded with %XX, eg. %00 will
+ * be understood to terminate the string.
+ *
+ * Use lws_get_urlarg_by_name_safe() instead of this, which returns the length.
 */
 LWS_VISIBLE LWS_EXTERN const char *
-lws_get_urlarg_by_name(struct lws *wsi, const char *name, char *buf, int len);
+lws_get_urlarg_by_name(struct lws *wsi, const char *name, char *buf, int len)
+/* LWS_WARN_DEPRECATED */;
 ///@}

 /*! \defgroup HTTP-headers-create HTTP headers: create
--- a/lib/core-net/wsi.c
+++ b/lib/core-net/wsi.c
@ -636,22 +636,49 @@ lws_parse_uri(char *p, const char **prot, const char **ads, int *port,

 /* ... */

+int
+lws_get_urlarg_by_name_safe(struct lws *wsi, const char *name, char *buf, int len)
+{
+	int n = 0, fraglen, sl = (int)strlen(name);
+
+	do {
+		fraglen = lws_hdr_copy_fragment(wsi, buf, len,
+						WSI_TOKEN_HTTP_URI_ARGS, n);
+
+		if (fraglen < 0)
+			break;
+
+		if (fraglen + 1 < len &&
+		    fraglen >= sl &&
+		    !strncmp(buf, name, (size_t)sl)) {
+			/*
+			 * If he left off the trailing =, trim it from the
+			 * result
+			 */
+
+			if (name[sl - 1] != '=' &&
+			    sl < fraglen &&
+			    buf[sl] == '=')
+				sl++;
+
+			memmove(buf, buf + sl, (size_t)(fraglen - sl));
+			buf[fraglen - sl] = '\0';
+
+			return fraglen - sl;
+		}
+
+		n++;
+	} while (1);
+
+	return -1;
+}
+
 const char *
 lws_get_urlarg_by_name(struct lws *wsi, const char *name, char *buf, int len)
 {
-	int n = 0;
-	size_t sl = strlen(name);
+	int n = lws_get_urlarg_by_name_safe(wsi, name, buf, len);

-	while (lws_hdr_copy_fragment(wsi, buf, len,
-			  WSI_TOKEN_HTTP_URI_ARGS, n) >= 0) {
-
-		if (!strncmp(buf, name, sl))
-			return buf + sl;
-
-		n++;
-	}
-
-	return NULL;
+	return n < 0 ? NULL : buf;
 }


--- a/lib/roles/http/parsers.c
+++ b/lib/roles/http/parsers.c
@ -742,8 +742,7 @@ issue_char(struct lws *wsi, unsigned char c)
 	if (!wsi->http.ah->current_token_limit ||
 	    frag_len < wsi->http.ah->current_token_limit) {
 		wsi->http.ah->data[wsi->http.ah->pos++] = (char)c;
-		if (c)
-			wsi->http.ah->frags[wsi->http.ah->nfrag].len++;
+		wsi->http.ah->frags[wsi->http.ah->nfrag].len++;
 		return 0;
 	}

@ -812,14 +811,29 @@ lws_parse_urldecode(struct lws *wsi, uint8_t *_c)
 	 *  leave /.dir or whatever alone
 	 */

+	if (!c && (!ah->frag_index[WSI_TOKEN_HTTP_URI_ARGS] ||
+		   !ah->post_literal_equal)) {
+		/*
+		 * Since user code is typically going to parse the path using
+		 * NUL-terminated apis, it's too dangerous to allow NUL
+		 * injection here.
+		 *
+		 * It's allowed in the urlargs, because the apis to access
+		 * those only allow retreival with explicit length.
+		 */
+		lwsl_warn("%s: saw NUL outside of uri args\n", __func__);
+		return -1;
+	}
+
 	switch (ah->ups) {
 	case URIPS_IDLE:
-		if (!c)
-			return -1;
+
 		/* genuine delimiter */
 		if ((c == '&' || c == ';') && !enc) {
 			if (issue_char(wsi, '\0') < 0)
 				return -1;
+			/* don't account for it */
+			wsi->http.ah->frags[wsi->http.ah->nfrag].len--;
 			/* link to next fragment */
 			ah->frags[ah->nfrag].nfrag = (uint8_t)(ah->nfrag + 1);
 			ah->nfrag++;
@ -924,6 +938,9 @@ lws_parse_urldecode(struct lws *wsi, uint8_t *_c)
 		if (issue_char(wsi, '\0') < 0)
 			return -1;

+		/* don't account for it */
+		wsi->http.ah->frags[wsi->http.ah->nfrag].len--;
+
 		/* move to using WSI_TOKEN_HTTP_URI_ARGS */
 		ah->nfrag++;
 		if (ah->nfrag >= LWS_ARRAY_SIZE(ah->frags))
@ -1055,6 +1072,8 @@ lws_parse(struct lws *wsi, unsigned char *buf, int *len)
 				/* begin parsing HTTP version: */
 				if (issue_char(wsi, '\0') < 0)
 					return LPR_FAIL;
+				/* don't account for it */
+				wsi->http.ah->frags[wsi->http.ah->nfrag].len--;
 				ah->parser_state = WSI_TOKEN_HTTP;
 				goto start_fragment;
 			}
@ -1096,6 +1115,17 @@ check_eol:
 				return LPR_FAIL;
 			if (n > 0)
 				ah->parser_state = WSI_TOKEN_SKIPPING;
+			else {
+				/*
+				 * Explicit zeroes are legal in URI ARGS.
+				 * They can only exist as a safety terminator
+				 * after the valid part of the token contents
+				 * for other types.
+				 */
+				if (!c && ah->parser_state != WSI_TOKEN_HTTP_URI_ARGS)
+					/* don't account for safety terminator */
+					wsi->http.ah->frags[wsi->http.ah->nfrag].len--;
+			}

 swallow:
 			/* per-protocol end of headers management */
--- a/minimal-examples/http-server/minimal-http-server-dynamic/minimal-http-server-dynamic.c
+++ b/minimal-examples/http-server/minimal-http-server-dynamic/minimal-http-server-dynamic.c
@ -74,6 +74,19 @@ callback_dynamic_http(struct lws *wsi, enum lws_callback_reasons reason,
 		lwsl_notice("%s: HTTP: connection %s, path %s\n", __func__,
 				(const char *)buf, pss->path);

+		/*
+		 * Demonstrates how to retreive a urlarg x=value
+		 */
+
+		{
+			char value[100];
+			int z = lws_get_urlarg_by_name_safe(wsi, "x", value,
+					   sizeof(value) - 1);
+
+			if (z >= 0)
+				lwsl_hexdump_notice(value, (size_t)z);
+		}
+
 		/*
 		 * prepare and write http headers... with regards to content-
 		 * length, there are three approaches:
--- a/minimal-examples/http-server/minimal-http-server-form-get/minimal-http-server-form-get.c
+++ b/minimal-examples/http-server/minimal-http-server-form-get/minimal-http-server-form-get.c
@ -29,7 +29,6 @@ callback_http(struct lws *wsi, enum lws_callback_reasons reason, void *user,
 	uint8_t buf[LWS_PRE + LWS_RECOMMENDED_MIN_HEADER_SPACE],
 		*start = &buf[LWS_PRE], *p = start,
 		*end = &buf[sizeof(buf) - 1];
-	const char *val;
 	int n;

 	switch (reason) {
@ -46,13 +45,13 @@ callback_http(struct lws *wsi, enum lws_callback_reasons reason, void *user,
 		/* we just dump the decoded things to the log */

 		for (n = 0; n < (int)LWS_ARRAY_SIZE(param_names); n++) {
-			val = lws_get_urlarg_by_name(wsi, param_names[n],
+			int rv = lws_get_urlarg_by_name_safe(wsi, param_names[n],
 					(char *)buf, sizeof(buf));
-			if (!val)
+			if (rv < 0)
 				lwsl_user("%s: undefined\n", param_names[n]);
 			else
 				lwsl_user("%s: (len %d) '%s'\n", param_names[n],
-					  (int)strlen((const char *)buf),buf);
+					  (int)rv, buf);
 		}

 		/*
--- a/plugins/protocol_lws_mirror.c
+++ b/plugins/protocol_lws_mirror.c
@ -224,12 +224,12 @@ callback_lws_mirror(struct lws *wsi, enum lws_callback_reasons reason,
 		 * mirror instance name... defaults to "", but if URL includes
 		 * "?mirror=xxx", will be "xxx"
 		 */
-		name[0] = '\0';
-		if (!lws_get_urlarg_by_name(wsi, "mirror", name,
-					   sizeof(name) - 1))
+
+		if (lws_get_urlarg_by_name_safe(wsi, "mirror", name,
+					        sizeof(name) - 1) < 0) {
 			lwsl_debug("get urlarg failed\n");
-		if (strchr(name, '='))
-			pn = strchr(name, '=') + 1;
+			name[0] = '\0';
+		}

 		//lwsl_notice("%s: mirror name '%s'\n", __func__, pn);