lws_tokenize

2025-03-09 00:00:04 +01:00 · 2018-10-09 10:29:42 +08:00 · 2018-10-09 10:29:42 +08:00 · 6cd80f9fc7
commit 6cd80f9fc7
parent 47e10ab200
9 changed files with 896 additions and 27 deletions
--- a/include/libwebsockets.h
+++ b/include/libwebsockets.h
@ -408,6 +408,7 @@ struct lws;
 #include <libwebsockets/lws-lejp.h>
 #include <libwebsockets/lws-stats.h>
 #include <libwebsockets/lws-threadpool.h>
+#include <libwebsockets/lws-tokenize.h>

 #if defined(LWS_WITH_TLS)

--- a/include/libwebsockets/lws-tokenize.h
+++ b/include/libwebsockets/lws-tokenize.h
@ -0,0 +1,130 @@
+/*
+ * libwebsockets - small server side websockets and web server implementation
+ *
+ * Copyright (C) 2010-2018 Andy Green <andy@warmcat.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation:
+ *  version 2.1 of the License.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *  MA  02110-1301  USA
+ *
+ * included from libwebsockets.h
+ */
+
+/* Do not treat - as a terminal character */
+#define LWS_TOKENIZE_F_MINUS_NONTERM	(1 << 0)
+/* Separately report aggregate colon-delimited tokens */
+#define LWS_TOKENIZE_F_AGG_COLON	(1 << 1)
+/* Enforce sequencing for a simple token , token , token ... list */
+#define LWS_TOKENIZE_F_COMMA_SEP_LIST	(1 << 2)
+/* Allow more characters in the tokens and less delimiters... default is
+ * only alphanumeric + underscore in tokens */
+#define LWS_TOKENIZE_F_RFC7230_DELIMS	(1 << 3)
+
+typedef enum {
+
+	LWS_TOKZE_ERRS			=  5, /* the number of errors defined */
+
+	LWS_TOKZE_ERR_BROKEN_UTF8	= -5,	/* malformed or partial utf8 */
+	LWS_TOKZE_ERR_UNTERM_STRING	= -4,	/* ended while we were in "" */
+	LWS_TOKZE_ERR_MALFORMED_FLOAT	= -3,	/* like 0..1 or 0.1.1 */
+	LWS_TOKZE_ERR_NUM_ON_LHS	= -2,	/* like 123= or 0.1= */
+	LWS_TOKZE_ERR_COMMA_LIST	= -1,	/* like ",tok", or, "tok,," */
+
+	LWS_TOKZE_ENDED = 0,		/* no more content */
+
+	/* Note: results have ordinal 1+, EOT is 0 and errors are < 0 */
+
+	LWS_TOKZE_DELIMITER,		/* a delimiter appeared */
+	LWS_TOKZE_TOKEN,		/* a token appeared */
+	LWS_TOKZE_INTEGER,		/* an integer appeared */
+	LWS_TOKZE_FLOAT,		/* a float appeared */
+	LWS_TOKZE_TOKEN_NAME_EQUALS,	/* token [whitespace] = */
+	LWS_TOKZE_TOKEN_NAME_COLON,	/* token [whitespace] : (only with
+					   LWS_TOKENIZE_F_AGG_COLON flag) */
+	LWS_TOKZE_QUOTED_STRING,	/* "*", where * may have any char */
+
+} lws_tokenize_elem;
+
+/*
+ * helper enums to allow caller to enforce legal delimiter sequencing, eg
+ * disallow "token,,token", "token,", and ",token"
+ */
+
+enum lws_tokenize_delimiter_tracking {
+	LWSTZ_DT_NEED_FIRST_CONTENT,
+	LWSTZ_DT_NEED_DELIM,
+	LWSTZ_DT_NEED_NEXT_CONTENT,
+};
+
+struct lws_tokenize {
+	const char *start; /**< set to the start of the string to tokenize */
+	const char *token; /**< the start of an identified token or delimiter */
+	int len;	/**< set to the length of the string to tokenize */
+	int token_len;	/**< the length of the identied token or delimiter */
+
+	int flags;	/**< optional LWS_TOKENIZE_F_ flags, or 0 */
+	int delim;
+};
+
+/**
+ * lws_tokenize() - breaks down a string into tokens and delimiters in-place
+ *
+ * \param ts: the lws_tokenize struct to init
+ * \param start: the string to tokenize
+ * \param flags: LWS_TOKENIZE_F_ option flags
+ *
+ * This initializes the tokenize struct to point to the given string, and
+ * sets the length to 2GiB - 1 (so there must be a terminating NUL)... you can
+ * override this requirement by setting ts.len yourself before using it.
+ *
+ * .delim is also initialized to LWSTZ_DT_NEED_FIRST_CONTENT.
+ */
+
+LWS_VISIBLE LWS_EXTERN void
+lws_tokenize_init(struct lws_tokenize *ts, const char *start, int flags);
+
+/**
+ * lws_tokenize() - breaks down a string into tokens and delimiters in-place
+ *
+ * \param ts: the lws_tokenize struct with information and state on what to do
+ *
+ * The \p ts struct should have its start, len and flags members initialized to
+ * reflect the string to be tokenized and any options.
+ *
+ * Then `lws_tokenize()` may be called repeatedly on the struct, returning one
+ * of `lws_tokenize_elem` each time, and with the struct's `token` and
+ * `token_len` members set to describe the content of the delimiter or token
+ * payload each time.
+ *
+ * There are no allocations during the process.
+ *
+ * returns lws_tokenize_elem that was identified (LWS_TOKZE_ENDED means reached
+ * the end of the string).
+ */
+
+LWS_VISIBLE LWS_EXTERN lws_tokenize_elem
+lws_tokenize(struct lws_tokenize *ts);
+
+/**
+ * lws_tokenize_cstr() - copy token string to NUL-terminated buffer
+ *
+ * \param ts: pointer to lws_tokenize struct to operate on
+ * \param str: destination buffer
+ * \pparam max: bytes in destination buffer
+ *
+ * returns 0 if OK or nonzero if the string + NUL won't fit.
+ */
+
+LWS_VISIBLE LWS_EXTERN int
+lws_tokenize_cstr(struct lws_tokenize *ts, char *str, int max);
--- a/lib/core/libwebsockets.c
+++ b/lib/core/libwebsockets.c
@ -2358,36 +2358,60 @@ __lws_rx_flow_control(struct lws *wsi)
 	return 0;
 }

+static const unsigned char e0f4[] = {
+	0xa0 | ((2 - 1) << 2) | 1, /* e0 */
+	0x80 | ((4 - 1) << 2) | 1, /* e1 */
+	0x80 | ((4 - 1) << 2) | 1, /* e2 */
+	0x80 | ((4 - 1) << 2) | 1, /* e3 */
+	0x80 | ((4 - 1) << 2) | 1, /* e4 */
+	0x80 | ((4 - 1) << 2) | 1, /* e5 */
+	0x80 | ((4 - 1) << 2) | 1, /* e6 */
+	0x80 | ((4 - 1) << 2) | 1, /* e7 */
+	0x80 | ((4 - 1) << 2) | 1, /* e8 */
+	0x80 | ((4 - 1) << 2) | 1, /* e9 */
+	0x80 | ((4 - 1) << 2) | 1, /* ea */
+	0x80 | ((4 - 1) << 2) | 1, /* eb */
+	0x80 | ((4 - 1) << 2) | 1, /* ec */
+	0x80 | ((2 - 1) << 2) | 1, /* ed */
+	0x80 | ((4 - 1) << 2) | 1, /* ee */
+	0x80 | ((4 - 1) << 2) | 1, /* ef */
+	0x90 | ((3 - 1) << 2) | 2, /* f0 */
+	0x80 | ((4 - 1) << 2) | 2, /* f1 */
+	0x80 | ((4 - 1) << 2) | 2, /* f2 */
+	0x80 | ((4 - 1) << 2) | 2, /* f3 */
+	0x80 | ((1 - 1) << 2) | 2, /* f4 */
+
+	0,			   /* s0 */
+	0x80 | ((4 - 1) << 2) | 0, /* s2 */
+	0x80 | ((4 - 1) << 2) | 1, /* s3 */
+};
+
+LWS_EXTERN int
+lws_check_byte_utf8(unsigned char state, unsigned char c)
+{
+	unsigned char s = state;
+
+	if (!s) {
+		if (c >= 0x80) {
+			if (c < 0xc2 || c > 0xf4)
+				return -1;
+			if (c < 0xe0)
+				return 0x80 | ((4 - 1) << 2);
+			else
+				return e0f4[c - 0xe0];
+		}
+
+		return s;
+	}
+	if (c < (s & 0xf0) || c >= (s & 0xf0) + 0x10 + ((s << 2) & 0x30))
+		return -1;
+
+	return e0f4[21 + (s & 3)];
+}
+
 LWS_EXTERN int
 lws_check_utf8(unsigned char *state, unsigned char *buf, size_t len)
 {
-	static const unsigned char e0f4[] = {
-		0xa0 | ((2 - 1) << 2) | 1, /* e0 */
-		0x80 | ((4 - 1) << 2) | 1, /* e1 */
-		0x80 | ((4 - 1) << 2) | 1, /* e2 */
-		0x80 | ((4 - 1) << 2) | 1, /* e3 */
-		0x80 | ((4 - 1) << 2) | 1, /* e4 */
-		0x80 | ((4 - 1) << 2) | 1, /* e5 */
-		0x80 | ((4 - 1) << 2) | 1, /* e6 */
-		0x80 | ((4 - 1) << 2) | 1, /* e7 */
-		0x80 | ((4 - 1) << 2) | 1, /* e8 */
-		0x80 | ((4 - 1) << 2) | 1, /* e9 */
-		0x80 | ((4 - 1) << 2) | 1, /* ea */
-		0x80 | ((4 - 1) << 2) | 1, /* eb */
-		0x80 | ((4 - 1) << 2) | 1, /* ec */
-		0x80 | ((2 - 1) << 2) | 1, /* ed */
-		0x80 | ((4 - 1) << 2) | 1, /* ee */
-		0x80 | ((4 - 1) << 2) | 1, /* ef */
-		0x90 | ((3 - 1) << 2) | 2, /* f0 */
-		0x80 | ((4 - 1) << 2) | 2, /* f1 */
-		0x80 | ((4 - 1) << 2) | 2, /* f2 */
-		0x80 | ((4 - 1) << 2) | 2, /* f3 */
-		0x80 | ((1 - 1) << 2) | 2, /* f4 */
-
-		0,			   /* s0 */
-		0x80 | ((4 - 1) << 2) | 0, /* s2 */
-		0x80 | ((4 - 1) << 2) | 1, /* s3 */
-	};
 	unsigned char s = *state;

 	while (len--) {
@ -2985,6 +3009,245 @@ lws_strncpy(char *dest, const char *src, size_t size)
 	return dest;
 }

+
+typedef enum {
+	LWS_TOKZS_LEADING_WHITESPACE,
+	LWS_TOKZS_QUOTED_STRING,
+	LWS_TOKZS_TOKEN,
+	LWS_TOKZS_TOKEN_POST_TERMINAL
+} lws_tokenize_state;
+
+int
+lws_tokenize(struct lws_tokenize *ts)
+{
+	const char *rfc7230_delims = "(),/:;<=>?@[\\]{}";
+	lws_tokenize_state state = LWS_TOKZS_LEADING_WHITESPACE;
+	char c, num = -1, flo = 0;
+	int utf8 = 0;
+
+	ts->token = NULL;
+	ts->token_len = 0;
+
+	while (ts->len) {
+		c = *ts->start++;
+		ts->len--;
+
+		utf8 = lws_check_byte_utf8((unsigned char)utf8, c);
+		if (utf8 < 0)
+			return LWS_TOKZE_ERR_BROKEN_UTF8;
+
+		lwsl_debug("%s: %c (%d) %d\n", __func__, c, state, (int)ts->len);
+
+		if (!c)
+			break;
+
+		/* whitespace */
+
+		if (c == ' ' || c == '\t' || c == '\n' || c == '\r' ||
+		    c == '\f') {
+			switch (state) {
+			case LWS_TOKZS_LEADING_WHITESPACE:
+			case LWS_TOKZS_TOKEN_POST_TERMINAL:
+				continue;
+			case LWS_TOKZS_QUOTED_STRING:
+				ts->token_len++;
+				continue;
+			case LWS_TOKZS_TOKEN:
+				/* we want to scan forward to look for = */
+
+				state = LWS_TOKZS_TOKEN_POST_TERMINAL;
+				continue;
+			}
+		}
+
+		/* quoted string */
+
+		if (c == '\"') {
+			if (state == LWS_TOKZS_QUOTED_STRING)
+				return LWS_TOKZE_QUOTED_STRING;
+
+			/* starting a quoted string */
+
+			if (ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) {
+				if (ts->delim == LWSTZ_DT_NEED_DELIM)
+					return LWS_TOKZE_ERR_COMMA_LIST;
+				ts->delim = LWSTZ_DT_NEED_DELIM;
+			}
+
+			state = LWS_TOKZS_QUOTED_STRING;
+			ts->token = ts->start;
+			ts->token_len = 0;
+
+			continue;
+		}
+
+		/* token= aggregation */
+
+		if (c == '=' && (state == LWS_TOKZS_TOKEN_POST_TERMINAL ||
+				 state == LWS_TOKZS_TOKEN)) {
+			if (num == 1)
+				return LWS_TOKZE_ERR_NUM_ON_LHS;
+			/* swallow the = */
+			return LWS_TOKZE_TOKEN_NAME_EQUALS;
+		}
+
+		/* optional token: aggregation */
+
+		if ((ts->flags & LWS_TOKENIZE_F_AGG_COLON) && c == ':' &&
+		    (state == LWS_TOKZS_TOKEN_POST_TERMINAL ||
+		     state == LWS_TOKZS_TOKEN))
+			/* swallow the : */
+			return LWS_TOKZE_TOKEN_NAME_COLON;
+
+		/* aggregate . in a number as a float */
+
+		if (c == '.' && state == LWS_TOKZS_TOKEN && num == 1) {
+			if (flo)
+				return LWS_TOKZE_ERR_MALFORMED_FLOAT;
+			flo = 1;
+			ts->token_len++;
+			continue;
+		}
+
+		/*
+		 * Delimiter... by default anything that:
+		 *
+		 *  - isn't matched earlier, or
+		 *  - is [A-Z, a-z, 0-9, _], and
+		 *  - is not a partial utf8 char
+		 *
+		 * is a "delimiter", it marks the end of a token and is itself
+		 * reported as a single LWS_TOKZE_DELIMITER each time.
+		 *
+		 * However with LWS_TOKENIZE_F_RFC7230_DELIMS flag, tokens may
+		 * contain any noncontrol character that isn't defined in
+		 * rfc7230_delims, and only characters listed there are treated
+		 * as delimiters.
+		 */
+
+		if (!utf8 &&
+		     ((ts->flags & LWS_TOKENIZE_F_RFC7230_DELIMS &&
+		     strchr(rfc7230_delims, c) && c > 32) ||
+		    ((!(ts->flags & LWS_TOKENIZE_F_RFC7230_DELIMS) &&
+		     (c < '0' || c > '9') && (c < 'A' || c > 'Z') &&
+		     (c < 'a' || c > 'z') && c != '_') && !(c == '-' &&
+			(ts->flags & LWS_TOKENIZE_F_MINUS_NONTERM))) ||
+		    (c == '-' && !(ts->flags & LWS_TOKENIZE_F_MINUS_NONTERM))
+		    )) {
+			switch (state) {
+			case LWS_TOKZS_LEADING_WHITESPACE:
+				if (ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) {
+					if (c != ',' ||
+					    ts->delim != LWSTZ_DT_NEED_DELIM)
+						return LWS_TOKZE_ERR_COMMA_LIST;
+					ts->delim = LWSTZ_DT_NEED_NEXT_CONTENT;
+				}
+
+				ts->token = ts->start - 1;
+				ts->token_len = 1;
+				return LWS_TOKZE_DELIMITER;
+
+			case LWS_TOKZS_QUOTED_STRING:
+				ts->token_len++;
+				continue;
+
+			case LWS_TOKZS_TOKEN_POST_TERMINAL:
+			case LWS_TOKZS_TOKEN:
+				/* report the delimiter next time */
+				ts->start--;
+				ts->len++;
+				goto token_or_numeric;
+			}
+		}
+
+		/* anything that's not whitespace or delimiter is payload */
+
+		switch (state) {
+		case LWS_TOKZS_LEADING_WHITESPACE:
+
+			if (ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) {
+				if (ts->delim == LWSTZ_DT_NEED_DELIM)
+					return LWS_TOKZE_ERR_COMMA_LIST;
+				ts->delim = LWSTZ_DT_NEED_DELIM;
+			}
+
+			state = LWS_TOKZS_TOKEN;
+			ts->token = ts->start - 1;
+			ts->token_len = 1;
+			if (c < '0' || c > '9')
+				num = 0;
+			else
+				if (num < 0)
+					num = 1;
+			continue;
+		case LWS_TOKZS_QUOTED_STRING:
+		case LWS_TOKZS_TOKEN:
+			if (c < '0' || c > '9')
+				num = 0;
+			else
+				if (num < 0)
+					num = 1;
+			ts->token_len++;
+			continue;
+		case LWS_TOKZS_TOKEN_POST_TERMINAL:
+			/* report the new token next time */
+			ts->start--;
+			ts->len++;
+			goto token_or_numeric;
+		}
+	}
+
+	/* we ran out of content */
+
+	if (utf8) /* ended partway through a multibyte char */
+		return LWS_TOKZE_ERR_BROKEN_UTF8;
+
+	if (state == LWS_TOKZS_QUOTED_STRING)
+		return LWS_TOKZE_ERR_UNTERM_STRING;
+
+	if (state != LWS_TOKZS_TOKEN_POST_TERMINAL &&
+	    state != LWS_TOKZS_TOKEN) {
+		if ((ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) &&
+		     ts->delim == LWSTZ_DT_NEED_NEXT_CONTENT)
+			return LWS_TOKZE_ERR_COMMA_LIST;
+
+		return LWS_TOKZE_ENDED;
+	}
+
+	/* report the pending token */
+
+token_or_numeric:
+
+	if (num != 1)
+		return LWS_TOKZE_TOKEN;
+	if (flo)
+		return LWS_TOKZE_FLOAT;
+
+	return LWS_TOKZE_INTEGER;
+}
+
+
+LWS_VISIBLE LWS_EXTERN int
+lws_tokenize_cstr(struct lws_tokenize *ts, char *str, int max)
+{
+	if (ts->token_len + 1 >= max)
+		return 1;
+
+	memcpy(str, ts->token, ts->token_len);
+	str[ts->token_len] = '\0';
+
+	return 0;
+}
+
+LWS_VISIBLE LWS_EXTERN void
+lws_tokenize_init(struct lws_tokenize *ts, const char *start, int flags)
+{
+	ts->start = start;
+	ts->len = 0x7fffffff;
+	ts->flags = flags;
+	ts->delim = LWSTZ_DT_NEED_FIRST_CONTENT;
+}
+
 #if LWS_MAX_SMP > 1

 void
--- a/lib/core/private.h
+++ b/lib/core/private.h
@ -1449,6 +1449,8 @@ lws_plat_inet_ntop(int af, const void *src, char *dst, int cnt);
 LWS_EXTERN int LWS_WARN_UNUSED_RESULT
 lws_plat_inet_pton(int af, const char *src, void *dst);

+LWS_EXTERN int
+lws_check_byte_utf8(unsigned char state, unsigned char c);
 LWS_EXTERN int LWS_WARN_UNUSED_RESULT
 lws_check_utf8(unsigned char *state, unsigned char *buf, size_t len);
 LWS_EXTERN int alloc_file(struct lws_context *context, const char *filename, uint8_t **buf,
--- a/lib/roles/http/server/parsers.c
+++ b/lib/roles/http/server/parsers.c
@ -501,6 +501,10 @@ LWS_VISIBLE int lws_hdr_copy(struct lws *wsi, char *dst, int len,
 	int toklen = lws_hdr_total_length(wsi, h);
 	int n;

+	*dst = '\0';
+	if (!toklen)
+		return 0;
+
 	if (toklen >= len)
 		return -1;

--- a/minimal-examples/api-tests/api-test-lws_tokenize/CMakeLists.txt
+++ b/minimal-examples/api-tests/api-test-lws_tokenize/CMakeLists.txt
@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+include(CheckCSourceCompiles)
+
+set(SAMP lws-api-test-lws_tokenize)
+set(SRCS main.c)
+
+# If we are being built as part of lws, confirm current build config supports
+# reqconfig, else skip building ourselves.
+#
+# If we are being built externally, confirm installed lws was configured to
+# support reqconfig, else error out with a helpful message about the problem.
+#
+MACRO(require_lws_config reqconfig _val result)
+
+	if (DEFINED ${reqconfig})
+	if (${reqconfig})
+		set (rq 1)
+	else()
+		set (rq 0)
+	endif()
+	else()
+		set(rq 0)
+	endif()
+
+	if (${_val} EQUAL ${rq})
+		set(SAME 1)
+	else()
+		set(SAME 0)
+	endif()
+
+	if (LWS_WITH_MINIMAL_EXAMPLES AND NOT ${SAME})
+		if (${_val})
+			message("${SAMP}: skipping as lws being built without ${reqconfig}")
+		else()
+			message("${SAMP}: skipping as lws built with ${reqconfig}")
+		endif()
+		set(${result} 0)
+	else()
+		if (LWS_WITH_MINIMAL_EXAMPLES)
+			set(MET ${SAME})
+		else()
+			CHECK_C_SOURCE_COMPILES("#include <libwebsockets.h>\nint main(void) {\n#if defined(${reqconfig})\n return 0;\n#else\n fail;\n#endif\n return 0;\n}\n" HAS_${reqconfig})
+			if (NOT DEFINED HAS_${reqconfig} OR NOT HAS_${reqconfig})
+				set(HAS_${reqconfig} 0)
+			else()
+				set(HAS_${reqconfig} 1)
+			endif()
+			if ((HAS_${reqconfig} AND ${_val}) OR (NOT HAS_${reqconfig} AND NOT ${_val}))
+				set(MET 1)
+			else()
+				set(MET 0)
+			endif()
+		endif()
+		if (NOT MET)
+			if (${_val})
+				message(FATAL_ERROR "This project requires lws must have been configured with ${reqconfig}")
+			else()
+				message(FATAL_ERROR "Lws configuration of ${reqconfig} is incompatible with this project")
+			endif()
+		endif()
+	endif()
+ENDMACRO()
+
+
+
+	add_executable(${SAMP} ${SRCS})
+
+	if (websockets_shared)
+		target_link_libraries(${SAMP} websockets_shared)
+		add_dependencies(${SAMP} websockets_shared)
+	else()
+		target_link_libraries(${SAMP} websockets)
+	endif()
--- a/minimal-examples/api-tests/api-test-lws_tokenize/README.md
+++ b/minimal-examples/api-tests/api-test-lws_tokenize/README.md
@ -0,0 +1,37 @@
+# lws api test lws_tokenize
+
+Performs selftests for lws_tokenize
+
+## build
+
+```
+ $ cmake . && make
+```
+
+## usage
+
+Commandline option|Meaning
+---|---
+-d <loglevel>|Debug verbosity in decimal, eg, -d15
+-s "input string"|String to tokenize
+-f 15|LWS_TOKENIZE_F_ flag values to apply to processing of -s 
+
+```
+ $ ./lws-api-test-lws_tokenize
+[2018/10/09 09:14:17:4834] USER: LWS API selftest: lws_tokenize
+[2018/10/09 09:14:17:4835] USER: Completed: PASS: 6, FAIL: 0
+```
+
+If the `-s string` option is given, the string is tokenized on stdout in
+the format used to produce the tests in the sources
+
+```
+ $ ./lws-api-test-lws_tokenize -s "hello: 1234,256"
+[2018/10/09 09:14:17:4834] USER: LWS API selftest: lws_tokenize
+{ LWS_TOKZE_TOKEN_NAME_COLON, "hello", 5 }
+{ LWS_TOKZE_INTEGER, "1234", 4 }
+{ LWS_TOKZE_DELIMITER, ",", 1 }
+{ LWS_TOKZE_INTEGER, "256", 3 }
+{ LWS_TOKZE_ENDED, "", 0 }
+```
+
--- a/minimal-examples/api-tests/api-test-lws_tokenize/main.c
+++ b/minimal-examples/api-tests/api-test-lws_tokenize/main.c
@ -0,0 +1,335 @@
+/*
+ * lws-api-test-lws_tokenize
+ *
+ * Copyright (C) 2018 Andy Green <andy@warmcat.com>
+ *
+ * This file is made available under the Creative Commons CC0 1.0
+ * Universal Public Domain Dedication.
+ *
+ * This demonstrates the most minimal http server you can make with lws.
+ *
+ * To keep it simple, it serves stuff from the subdirectory 
+ * "./mount-origin" of the directory it was started in.
+ * You can change that by changing mount.origin below.
+ */
+
+#include <libwebsockets.h>
+#include <string.h>
+
+struct expected {
+	lws_tokenize_elem e;
+	const char *value;
+	int len;
+};
+
+struct tests {
+	const char *string;
+	struct expected *exp;
+	int count;
+	int flags;
+};
+
+struct expected expected1[] = {
+			{ LWS_TOKZE_TOKEN,		"protocol-1", 10 },
+		{ LWS_TOKZE_DELIMITER, ",", 1},
+			{ LWS_TOKZE_TOKEN,		"protocol_2", 10 },
+		{ LWS_TOKZE_DELIMITER, ",", 1},
+			{ LWS_TOKZE_TOKEN,		"protocol3", 9 },
+		{ LWS_TOKZE_ENDED, NULL, 0 },
+	},
+	expected2[] = {
+		{ LWS_TOKZE_TOKEN_NAME_COLON,		"Accept-Language", 15 },
+			{ LWS_TOKZE_TOKEN,		"fr-CH", 5 },
+		{ LWS_TOKZE_DELIMITER,			",", 1 },
+			{ LWS_TOKZE_TOKEN,		"fr", 2 },
+			{ LWS_TOKZE_DELIMITER,		";", 1},
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS,	"q", 1 },
+			{ LWS_TOKZE_FLOAT,		"0.9", 3 },
+		{ LWS_TOKZE_DELIMITER,			",", 1 },
+			{ LWS_TOKZE_TOKEN,		"en", 2 },
+			{ LWS_TOKZE_DELIMITER,		";", 1},
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS,	"q", 1 },
+			{ LWS_TOKZE_FLOAT,		"0.8", 3 },
+		{ LWS_TOKZE_DELIMITER,			",", 1 },
+			{ LWS_TOKZE_TOKEN,		"de", 2 },
+			{ LWS_TOKZE_DELIMITER,		";", 1},
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS,	"q", 1 },
+			{ LWS_TOKZE_FLOAT,		"0.7", 3 },
+		{ LWS_TOKZE_DELIMITER, ",", 1 },
+			{ LWS_TOKZE_DELIMITER,		"*", 1 },
+			{ LWS_TOKZE_DELIMITER,		";", 1 },
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS,	"q", 1 },
+			{ LWS_TOKZE_FLOAT,		"0.5", 3 },
+		{ LWS_TOKZE_ENDED, NULL, 0 },
+	},
+	expected3[] = {
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS,	"quoted", 6 },
+			{ LWS_TOKZE_QUOTED_STRING,	"things:", 7 },
+		{ LWS_TOKZE_DELIMITER,			",", 1 },
+			{ LWS_TOKZE_INTEGER,		"1234", 4 },
+		{ LWS_TOKZE_ENDED, NULL, 0 },
+	},
+	expected4[] = {
+		{ LWS_TOKZE_ERR_COMMA_LIST,		",", 1 },
+	},
+	expected5[] = {
+			{ LWS_TOKZE_TOKEN,		"brokenlist2", 11 },
+		{ LWS_TOKZE_DELIMITER, ",", 1 },
+		{ LWS_TOKZE_ERR_COMMA_LIST,		",", 1 },
+	},
+	expected6[] = {
+			{ LWS_TOKZE_TOKEN,		"brokenlist3", 11 },
+		{ LWS_TOKZE_DELIMITER, ",", 1 },
+		{ LWS_TOKZE_ERR_COMMA_LIST,		",", 1 },
+
+	},
+	expected7[] = {
+			{ LWS_TOKZE_TOKEN, "fr", 2 },
+			{ LWS_TOKZE_DELIMITER, "-", 1 },
+			{ LWS_TOKZE_TOKEN, "CH", 2 },
+			{ LWS_TOKZE_DELIMITER, ",", 1 },
+			{ LWS_TOKZE_TOKEN, "fr", 2 },
+			{ LWS_TOKZE_DELIMITER, ";", 1 },
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 },
+			{ LWS_TOKZE_FLOAT, "0.9", 3 },
+			{ LWS_TOKZE_DELIMITER, ",", 1 },
+			{ LWS_TOKZE_TOKEN, "en", 2 },
+			{ LWS_TOKZE_DELIMITER, ";", 1 },
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 },
+			{ LWS_TOKZE_FLOAT, "0.8", 3 },
+			{ LWS_TOKZE_DELIMITER, ",", 1 },
+			{ LWS_TOKZE_TOKEN, "de", 2 },
+			{ LWS_TOKZE_DELIMITER, ";", 1 },
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 },
+			{ LWS_TOKZE_FLOAT, "0.7", 3 },
+			{ LWS_TOKZE_DELIMITER, ",", 1 },
+			{ LWS_TOKZE_TOKEN, "*", 1 },
+			{ LWS_TOKZE_DELIMITER, ";", 1 },
+			{ LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 },
+			{ LWS_TOKZE_FLOAT, "0.5", 3 },
+			{ LWS_TOKZE_ENDED, "", 0 },
+	},
+	expected8[] = {
+		{ LWS_TOKZE_TOKEN, "Οὐχὶ", 10 },
+		{ LWS_TOKZE_TOKEN, "ταὐτὰ", 12 },
+		{ LWS_TOKZE_TOKEN, "παρίσταταί", 22 },
+		{ LWS_TOKZE_TOKEN, "μοι", 6 },
+		{ LWS_TOKZE_TOKEN, "γιγνώσκειν", 21 },
+		{ LWS_TOKZE_DELIMITER, ",", 1 },
+		{ LWS_TOKZE_TOKEN, "ὦ", 3 },
+		{ LWS_TOKZE_TOKEN, "ἄνδρες", 13 },
+		{ LWS_TOKZE_TOKEN, "᾿Αθηναῖοι", 20 },
+		{ LWS_TOKZE_DELIMITER, ",", 1 },
+		{ LWS_TOKZE_TOKEN, "greek", 5 },
+		{ LWS_TOKZE_ENDED, "", 0 },
+	},
+	expected9[] = {
+		/*
+		 *  because the tokenizer scans ahead for = aggregation,
+		 * it finds the broken utf8 before reporting the token
+		 */
+		{ LWS_TOKZE_ERR_BROKEN_UTF8, "", 0 },
+	},
+	expected10[] = {
+		{ LWS_TOKZE_TOKEN, "badutf8-2", 9 },
+		{ LWS_TOKZE_TOKEN, "퟿", 3 },
+		{ LWS_TOKZE_DELIMITER, ",", 1 },
+		{ LWS_TOKZE_ERR_BROKEN_UTF8, "", 0 },
+	}
+
+;
+
+struct tests tests[] = {
+	{
+		" protocol-1, protocol_2\t,\tprotocol3\n",
+		expected1, LWS_ARRAY_SIZE(expected1),
+		LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_AGG_COLON
+	}, {
+		"Accept-Language: fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5",
+		expected2, LWS_ARRAY_SIZE(expected2),
+		LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_AGG_COLON
+	}, {
+		"quoted = \"things:\", 1234",
+		expected3, LWS_ARRAY_SIZE(expected3),
+		LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_AGG_COLON
+	}, {
+		", brokenlist1",
+		expected4, LWS_ARRAY_SIZE(expected4),
+		LWS_TOKENIZE_F_COMMA_SEP_LIST
+	}, {
+		"brokenlist2,,",
+		expected5, LWS_ARRAY_SIZE(expected5),
+		LWS_TOKENIZE_F_COMMA_SEP_LIST
+	}, {
+		"brokenlist3,",
+		expected6, LWS_ARRAY_SIZE(expected6),
+		LWS_TOKENIZE_F_COMMA_SEP_LIST
+	}, {
+		"fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5",
+		expected7, LWS_ARRAY_SIZE(expected7),
+		LWS_TOKENIZE_F_RFC7230_DELIMS
+	},
+	{
+		" Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, greek",
+		expected8, LWS_ARRAY_SIZE(expected8),
+		LWS_TOKENIZE_F_RFC7230_DELIMS
+	},
+	{
+		"badutf8-1 \x80...",
+		expected9, LWS_ARRAY_SIZE(expected9),
+		LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_RFC7230_DELIMS
+	},
+	{
+		"badutf8-2 \xed\x9f\xbf,\x80...",
+		expected10, LWS_ARRAY_SIZE(expected10),
+		LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_RFC7230_DELIMS
+	},
+};
+
+/*
+ * add LWS_TOKZE_ERRS to the element index (which may be negative by that
+ * amount) to index this array
+ */
+
+static const char *element_names[] = {
+	"LWS_TOKZE_ERR_BROKEN_UTF8",
+	"LWS_TOKZE_ERR_UNTERM_STRING",
+	"LWS_TOKZE_ERR_MALFORMED_FLOAT",
+	"LWS_TOKZE_ERR_NUM_ON_LHS",
+	"LWS_TOKZE_ERR_COMMA_LIST",
+	"LWS_TOKZE_ENDED",
+	"LWS_TOKZE_DELIMITER",
+	"LWS_TOKZE_TOKEN",
+	"LWS_TOKZE_INTEGER",
+	"LWS_TOKZE_FLOAT",
+	"LWS_TOKZE_TOKEN_NAME_EQUALS",
+	"LWS_TOKZE_TOKEN_NAME_COLON",
+	"LWS_TOKZE_QUOTED_STRING",
+};
+
+int main(int argc, const char **argv)
+{
+	struct lws_tokenize ts;
+	lws_tokenize_elem e;
+	const char *p;
+	int n, logs = LLL_USER | LLL_ERR | LLL_WARN | LLL_NOTICE
+			/* for LLL_ verbosity above NOTICE to be built into lws,
+			 * lws must have been configured and built with
+			 * -DCMAKE_BUILD_TYPE=DEBUG instead of =RELEASE */
+			/* | LLL_INFO */ /* | LLL_PARSER */ /* | LLL_HEADER */
+			/* | LLL_EXT */ /* | LLL_CLIENT */ /* | LLL_LATENCY */
+			/* | LLL_DEBUG */;
+	int fail = 0, ok = 0, flags = 0;
+
+	if ((p = lws_cmdline_option(argc, argv, "-d")))
+		logs = atoi(p);
+
+	lws_set_log_level(logs, NULL);
+	lwsl_user("LWS API selftest: lws_tokenize\n");
+
+	if ((p = lws_cmdline_option(argc, argv, "-f")))
+		flags = atoi(p);
+
+	p = lws_cmdline_option(argc, argv, "-s");
+
+	for (n = 0; n < (int)LWS_ARRAY_SIZE(tests); n++) {
+		int m = 0, in_fail = fail;
+		struct expected *exp = tests[n].exp;
+
+		ts.start = tests[n].string;
+		ts.len = strlen(ts.start);
+		ts.flags = tests[n].flags;
+
+		do {
+			e = lws_tokenize(&ts);
+
+			lwsl_info("{ %s, \"%.*s\", %d }\n",
+				  element_names[e + LWS_TOKZE_ERRS],
+				  (int)ts.token_len, ts.token,
+				  (int)ts.token_len);
+
+			if (m == (int)tests[n].count) {
+				lwsl_notice("fail: expected end earlier\n");
+				fail++;
+				break;
+			}
+
+			if (e != exp->e) {
+				lwsl_notice("fail... tok %s vs expected %s\n",
+					element_names[e + LWS_TOKZE_ERRS],
+					element_names[exp->e + LWS_TOKZE_ERRS]);
+				fail++;
+				break;
+			}
+
+			if (e > 0 &&
+			    (ts.token_len != exp->len ||
+			     memcmp(exp->value, ts.token, exp->len))) {
+				lwsl_notice("fail token mismatch\n");
+				fail++;
+				break;
+			}
+
+			m++;
+			exp++;
+
+		} while (e > 0);
+
+		if (fail == in_fail)
+			ok++;
+	}
+
+	if (p) {
+		ts.start = p;
+		ts.len = strlen(p);
+		ts.flags = flags;
+
+		printf("\t{\n\t\t\"%s\",\n"
+		       "\t\texpected%d, LWS_ARRAY_SIZE(expected%d),\n\t\t",
+		       p, (int)LWS_ARRAY_SIZE(tests) + 1,
+		       (int)LWS_ARRAY_SIZE(tests) + 1);
+
+		if (!flags)
+			printf("0\n\t},\n");
+		else {
+			if (flags & LWS_TOKENIZE_F_MINUS_NONTERM)
+				printf("LWS_TOKENIZE_F_MINUS_NONTERM");
+			if (flags & LWS_TOKENIZE_F_AGG_COLON) {
+				if (flags & 1)
+					printf(" | ");
+				printf("LWS_TOKENIZE_F_AGG_COLON");
+			}
+			if (flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) {
+				if (flags & 3)
+					printf(" | ");
+				printf("LWS_TOKENIZE_F_COMMA_SEP_LIST");
+			}
+			if (flags & LWS_TOKENIZE_F_RFC7230_DELIMS) {
+				if (flags & 7)
+					printf(" | ");
+				printf("LWS_TOKENIZE_F_RFC7230_DELIMS");
+			}
+
+			printf("\n\t},\n");
+		}
+
+		printf("\texpected%d[] = {\n", (int)LWS_ARRAY_SIZE(tests) + 1);
+
+		do {
+			e = lws_tokenize(&ts);
+
+			printf("\t\t{ %s, \"%.*s\", %d },\n",
+				  element_names[e + LWS_TOKZE_ERRS],
+				  (int)ts.token_len,
+				  ts.token, (int)ts.token_len);
+
+		} while (e > 0);
+
+		printf("\t}\n");
+	}
+
+
+	lwsl_user("Completed: PASS: %d, FAIL: %d\n", ok, fail);
+
+	return !(ok && !fail);
+}
--- a/minimal-examples/api-tests/api-test-lws_tokenize/selftest.sh
+++ b/minimal-examples/api-tests/api-test-lws_tokenize/selftest.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# $1: path to minimal example binaries...
+#     if lws is built with -DLWS_WITH_MINIMAL_EXAMPLES=1
+#     that will be ./bin from your build dir
+#
+# $2: path for logs and results.  The results will go
+#     in a subdir named after the directory this script
+#     is in
+#
+# $3: offset for test index count
+#
+# $4: total test count
+#
+# $5: path to ./minimal-examples dir in lws
+#
+# Test return code 0: OK, 254: timed out, other: error indication
+
+. $5/selftests-library.sh
+
+COUNT_TESTS=1
+
+dotest $1 $2 apiselftest
+exit $FAILS