diff --git a/include/libwebsockets.h b/include/libwebsockets.h index ebea94506..c4fc84ad8 100644 --- a/include/libwebsockets.h +++ b/include/libwebsockets.h @@ -408,6 +408,7 @@ struct lws; #include #include #include +#include #if defined(LWS_WITH_TLS) diff --git a/include/libwebsockets/lws-tokenize.h b/include/libwebsockets/lws-tokenize.h new file mode 100644 index 000000000..88f3a62a8 --- /dev/null +++ b/include/libwebsockets/lws-tokenize.h @@ -0,0 +1,130 @@ +/* + * libwebsockets - small server side websockets and web server implementation + * + * Copyright (C) 2010-2018 Andy Green + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation: + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + * + * included from libwebsockets.h + */ + +/* Do not treat - as a terminal character */ +#define LWS_TOKENIZE_F_MINUS_NONTERM (1 << 0) +/* Separately report aggregate colon-delimited tokens */ +#define LWS_TOKENIZE_F_AGG_COLON (1 << 1) +/* Enforce sequencing for a simple token , token , token ... list */ +#define LWS_TOKENIZE_F_COMMA_SEP_LIST (1 << 2) +/* Allow more characters in the tokens and less delimiters... default is + * only alphanumeric + underscore in tokens */ +#define LWS_TOKENIZE_F_RFC7230_DELIMS (1 << 3) + +typedef enum { + + LWS_TOKZE_ERRS = 5, /* the number of errors defined */ + + LWS_TOKZE_ERR_BROKEN_UTF8 = -5, /* malformed or partial utf8 */ + LWS_TOKZE_ERR_UNTERM_STRING = -4, /* ended while we were in "" */ + LWS_TOKZE_ERR_MALFORMED_FLOAT = -3, /* like 0..1 or 0.1.1 */ + LWS_TOKZE_ERR_NUM_ON_LHS = -2, /* like 123= or 0.1= */ + LWS_TOKZE_ERR_COMMA_LIST = -1, /* like ",tok", or, "tok,," */ + + LWS_TOKZE_ENDED = 0, /* no more content */ + + /* Note: results have ordinal 1+, EOT is 0 and errors are < 0 */ + + LWS_TOKZE_DELIMITER, /* a delimiter appeared */ + LWS_TOKZE_TOKEN, /* a token appeared */ + LWS_TOKZE_INTEGER, /* an integer appeared */ + LWS_TOKZE_FLOAT, /* a float appeared */ + LWS_TOKZE_TOKEN_NAME_EQUALS, /* token [whitespace] = */ + LWS_TOKZE_TOKEN_NAME_COLON, /* token [whitespace] : (only with + LWS_TOKENIZE_F_AGG_COLON flag) */ + LWS_TOKZE_QUOTED_STRING, /* "*", where * may have any char */ + +} lws_tokenize_elem; + +/* + * helper enums to allow caller to enforce legal delimiter sequencing, eg + * disallow "token,,token", "token,", and ",token" + */ + +enum lws_tokenize_delimiter_tracking { + LWSTZ_DT_NEED_FIRST_CONTENT, + LWSTZ_DT_NEED_DELIM, + LWSTZ_DT_NEED_NEXT_CONTENT, +}; + +struct lws_tokenize { + const char *start; /**< set to the start of the string to tokenize */ + const char *token; /**< the start of an identified token or delimiter */ + int len; /**< set to the length of the string to tokenize */ + int token_len; /**< the length of the identied token or delimiter */ + + int flags; /**< optional LWS_TOKENIZE_F_ flags, or 0 */ + int delim; +}; + +/** + * lws_tokenize() - breaks down a string into tokens and delimiters in-place + * + * \param ts: the lws_tokenize struct to init + * \param start: the string to tokenize + * \param flags: LWS_TOKENIZE_F_ option flags + * + * This initializes the tokenize struct to point to the given string, and + * sets the length to 2GiB - 1 (so there must be a terminating NUL)... you can + * override this requirement by setting ts.len yourself before using it. + * + * .delim is also initialized to LWSTZ_DT_NEED_FIRST_CONTENT. + */ + +LWS_VISIBLE LWS_EXTERN void +lws_tokenize_init(struct lws_tokenize *ts, const char *start, int flags); + +/** + * lws_tokenize() - breaks down a string into tokens and delimiters in-place + * + * \param ts: the lws_tokenize struct with information and state on what to do + * + * The \p ts struct should have its start, len and flags members initialized to + * reflect the string to be tokenized and any options. + * + * Then `lws_tokenize()` may be called repeatedly on the struct, returning one + * of `lws_tokenize_elem` each time, and with the struct's `token` and + * `token_len` members set to describe the content of the delimiter or token + * payload each time. + * + * There are no allocations during the process. + * + * returns lws_tokenize_elem that was identified (LWS_TOKZE_ENDED means reached + * the end of the string). + */ + +LWS_VISIBLE LWS_EXTERN lws_tokenize_elem +lws_tokenize(struct lws_tokenize *ts); + +/** + * lws_tokenize_cstr() - copy token string to NUL-terminated buffer + * + * \param ts: pointer to lws_tokenize struct to operate on + * \param str: destination buffer + * \pparam max: bytes in destination buffer + * + * returns 0 if OK or nonzero if the string + NUL won't fit. + */ + +LWS_VISIBLE LWS_EXTERN int +lws_tokenize_cstr(struct lws_tokenize *ts, char *str, int max); diff --git a/lib/core/libwebsockets.c b/lib/core/libwebsockets.c index 2d62259b9..8666733f3 100644 --- a/lib/core/libwebsockets.c +++ b/lib/core/libwebsockets.c @@ -2358,36 +2358,60 @@ __lws_rx_flow_control(struct lws *wsi) return 0; } +static const unsigned char e0f4[] = { + 0xa0 | ((2 - 1) << 2) | 1, /* e0 */ + 0x80 | ((4 - 1) << 2) | 1, /* e1 */ + 0x80 | ((4 - 1) << 2) | 1, /* e2 */ + 0x80 | ((4 - 1) << 2) | 1, /* e3 */ + 0x80 | ((4 - 1) << 2) | 1, /* e4 */ + 0x80 | ((4 - 1) << 2) | 1, /* e5 */ + 0x80 | ((4 - 1) << 2) | 1, /* e6 */ + 0x80 | ((4 - 1) << 2) | 1, /* e7 */ + 0x80 | ((4 - 1) << 2) | 1, /* e8 */ + 0x80 | ((4 - 1) << 2) | 1, /* e9 */ + 0x80 | ((4 - 1) << 2) | 1, /* ea */ + 0x80 | ((4 - 1) << 2) | 1, /* eb */ + 0x80 | ((4 - 1) << 2) | 1, /* ec */ + 0x80 | ((2 - 1) << 2) | 1, /* ed */ + 0x80 | ((4 - 1) << 2) | 1, /* ee */ + 0x80 | ((4 - 1) << 2) | 1, /* ef */ + 0x90 | ((3 - 1) << 2) | 2, /* f0 */ + 0x80 | ((4 - 1) << 2) | 2, /* f1 */ + 0x80 | ((4 - 1) << 2) | 2, /* f2 */ + 0x80 | ((4 - 1) << 2) | 2, /* f3 */ + 0x80 | ((1 - 1) << 2) | 2, /* f4 */ + + 0, /* s0 */ + 0x80 | ((4 - 1) << 2) | 0, /* s2 */ + 0x80 | ((4 - 1) << 2) | 1, /* s3 */ +}; + +LWS_EXTERN int +lws_check_byte_utf8(unsigned char state, unsigned char c) +{ + unsigned char s = state; + + if (!s) { + if (c >= 0x80) { + if (c < 0xc2 || c > 0xf4) + return -1; + if (c < 0xe0) + return 0x80 | ((4 - 1) << 2); + else + return e0f4[c - 0xe0]; + } + + return s; + } + if (c < (s & 0xf0) || c >= (s & 0xf0) + 0x10 + ((s << 2) & 0x30)) + return -1; + + return e0f4[21 + (s & 3)]; +} + LWS_EXTERN int lws_check_utf8(unsigned char *state, unsigned char *buf, size_t len) { - static const unsigned char e0f4[] = { - 0xa0 | ((2 - 1) << 2) | 1, /* e0 */ - 0x80 | ((4 - 1) << 2) | 1, /* e1 */ - 0x80 | ((4 - 1) << 2) | 1, /* e2 */ - 0x80 | ((4 - 1) << 2) | 1, /* e3 */ - 0x80 | ((4 - 1) << 2) | 1, /* e4 */ - 0x80 | ((4 - 1) << 2) | 1, /* e5 */ - 0x80 | ((4 - 1) << 2) | 1, /* e6 */ - 0x80 | ((4 - 1) << 2) | 1, /* e7 */ - 0x80 | ((4 - 1) << 2) | 1, /* e8 */ - 0x80 | ((4 - 1) << 2) | 1, /* e9 */ - 0x80 | ((4 - 1) << 2) | 1, /* ea */ - 0x80 | ((4 - 1) << 2) | 1, /* eb */ - 0x80 | ((4 - 1) << 2) | 1, /* ec */ - 0x80 | ((2 - 1) << 2) | 1, /* ed */ - 0x80 | ((4 - 1) << 2) | 1, /* ee */ - 0x80 | ((4 - 1) << 2) | 1, /* ef */ - 0x90 | ((3 - 1) << 2) | 2, /* f0 */ - 0x80 | ((4 - 1) << 2) | 2, /* f1 */ - 0x80 | ((4 - 1) << 2) | 2, /* f2 */ - 0x80 | ((4 - 1) << 2) | 2, /* f3 */ - 0x80 | ((1 - 1) << 2) | 2, /* f4 */ - - 0, /* s0 */ - 0x80 | ((4 - 1) << 2) | 0, /* s2 */ - 0x80 | ((4 - 1) << 2) | 1, /* s3 */ - }; unsigned char s = *state; while (len--) { @@ -2985,6 +3009,245 @@ lws_strncpy(char *dest, const char *src, size_t size) return dest; } + +typedef enum { + LWS_TOKZS_LEADING_WHITESPACE, + LWS_TOKZS_QUOTED_STRING, + LWS_TOKZS_TOKEN, + LWS_TOKZS_TOKEN_POST_TERMINAL +} lws_tokenize_state; + +int +lws_tokenize(struct lws_tokenize *ts) +{ + const char *rfc7230_delims = "(),/:;<=>?@[\\]{}"; + lws_tokenize_state state = LWS_TOKZS_LEADING_WHITESPACE; + char c, num = -1, flo = 0; + int utf8 = 0; + + ts->token = NULL; + ts->token_len = 0; + + while (ts->len) { + c = *ts->start++; + ts->len--; + + utf8 = lws_check_byte_utf8((unsigned char)utf8, c); + if (utf8 < 0) + return LWS_TOKZE_ERR_BROKEN_UTF8; + + lwsl_debug("%s: %c (%d) %d\n", __func__, c, state, (int)ts->len); + + if (!c) + break; + + /* whitespace */ + + if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || + c == '\f') { + switch (state) { + case LWS_TOKZS_LEADING_WHITESPACE: + case LWS_TOKZS_TOKEN_POST_TERMINAL: + continue; + case LWS_TOKZS_QUOTED_STRING: + ts->token_len++; + continue; + case LWS_TOKZS_TOKEN: + /* we want to scan forward to look for = */ + + state = LWS_TOKZS_TOKEN_POST_TERMINAL; + continue; + } + } + + /* quoted string */ + + if (c == '\"') { + if (state == LWS_TOKZS_QUOTED_STRING) + return LWS_TOKZE_QUOTED_STRING; + + /* starting a quoted string */ + + if (ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) { + if (ts->delim == LWSTZ_DT_NEED_DELIM) + return LWS_TOKZE_ERR_COMMA_LIST; + ts->delim = LWSTZ_DT_NEED_DELIM; + } + + state = LWS_TOKZS_QUOTED_STRING; + ts->token = ts->start; + ts->token_len = 0; + + continue; + } + + /* token= aggregation */ + + if (c == '=' && (state == LWS_TOKZS_TOKEN_POST_TERMINAL || + state == LWS_TOKZS_TOKEN)) { + if (num == 1) + return LWS_TOKZE_ERR_NUM_ON_LHS; + /* swallow the = */ + return LWS_TOKZE_TOKEN_NAME_EQUALS; + } + + /* optional token: aggregation */ + + if ((ts->flags & LWS_TOKENIZE_F_AGG_COLON) && c == ':' && + (state == LWS_TOKZS_TOKEN_POST_TERMINAL || + state == LWS_TOKZS_TOKEN)) + /* swallow the : */ + return LWS_TOKZE_TOKEN_NAME_COLON; + + /* aggregate . in a number as a float */ + + if (c == '.' && state == LWS_TOKZS_TOKEN && num == 1) { + if (flo) + return LWS_TOKZE_ERR_MALFORMED_FLOAT; + flo = 1; + ts->token_len++; + continue; + } + + /* + * Delimiter... by default anything that: + * + * - isn't matched earlier, or + * - is [A-Z, a-z, 0-9, _], and + * - is not a partial utf8 char + * + * is a "delimiter", it marks the end of a token and is itself + * reported as a single LWS_TOKZE_DELIMITER each time. + * + * However with LWS_TOKENIZE_F_RFC7230_DELIMS flag, tokens may + * contain any noncontrol character that isn't defined in + * rfc7230_delims, and only characters listed there are treated + * as delimiters. + */ + + if (!utf8 && + ((ts->flags & LWS_TOKENIZE_F_RFC7230_DELIMS && + strchr(rfc7230_delims, c) && c > 32) || + ((!(ts->flags & LWS_TOKENIZE_F_RFC7230_DELIMS) && + (c < '0' || c > '9') && (c < 'A' || c > 'Z') && + (c < 'a' || c > 'z') && c != '_') && !(c == '-' && + (ts->flags & LWS_TOKENIZE_F_MINUS_NONTERM))) || + (c == '-' && !(ts->flags & LWS_TOKENIZE_F_MINUS_NONTERM)) + )) { + switch (state) { + case LWS_TOKZS_LEADING_WHITESPACE: + if (ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) { + if (c != ',' || + ts->delim != LWSTZ_DT_NEED_DELIM) + return LWS_TOKZE_ERR_COMMA_LIST; + ts->delim = LWSTZ_DT_NEED_NEXT_CONTENT; + } + + ts->token = ts->start - 1; + ts->token_len = 1; + return LWS_TOKZE_DELIMITER; + + case LWS_TOKZS_QUOTED_STRING: + ts->token_len++; + continue; + + case LWS_TOKZS_TOKEN_POST_TERMINAL: + case LWS_TOKZS_TOKEN: + /* report the delimiter next time */ + ts->start--; + ts->len++; + goto token_or_numeric; + } + } + + /* anything that's not whitespace or delimiter is payload */ + + switch (state) { + case LWS_TOKZS_LEADING_WHITESPACE: + + if (ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) { + if (ts->delim == LWSTZ_DT_NEED_DELIM) + return LWS_TOKZE_ERR_COMMA_LIST; + ts->delim = LWSTZ_DT_NEED_DELIM; + } + + state = LWS_TOKZS_TOKEN; + ts->token = ts->start - 1; + ts->token_len = 1; + if (c < '0' || c > '9') + num = 0; + else + if (num < 0) + num = 1; + continue; + case LWS_TOKZS_QUOTED_STRING: + case LWS_TOKZS_TOKEN: + if (c < '0' || c > '9') + num = 0; + else + if (num < 0) + num = 1; + ts->token_len++; + continue; + case LWS_TOKZS_TOKEN_POST_TERMINAL: + /* report the new token next time */ + ts->start--; + ts->len++; + goto token_or_numeric; + } + } + + /* we ran out of content */ + + if (utf8) /* ended partway through a multibyte char */ + return LWS_TOKZE_ERR_BROKEN_UTF8; + + if (state == LWS_TOKZS_QUOTED_STRING) + return LWS_TOKZE_ERR_UNTERM_STRING; + + if (state != LWS_TOKZS_TOKEN_POST_TERMINAL && + state != LWS_TOKZS_TOKEN) { + if ((ts->flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) && + ts->delim == LWSTZ_DT_NEED_NEXT_CONTENT) + return LWS_TOKZE_ERR_COMMA_LIST; + + return LWS_TOKZE_ENDED; + } + + /* report the pending token */ + +token_or_numeric: + + if (num != 1) + return LWS_TOKZE_TOKEN; + if (flo) + return LWS_TOKZE_FLOAT; + + return LWS_TOKZE_INTEGER; +} + + +LWS_VISIBLE LWS_EXTERN int +lws_tokenize_cstr(struct lws_tokenize *ts, char *str, int max) +{ + if (ts->token_len + 1 >= max) + return 1; + + memcpy(str, ts->token, ts->token_len); + str[ts->token_len] = '\0'; + + return 0; +} + +LWS_VISIBLE LWS_EXTERN void +lws_tokenize_init(struct lws_tokenize *ts, const char *start, int flags) +{ + ts->start = start; + ts->len = 0x7fffffff; + ts->flags = flags; + ts->delim = LWSTZ_DT_NEED_FIRST_CONTENT; +} + #if LWS_MAX_SMP > 1 void diff --git a/lib/core/private.h b/lib/core/private.h index c623c24da..a075880a2 100644 --- a/lib/core/private.h +++ b/lib/core/private.h @@ -1449,6 +1449,8 @@ lws_plat_inet_ntop(int af, const void *src, char *dst, int cnt); LWS_EXTERN int LWS_WARN_UNUSED_RESULT lws_plat_inet_pton(int af, const char *src, void *dst); +LWS_EXTERN int +lws_check_byte_utf8(unsigned char state, unsigned char c); LWS_EXTERN int LWS_WARN_UNUSED_RESULT lws_check_utf8(unsigned char *state, unsigned char *buf, size_t len); LWS_EXTERN int alloc_file(struct lws_context *context, const char *filename, uint8_t **buf, diff --git a/lib/roles/http/server/parsers.c b/lib/roles/http/server/parsers.c index b7dcdb49b..96a779158 100644 --- a/lib/roles/http/server/parsers.c +++ b/lib/roles/http/server/parsers.c @@ -501,6 +501,10 @@ LWS_VISIBLE int lws_hdr_copy(struct lws *wsi, char *dst, int len, int toklen = lws_hdr_total_length(wsi, h); int n; + *dst = '\0'; + if (!toklen) + return 0; + if (toklen >= len) return -1; diff --git a/minimal-examples/api-tests/api-test-lws_tokenize/CMakeLists.txt b/minimal-examples/api-tests/api-test-lws_tokenize/CMakeLists.txt new file mode 100644 index 000000000..7bfc6f651 --- /dev/null +++ b/minimal-examples/api-tests/api-test-lws_tokenize/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 2.8) +include(CheckCSourceCompiles) + +set(SAMP lws-api-test-lws_tokenize) +set(SRCS main.c) + +# If we are being built as part of lws, confirm current build config supports +# reqconfig, else skip building ourselves. +# +# If we are being built externally, confirm installed lws was configured to +# support reqconfig, else error out with a helpful message about the problem. +# +MACRO(require_lws_config reqconfig _val result) + + if (DEFINED ${reqconfig}) + if (${reqconfig}) + set (rq 1) + else() + set (rq 0) + endif() + else() + set(rq 0) + endif() + + if (${_val} EQUAL ${rq}) + set(SAME 1) + else() + set(SAME 0) + endif() + + if (LWS_WITH_MINIMAL_EXAMPLES AND NOT ${SAME}) + if (${_val}) + message("${SAMP}: skipping as lws being built without ${reqconfig}") + else() + message("${SAMP}: skipping as lws built with ${reqconfig}") + endif() + set(${result} 0) + else() + if (LWS_WITH_MINIMAL_EXAMPLES) + set(MET ${SAME}) + else() + CHECK_C_SOURCE_COMPILES("#include \nint main(void) {\n#if defined(${reqconfig})\n return 0;\n#else\n fail;\n#endif\n return 0;\n}\n" HAS_${reqconfig}) + if (NOT DEFINED HAS_${reqconfig} OR NOT HAS_${reqconfig}) + set(HAS_${reqconfig} 0) + else() + set(HAS_${reqconfig} 1) + endif() + if ((HAS_${reqconfig} AND ${_val}) OR (NOT HAS_${reqconfig} AND NOT ${_val})) + set(MET 1) + else() + set(MET 0) + endif() + endif() + if (NOT MET) + if (${_val}) + message(FATAL_ERROR "This project requires lws must have been configured with ${reqconfig}") + else() + message(FATAL_ERROR "Lws configuration of ${reqconfig} is incompatible with this project") + endif() + endif() + endif() +ENDMACRO() + + + + add_executable(${SAMP} ${SRCS}) + + if (websockets_shared) + target_link_libraries(${SAMP} websockets_shared) + add_dependencies(${SAMP} websockets_shared) + else() + target_link_libraries(${SAMP} websockets) + endif() diff --git a/minimal-examples/api-tests/api-test-lws_tokenize/README.md b/minimal-examples/api-tests/api-test-lws_tokenize/README.md new file mode 100644 index 000000000..a6b75ec83 --- /dev/null +++ b/minimal-examples/api-tests/api-test-lws_tokenize/README.md @@ -0,0 +1,37 @@ +# lws api test lws_tokenize + +Performs selftests for lws_tokenize + +## build + +``` + $ cmake . && make +``` + +## usage + +Commandline option|Meaning +---|--- +-d |Debug verbosity in decimal, eg, -d15 +-s "input string"|String to tokenize +-f 15|LWS_TOKENIZE_F_ flag values to apply to processing of -s + +``` + $ ./lws-api-test-lws_tokenize +[2018/10/09 09:14:17:4834] USER: LWS API selftest: lws_tokenize +[2018/10/09 09:14:17:4835] USER: Completed: PASS: 6, FAIL: 0 +``` + +If the `-s string` option is given, the string is tokenized on stdout in +the format used to produce the tests in the sources + +``` + $ ./lws-api-test-lws_tokenize -s "hello: 1234,256" +[2018/10/09 09:14:17:4834] USER: LWS API selftest: lws_tokenize +{ LWS_TOKZE_TOKEN_NAME_COLON, "hello", 5 } +{ LWS_TOKZE_INTEGER, "1234", 4 } +{ LWS_TOKZE_DELIMITER, ",", 1 } +{ LWS_TOKZE_INTEGER, "256", 3 } +{ LWS_TOKZE_ENDED, "", 0 } +``` + diff --git a/minimal-examples/api-tests/api-test-lws_tokenize/main.c b/minimal-examples/api-tests/api-test-lws_tokenize/main.c new file mode 100644 index 000000000..bd15e72b4 --- /dev/null +++ b/minimal-examples/api-tests/api-test-lws_tokenize/main.c @@ -0,0 +1,335 @@ +/* + * lws-api-test-lws_tokenize + * + * Copyright (C) 2018 Andy Green + * + * This file is made available under the Creative Commons CC0 1.0 + * Universal Public Domain Dedication. + * + * This demonstrates the most minimal http server you can make with lws. + * + * To keep it simple, it serves stuff from the subdirectory + * "./mount-origin" of the directory it was started in. + * You can change that by changing mount.origin below. + */ + +#include +#include + +struct expected { + lws_tokenize_elem e; + const char *value; + int len; +}; + +struct tests { + const char *string; + struct expected *exp; + int count; + int flags; +}; + +struct expected expected1[] = { + { LWS_TOKZE_TOKEN, "protocol-1", 10 }, + { LWS_TOKZE_DELIMITER, ",", 1}, + { LWS_TOKZE_TOKEN, "protocol_2", 10 }, + { LWS_TOKZE_DELIMITER, ",", 1}, + { LWS_TOKZE_TOKEN, "protocol3", 9 }, + { LWS_TOKZE_ENDED, NULL, 0 }, + }, + expected2[] = { + { LWS_TOKZE_TOKEN_NAME_COLON, "Accept-Language", 15 }, + { LWS_TOKZE_TOKEN, "fr-CH", 5 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "fr", 2 }, + { LWS_TOKZE_DELIMITER, ";", 1}, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.9", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "en", 2 }, + { LWS_TOKZE_DELIMITER, ";", 1}, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.8", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "de", 2 }, + { LWS_TOKZE_DELIMITER, ";", 1}, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.7", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_DELIMITER, "*", 1 }, + { LWS_TOKZE_DELIMITER, ";", 1 }, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.5", 3 }, + { LWS_TOKZE_ENDED, NULL, 0 }, + }, + expected3[] = { + { LWS_TOKZE_TOKEN_NAME_EQUALS, "quoted", 6 }, + { LWS_TOKZE_QUOTED_STRING, "things:", 7 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_INTEGER, "1234", 4 }, + { LWS_TOKZE_ENDED, NULL, 0 }, + }, + expected4[] = { + { LWS_TOKZE_ERR_COMMA_LIST, ",", 1 }, + }, + expected5[] = { + { LWS_TOKZE_TOKEN, "brokenlist2", 11 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_ERR_COMMA_LIST, ",", 1 }, + }, + expected6[] = { + { LWS_TOKZE_TOKEN, "brokenlist3", 11 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_ERR_COMMA_LIST, ",", 1 }, + + }, + expected7[] = { + { LWS_TOKZE_TOKEN, "fr", 2 }, + { LWS_TOKZE_DELIMITER, "-", 1 }, + { LWS_TOKZE_TOKEN, "CH", 2 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "fr", 2 }, + { LWS_TOKZE_DELIMITER, ";", 1 }, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.9", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "en", 2 }, + { LWS_TOKZE_DELIMITER, ";", 1 }, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.8", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "de", 2 }, + { LWS_TOKZE_DELIMITER, ";", 1 }, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.7", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "*", 1 }, + { LWS_TOKZE_DELIMITER, ";", 1 }, + { LWS_TOKZE_TOKEN_NAME_EQUALS, "q", 1 }, + { LWS_TOKZE_FLOAT, "0.5", 3 }, + { LWS_TOKZE_ENDED, "", 0 }, + }, + expected8[] = { + { LWS_TOKZE_TOKEN, "Οὐχὶ", 10 }, + { LWS_TOKZE_TOKEN, "ταὐτὰ", 12 }, + { LWS_TOKZE_TOKEN, "παρίσταταί", 22 }, + { LWS_TOKZE_TOKEN, "μοι", 6 }, + { LWS_TOKZE_TOKEN, "γιγνώσκειν", 21 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "ὦ", 3 }, + { LWS_TOKZE_TOKEN, "ἄνδρες", 13 }, + { LWS_TOKZE_TOKEN, "᾿Αθηναῖοι", 20 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_TOKEN, "greek", 5 }, + { LWS_TOKZE_ENDED, "", 0 }, + }, + expected9[] = { + /* + * because the tokenizer scans ahead for = aggregation, + * it finds the broken utf8 before reporting the token + */ + { LWS_TOKZE_ERR_BROKEN_UTF8, "", 0 }, + }, + expected10[] = { + { LWS_TOKZE_TOKEN, "badutf8-2", 9 }, + { LWS_TOKZE_TOKEN, "퟿", 3 }, + { LWS_TOKZE_DELIMITER, ",", 1 }, + { LWS_TOKZE_ERR_BROKEN_UTF8, "", 0 }, + } + +; + +struct tests tests[] = { + { + " protocol-1, protocol_2\t,\tprotocol3\n", + expected1, LWS_ARRAY_SIZE(expected1), + LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_AGG_COLON + }, { + "Accept-Language: fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5", + expected2, LWS_ARRAY_SIZE(expected2), + LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_AGG_COLON + }, { + "quoted = \"things:\", 1234", + expected3, LWS_ARRAY_SIZE(expected3), + LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_AGG_COLON + }, { + ", brokenlist1", + expected4, LWS_ARRAY_SIZE(expected4), + LWS_TOKENIZE_F_COMMA_SEP_LIST + }, { + "brokenlist2,,", + expected5, LWS_ARRAY_SIZE(expected5), + LWS_TOKENIZE_F_COMMA_SEP_LIST + }, { + "brokenlist3,", + expected6, LWS_ARRAY_SIZE(expected6), + LWS_TOKENIZE_F_COMMA_SEP_LIST + }, { + "fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5", + expected7, LWS_ARRAY_SIZE(expected7), + LWS_TOKENIZE_F_RFC7230_DELIMS + }, + { + " Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, greek", + expected8, LWS_ARRAY_SIZE(expected8), + LWS_TOKENIZE_F_RFC7230_DELIMS + }, + { + "badutf8-1 \x80...", + expected9, LWS_ARRAY_SIZE(expected9), + LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_RFC7230_DELIMS + }, + { + "badutf8-2 \xed\x9f\xbf,\x80...", + expected10, LWS_ARRAY_SIZE(expected10), + LWS_TOKENIZE_F_MINUS_NONTERM | LWS_TOKENIZE_F_RFC7230_DELIMS + }, +}; + +/* + * add LWS_TOKZE_ERRS to the element index (which may be negative by that + * amount) to index this array + */ + +static const char *element_names[] = { + "LWS_TOKZE_ERR_BROKEN_UTF8", + "LWS_TOKZE_ERR_UNTERM_STRING", + "LWS_TOKZE_ERR_MALFORMED_FLOAT", + "LWS_TOKZE_ERR_NUM_ON_LHS", + "LWS_TOKZE_ERR_COMMA_LIST", + "LWS_TOKZE_ENDED", + "LWS_TOKZE_DELIMITER", + "LWS_TOKZE_TOKEN", + "LWS_TOKZE_INTEGER", + "LWS_TOKZE_FLOAT", + "LWS_TOKZE_TOKEN_NAME_EQUALS", + "LWS_TOKZE_TOKEN_NAME_COLON", + "LWS_TOKZE_QUOTED_STRING", +}; + +int main(int argc, const char **argv) +{ + struct lws_tokenize ts; + lws_tokenize_elem e; + const char *p; + int n, logs = LLL_USER | LLL_ERR | LLL_WARN | LLL_NOTICE + /* for LLL_ verbosity above NOTICE to be built into lws, + * lws must have been configured and built with + * -DCMAKE_BUILD_TYPE=DEBUG instead of =RELEASE */ + /* | LLL_INFO */ /* | LLL_PARSER */ /* | LLL_HEADER */ + /* | LLL_EXT */ /* | LLL_CLIENT */ /* | LLL_LATENCY */ + /* | LLL_DEBUG */; + int fail = 0, ok = 0, flags = 0; + + if ((p = lws_cmdline_option(argc, argv, "-d"))) + logs = atoi(p); + + lws_set_log_level(logs, NULL); + lwsl_user("LWS API selftest: lws_tokenize\n"); + + if ((p = lws_cmdline_option(argc, argv, "-f"))) + flags = atoi(p); + + p = lws_cmdline_option(argc, argv, "-s"); + + for (n = 0; n < (int)LWS_ARRAY_SIZE(tests); n++) { + int m = 0, in_fail = fail; + struct expected *exp = tests[n].exp; + + ts.start = tests[n].string; + ts.len = strlen(ts.start); + ts.flags = tests[n].flags; + + do { + e = lws_tokenize(&ts); + + lwsl_info("{ %s, \"%.*s\", %d }\n", + element_names[e + LWS_TOKZE_ERRS], + (int)ts.token_len, ts.token, + (int)ts.token_len); + + if (m == (int)tests[n].count) { + lwsl_notice("fail: expected end earlier\n"); + fail++; + break; + } + + if (e != exp->e) { + lwsl_notice("fail... tok %s vs expected %s\n", + element_names[e + LWS_TOKZE_ERRS], + element_names[exp->e + LWS_TOKZE_ERRS]); + fail++; + break; + } + + if (e > 0 && + (ts.token_len != exp->len || + memcmp(exp->value, ts.token, exp->len))) { + lwsl_notice("fail token mismatch\n"); + fail++; + break; + } + + m++; + exp++; + + } while (e > 0); + + if (fail == in_fail) + ok++; + } + + if (p) { + ts.start = p; + ts.len = strlen(p); + ts.flags = flags; + + printf("\t{\n\t\t\"%s\",\n" + "\t\texpected%d, LWS_ARRAY_SIZE(expected%d),\n\t\t", + p, (int)LWS_ARRAY_SIZE(tests) + 1, + (int)LWS_ARRAY_SIZE(tests) + 1); + + if (!flags) + printf("0\n\t},\n"); + else { + if (flags & LWS_TOKENIZE_F_MINUS_NONTERM) + printf("LWS_TOKENIZE_F_MINUS_NONTERM"); + if (flags & LWS_TOKENIZE_F_AGG_COLON) { + if (flags & 1) + printf(" | "); + printf("LWS_TOKENIZE_F_AGG_COLON"); + } + if (flags & LWS_TOKENIZE_F_COMMA_SEP_LIST) { + if (flags & 3) + printf(" | "); + printf("LWS_TOKENIZE_F_COMMA_SEP_LIST"); + } + if (flags & LWS_TOKENIZE_F_RFC7230_DELIMS) { + if (flags & 7) + printf(" | "); + printf("LWS_TOKENIZE_F_RFC7230_DELIMS"); + } + + printf("\n\t},\n"); + } + + printf("\texpected%d[] = {\n", (int)LWS_ARRAY_SIZE(tests) + 1); + + do { + e = lws_tokenize(&ts); + + printf("\t\t{ %s, \"%.*s\", %d },\n", + element_names[e + LWS_TOKZE_ERRS], + (int)ts.token_len, + ts.token, (int)ts.token_len); + + } while (e > 0); + + printf("\t}\n"); + } + + + lwsl_user("Completed: PASS: %d, FAIL: %d\n", ok, fail); + + return !(ok && !fail); +} diff --git a/minimal-examples/api-tests/api-test-lws_tokenize/selftest.sh b/minimal-examples/api-tests/api-test-lws_tokenize/selftest.sh new file mode 100755 index 000000000..16d1e2e8e --- /dev/null +++ b/minimal-examples/api-tests/api-test-lws_tokenize/selftest.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# $1: path to minimal example binaries... +# if lws is built with -DLWS_WITH_MINIMAL_EXAMPLES=1 +# that will be ./bin from your build dir +# +# $2: path for logs and results. The results will go +# in a subdir named after the directory this script +# is in +# +# $3: offset for test index count +# +# $4: total test count +# +# $5: path to ./minimal-examples dir in lws +# +# Test return code 0: OK, 254: timed out, other: error indication + +. $5/selftests-library.sh + +COUNT_TESTS=1 + +dotest $1 $2 apiselftest +exit $FAILS