mirror of
https://github.com/warmcat/libwebsockets.git
synced 2025-03-09 00:00:04 +01:00
minilex: add improved generic version in misc
Leave the http minilex as it is, and add an improved version in lib/misc - get a list of strings from stdin and emit C header to stdout - support ambiguous terminals (xxx and xxx-something) correctly regardless of introduction order - add generic parser in lib/misc minilex doesn't build as part of lws since it's only needed by developers, there's a one-line build documented at a comment at the top of lib/misc/minilex.c
This commit is contained in:
parent
236512687e
commit
8b16aa18c8
3 changed files with 381 additions and 0 deletions
|
@ -1127,3 +1127,37 @@ lws_fsmount_mount(struct lws_fsmount *fsm);
|
|||
*/
|
||||
LWS_VISIBLE LWS_EXTERN int
|
||||
lws_fsmount_unmount(struct lws_fsmount *fsm);
|
||||
|
||||
#define LWS_MINILEX_FAIL -1
|
||||
#define LWS_MINILEX_CONTINUE 0
|
||||
#define LWS_MINILEX_MATCH 1
|
||||
|
||||
/**
|
||||
* lws_minilex_parse() - stateful matching vs lws minilex tables
|
||||
*
|
||||
* \p lex: the start of the precomputed minilex table
|
||||
* \p ps: pointer to the int16_t that holds the parsing state (init to 0)
|
||||
* \p c: the next incoming character to parse
|
||||
* \p match: pointer to take the match
|
||||
*
|
||||
* Returns either
|
||||
*
|
||||
* - LWS_MINILEX_FAIL if there is no way to match the characters seen,
|
||||
* this is sticky for additional characters until the *ps is reset to 0.
|
||||
*
|
||||
* - LWS_MINILEX_CONTINUE if the character could be part of a match but more
|
||||
* are required to see if it can match
|
||||
*
|
||||
* - LWS_MINILEX_MATCH and *match is set to the match index if there is a
|
||||
* valid match.
|
||||
*
|
||||
* In cases where the match is ambiguous, eg, we saw "right" and the possible
|
||||
* matches are "right" or "right-on", LWS_MINILEX_CONTINUE is returned. To
|
||||
* allow it to match on the complete-but-ambiguous token, if the caller sees
|
||||
* a delimiter it can call lws_minilex_parse() again with c == 0. This will
|
||||
* either return LWS_MINILEX_MATCH and set *match to the smaller ambiguous
|
||||
* match, or return LWS_MINILEX_FAIL.
|
||||
*/
|
||||
LWS_VISIBLE LWS_EXTERN int
|
||||
lws_minilex_parse(const uint8_t *lex, int16_t *ps, const uint8_t c,
|
||||
int *match);
|
||||
|
|
|
@ -1645,3 +1645,62 @@ lws_humanize(char *p, size_t len, uint64_t v, const lws_humanize_unit_t *schema)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* -1 = fail
|
||||
* 0 = continue
|
||||
* 1 = hit
|
||||
*/
|
||||
|
||||
#define LWS_MINILEX_FAIL_CODING 8
|
||||
|
||||
int
|
||||
lws_minilex_parse(const uint8_t *lex, int16_t *ps, const uint8_t c, int *match)
|
||||
{
|
||||
if (*ps == (int16_t)-1)
|
||||
return LWS_MINILEX_FAIL;
|
||||
|
||||
while (1) {
|
||||
if (lex[*ps] & (1 << 7)) {
|
||||
/* 1-byte, fail on mismatch */
|
||||
if ((lex[*ps] & 0x7f) != c)
|
||||
goto nope;
|
||||
|
||||
/* go forward */
|
||||
if (lex[++(*ps)] == LWS_MINILEX_FAIL_CODING)
|
||||
goto nope;
|
||||
|
||||
if (lex[*ps] < LWS_MINILEX_FAIL_CODING) {
|
||||
/* this is a terminal marker */
|
||||
*match = (int)lex[++(*ps)];
|
||||
return LWS_MINILEX_MATCH;
|
||||
}
|
||||
|
||||
return LWS_MINILEX_CONTINUE;
|
||||
}
|
||||
|
||||
if (lex[*ps] == LWS_MINILEX_FAIL_CODING)
|
||||
goto nope;
|
||||
|
||||
/* b7 = 0, end or 3-byte */
|
||||
if (lex[*ps] < LWS_MINILEX_FAIL_CODING) {
|
||||
/* this is a terminal marker */
|
||||
*match = (int)lex[++(*ps)];
|
||||
return LWS_MINILEX_MATCH;
|
||||
}
|
||||
|
||||
if (lex[*ps] == c) { /* goto-on-match */
|
||||
*ps = (int16_t)(*ps + (lex[(*ps) + 1]) +
|
||||
(lex[(*ps) + 2] << 8));
|
||||
return LWS_MINILEX_CONTINUE;
|
||||
}
|
||||
|
||||
/* fall thru to next */
|
||||
*ps = (int16_t)((*ps) + 3);
|
||||
}
|
||||
|
||||
nope:
|
||||
*ps = (int16_t)-1;
|
||||
|
||||
return LWS_MINILEX_FAIL;
|
||||
}
|
||||
|
|
288
lib/misc/minilex.c
Normal file
288
lib/misc/minilex.c
Normal file
|
@ -0,0 +1,288 @@
|
|||
/*
|
||||
* minilex.c
|
||||
*
|
||||
* High efficiency lexical s parser
|
||||
*
|
||||
* Copyright (C)2011-2022 Andy Green <andy@warmcat.com>
|
||||
*
|
||||
* Licensed under MIT
|
||||
*
|
||||
* This is a version of the original lws http minilex that can handle ambiguous
|
||||
* terminals and accepts the terminal list from stdin, producing a parsing
|
||||
* table on stdout.
|
||||
*
|
||||
* Usage: gcc minilex.c -o minilex && \
|
||||
* cat css-lextable-strings.txt | ./minilex > css-lextable.h
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* b7 = 0 = 1-byte seq
|
||||
* 0x08 = fail
|
||||
* 2-byte seq
|
||||
* 0x00 - 0x07, then terminal as given in 2nd byte
|
||||
3-byte seq
|
||||
* no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
|
||||
* = 1 = 1-byte seq
|
||||
* no match: die, match go fwd 1 byte
|
||||
*/
|
||||
|
||||
#define PARALLEL 30
|
||||
|
||||
struct s {
|
||||
char c[PARALLEL];
|
||||
int s[PARALLEL];
|
||||
int count;
|
||||
int bytepos;
|
||||
|
||||
int real_pos;
|
||||
};
|
||||
|
||||
struct s s[1000];
|
||||
int next = 1;
|
||||
|
||||
#define FAIL_CHAR 0x08
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
const char *rset[200];
|
||||
int n = 0;
|
||||
int m;
|
||||
int prev;
|
||||
int walk;
|
||||
int y;
|
||||
int j;
|
||||
int pos = 0;
|
||||
size_t sl = 0;
|
||||
char *line = NULL;
|
||||
ssize_t r;
|
||||
int setmembers = 0;
|
||||
|
||||
memset(rset, 0, sizeof(rset));
|
||||
|
||||
/* Step 1: collect the strings from stdin and list in a comment */
|
||||
|
||||
do {
|
||||
r = getline(&line, &sl, stdin);
|
||||
if (r <= 0 || setmembers == sizeof(rset) / sizeof(rset[0]))
|
||||
break;
|
||||
if (line[r - 1] == '\n')
|
||||
line[r - 1] = '\0';
|
||||
printf("\t/* %d: %s */\n", setmembers, line);
|
||||
rset[setmembers++] = strdup(line);
|
||||
} while (1);
|
||||
|
||||
free(line);
|
||||
|
||||
/* Step 2: produce an enum template for the strings in a comment */
|
||||
|
||||
printf("/* enum {\n");
|
||||
|
||||
n = 0;
|
||||
while (n < setmembers) {
|
||||
char def[100];
|
||||
|
||||
strncpy(def, rset[n], sizeof(def));
|
||||
j = 0;
|
||||
while (def[j]) {
|
||||
if (def[j] == '-')
|
||||
def[j] = '_';
|
||||
if (def[j] == ':' && !def[j + 1])
|
||||
def[j] = '\0';
|
||||
else
|
||||
if (def[j] >= 'a' && def[j] <= 'z')
|
||||
def[j] = def[j] - ('a' - 'A');
|
||||
|
||||
j++;
|
||||
}
|
||||
printf("\tXXXX_%s,\n", def);
|
||||
n++;
|
||||
}
|
||||
|
||||
printf("}; */\n\n");
|
||||
|
||||
/*
|
||||
* Step 3: issue each character of each string into the tree, reusing
|
||||
* any existing common substring subtrees
|
||||
*/
|
||||
|
||||
n = 0;
|
||||
while (n < setmembers) {
|
||||
m = 0;
|
||||
walk = 0;
|
||||
prev = 0;
|
||||
|
||||
while (rset[n][m]) {
|
||||
|
||||
int saw = 0;
|
||||
for (y = 0; y < s[walk].count; y++)
|
||||
if (s[walk].c[y] == rset[n][m]) {
|
||||
/* exists -- go forward */
|
||||
walk = s[walk].s[y];
|
||||
saw = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (saw) {
|
||||
m++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* If something we didn't see before, insert a
|
||||
* conditional goto for it... however if there
|
||||
* is already a terminal, we must insert the
|
||||
* conditional before it. This handles
|
||||
* matches on "xx" and "xxy" where "xx" is
|
||||
* listed first */
|
||||
|
||||
s[walk].count++;
|
||||
|
||||
if (s[walk].count > 1 &&
|
||||
!s[walk].s[s[walk].count - 2]) {
|
||||
/*
|
||||
* This s currently has a terminal
|
||||
* at the end... insert a conditional
|
||||
* behind it
|
||||
*/
|
||||
s[walk].c[s[walk].count - 1] =
|
||||
s[walk].c[s[walk].count - 2];
|
||||
s[walk].s[s[walk].count - 1] =
|
||||
s[walk].s[s[walk].count - 2];
|
||||
|
||||
s[walk].c[s[walk].count - 2] = rset[n][m];
|
||||
s[walk].s[s[walk].count - 2] = next;
|
||||
} else {
|
||||
/* just append a conditional */
|
||||
s[walk].c[s[walk].count - 1] = rset[n][m];
|
||||
s[walk].s[s[walk].count - 1] = next;
|
||||
}
|
||||
|
||||
walk = next++;
|
||||
|
||||
m++;
|
||||
}
|
||||
|
||||
/* reached the end of rset[n] */
|
||||
|
||||
s[walk].c[s[walk].count] = n++;
|
||||
s[walk].s[s[walk].count++] = 0; /* terminal marker */
|
||||
}
|
||||
|
||||
walk = 0;
|
||||
for (n = 0; n < next; n++) {
|
||||
s[n].bytepos = walk;
|
||||
walk += (2 * s[n].count);
|
||||
}
|
||||
|
||||
/* compute everyone's position first */
|
||||
|
||||
pos = 0;
|
||||
walk = 0;
|
||||
for (n = 0; n < next; n++) {
|
||||
|
||||
s[n].real_pos = pos;
|
||||
|
||||
for (m = 0; m < s[n].count; m++) {
|
||||
|
||||
if (s[n].s[m] == 0)
|
||||
pos += 2; /* terminal marker */
|
||||
else { /* c is a character */
|
||||
if ((s[s[n].s[m]].bytepos -
|
||||
walk) == 2)
|
||||
pos++;
|
||||
else {
|
||||
pos += 3;
|
||||
if (m == s[n].count - 1)
|
||||
pos++; /* fail */
|
||||
}
|
||||
}
|
||||
walk += 2;
|
||||
}
|
||||
}
|
||||
|
||||
walk = 0;
|
||||
pos = 0;
|
||||
for (n = 0; n < next; n++) {
|
||||
for (m = 0; m < s[n].count; m++) {
|
||||
|
||||
int saw = s[n].s[m];
|
||||
|
||||
if (!m)
|
||||
fprintf(stdout, "/* pos %04x: %3d */ ",
|
||||
s[n].real_pos, n);
|
||||
else
|
||||
fprintf(stdout, " ");
|
||||
|
||||
y = s[n].c[m];
|
||||
|
||||
if (saw == 0) { // c is a terminal then
|
||||
|
||||
if (y > 0x7ff) {
|
||||
fprintf(stderr, "terminal too big\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
fprintf(stdout, " 0x%02X, 0x%02X "
|
||||
" "
|
||||
"/* - terminal marker %2d - */,\n",
|
||||
y >> 8, y & 0xff, y & 0x7f);
|
||||
pos += 2;
|
||||
walk += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* c is a character */
|
||||
|
||||
prev = y &0x7f;
|
||||
if (prev < 32 || prev > 126)
|
||||
prev = '.';
|
||||
|
||||
|
||||
if ((s[saw].bytepos - walk) == 2) {
|
||||
fprintf(stdout, " 0x%02X /* '%c' -> */,\n",
|
||||
y | 0x80, prev);
|
||||
pos++;
|
||||
walk += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
j = s[saw].real_pos - pos;
|
||||
|
||||
if (j > 0xffff) {
|
||||
fprintf(stderr,
|
||||
"Jump > 64K bytes ahead (%d to %d)\n",
|
||||
s[n].real_pos, s[saw].real_pos);
|
||||
return 1;
|
||||
}
|
||||
fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X "
|
||||
"/* (to 0x%04X s %3d) */,\n",
|
||||
y, prev,
|
||||
j & 0xff, j >> 8,
|
||||
s[saw].real_pos, saw);
|
||||
pos += 3;
|
||||
|
||||
if (m == s[n].count - 1) {
|
||||
fprintf(stdout,
|
||||
" 0x%02X, /* fail */\n",
|
||||
FAIL_CHAR);
|
||||
pos++; /* fail */
|
||||
}
|
||||
|
||||
walk += 2;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stdout, "/* total size %d bytes */\n", pos);
|
||||
|
||||
for (n = 0;n < setmembers; n++) {
|
||||
free((void *)rset[n]);
|
||||
rset[n] = NULL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue