From b6fba247a348013d67f5c1a5bb8ff9e7b44dbd3d Mon Sep 17 00:00:00 2001 From: Richard Aas Date: Thu, 15 Oct 2015 06:49:07 +0000 Subject: [PATCH] fmt: added utf8 encode/decode functions --- include/re_fmt.h | 5 ++ src/fmt/mod.mk | 1 + src/fmt/unicode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 src/fmt/unicode.c diff --git a/include/re_fmt.h b/include/re_fmt.h index 2f942ff..440bd58 100644 --- a/include/re_fmt.h +++ b/include/re_fmt.h @@ -139,3 +139,8 @@ typedef void (fmt_param_h)(const struct pl *name, const struct pl *val, bool fmt_param_exists(const struct pl *pl, const char *pname); bool fmt_param_get(const struct pl *pl, const char *pname, struct pl *val); void fmt_param_apply(const struct pl *pl, fmt_param_h *ph, void *arg); + + +/* unicode */ +int utf8_encode(struct re_printf *pf, const char *str); +int utf8_decode(struct re_printf *pf, const struct pl *pl); diff --git a/src/fmt/mod.mk b/src/fmt/mod.mk index 92b5650..04d1736 100644 --- a/src/fmt/mod.mk +++ b/src/fmt/mod.mk @@ -13,3 +13,4 @@ SRCS += fmt/regex.c SRCS += fmt/str.c SRCS += fmt/str_error.c SRCS += fmt/time.c +SRCS += fmt/unicode.c diff --git a/src/fmt/unicode.c b/src/fmt/unicode.c new file mode 100644 index 0000000..5edb39c --- /dev/null +++ b/src/fmt/unicode.c @@ -0,0 +1,168 @@ +/** + * @file utf8.c Unicode character coding + * + * Copyright (C) 2010 Creytiv.com + */ +#include +#include +#include + + +static const char *hex_chars = "0123456789ABCDEF"; + + +/** + * UTF-8 encode + * + * @param pf Print function for output + * @param str Input string to encode + * + * @return 0 if success, otherwise errorcode + */ +int utf8_encode(struct re_printf *pf, const char *str) +{ + char ubuf[6] = "\\u00", ebuf[2] = "\\"; + + if (!pf) + return EINVAL; + + if (!str) + return 0; + + while (*str) { + const uint8_t c = *str++; /* NOTE: must be unsigned 8-bit */ + bool unicode = false; + char ec = 0; + int err; + + switch (c) { + + case '"': ec = '"'; break; + case '\\': ec = '\\'; break; + case '/': ec = '/'; break; + case '\b': ec = 'b'; break; + case '\f': ec = 'f'; break; + case '\n': ec = 'n'; break; + case '\r': ec = 'r'; break; + case '\t': ec = 't'; break; + default: + if (c < ' ') { + unicode = true; + } + /* chars in range 0x80-0xff are not escaped */ + break; + } + + if (unicode) { + ubuf[4] = hex_chars[(c>>4) & 0xf]; + ubuf[5] = hex_chars[c & 0xf]; + + err = pf->vph(ubuf, sizeof(ubuf), pf->arg); + } + else if (ec) { + ebuf[1] = ec; + + err = pf->vph(ebuf, sizeof(ebuf), pf->arg); + } + else { + err = pf->vph((char *)&c, 1, pf->arg); + } + + if (err) + return err; + } + + return 0; +} + + +/** + * UTF-8 decode + * + * @param pf Print function for output + * @param pl Input buffer to decode + * + * @return 0 if success, otherwise errorcode + */ +int utf8_decode(struct re_printf *pf, const struct pl *pl) +{ + size_t i; + + if (!pf) + return EINVAL; + + if (!pl) + return 0; + + for (i=0; il; i++) { + + char ch = pl->p[i]; + int err; + + if (ch == '\\') { + + uint16_t u = 0; + + ++i; + + if (i >= pl->l) + return EBADMSG; + + ch = pl->p[i]; + + switch (ch) { + + case 'b': + ch = '\b'; + break; + + case 'f': + ch = '\f'; + break; + + case 'n': + ch = '\n'; + break; + + case 'r': + ch = '\r'; + break; + + case 't': + ch = '\t'; + break; + + case 'u': + if (i+4 >= pl->l) + return EBADMSG; + + if (!isxdigit(pl->p[i+1]) || + !isxdigit(pl->p[i+2]) || + !isxdigit(pl->p[i+3]) || + !isxdigit(pl->p[i+4])) + return EBADMSG; + + u |= ((uint16_t)ch_hex(pl->p[++i])) << 12; + u |= ((uint16_t)ch_hex(pl->p[++i])) << 8; + u |= ((uint16_t)ch_hex(pl->p[++i])) << 4; + u |= ((uint16_t)ch_hex(pl->p[++i])) << 0; + + if (u > 255) { + ch = u>>8; + err = pf->vph(&ch, 1, pf->arg); + if (err) + return err; + } + + ch = u & 0xff; + break; + } + } + + err = pf->vph(&ch, 1, pf->arg); + if (err) + return err; + } + + return 0; +}