/* * Functions converting HTSMSGs to/from XML * Copyright (C) 2008 Andreas �man * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /** * XML parser, written according to this spec: * * http://www.w3.org/TR/2006/REC-xml-20060816/ * * Parses of UTF-8 and ISO-8859-1 (Latin 1) encoded XML and output as * htsmsg's with UTF-8 encoded payloads * * Supports: Example: * * Comments * Processing Instructions * CDATA ]]> * Label references & * Character references A * Empty tags * * * Not supported: * * UTF-16 (mandatory by standard) * Intelligent parsing of * Entity declarations * */ #include #include #include #include #include #include #include "tvheadend.h" #include "htsmsg_xml.h" #include "htsbuf.h" TAILQ_HEAD(cdata_content_queue, cdata_content); LIST_HEAD(xmlns_list, xmlns); typedef struct xmlns { LIST_ENTRY(xmlns) xmlns_global_link; LIST_ENTRY(xmlns) xmlns_scope_link; char *xmlns_prefix; unsigned int xmlns_prefix_len; char *xmlns_norm; unsigned int xmlns_norm_len; } xmlns_t; typedef struct xmlparser { enum { XML_ENCODING_UTF8, XML_ENCODING_8859_1, } xp_encoding; char xp_errmsg[128]; int xp_srcdataused; struct xmlns_list xp_namespaces; } xmlparser_t; #define xmlerr(xp, fmt...) \ snprintf((xp)->xp_errmsg, sizeof((xp)->xp_errmsg), fmt) typedef struct cdata_content { TAILQ_ENTRY(cdata_content) cc_link; char *cc_start, *cc_end; /* end points to byte AFTER last char */ int cc_encoding; char cc_buf[0]; } cdata_content_t; static char *htsmsg_xml_parse_cd(xmlparser_t *xp, htsmsg_t *parent, char *src); /** * */ static void add_unicode(struct cdata_content_queue *ccq, int c) { cdata_content_t *cc; int r; cc = malloc(sizeof(cdata_content_t) + 6); r = put_utf8(cc->cc_buf, c); if(r == 0) { free(cc); return; } cc->cc_encoding = XML_ENCODING_UTF8; TAILQ_INSERT_TAIL(ccq, cc, cc_link); cc->cc_start = cc->cc_buf; cc->cc_end = cc->cc_buf + r; } /** * */ static int decode_character_reference(char **src) { int v = 0; char c; if(**src == 'x') { /* hexadecimal */ (*src)++; /* decimal */ while(1) { c = **src; switch(c) { case '0' ... '9': v = v * 0x10 + c - '0'; break; case 'a' ... 'f': v = v * 0x10 + c - 'a' + 10; break; case 'A' ... 'F': v = v * 0x10 + c - 'A' + 10; break; case ';': (*src)++; return v; default: return 0; } (*src)++; } } else { /* decimal */ while(1) { c = **src; switch(c) { case '0' ... '9': v = v * 10 + c - '0'; (*src)++; break; case ';': (*src)++; return v; default: return 0; } } } } /** * */ static inline int is_xmlws(char c) { return c > 0 && c <= 32; // return c == 32 || c == 9 || c = 10 || c = 13; } /** * */ static void xmlns_destroy(xmlns_t *ns) { LIST_REMOVE(ns, xmlns_global_link); LIST_REMOVE(ns, xmlns_scope_link); free(ns->xmlns_prefix); free(ns->xmlns_norm); free(ns); } /** * */ static char * htsmsg_xml_parse_attrib(xmlparser_t *xp, htsmsg_t *msg, char *src, struct xmlns_list *xmlns_scope_list) { char *attribname, *payload; int attriblen, payloadlen; char quote; htsmsg_field_t *f; xmlns_t *ns; attribname = src; /* Parse attribute name */ while(1) { if(*src == 0) { xmlerr(xp, "Unexpected end of file during attribute name parsing"); return NULL; } if(is_xmlws(*src) || *src == '=') break; src++; } attriblen = src - attribname; if(attriblen < 1 || attriblen > 65535) { xmlerr(xp, "Invalid attribute name"); return NULL; } while(is_xmlws(*src)) src++; if(*src != '=') { xmlerr(xp, "Expected '=' in attribute parsing"); return NULL; } src++; while(is_xmlws(*src)) src++; /* Parse attribute payload */ quote = *src++; if(quote != '"' && quote != '\'') { xmlerr(xp, "Expected ' or \" before attribute value"); return NULL; } payload = src; while(1) { if(*src == 0) { xmlerr(xp, "Unexpected end of file during attribute value parsing"); return NULL; } if(*src == quote) break; src++; } payloadlen = src - payload; if(payloadlen < 0 || payloadlen > 65535) { xmlerr(xp, "Invalid attribute value"); return NULL; } src++; while(is_xmlws(*src)) src++; if(xmlns_scope_list != NULL && attriblen > 6 && !memcmp(attribname, "xmlns:", 6)) { attribname += 6; attriblen -= 6; ns = malloc(sizeof(xmlns_t)); ns->xmlns_prefix = malloc(attriblen + 1); memcpy(ns->xmlns_prefix, attribname, attriblen); ns->xmlns_prefix[attriblen] = 0; ns->xmlns_prefix_len = attriblen; ns->xmlns_norm = malloc(payloadlen + 1); memcpy(ns->xmlns_norm, payload, payloadlen); ns->xmlns_norm[payloadlen] = 0; ns->xmlns_norm_len = payloadlen; LIST_INSERT_HEAD(&xp->xp_namespaces, ns, xmlns_global_link); LIST_INSERT_HEAD(xmlns_scope_list, ns, xmlns_scope_link); return src; } xp->xp_srcdataused = 1; attribname[attriblen] = 0; payload[payloadlen] = 0; f = htsmsg_field_add(msg, attribname, HMF_STR, 0); f->hmf_str = payload; return src; } /** * */ static char * htsmsg_xml_parse_tag(xmlparser_t *xp, htsmsg_t *parent, char *src) { htsmsg_t *m, *attrs; struct xmlns_list nslist; char *tagname; int taglen, empty = 0, i; xmlns_t *ns; tagname = src; LIST_INIT(&nslist); while(1) { if(*src == 0) { xmlerr(xp, "Unexpected end of file during tag name parsing"); return NULL; } if(is_xmlws(*src) || *src == '>' || *src == '/') break; src++; } taglen = src - tagname; if(taglen < 1 || taglen > 65535) { xmlerr(xp, "Invalid tag name"); return NULL; } attrs = htsmsg_create_map(); while(1) { while(is_xmlws(*src)) src++; if(*src == 0) { htsmsg_destroy(attrs); xmlerr(xp, "Unexpected end of file in tag"); return NULL; } if(src[0] == '/' && src[1] == '>') { empty = 1; src += 2; break; } if(*src == '>') { src++; break; } if((src = htsmsg_xml_parse_attrib(xp, attrs, src, &nslist)) == NULL) { htsmsg_destroy(attrs); return NULL; } } m = htsmsg_create_map(); if(TAILQ_FIRST(&attrs->hm_fields) != NULL) { htsmsg_add_msg_extname(m, "attrib", attrs); } else { htsmsg_destroy(attrs); } if(!empty) src = htsmsg_xml_parse_cd(xp, m, src); for(i = 0; i < taglen - 1; i++) { if(tagname[i] == ':') { LIST_FOREACH(ns, &xp->xp_namespaces, xmlns_global_link) { if(ns->xmlns_prefix_len == i && !memcmp(ns->xmlns_prefix, tagname, ns->xmlns_prefix_len)) { int llen = taglen - i - 1; char *n = malloc(ns->xmlns_norm_len + llen + 1); n[ns->xmlns_norm_len + llen] = 0; memcpy(n, ns->xmlns_norm, ns->xmlns_norm_len); memcpy(n + ns->xmlns_norm_len, tagname + i + 1, llen); htsmsg_add_msg(parent, n, m); free(n); goto done; } } } } xp->xp_srcdataused = 1; tagname[taglen] = 0; htsmsg_add_msg_extname(parent, tagname, m); done: while((ns = LIST_FIRST(&nslist)) != NULL) xmlns_destroy(ns); return src; } /** * */ static char * htsmsg_xml_parse_pi(xmlparser_t *xp, htsmsg_t *parent, char *src) { htsmsg_t *attrs; char *s = src; char *piname; int l; while(1) { if(*src == 0) { xmlerr(xp, "Unexpected end of file during parsing of " "Processing instructions"); return NULL; } if(is_xmlws(*src) || *src == '?') break; src++; } l = src - s; if(l < 1 || l > 65536) { xmlerr(xp, "Invalid 'Processing instructions' name"); return NULL; } piname = alloca(l + 1); memcpy(piname, s, l); piname[l] = 0; attrs = htsmsg_create_map(); while(1) { while(is_xmlws(*src)) src++; if(*src == 0) { htsmsg_destroy(attrs); xmlerr(xp, "Unexpected end of file during parsing of " "Processing instructions"); return NULL; } if(src[0] == '?' && src[1] == '>') { src += 2; break; } if((src = htsmsg_xml_parse_attrib(xp, attrs, src, NULL)) == NULL) { htsmsg_destroy(attrs); return NULL; } } if(TAILQ_FIRST(&attrs->hm_fields) != NULL && parent != NULL) { htsmsg_add_msg(parent, piname, attrs); } else { htsmsg_destroy(attrs); } return src; } /** * */ static char * xml_parse_comment(xmlparser_t *xp, char *src) { /* comment */ while(1) { if(*src == 0) { /* EOF inside comment is invalid */ xmlerr(xp, "Unexpected end of file inside a comment"); return NULL; } if(src[0] == '-' && src[1] == '-' && src[2] == '>') return src + 3; src++; } } /** * */ static char * decode_label_reference(xmlparser_t *xp, struct cdata_content_queue *ccq, char *src) { char *s = src; int l; char *label; while(*src != 0 && *src != ';') src++; if(*src == 0) { xmlerr(xp, "Unexpected end of file during parsing of label reference"); return NULL; } l = src - s; if(l < 1 || l > 65535) return NULL; label = alloca(l + 1); memcpy(label, s, l); label[l] = 0; src++; if(!strcmp(label, "amp")) add_unicode(ccq, '&'); else if(!strcmp(label, "gt")) add_unicode(ccq, '>'); else if(!strcmp(label, "lt")) add_unicode(ccq, '<'); else if(!strcmp(label, "apos")) add_unicode(ccq, '\''); else if(!strcmp(label, "quot")) add_unicode(ccq, '"'); else { xmlerr(xp, "Unknown label referense: \"&%s;\"\n", label); return NULL; } return src; } /** * */ static char * htsmsg_xml_parse_cd0(xmlparser_t *xp, struct cdata_content_queue *ccq, htsmsg_t *tags, htsmsg_t *pis, char *src, int raw) { cdata_content_t *cc = NULL; int c; while(src != NULL && *src != 0) { if(raw && src[0] == ']' && src[1] == ']' && src[2] == '>') { if(cc != NULL) cc->cc_end = src; cc = NULL; src += 3; break; } if(*src == '<' && !raw) { if(cc != NULL) cc->cc_end = src; cc = NULL; src++; if(*src == '?') { src++; src = htsmsg_xml_parse_pi(xp, pis, src); continue; } if(src[0] == '!') { src++; if(src[0] == '-' && src[1] == '-') { src = xml_parse_comment(xp, src + 2); continue; } if(!strncmp(src, "[CDATA[", 7)) { src += 7; src = htsmsg_xml_parse_cd0(xp, ccq, tags, pis, src, 1); continue; } xmlerr(xp, "Unknown syntatic element: ') { if(*src == 0) { /* EOF inside endtag */ xmlerr(xp, "Unexpected end of file inside close tag"); return NULL; } src++; } src++; break; } src = htsmsg_xml_parse_tag(xp, tags, src); continue; } if(*src == '&' && !raw) { if(cc != NULL) cc->cc_end = src; cc = NULL; src++; if(*src == '#') { src++; /* Character reference */ if((c = decode_character_reference(&src)) != 0) add_unicode(ccq, c); else { xmlerr(xp, "Invalid character reference"); return NULL; } } else { /* Label references */ src = decode_label_reference(xp, ccq, src); } continue; } if(cc == NULL) { if(*src < 32) { src++; continue; } cc = malloc(sizeof(cdata_content_t)); cc->cc_encoding = xp->xp_encoding; TAILQ_INSERT_TAIL(ccq, cc, cc_link); cc->cc_start = src; } src++; } if(cc != NULL) { assert(src != NULL); cc->cc_end = src; } return src; } /** * */ static char * htsmsg_xml_parse_cd(xmlparser_t *xp, htsmsg_t *parent, char *src) { struct cdata_content_queue ccq; htsmsg_field_t *f; cdata_content_t *cc; int c = 0, l, y = 0; char *x, *body; htsmsg_t *tags = htsmsg_create_map(); TAILQ_INIT(&ccq); src = htsmsg_xml_parse_cd0(xp, &ccq, tags, NULL, src, 0); /* Assemble body */ TAILQ_FOREACH(cc, &ccq, cc_link) { switch(cc->cc_encoding) { case XML_ENCODING_UTF8: c += cc->cc_end - cc->cc_start; y++; break; case XML_ENCODING_8859_1: l = 0; for(x = cc->cc_start; x < cc->cc_end; x++) l += 1 + (*x >= 0x80); c += l; y += 1 + (l != cc->cc_end - cc->cc_start); break; } } if(y == 1 && c > 0) { /* One segment UTF-8 (or 7bit ASCII), use data directly from source */ cc = TAILQ_FIRST(&ccq); assert(cc != NULL); assert(TAILQ_NEXT(cc, cc_link) == NULL); f = htsmsg_field_add(parent, "cdata", HMF_STR, 0); f->hmf_str = cc->cc_start; *cc->cc_end = 0; free(cc); } else if(c > 0) { body = malloc(c + 1); c = 0; while((cc = TAILQ_FIRST(&ccq)) != NULL) { switch(cc->cc_encoding) { case XML_ENCODING_UTF8: l = cc->cc_end - cc->cc_start; memcpy(body + c, cc->cc_start, l); c += l; break; case XML_ENCODING_8859_1: for(x = cc->cc_start; x < cc->cc_end; x++) c += put_utf8(body + c, *x); break; } TAILQ_REMOVE(&ccq, cc, cc_link); free(cc); } body[c] = 0; f = htsmsg_field_add(parent, "cdata", HMF_STR, HMF_ALLOCED); f->hmf_str = body; } else { while((cc = TAILQ_FIRST(&ccq)) != NULL) { TAILQ_REMOVE(&ccq, cc, cc_link); free(cc); } } if(src == NULL) { htsmsg_destroy(tags); return NULL; } if(TAILQ_FIRST(&tags->hm_fields) != NULL) { htsmsg_add_msg_extname(parent, "tags", tags); } else { htsmsg_destroy(tags); } return src; } /** * */ static char * htsmsg_parse_prolog(xmlparser_t *xp, char *src) { htsmsg_t *pis = htsmsg_create_map(); htsmsg_t *xmlpi; const char *encoding; while(1) { if(*src == 0) break; while(is_xmlws(*src)) src++; if(!strncmp(src, "') { src++; break; } src++; } continue; } break; } if((xmlpi = htsmsg_get_map(pis, "xml")) != NULL) { if((encoding = htsmsg_get_str(xmlpi, "encoding")) != NULL) { if(!strcasecmp(encoding, "iso-8859-1") || !strcasecmp(encoding, "iso-8859_1") || !strcasecmp(encoding, "iso_8859-1") || !strcasecmp(encoding, "iso_8859_1")) { xp->xp_encoding = XML_ENCODING_8859_1; } } } htsmsg_destroy(pis); return src; } /** * */ htsmsg_t * htsmsg_xml_deserialize(char *src, char *errbuf, size_t errbufsize) { htsmsg_t *m; xmlparser_t xp; char *src0 = src; int i; xp.xp_errmsg[0] = 0; xp.xp_encoding = XML_ENCODING_UTF8; LIST_INIT(&xp.xp_namespaces); if((src = htsmsg_parse_prolog(&xp, src)) == NULL) goto err; m = htsmsg_create_map(); if(htsmsg_xml_parse_cd(&xp, m, src) == NULL) { htsmsg_destroy(m); goto err; } if(xp.xp_srcdataused) { m->hm_data = src0; } else { free(src0); } return m; err: free(src0); snprintf(errbuf, errbufsize, "%s", xp.xp_errmsg); /* Remove any odd chars inside of errmsg */ for(i = 0; i < errbufsize; i++) { if(errbuf[i] < 32) { errbuf[i] = 0; break; } } return NULL; } /* * Get cdata string field */ const char * htsmsg_xml_get_cdata_str(htsmsg_t *tags, const char *name) { htsmsg_t *sub; if((sub = htsmsg_get_map(tags, name)) == NULL) return NULL; return htsmsg_get_str(sub, "cdata"); } /* * Get cdata u32 field */ int htsmsg_xml_get_cdata_u32(htsmsg_t *tags, const char *name, uint32_t *u32) { htsmsg_t *sub; if((sub = htsmsg_get_map(tags, name)) == NULL) return HTSMSG_ERR_FIELD_NOT_FOUND; return htsmsg_get_u32(sub, "cdata", u32); } /* * Get tag attribute */ const char * htsmsg_xml_get_attr_str ( htsmsg_t *tag, const char *name ) { htsmsg_t *attr = htsmsg_get_map(tag, "attrib"); if (attr) return htsmsg_get_str(tag, name); return NULL; } int htsmsg_xml_get_attr_u32 ( htsmsg_t *tag, const char *name, uint32_t *ret ) { htsmsg_t *attr = htsmsg_get_map(tag, "attrib"); if (attr) return htsmsg_get_u32(attr, name, ret); return HTSMSG_ERR_FIELD_NOT_FOUND; }