From 582131202a479c1e678cffa11318022258be445c Mon Sep 17 00:00:00 2001 From: Hiltjo Posthuma Date: Fri, 14 Aug 2015 13:47:19 +0200 Subject: xml: separate reader context from parser also: - rename xmlparser_ prefix to xml_. - make xml_parse public, this allows a custom reader like a direct mmap, see: XMLParser.getnext and (optionall) XMLParser.getnext_data. - improve the README text. --- README.xml | 29 +++++++++++--- sfeed.c | 2 +- sfeed_opml_import.c | 2 +- sfeed_web.c | 2 +- sfeed_xmlenc.c | 2 +- xml.c | 110 ++++++++++++++++++++++++++++++++-------------------- xml.h | 21 ++-------- 7 files changed, 100 insertions(+), 68 deletions(-) diff --git a/README.xml b/README.xml index 809360a..dea2e02 100644 --- a/README.xml +++ b/README.xml @@ -5,7 +5,7 @@ XML parser Dependencies ------------ -- C compiler (C99) +- C compiler (C99). Features @@ -25,19 +25,21 @@ Supports - Short attributes without an explicity set value (). - Comments - CDATA sections. +- Helper function (xml_entitytostr) to convert XML 1.0 / HTML 2.0 named entities + and numeric entities to UTF-8. +- Reading XML from a fd, string buffer or implement a custom reader: + see: XMLParser.getnext and XMLParser.getnext_data. Caveats ------- -- Internally static buffers are used, callbacks like XMLParser.xmldata are +- Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are called multiple times for the same tag if the data size is bigger than the internal buffer size (sizeof(XMLParser.data)). To differentiate between new calls for data you can use the xml*start and xml*end handlers. -- There is no table of (HTML / XML) named entities you should handle this with - the XMLParser.xmldataentity handler yourself. -- The XML is not checked for errors so it will continue parsing invalid XML - data, this is by design. +- The XML is not checked for errors so it will continue parsing XML data, this + is by design. Files used @@ -51,6 +53,20 @@ Interface / API Should be trivial, see xml.c and xml.h and the examples below. +The most minimal implementation to read and parse from fd 0 (stdin) is: + + #include "xml.h" + + static XMLParser x; + + int + main(void) + { + xml_parse_fd(&x, 0); /* xml_parse_string(&x, ""); */ + + return 0; + } + Examples -------- @@ -60,5 +76,6 @@ sfeed_opml_import.c or sfeed_web.c or sfeed_xmlenc.c License ------- + See LICENSE file. diff --git a/sfeed.c b/sfeed.c index b09d595..8526c23 100644 --- a/sfeed.c +++ b/sfeed.c @@ -737,7 +737,7 @@ main(int argc, char *argv[]) parser.xmltagstart = xml_handler_start_el; parser.xmltagstartparsed = xml_handler_start_el_parsed; - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); return 0; } diff --git a/sfeed_opml_import.c b/sfeed_opml_import.c index 383cbe8..6c3f505 100644 --- a/sfeed_opml_import.c +++ b/sfeed_opml_import.c @@ -87,7 +87,7 @@ main(void) "# list of feeds to fetch:\n" "feeds() {\n" " # feed [basesiteurl] [encoding]\n", stdout); - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); fputs("}\n", stdout); return 0; diff --git a/sfeed_web.c b/sfeed_web.c index 2d3c344..1a496ba 100644 --- a/sfeed_web.c +++ b/sfeed_web.c @@ -94,7 +94,7 @@ main(int argc, char *argv[]) parser.xmltagstart = xmltagstart; parser.xmltagstartparsed = xmltagstartparsed; - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); return found > 0 ? 0: 1; } diff --git a/sfeed_xmlenc.c b/sfeed_xmlenc.c index 223951a..222c31a 100644 --- a/sfeed_xmlenc.c +++ b/sfeed_xmlenc.c @@ -60,7 +60,7 @@ main(void) parser.xmltagstart = xmltagstart; parser.xmltagend = xmltagend; - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); return 1; } diff --git a/xml.c b/xml.c index ad191de..8c54048 100644 --- a/xml.c +++ b/xml.c @@ -8,54 +8,75 @@ #include "xml.h" +struct xml_context_fd { + char buf[BUFSIZ]; + int readerrno; + int fd; + size_t nread; + size_t offset; +}; + +struct xml_context_string { + const char *str; +}; + +static int +xml_getnext_stdin(XMLParser *x) +{ + return getchar(); +} + static int -xmlparser_string_getnext(XMLParser *x) +xml_getnext_string(XMLParser *x) { - if (!*(x->str)) + struct xml_context_string *d = (struct xml_context_string *)x->getnext_data; + + if (!*(d->str)) return EOF; - return (int)*(x->str++); + return (int)*(d->str++); } static int /* like getc(), but do some smart buffering */ -xmlparser_fd_getnext(XMLParser *x) +xml_getnext_fd(XMLParser *x) { + struct xml_context_fd *d = (struct xml_context_fd *)x->getnext_data; ssize_t r; /* previous read error was set */ - if (x->readerrno) + if (d->readerrno) return EOF; - if (x->readoffset >= x->readlastbytes) { - x->readoffset = 0; + if (d->offset >= d->nread) { + d->offset = 0; again: - r = read(x->fd, x->readbuf, sizeof(x->readbuf)); + r = read(d->fd, d->buf, sizeof(d->buf)); if (r == -1) { if (errno == EINTR) goto again; - x->readerrno = errno; - x->readlastbytes = 0; + d->readerrno = errno; + d->nread = 0; return EOF; } else if (!r) { return EOF; } - x->readlastbytes = r; + d->nread = r; } - return (int)x->readbuf[x->readoffset++]; + return (int)d->buf[d->offset++]; } static int -xmlparser_getnext(XMLParser *x) +xml_getnext(XMLParser *x) { return x->getnext(x); } static void -xmlparser_parseattrs(XMLParser *x) +xml_parseattrs(XMLParser *x) { size_t namelen = 0, valuelen; int c, endsep, endname = 0; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (isspace(c)) { /* TODO: simplify endname ? */ if (namelen) endname = 1; @@ -82,7 +103,7 @@ xmlparser_parseattrs(XMLParser *x) endsep = c; /* c is end separator */ if (x->xmlattrstart) x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); - for (valuelen = 0; (c = xmlparser_getnext(x)) != EOF;) { + for (valuelen = 0; (c = xml_getnext(x)) != EOF;) { if (c == '&') { /* entities */ x->data[valuelen] = '\0'; /* call data function with data before entity if there is data */ @@ -90,7 +111,7 @@ xmlparser_parseattrs(XMLParser *x) x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); x->data[0] = c; valuelen = 1; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == endsep) break; if (valuelen < sizeof(x->data) - 1) @@ -147,7 +168,7 @@ xmlparser_parseattrs(XMLParser *x) } static void -xmlparser_parsecomment(XMLParser *x) +xml_parsecomment(XMLParser *x) { static const char *end = "-->"; size_t datalen = 0, i = 0; @@ -156,7 +177,7 @@ xmlparser_parsecomment(XMLParser *x) if (x->xmlcommentstart) x->xmlcommentstart(x); - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == end[i]) { if (end[++i] == '\0') { /* end */ x->data[datalen] = '\0'; @@ -191,7 +212,7 @@ xmlparser_parsecomment(XMLParser *x) } static void -xmlparser_parsecdata(XMLParser *x) +xml_parsecdata(XMLParser *x) { static const char *end = "]]>"; size_t datalen = 0, i = 0; @@ -200,7 +221,7 @@ xmlparser_parsecdata(XMLParser *x) if (x->xmlcdatastart) x->xmlcdatastart(x); - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == end[i]) { if (end[++i] == '\0') { /* end */ x->data[datalen] = '\0'; @@ -351,44 +372,44 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz) return xml_numericentitytostr(e, buf, bufsiz); } -static void -xmlparser_parse(XMLParser *x) +void +xml_parse(XMLParser *x) { int c, ispi; size_t datalen, tagdatalen, taglen; - while ((c = xmlparser_getnext(x)) != EOF && c != '<') + while ((c = xml_getnext(x)) != EOF && c != '<') ; /* skip until < */ while (c != EOF) { if (c == '<') { /* parse tag */ - if ((c = xmlparser_getnext(x)) == EOF) + if ((c = xml_getnext(x)) == EOF) return; x->tag[0] = '\0'; x->taglen = 0; if (c == '!') { /* cdata and comments */ - for (tagdatalen = 0; (c = xmlparser_getnext(x)) != EOF;) { + for (tagdatalen = 0; (c = xml_getnext(x)) != EOF;) { if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */ x->data[tagdatalen++] = c; /* TODO: prevent overflow */ if (c == '>') break; else if (c == '-' && tagdatalen == sizeof("--") - 1 && (x->data[0] == '-')) { /* comment */ - xmlparser_parsecomment(x); + xml_parsecomment(x); break; } else if (c == '[') { if (tagdatalen == sizeof("[CDATA[") - 1 && x->data[1] == 'C' && x->data[2] == 'D' && x->data[3] == 'A' && x->data[4] == 'T' && x->data[5] == 'A' && x->data[6] == '[') { /* CDATA */ - xmlparser_parsecdata(x); + xml_parsecdata(x); break; } } } } else { /* normal tag (open, short open, close), processing instruction. */ if (isspace(c)) - while ((c = xmlparser_getnext(x)) != EOF && isspace(c)) + while ((c = xml_getnext(x)) != EOF && isspace(c)) ; if (c == EOF) return; @@ -396,7 +417,7 @@ xmlparser_parse(XMLParser *x) ispi = (c == '?') ? 1 : 0; x->isshorttag = ispi; taglen = 1; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == '/') /* TODO: simplify short tag? */ x->isshorttag = 1; /* short tag */ else if (c == '>' || isspace(c)) { @@ -411,7 +432,7 @@ xmlparser_parse(XMLParser *x) if (x->xmltagstart) x->xmltagstart(x, x->tag, x->taglen); if (isspace(c)) - xmlparser_parseattrs(x); + xml_parseattrs(x); if (x->xmltagstartparsed) x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); } @@ -428,7 +449,7 @@ xmlparser_parse(XMLParser *x) datalen = 0; if (x->xmldatastart) x->xmldatastart(x); - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == '&') { if (datalen) { x->data[datalen] = '\0'; @@ -437,7 +458,7 @@ xmlparser_parse(XMLParser *x) } x->data[0] = c; datalen = 1; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == '<') break; if (datalen < sizeof(x->data) - 1) @@ -477,17 +498,24 @@ xmlparser_parse(XMLParser *x) } void -xmlparser_parse_string(XMLParser *x, const char *s) +xml_parse_string(XMLParser *x, const char *s) { - x->str = s; - x->getnext = xmlparser_string_getnext; - xmlparser_parse(x); + struct xml_context_string ctx = { .str = s }; + + x->getnext = xml_getnext_string; + x->getnext_data = (void *)&ctx; + xml_parse(x); } void -xmlparser_parse_fd(XMLParser *x, int fd) +xml_parse_fd(XMLParser *x, int fd) { - x->fd = fd; - x->getnext = xmlparser_fd_getnext; - xmlparser_parse(x); + struct xml_context_fd ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fd = fd; + + x->getnext = xml_getnext_fd; + x->getnext_data = (void *)&ctx; + xml_parse(x); } diff --git a/xml.h b/xml.h index 7604569..df63e23 100644 --- a/xml.h +++ b/xml.h @@ -24,16 +24,7 @@ typedef struct xmlparser { size_t, int); int (*getnext)(struct xmlparser *); - - /* for use with xmlparser_parse_fd */ - /* errno set from read(). */ - int readerrno; - int fd; - - /* for use with "read" from string: xmlparser_parse_string */ - const char *str; - - /* private; internal state */ + void *getnext_data; /* custom data for getnext */ /* current tag */ char tag[1024]; @@ -44,11 +35,6 @@ typedef struct xmlparser { char name[256]; /* data buffer used for tag data, cdata and attribute data */ char data[BUFSIZ]; - - size_t readoffset; - size_t readlastbytes; - /* read buffer used by xmlparser_parse_fd */ - unsigned char readbuf[BUFSIZ]; } XMLParser; int xml_codepointtoutf8(uint32_t, uint32_t *); @@ -56,5 +42,6 @@ ssize_t xml_entitytostr(const char *, char *, size_t); ssize_t xml_namedentitytostr(const char *, char *, size_t); ssize_t xml_numericetitytostr(const char *, char *, size_t); -void xmlparser_parse_fd(XMLParser *, int); -void xmlparser_parse_string(XMLParser *, const char *); +void xml_parse(XMLParser *); +void xml_parse_fd(XMLParser *, int); +void xml_parse_string(XMLParser *, const char *); -- cgit v1.2.3