diff options
author | Hiltjo Posthuma <hiltjo@codemadness.org> | 2014-06-27 15:40:21 +0200 |
---|---|---|
committer | Hiltjo Posthuma <hiltjo@codemadness.org> | 2014-06-27 15:40:21 +0200 |
commit | d0cb258e59ae59ddae3fb2a9de2420de9491849e (patch) | |
tree | 41aa4b52ef5624a88591c157d164a1b48be552a9 | |
parent | e0da2d775ac34a06949c554c61d015d076890be6 (diff) |
add initial README for xml parser
Signed-off-by: Hiltjo Posthuma <hiltjo@codemadness.org>
-rw-r--r-- | README.xml | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/README.xml b/README.xml new file mode 100644 index 0000000..2589f07 --- /dev/null +++ b/README.xml @@ -0,0 +1,130 @@ +XML parser +========== + + +Dependencies +------------ + +- C compiler (C99) + + +Features +-------- + +- Relatively small parser suitable for embedded systems. +- Pretty simple API comparable to libexpat. + + +Supports +-------- + +- Tags in short-form (<img src="lolcat.jpg" title="Meow" />). +- Attributes +- Short attributes without an explicity set value (<input type="checkbox" checked />). + - Attribute entities. +- Comments +- CDATA sections. + + +Caveats +------- + +- Internally static buffers are used, callbacks like XMLParser.xmldata can be + called multiple times for the same tag if the data size is bigger than the + internal buffer size (sizeof(XMLParser.data)). To differentiate between this + you can use xml*start and xml*end. +- No table of (HTML / XML) named entities you should handle this with the + XMLParser.xmldataentity callback yourself. +- The XML is not checked for errors so it will happily continue parsing invalid + XML data, this is by design. + + +Interface / API +--------------- + +Should be straightforward, see xml.h + + +Files +----- +xml.c, xml.h + + +Example (get RSS/Atom links from a webpage) +------------------------------------------- + +#include <stdio.h> +#include <string.h> +#include <strings.h> +#include <stdlib.h> +#include <ctype.h> + +#include "util.h" +#include "xml.h" + +static unsigned int isbase = 0, islink = 0, isfeedlink = 0, found = 0; +static char feedlink[4096] = "", basehref[4096] = "", feedtype[256] = ""; + +static void +xmltagstart(XMLParser *p, const char *tag, size_t taglen) { + isbase = islink = isfeedlink = 0; + if(taglen == 4) { /* optimization */ + if(!strncasecmp(tag, "base", taglen)) + isbase = 1; + else if(!strncasecmp(tag, "link", taglen)) + islink = 1; + } +} + +static void +xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) { + if(isfeedlink) { + if(*feedtype) { + fputs(feedtype, stdout); + putchar(' '); + } + printlink(feedlink, basehref, stdout); /* this is in util.h (program-specific) */ + putchar('\n'); + found++; + } +} + +static void +xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, + size_t namelen, const char *value, size_t valuelen) { + + if(namelen != 4) /* optimization */ + return; + if(isbase) { + if(!strncasecmp(name, "href", namelen)) + strlcpy(basehref, value, sizeof(basehref)); + } else if(islink) { + if(!strncasecmp(name, "type", namelen)) { + if(!strncasecmp(value, "application/atom", strlen("application/atom")) || + !strncasecmp(value, "application/rss", strlen("application/rss"))) { + isfeedlink = 1; + strlcpy(feedtype, value, sizeof(feedtype)); + } + } else if(!strncasecmp(name, "href", namelen)) + strlcpy(feedlink, value, sizeof(feedlink)); + } +} + +int +main(int argc, char **argv) { + XMLParser x; + + /* base href */ + if(argc > 1) + strlcpy(basehref, argv[1], sizeof(basehref)); + + xmlparser_init(&x, stdin); + x.xmltagstart = xmltagstart; + x.xmlattr = xmlattr; + x.xmltagstartparsed = xmltagstartparsed; + + xmlparser_parse(&x); + + return found > 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} + |