XML parser ========== Dependencies ------------ - C compiler (C99) Features -------- - Relatively small parser. - Pretty simple API comparable with libexpat. Supports -------- - Tags in short-form (). - Attributes - Short attributes without an explicity set value (). - Attribute entities. - Comments - CDATA sections. Caveats ------- - Internally static buffers are used, callbacks like XMLParser.xmldata are called multiple times for the same tag if the data size is bigger than the internal buffer size (sizeof(XMLParser.data)). To differentiate between this you can use xml*start and xml*end. - If xmlattrentity or xmldataentity is NULL it will pass the data to xmlattr and xmldata. - No table of (HTML / XML) named entities you should handle this with the XMLParser.xmldataentity callback yourself. - The XML is not checked for errors so it will happily continue parsing invalid XML data, this is by design. Interface / API --------------- Should be straightforward, see xml.{c,h} and the example below. Files ----- xml.c, xml.h Example (get RSS/Atom links from a webpage) ------------------------------------------- #include #include #include #include #include #include "util.h" #include "xml.h" static unsigned int isbase = 0, islink = 0, isfeedlink = 0, found = 0; static char feedlink[4096] = "", basehref[4096] = "", feedtype[256] = ""; static void xmltagstart(XMLParser *p, const char *tag, size_t taglen) { isbase = islink = isfeedlink = 0; if(taglen == 4) { /* optimization */ if(!strncasecmp(tag, "base", taglen)) isbase = 1; else if(!strncasecmp(tag, "link", taglen)) islink = 1; } } static void xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) { if(isfeedlink) { if(*feedtype) { fputs(feedtype, stdout); putchar(' '); } printlink(feedlink, basehref, stdout); /* this is in util.h (program-specific) */ putchar('\n'); found++; } } static void xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, size_t namelen, const char *value, size_t valuelen) { if(namelen != 4) /* optimization */ return; if(isbase) { if(!strncasecmp(name, "href", namelen)) strlcpy(basehref, value, sizeof(basehref)); } else if(islink) { if(!strncasecmp(name, "type", namelen)) { if(!strncasecmp(value, "application/atom", strlen("application/atom")) || !strncasecmp(value, "application/rss", strlen("application/rss"))) { isfeedlink = 1; strlcpy(feedtype, value, sizeof(feedtype)); } } else if(!strncasecmp(name, "href", namelen)) strlcpy(feedlink, value, sizeof(feedlink)); } } int main(int argc, char **argv) { XMLParser x; /* base href */ if(argc > 1) strlcpy(basehref, argv[1], sizeof(basehref)); xmlparser_init(&x, stdin); x.xmltagstart = xmltagstart; x.xmlattr = xmlattr; x.xmltagstartparsed = xmltagstartparsed; xmlparser_parse(&x); return found > 0 ? 0 : 1; }