add initial README for xml parser

Signed-off-by: Hiltjo Posthuma <hiltjo@codemadness.org>
author: Hiltjo Posthuma <hiltjo@codemadness.org> 2014-06-27 15:40:21 +0200
committer: Hiltjo Posthuma <hiltjo@codemadness.org> 2014-06-27 15:40:21 +0200
commit: d0cb258e59ae59ddae3fb2a9de2420de9491849e (patch)
tree: 41aa4b52ef5624a88591c157d164a1b48be552a9 /README.xml
parent: e0da2d775ac34a06949c554c61d015d076890be6 (diff)
1 files changed, 130 insertions, 0 deletions
diff --git a/README.xml b/README.xml
new file mode 100644
index 0000000..2589f07
--- /dev/null
+++ b/README.xml
@@ -0,0 +1,130 @@
+XML parser
+==========
+
+
+Dependencies
+------------
+
+- C compiler (C99)
+
+
+Features
+--------
+
+- Relatively small parser suitable for embedded systems.
+- Pretty simple API comparable to libexpat.
+
+
+Supports
+--------
+
+- Tags in short-form (<img src="lolcat.jpg" title="Meow" />).
+- Attributes
+- Short attributes without an explicity set value (<input type="checkbox" checked />).
+  - Attribute entities.
+- Comments
+- CDATA sections.
+
+
+Caveats
+-------
+
+- Internally static buffers are used, callbacks like XMLParser.xmldata can be
+  called multiple times for the same tag if the data size is bigger than the
+  internal buffer size (sizeof(XMLParser.data)). To differentiate between this
+  you can use xml*start and xml*end.
+- No table of (HTML / XML) named entities you should handle this with the
+  XMLParser.xmldataentity callback yourself.
+- The XML is not checked for errors so it will happily continue parsing invalid
+  XML data, this is by design.
+
+
+Interface / API
+---------------
+
+Should be straightforward, see xml.h
+
+
+Files
+-----
+xml.c, xml.h
+
+
+Example (get RSS/Atom links from a webpage)
+-------------------------------------------
+
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "util.h"
+#include "xml.h"
+
+static unsigned int isbase = 0, islink = 0, isfeedlink = 0, found = 0;
+static char feedlink[4096] = "", basehref[4096] = "", feedtype[256] = "";
+
+static void
+xmltagstart(XMLParser *p, const char *tag, size_t taglen) {
+	isbase = islink = isfeedlink = 0;
+	if(taglen == 4) { /* optimization */
+		if(!strncasecmp(tag, "base", taglen))
+			isbase = 1;
+		else if(!strncasecmp(tag, "link", taglen))
+			islink = 1;
+	}
+}
+
+static void
+xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) {
+	if(isfeedlink) {
+		if(*feedtype) {
+			fputs(feedtype, stdout);
+			putchar(' ');
+		}
+		printlink(feedlink, basehref, stdout); /* this is in util.h (program-specific) */
+		putchar('\n');
+		found++;
+	}
+}
+
+static void
+xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
+        size_t namelen, const char *value, size_t valuelen) {
+
+	if(namelen != 4) /* optimization */
+		return;
+	if(isbase) {
+		if(!strncasecmp(name, "href", namelen))
+			strlcpy(basehref, value, sizeof(basehref));
+	} else if(islink) {
+		if(!strncasecmp(name, "type", namelen)) {
+			if(!strncasecmp(value, "application/atom", strlen("application/atom")) ||
+			   !strncasecmp(value, "application/rss", strlen("application/rss"))) {
+				isfeedlink = 1;
+				strlcpy(feedtype, value, sizeof(feedtype));
+			}
+		} else if(!strncasecmp(name, "href", namelen))
+			strlcpy(feedlink, value, sizeof(feedlink));
+	}
+}
+
+int
+main(int argc, char **argv) {
+	XMLParser x;
+
+	/* base href */
+	if(argc > 1)
+		strlcpy(basehref, argv[1], sizeof(basehref));
+
+	xmlparser_init(&x, stdin);
+	x.xmltagstart = xmltagstart;
+	x.xmlattr = xmlattr;
+	x.xmltagstartparsed = xmltagstartparsed;
+
+	xmlparser_parse(&x);
+
+	return found > 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
author	Hiltjo Posthuma <hiltjo@codemadness.org>	2014-06-27 15:40:21 +0200
committer	Hiltjo Posthuma <hiltjo@codemadness.org>	2014-06-27 15:40:21 +0200
commit	d0cb258e59ae59ddae3fb2a9de2420de9491849e (patch)
tree	41aa4b52ef5624a88591c157d164a1b48be552a9 /README.xml
parent	e0da2d775ac34a06949c554c61d015d076890be6 (diff)