sfeed.c: lots of changes, see CHANGELOG, will separate commits better in the future after next version release

Signed-off-by: Hiltjo Posthuma <hiltjo@codemadness.org>
author: Hiltjo Posthuma <hiltjo@codemadness.org> 2013-05-20 19:34:07 +0200
committer: Hiltjo Posthuma <hiltjo@codemadness.org> 2013-05-20 19:34:07 +0200
commit: 97d0866d07ce25b157d1cfb79940e2a6e1a57b33 (patch)
tree: ea4c43abe4b9b87d21a47f00d845bbd979a50a66
parent: 56c0c0dc73096bb5d08bd93f8f2644225a551524 (diff)
1 files changed, 546 insertions, 276 deletions
diff --git a/sfeed.c b/sfeed.c
index e3f77a6..b156b30 100644
--- a/sfeed.c
+++ b/sfeed.c
@@ -1,15 +1,15 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <strings.h>
 #include <string.h>
 #include <time.h>
 #include <ctype.h>
-#include <expat.h> /* libexpat */
+#include "xml.h"
+#include "compat.h"
 
-enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2, FeedTypeLast = 3 };
+enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 };
 const char *feedtypes[] = {	"", "rss", "atom" };
 
-enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2, ContentTypeLast = 3 };
+enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 };
 const char *contenttypes[] = { "", "plain", "html" };
 
 typedef struct string { /* String data / pool */
@@ -32,13 +32,112 @@ typedef struct feeditem { /* Feed item */
 void die(const char *s);
 void cleanup(void);
 
+String *currentfield = NULL; /* TODO */
 const int FieldSeparator = '\t';
 FeedItem feeditem; /* data for current feed item */
-char tag[1024]; /* current XML tag being parsed. */
-char feeditemtag[1024]; /* current tag _inside_ a feeditem */
-XML_Parser parser; /* expat XML parser state */
-int incdata = 0;
+char feeditemtag[256] = ""; /* current tag _inside_ a feeditem */
+size_t feeditemtaglen = 0;
+int feeditemtagid = 0;
+int iscontent = 0;
+int iscontenttag = 0;
+size_t attrcount = 0;
 char *standardtz = NULL; /* TZ variable at start of program */
+XMLParser parser; /* XML parser state */
+
+enum {
+	TagUnknown = 0,
+	/* RSS */
+	RSSTagDcdate, RSSTagPubdate, RSSTagTitle,
+	RSSTagLink, RSSTagDescription, RSSTagContentencoded,
+	RSSTagGuid, RSSTagAuthor, RSSTagDccreator,
+	/* Atom */
+	AtomTagPublished, AtomTagUpdated, AtomTagTitle,
+	AtomTagSummary, AtomTagContent,
+	AtomTagId, AtomTagLink, AtomTagAuthor
+};
+
+typedef struct feedtag {
+	char *name;
+	size_t namelen;
+	int id;
+} FeedTag;
+
+/* TODO: optimize lookup */
+int
+gettag(int feedtype, const char *name, size_t namelen) {
+	/* RSS, alphabetical order */
+	static FeedTag rsstag[] = {
+		{ "author", 6, RSSTagAuthor },
+		{ "content:encoded", 15, RSSTagContentencoded },
+		{ "dc:creator", 10, RSSTagDccreator },
+		{ "dc:date", 7, RSSTagDcdate },
+		{ "description", 11, RSSTagDescription },
+		{ "guid", 4, RSSTagGuid },
+		{ "link", 4, RSSTagLink },
+		{ "pubdate", 7, RSSTagPubdate },
+		{ "title", 5, RSSTagTitle },
+		{ NULL, 0, -1 }
+	};
+	/* Atom, alphabetical order */
+	static FeedTag atomtag[] = {
+		{ "author", 6, AtomTagAuthor }, /* assume this is: <author><name></name></author> */
+		{ "content", 7, AtomTagContent },
+		{ "id", 2, AtomTagId },
+		{ "link", 4, AtomTagLink },
+		{ "published", 9, AtomTagPublished },
+		{ "summary", 7, AtomTagSummary },
+		{ "title", 5, AtomTagTitle },
+		{ "updated", 7, AtomTagUpdated },
+		{ NULL, 0, -1 }
+	};
+	int i, n;
+	
+	if(namelen >= 2 && namelen <= 15) {
+		if(feedtype == FeedTypeRSS) {
+			for(i = 0; rsstag[i].name; i++) {
+				if(!(n = xstrncasecmp(rsstag[i].name, name, rsstag[i].namelen)))
+					return rsstag[i].id;
+				/* optimization: it's sorted so nothing after it matches. */
+				if(n > 0)
+					return TagUnknown;
+			}
+		} else if(feedtype == FeedTypeAtom) {
+			for(i = 0; atomtag[i].name; i++) {
+				if(!(n = xstrncasecmp(atomtag[i].name, name, atomtag[i].namelen)))
+					return atomtag[i].id;
+				/* optimization: it's sorted so nothing after it matches. */
+				if(n > 0)
+					return TagUnknown;
+			}
+		}
+	}
+	return TagUnknown;
+}
+
+int
+entitytostr(const char *e, char *buffer, size_t bufsiz) {
+	/* TODO: optimize lookup? */
+	char *entities[6][2] = {
+		{ "&lt;", "<" },
+		{ "&gt;", ">" },
+		{ "&apos;", "'" },
+		{ "&amp;", "&" },
+		{ "&quot;", "\"" },
+		{ NULL, NULL }
+	};
+	size_t i;
+	if(*e != '&' || bufsiz < 2) /* doesnt start with & */
+		return 0;
+	for(i = 0; entities[i][0]; i++) {
+		/* NOTE: compares max 7 chars */
+		if(!xstrncasecmp(e, entities[i][0], 6)) {
+			buffer[0] = *(entities[i][1]);
+			buffer[1] = '\0';
+			return 1;
+		}
+	}
+	return 0;
+}
 
 void
 string_clear(String *s) {
@@ -68,23 +167,24 @@ int
 string_buffer_expand(String *s, size_t newlen) {
 	char *p;
 	size_t alloclen;
-	/* check if allocation is necesary, dont shrink buffer */
-	if(!s->data || (newlen > s->bufsiz)) {
-		/* should be more than bufsiz ofcourse */
-		for(alloclen = 16; alloclen <= newlen; alloclen *= 2);
-		if(!(p = realloc(s->data, alloclen))) {
-			string_free(s); /* free previous allocation */
-			die("can't allocate enough memory");
-		}
-		s->bufsiz = alloclen;
-		s->data = p;
+	/* check if allocation is necesary, dont shrink buffer
+	   should be more than bufsiz ofcourse */
+	for(alloclen = 16; alloclen <= newlen; alloclen *= 2);
+	if(!(p = realloc(s->data, alloclen))) {
+		string_free(s); /* free previous allocation */
+		die("can't allocate enough memory");
 	}
+	s->bufsiz = alloclen;
+	s->data = p;
 	return s->bufsiz;
 }
 
 void
 string_append(String *s, const char *data, size_t len) {
-	string_buffer_expand(s, s->len + len);
+	if(!len || *data == '\0')
+		return;
+	if(s->len + len > s->bufsiz)
+		string_buffer_expand(s, s->len + len);
 	memcpy(s->data + s->len, data, len);
 	s->len += len;
 	s->data[s->len] = '\0';
@@ -92,7 +192,6 @@ string_append(String *s, const char *data, size_t len) {
 
 void /* cleanup parser, free allocated memory, etc */
 cleanup(void) {
-	XML_ParserFree(parser);
 	string_free(&feeditem.timestamp);
 	string_free(&feeditem.title);
 	string_free(&feeditem.link);
@@ -106,114 +205,194 @@ die(const char *s) {
 	fputs("sfeed: ", stderr);
 	fputs(s, stderr);
 	fputc('\n', stderr);
-	cleanup();
 	exit(EXIT_FAILURE);
 }
 
-void
+/* get timezone from string, return as formatted string and time offset,
+ * for the offset it assumes GMT */
+int
 gettimetz(const char *s, char *buf, size_t bufsiz) {
 	const char *p = s;
+	char tzname[16] = "", *t = NULL;
 	int tzhour = 0, tzmin = 0;
-	char tzname[128] = "", *t = NULL;
 	unsigned int i;
+	char c;
 
 	buf[0] = '\0';
-	for(; *p && isspace(*p); p++); /* skip whitespace */
-	/* detect time offset, assume time offset isn't specified in the first 18 characters */
-	for(i = 0; *p && ((*p != '+' && *p != '-') || i <= 18); p++, i++);
-
-	if(isalpha(*p)) {
+	if(bufsiz < sizeof(tzname) + 7)
+		return 0;
+	for(; *p && isspace((int)*p); p++); /* skip whitespace */
+	/* loop until some common timezone delimiters are found */
+	for(;*p && (*p != '+' && *p != '-' && *p != 'Z' && *p != 'z'); p++);
+
+	/* TODO: cleanup / simplify */
+	if(isalpha((int)*p)) {
 		if(*p == 'Z' || *p == 'z') {
-			strncpy(buf, "GMT+00:00", bufsiz);
-			return;
+			memcpy(buf, "GMT+00:00", strlen("GMT+00:00") + 1);
+			return 0;
 		} else {
-			for(i = 0, t = &tzname[0]; i < (sizeof(tzname) - 1) && (*p && isalpha(*p)); i++)
+			for(i = 0, t = &tzname[0]; i < (sizeof(tzname) - 1) &&
+				(*p && isalpha((int)*p)); i++)
 				*(t++) = *(p++);
 			*t = '\0';
 		}
 	} else
-		strncpy(tzname, "GMT", sizeof(tzname) - 1);
-	if(!(*p)) {
+		memcpy(tzname, "GMT", strlen("GMT") + 1);
+	if(!(*p)) {	
 		strncpy(buf, tzname, bufsiz);
-		return;
+		return 0;
 	}
-	/* NOTE: reverses time offsets for TZ */
-	if((sscanf(p, "+%02d:%02d", &tzhour, &tzmin)) > 0)
-		snprintf(buf, bufsiz, "%s-%02d:%02d", tzname, tzhour, tzmin);
-	else if((sscanf(p, "-%02d:%02d", &tzhour, &tzmin)) > 0)
-		snprintf(buf, bufsiz, "%s+%02d:%02d", tzname, tzhour, tzmin);
-	else if((sscanf(p, "+%02d%02d", &tzhour, &tzmin)) > 0)
-		snprintf(buf, bufsiz, "%s-%02d:%02d", tzname, tzhour, tzmin);
-	else if((sscanf(p, "-%02d%02d", &tzhour, &tzmin)) > 0)
-		snprintf(buf, bufsiz, "%s+%02d:%02d", tzname, tzhour, tzmin);
-	else if(sscanf(p, "+%d", &tzhour) > 0)
-		snprintf(buf, bufsiz, "%s-%02d:00", tzname, tzhour);
-	else if(sscanf(p, "-%d", &tzhour) > 0)
-		snprintf(buf, bufsiz, "%s+%02d:00", tzname, tzhour);
+	if((sscanf(p, "%c%02d:%02d", &c, &tzhour, &tzmin)) > 0);
+	else if(sscanf(p, "%c%02d%02d", &c, &tzhour, &tzmin) > 0);
+	else if(sscanf(p, "%c%d", &c, &tzhour) > 0)
+		tzmin = 0;
+	sprintf(buf, "%s%c%02d%02d", tzname, c, tzhour, tzmin);
+	/* TODO: test + or - offset */
+	return (tzhour * 3600) + (tzmin * 60) * (c == '-' ? -1 : 1);
 }
 
-time_t
-parsetime(const char *s, char *buf, size_t bufsiz) {
-	struct tm tm = { 0 };
-	time_t t = 0;
-	char timebuf[64], tz[256], *p;
+/* parses everything in a format similar to:
+ * "%a, %d %b %Y %H:%M:%S" or "%Y-%m-%d %H:%M:%S" */
+/* TODO: calculate time offset (GMT only) from gettimetz ? */
+int
+parsetimeformat(const char *s, struct tm *t, const char **end) {
+	static const char *months[] = {
+		"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
+		"Nov", "Dec"
+	};
+	const char *p = s;
+	unsigned int i, fm;
+	unsigned long l;
 
-	if(buf)
-		buf[0] = '\0';
-	gettimetz(s, tz, sizeof(tz) - 1);
-	if(!standardtz || strcmp(standardtz, tz)) {
-		if(!strcmp(tz, "")) { /* restore TZ */
-			if(standardtz)
-				setenv("TZ", standardtz, 1);
-			else
-				unsetenv("TZ");
+	memset(t, 0, sizeof(struct tm));
+	if((l = strtoul(p, (void *)&p, 10))) {
+		t->tm_year = abs(l) - 1900;
+		if(!(l = strtoul(p, (void *)&p, 10)))
+			return 0;
+		t->tm_mon = abs(l) - 1;
+		if(!(t->tm_mday = abs(strtoul(p, (void *)&p, 10))))
+			return 0;
+	} else {
+		for(; *p && !isdigit((int)*p); p++);
+		if(!(t->tm_mday = abs(strtoul(p, (void *)&p, 10))))
+			return 0;
+		for(; *p && !isalpha((int)*p); p++); /* skip non-alpha */
+		for(fm = 0, i = 0; i < 12; i++) { /* parse month names */
+			if(!xstrncasecmp(p, months[i], 3)) {
+				t->tm_mon = i;
+				fm = 1;
+				break;
+			}
 		}
-		else
-			setenv("TZ", tz, 1);
-		tzset();
+		if(!fm) /* can't find month */
+			return 0;
+		for(; *p && !isdigit((int)*p); p++); /* skip non-digit */
+		if(!(l = strtoul(p, (void *)&p, 10)))
+			return 0;
+		t->tm_year = abs(l) - 1900;
 	}
-	if((strptime(s, "%Y-%m-%dT%H:%M:%SZ", &tm)) ||
-	   (strptime(s, "%Y-%m-%d %H:%M:%S", &tm)) ||
-	   (strptime(s, "%a, %d %b %Y %H:%M:%S", &tm)) ||
-	   (strptime(s, "%Y-%m-%dT%H:%M:%S", &tm))) {
-		tm.tm_isdst = -1; /* detect Daylight Saving Time */
-		if((t = mktime(&tm)) == -1)
-			t = 0;
-		if(buf && (strftime(timebuf, sizeof(timebuf) - 1,
-		           "%Y-%m-%d %H:%M:%S", &tm))) {
-			for(p = tz; *p; p++) /* print time offset reverse */
-				*p = ((*p == '-') ? '+' : (*p == '+' ? '-' : *p));
-			snprintf(buf, bufsiz, "%s %s", timebuf, tz);
+	for(; *p && !isdigit((int)*p); p++); /* skip non-digit */
+	if((t->tm_hour = abs(strtoul(p, (void *)&p, 10))) > 23)
+		return 0;
+	for(; *p && !isdigit((int)*p); p++); /* skip non-digit */
+	if((t->tm_min = abs(strtoul(p, (void *)&p, 10))) > 59)
+		return 0;
+	for(; *p && !isdigit((int)*p); p++); /* skip non-digit */
+	if((t->tm_sec = abs(strtoul(p, (void *)&p, 10))) > 60)
+		return 0;
+	if(end)
+		*end = p;
+	return 1;
+}
+
+/* C defines the rounding for division in a nonsensical way */
+#define Q(a,b) ((a)>0 ? (a)/(b) : -(((b)-(a)-1)/(b)))
+
+/* copied from Musl C awesome small implementation, see LICENSE. */
+time_t
+tm_to_time(struct tm *tm) {
+	time_t year = tm->tm_year - 100;
+	int month = tm->tm_mon;
+	int day = tm->tm_mday;
+	int daysbeforemon[] = { 0,31,59,90,120,151,181,212,243,273,304,334 };
+	int z4, z100, z400;
+
+	/* normalize month */
+	if(month >= 12) {
+		year += month / 12;
+		month %= 12;
+	} else if(month < 0) {
+		year += month / 12;
+		month %= 12;
+		if(month) {
+			month += 12;
+			year--;
 		}
 	}
-	return t;
+	z4 = Q(year - (month < 2), 4); /* is leap? */
+	z100 = Q(z4, 25);
+	z400 = Q(z100, 4);
+	day += year * 365 + z4 - z100 + z400 + daysbeforemon[month];
+	return (time_t)day * 86400 +
+	       tm->tm_hour * 3600 + tm->tm_min * 60 + tm->tm_sec +
+	       946684800; /* the dawn of time, aka 1970 (30 years of seconds) :) */
+}
+
+time_t
+parsetime(const char *s, char *buf) {
+	struct tm tm;
+	char tz[64];
+	const char *end;
+	int offset;
+
+	if(buf)
+		buf[0] = '\0';
+	if(parsetimeformat(s, &tm, &end)) {
+		offset = gettimetz(end, tz, sizeof(tz) - 1);
+		/* TODO: make sure snprintf cant overflow */
+		if(buf)
+		   sprintf(buf, "%04d-%02d-%02d %02d:%02d:%02d %-.16s",
+					tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+					tm.tm_hour, tm.tm_min, tm.tm_sec, tz);
+		/* return UNIX time, reverse offset to GMT+0 */
+		return tm_to_time(&tm) - offset;
+	}
+	return -1; /* can't parse */
 }
 
 /* print text, ignore tabs, newline and carriage return etc
-1 * print some HTML 2.0 / XML 1.0 as normal text */
+ * print some HTML 2.0 / XML 1.0 as normal text */
 void
 string_print_trimmed(String *s) {
-	const char *entities[] = {
-		"&amp;", "&", "&lt;", "<", "&gt;", ">",	"&apos;", "'", "&quot;", "\"",
+/*	const char *entities[] = {
+		"&amp;", "&", "&lt;", "<", "&gt;", ">",	"&apos;", "'",
+		"&quot;", "\"",
 		NULL, NULL
 	};
-	const char *p, *n, **e;
-	unsigned int len, found;
-	if(!s->data)
+	unsigned char entlen[] = { 5, 4, 4, 6, 6 };*/
+	/*unsigned int len, found, i;*/
+	const char *p, *n/*, **e*/;
+	char buffer[BUFSIZ + 4];
+	size_t buflen = 0;
+
+	if(!s->len)
 		return;
-	for(p = s->data; isspace(*p); p++); /* strip leading whitespace */
-	for(; *p; ) { /* ignore tabs, newline and carriage return etc */
-		if(!isspace(*p) || *p == ' ') {
+	for(p = s->data; isspace((int)*p); p++); /* strip leading whitespace */
+	for(; *p; ) { /* ignore tabs, newline and carriage return etc, except space */
+		/*if(!isspace((int)*p) || *p == ' ') {*/
+		if(!((unsigned)*p - '\t' < 5)) {
 			if(*p == '<') { /* skip tags */
-				if((n = strchr(p, '>')))
-					p = n;
-				else
-					putchar('<');
-			} else if(*p == '&') {
-				for(e = entities, found = 0; *e; e += 2) {
-					len = strlen(*e);
-					if(!strncmp(*e, p, len)) { /* compare entities and "replace" */
-						fputs(*(e + 1), stdout);
+				if((n = strchr(p, '>'))) {
+					p = n + 1;
+					continue;
+				}
+			}
+			/* TODO: not necesary anymore because xml_handler_data_entity is used with entitytostr ? */
+			/* else if(*p == '&') { 
+				for(e = entities, i = 0, found = 0; *e; e += 2, i++) {
+					len = entlen[i];
+					if(!strncmp(*e, p, len)) {
+						buffer[buflen++] = *(e + 1)[0];
 						p += len;
 						found = 1;
 						break;
@@ -221,122 +400,298 @@ string_print_trimmed(String *s) {
 				}
 				if(found)
 					continue;
-				else
-					putchar('&');
-			} else
-				fputc(*p, stdout);
+			}*/
+			buffer[buflen++] = *p;
+		}
+		if(buflen >= BUFSIZ) {
+			fwrite(buffer, 1, buflen, stdout);
+			buflen = 0;
 		}
 		p++;
 	}
+/*	printf("%d |", buflen);*/
+	if(buflen)
+		fwrite(buffer, 1, buflen, stdout);
+/*	printf("|\n");*/
 }
 
 void /* print text, escape tabs, newline and carriage return etc */
 string_print_textblock(String *s) {
 	const char *p;
-	if(!s->data)
-		return;
-	for(p = s->data; *p && isspace(*p); p++); /* strip leading whitespace */
-	for(; *p; p++) {
-		if(*p == '\n') /* escape newline */
-			fputs("\\n", stdout);
-		else if(*p == '\\') /* escape \ */
-			fputs("\\\\", stdout);
-		else if(*p == '\t') /* tab */
-			fputs("\\t", stdout);
-		else if(!isspace(*p) || *p == ' ') /* ignore other whitespace chars */
-			fputc(*p, stdout);
+	char buffer[BUFSIZ + 4];
+	size_t i;
+
+	if(!s->len)
+		return;	
+	/* skip leading whitespace */
+	for(p = s->data; *p && isspace((int)*p); p++);
+	for(i = 0; *p; p++) {
+		if(((unsigned)*p - '\t') < 5) {
+			if(*p == '\n') { /* escape newline */
+				buffer[i++] = '\\';
+				buffer[i++] = 'n';
+			} else if(*p == '\\') { /* escape \ */
+				buffer[i++] = '\\';
+				buffer[i++] = '\\';
+			} else if(*p == '\t') { /* tab */
+				buffer[i++] = '\\';
+				buffer[i++] = 't';
+			}
+			/* ignore other whitespace chars, except space */
+		} else {
+			buffer[i++] = *p;
+		}
+		if(i >= BUFSIZ) { /* TODO: align */
+			fwrite(buffer, 1, i, stdout);
+			i = 0;
+		}
 	}
+	if(i)
+		fwrite(buffer, 1, i, stdout);
 }
 
 int
-istag(const char *name, const char *name2) {
-	return (!strcasecmp(name, name2));
+istag(const char *name, size_t len, const char *name2, size_t len2) {
+	return (len == len2 && !xstrcasecmp(name, name2));
 }
 
 int
-isattr(const char *name, const char *name2) {
-	return (!strcasecmp(name, name2));
+isattr(const char *name, size_t len, const char *name2, size_t len2) {
+	return (len == len2 && !xstrcasecmp(name, name2));
 }
 
-char * /* search for attr value by attr name in attributes list */
-getattrvalue(const char **atts, const char *name) {
-	const char **attr = NULL, *key, *value;
-	if(!atts || !(*atts))
-		return NULL;
-	for(attr = atts; *attr; ) {
-		key = *(attr++);
-		value = *(attr++);
-		if(key && value && isattr(key, name))
-			return (char *)value;
+/* NOTE: this handler can be called multiple times if the data in this
+ * block is bigger than the buffer */
+void
+xml_handler_data(XMLParser *p, const char *s, size_t len) {
+	if(currentfield) {
+		if(feeditemtagid != AtomTagAuthor || !strcmp(p->tag, "name")) /* author>name */
+			string_append(currentfield, s, len);
 	}
-	return NULL;
 }
 
-void XMLCALL
-xml_handler_start_element(void *data, const char *name, const char **atts) {
-	const char *value;
+void
+xml_handler_cdata(XMLParser *p, const char *s, size_t len) {
+	if(currentfield)
+		string_append(currentfield, s, len);
+}
 
-	strncpy(tag, name, sizeof(tag) - 1); /* set tag */
-	if(feeditem.feedtype != FeedTypeNone) { /* in item */
-		if(feeditem.feedtype == FeedTypeAtom) {
-			if(istag(feeditemtag, "content") || istag(feeditemtag, "summary")) {
-				XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */
-			} else if(istag(name, "link")) { /* link href attribute */
-				if((value = getattrvalue(atts, "href")))
-					string_append(&feeditem.link, value, strlen(value));
-			} else if(istag(name, "content") || istag(name, "summary")) {
-				if((value = getattrvalue(atts, "type"))) {  /* content type is HTML or plain text */
-					if(!strcasecmp(value, "xhtml") || !strcasecmp(value, "text/xhtml") ||
-					   !strcasecmp(value, "html") || !strcasecmp(value, "text/html"))
-						feeditem.contenttype = ContentTypeHTML;
-				}
+void
+xml_handler_attr_start(struct xmlparser *p, const char *tag, size_t taglen, const char *name, size_t namelen) {
+	if(iscontent && !iscontenttag) {
+		if(!attrcount)
+			xml_handler_data(p, " ", 1);
+		attrcount++;
+		xml_handler_data(p, name, namelen);
+		xml_handler_data(p, "=\"", 2);
+		return;
+	}
+}
+
+void
+xml_handler_attr_end(struct xmlparser *p, const char *tag, size_t taglen, const char *name, size_t namelen) {
+	if(iscontent && !iscontenttag) {
+		xml_handler_data(p, "\"", 1);
+		attrcount = 0;
+	}
+}
+
+void
+xml_handler_start_element_parsed(XMLParser *p, const char *tag, size_t taglen, int isshort) {
+	if(iscontent && !iscontenttag) {
+		if(isshort)
+			xml_handler_data(p, "/>", 2);
+		else
+			xml_handler_data(p, ">", 1);
+	}
+}
+
+void
+xml_handler_attr(XMLParser *p, const char *tag, size_t taglen,
+                 const char *name, size_t namelen, const char *value,
+                 size_t valuelen) {
+	if(iscontent && !iscontenttag) {
+		xml_handler_data(p, value, valuelen);
+		return;
+	}
+	if(feeditem.feedtype == FeedTypeAtom) {
+		/*if(feeditemtagid == AtomTagContent || feeditemtagid == AtomTagSummary) {*/
+		if(iscontenttag) {
+			if(isattr(name, namelen, "type", strlen("type")) &&
+			   (isattr(value, valuelen, "xhtml", strlen("xhtml")) || isattr(value, valuelen, "text/xhtml", strlen("text/xhtml")) ||
+			    isattr(value, valuelen, "html", strlen("html")) || isattr(value, valuelen, "text/html", strlen("text/html")))) {
+				feeditem.contenttype = ContentTypeHTML;
+				iscontent = 1;
+/*				p->xmldataentity = NULL;*/
+				p->xmlattrstart = xml_handler_attr_start;
+				p->xmlattrend = xml_handler_attr_end;
+				p->xmltagstartparsed = xml_handler_start_element_parsed;
 			}
-		} else if(feeditem.feedtype == FeedTypeRSS) {
-			if((istag(feeditemtag, "description") && !feeditem.content.len) || istag(feeditemtag, "content:encoded")) {
-				string_clear(&feeditem.content);
-				XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */
+		} else if(feeditemtagid == AtomTagLink && isattr(name, namelen, "href", strlen("href"))) /* link href attribute */
+			string_append(&feeditem.link, value, valuelen);
+	}
+}
+
+void
+xml_handler_start_element(XMLParser *p, const char *name, size_t namelen) {
+	if(iscontenttag) {
+		/* starts with div, handle as XML, dont convert entities */
+		/* TODO: test properly and do printf() to debug */
+		if(feeditem.feedtype == FeedTypeAtom && !strncmp(name, "div", strlen("div")))
+			p->xmldataentity = NULL;
+	}
+	if(iscontent) {
+		attrcount = 0;
+		iscontenttag = 0;
+		xml_handler_data(p, "<", 1);
+		xml_handler_data(p, name, namelen);
+		return;
+	}
+
+	/* TODO: cleanup, merge with code below ?, return function if FeedTypeNone */
+/*	iscontenttag = 0;*/
+	if(feeditem.feedtype != FeedTypeNone) { /* in item */
+		if(feeditemtag[0] == '\0') { /* set tag if not already set. */
+/*			strncpy(feeditemtag, name, sizeof(feeditemtag) - 1);*/
+			if(namelen >= sizeof(feeditemtag) - 2)
+				return;
+			memcpy(feeditemtag, name, namelen + 1);
+			feeditemtaglen = namelen; /* XXX: assumes feeditemtag had enough space */
+			feeditemtagid = gettag(feeditem.feedtype, feeditemtag, feeditemtaglen);
+
+			if(feeditem.feedtype == FeedTypeRSS) {
+				if(feeditemtagid == TagUnknown)
+					currentfield = NULL;
+				else if(feeditemtagid == RSSTagPubdate || feeditemtagid == RSSTagDcdate)
+					currentfield = &feeditem.timestamp;
+				else if(feeditemtagid == RSSTagTitle)
+					currentfield = &feeditem.title;
+				else if(feeditemtagid == RSSTagLink)
+					currentfield = &feeditem.link;
+				else if(feeditemtagid == RSSTagDescription || feeditemtagid == RSSTagContentencoded) {
+					/* clear previous summary, assumes previous content was not a summary text */
+					if(feeditemtagid == RSSTagContentencoded && feeditem.content.len)
+						string_clear(&feeditem.content);
+					/* ignore, prefer content:encoded over description */
+					if(!(feeditemtagid == RSSTagDescription && feeditem.content.len)) {
+						iscontenttag = 1;
+						currentfield = &feeditem.content;
+					}
+				} else if(feeditemtagid == RSSTagGuid)
+					currentfield = &feeditem.id;
+				else if(feeditemtagid == RSSTagAuthor || feeditemtagid == RSSTagDccreator)
+					currentfield = &feeditem.author;
+			} else if(feeditem.feedtype == FeedTypeAtom) {
+				if(feeditemtagid == TagUnknown)
+					currentfield = NULL;
+				else if(feeditemtagid == AtomTagPublished || feeditemtagid == AtomTagUpdated)
+					currentfield = &feeditem.timestamp;
+				else if(feeditemtagid == AtomTagTitle)
+					currentfield = &feeditem.title;
+				else if(feeditemtagid == AtomTagSummary || feeditemtagid == AtomTagContent) {
+					/* clear previous summary, assumes previous content was not a summary text */
+					if(feeditemtagid == AtomTagContent && feeditem.content.len)
+						string_clear(&feeditem.content);
+					/* ignore, prefer content:encoded over description */
+					if(!(feeditemtagid == AtomTagSummary && feeditem.content.len)) {
+						iscontenttag = 1;
+						currentfield = &feeditem.content;
+					}
+				} else if(feeditemtagid == AtomTagId)
+					currentfield = &feeditem.id;
+				else if(feeditemtagid == AtomTagLink)
+					currentfield = &feeditem.link;
+				else if(feeditemtagid == AtomTagAuthor)
+					currentfield = &feeditem.author;
 			}
+			/* TODO: prefer content encoded over content? */
 		}
-		if(feeditemtag[0] == '\0') /* set tag if not already set. */
-			strncpy(feeditemtag, name, sizeof(feeditemtag) - 1);
-	} else { /* start of RSS or Atom entry / item */
-		if(istag(name, "entry")) { /* Atom */
+	} else { /* start of RSS or Atom item / entry */
+		if(istag(name, namelen, "entry", strlen("entry"))) { /* Atom */
 			feeditem.feedtype = FeedTypeAtom;
 			feeditem.contenttype = ContentTypePlain; /* Default content type */
-		} else if(istag(name, "item")) { /* RSS */
+			currentfield = NULL; /* XXX: optimization */
+		} else if(istag(name, namelen, "item", strlen("item"))) { /* RSS */
 			feeditem.feedtype = FeedTypeRSS;
 			feeditem.contenttype = ContentTypeHTML; /* Default content type */
+			currentfield = NULL; /* XXX: optimization */
 		}
 	}
 }
 
-void XMLCALL
-xml_handler_end_element(void *data, const char *name) {
+void
+xml_handler_data_entity(XMLParser *p, const char *data, size_t datalen) {
+	char buffer[16];
+	size_t len;
+
+#if 0
+	if(iscontent) {
+		xml_handler_data(p, data, datalen); /* TODO: for now, dont convert entities */
+		return;
+	}
+#endif
+	/* TODO: for content HTML data entities, convert &amp; to &? */
+	if((len = entitytostr(data, buffer, sizeof(buffer))))
+		xml_handler_data(p, buffer, len);
+	else
+		xml_handler_data(p, data, datalen); /* can't convert entity, just use it's data */
+}
+
+void
+xml_handler_end_element(XMLParser *p, const char *name, size_t namelen, int isshort) {
 	char timebuf[64];
+	int tagid;
+
+/*	printf("%d end tag: </%s>\n", iscontent, name);*/
+	if(iscontent) {
+		attrcount = 0;
+		/* TODO: optimize */
+		tagid = gettag(feeditem.feedtype, name, namelen);
+		if(feeditemtagid == tagid) { /* close content */
+			iscontent = 0;
+			iscontenttag = 0;
+
+			p->xmldataentity = xml_handler_data_entity;
+			p->xmlattrstart = NULL;
+			p->xmlattrend = NULL;
+			p->xmltagstartparsed = NULL;
+
+			feeditemtag[0] = '\0'; /* unset tag */
+			feeditemtaglen = 0;
+			feeditemtagid = TagUnknown;
 
+			return; /* TODO: not sure if !isshort check below should be skipped */
+		}
+		if(!isshort) {
+			xml_handler_data(p, "</", 2);
+			xml_handler_data(p, name, namelen);
+			xml_handler_data(p, ">", 1);
+		}
+		return;
+	}
 	if(feeditem.feedtype != FeedTypeNone) {
 		/* end of RSS or Atom entry / item */
-		if((istag(name, "entry") && (feeditem.feedtype == FeedTypeAtom)) || /* Atom */
-		  (istag(name, "item") && (feeditem.feedtype == FeedTypeRSS))) { /* RSS */
-			printf("%ld", (long)parsetime((&feeditem.timestamp)->data, timebuf,
-			       sizeof(timebuf) - 1));
-			fputc(FieldSeparator, stdout);
+		/* TODO: optimize, use gettag() ? to tagid? */
+		if((feeditem.feedtype == FeedTypeAtom && istag(name, namelen, "entry", strlen("entry"))) || /* Atom */
+		  (feeditem.feedtype == FeedTypeRSS && istag(name, namelen, "item", strlen("item")))) { /* RSS */
+			printf("%ld", (long)parsetime((&feeditem.timestamp)->data, timebuf));
+			putchar(FieldSeparator);
 			fputs(timebuf, stdout);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			string_print_trimmed(&feeditem.title);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			string_print_trimmed(&feeditem.link);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			string_print_textblock(&feeditem.content);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			fputs(contenttypes[feeditem.contenttype], stdout);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			string_print_trimmed(&feeditem.id);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			string_print_trimmed(&feeditem.author);
-			fputc(FieldSeparator, stdout);
+			putchar(FieldSeparator);
 			fputs(feedtypes[feeditem.feedtype], stdout);
-			fputc('\n', stdout);
+			putchar('\n');
 
 			/* clear strings */
 			string_clear(&feeditem.timestamp);
@@ -347,109 +702,29 @@ xml_handler_end_element(void *data, const char *name) {
 			string_clear(&feeditem.author);
 			feeditem.feedtype = FeedTypeNone;
 			feeditem.contenttype = ContentTypePlain;
-			incdata = 0;
 			feeditemtag[0] = '\0'; /* unset tag */
-		} else if(!strcmp(feeditemtag, name)) { /* clear */
+			feeditemtaglen = 0;
+			feeditemtagid = TagUnknown;
+			
+			/* not sure if needed */
+			iscontenttag = 0;
+			iscontent = 0;
+		} else if(!strcmp(feeditemtag, name)) { /* clear */ /* XXX: optimize ? */
+			currentfield = NULL;
 			feeditemtag[0] = '\0'; /* unset tag */
-		} else {
-			if(feeditem.feedtype == FeedTypeAtom) {
-				if(istag(feeditemtag, "content") || istag(feeditemtag, "summary")) {
-					/* pass to default handler to process inline HTML etc */
-					XML_DefaultCurrent(parser);
-					return;
-				}
-			}
+			feeditemtaglen = 0;
+			feeditemtagid = TagUnknown;
+			
+			/* not sure if needed */
+			iscontenttag = 0;
+			iscontent = 0;
 		}
 	}
-	tag[0] = '\0'; /* unset tag */
-}
-
-/* NOTE: this handler can be called multiple times if the data in this block
- * is bigger than the buffer */
-void XMLCALL
-xml_handler_data(void *data, const XML_Char *s, int len) {
-	if(feeditem.feedtype == FeedTypeRSS) {
-		if(istag(feeditemtag, "pubdate") || istag(feeditemtag, "dc:date"))
-			string_append(&feeditem.timestamp, s, len);
-		else if(istag(feeditemtag, "title"))
-			string_append(&feeditem.title, s, len);
-		else if(istag(feeditemtag, "link"))
-			string_append(&feeditem.link, s, len);
-		else if(istag(feeditemtag, "description") || istag(feeditemtag, "content:encoded")) {
-			if(incdata)
-				XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */
-			else
-				string_append(&feeditem.content, s, len);
-		} else if(istag(feeditemtag, "guid"))
-			string_append(&feeditem.id, s, len);
-		else if(istag(feeditemtag, "author") || istag(feeditemtag, "dc:creator"))
-			string_append(&feeditem.author, s, len);
-	} else if(feeditem.feedtype == FeedTypeAtom) {
-		if(istag(feeditemtag, "published") || istag(feeditemtag, "updated"))
-			string_append(&feeditem.timestamp, s, len);
-		else if(istag(feeditemtag, "title")) {
-			string_append(&feeditem.title, s, len);
-		} else if(istag(feeditemtag, "summary") || istag(feeditemtag, "content")) {
-			if(feeditem.contenttype == ContentTypeHTML) {
-				if(incdata)
-					XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */
-				else
-					string_append(&feeditem.content, s, len);
-			} else
-				XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */
-		} else if(istag(feeditemtag, "id"))
-			string_append(&feeditem.id, s, len);
-		else if(istag(feeditemtag, "name")) /* assume this is: <author><name></name></author> */
-			string_append(&feeditem.author, s, len);
-	}
-}
-
-int /* parse XML from stream using setup parser, return 1 on success, 0 on failure. */
-xml_parse_stream(XML_Parser parser, FILE *fp) {
-	char buffer[BUFSIZ];
-	int done = 0, len = 0;
-
-	while(!feof(fp)) {
-		len = fread(buffer, 1, sizeof(buffer), fp);
-		done = (feof(fp) || ferror(fp));
-		if(XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR && (len > 0)) {
-			if(XML_GetErrorCode(parser) == XML_ERROR_NO_ELEMENTS)
-				return 1; /* Ignore "no elements found" / empty document as an error */
-			fprintf(stderr, "sfeed: error parsing xml %s at line %lu column %lu\n",
-			        XML_ErrorString(XML_GetErrorCode(parser)), (unsigned long)XML_GetCurrentLineNumber(parser),
-			        (unsigned long)XML_GetCurrentColumnNumber(parser));
-			return 0;
-		}
-	} while(!done);
-	return 1;
-}
-
-void
-xml_handler_default(void *data, const XML_Char *s, int len) {
-	if((feeditem.feedtype == FeedTypeAtom && (istag(feeditemtag, "summary") || istag(feeditemtag, "content"))) ||
-	   (feeditem.feedtype == FeedTypeRSS && (istag(feeditemtag, "description") || istag(feeditemtag, "content:encoded"))))
-		/*if(!istag(tag, "script") && !istag(tag, "style"))*/ /* ignore data in inline script and style */
-			string_append(&feeditem.content, s, len);
-}
-
-void /* NOTE: data is null terminated. */
-xml_handler_comment(void *data, const XML_Char *s) {
-}
-
-void
-xml_cdata_section_handler_start(void *userdata) {
-	incdata = 1;
-}
-
-void
-xml_cdata_section_handler_end(void *userdata) {
-	incdata = 0;
 }
 
 int
 main(void) {
-	int status;
-	standardtz = getenv("TZ");
+	atexit(cleanup);
 
 	/* init strings and initial memory pool size */
 	string_buffer_init(&feeditem.timestamp, 64);
@@ -460,20 +735,15 @@ main(void) {
 	string_buffer_init(&feeditem.author, 256);
 	feeditem.contenttype = ContentTypePlain;
 	feeditem.feedtype = FeedTypeNone;
-	feeditemtag[0] = '\0'; /* unset tag */
-	tag[0] = '\0'; /* unset tag */
-
-	if(!(parser = XML_ParserCreate("UTF-8")))
-		die("can't create parser");
-
-	XML_SetElementHandler(parser, xml_handler_start_element, xml_handler_end_element);
-	XML_SetCharacterDataHandler(parser, xml_handler_data);
-	XML_SetCommentHandler(parser, xml_handler_comment);
-	XML_SetCdataSectionHandler(parser, xml_cdata_section_handler_start, xml_cdata_section_handler_end);
-	XML_SetDefaultHandler(parser, xml_handler_default);
 
-	status = xml_parse_stream(parser, stdin);
-	cleanup();
+	xmlparser_init(&parser);
+	parser.xmltagstart = xml_handler_start_element;
+	parser.xmltagend = xml_handler_end_element;
+	parser.xmldata = xml_handler_data;
+	parser.xmldataentity = xml_handler_data_entity;
+	parser.xmlattr = xml_handler_attr;
+	parser.xmlcdata = xml_handler_cdata;
+	xmlparser_parse(&parser);
 
-	return status ? EXIT_SUCCESS : EXIT_FAILURE;
+	return EXIT_SUCCESS;
 }
author	Hiltjo Posthuma <hiltjo@codemadness.org>	2013-05-20 19:34:07 +0200
committer	Hiltjo Posthuma <hiltjo@codemadness.org>	2013-05-20 19:34:07 +0200
commit	97d0866d07ce25b157d1cfb79940e2a6e1a57b33 (patch)
tree	ea4c43abe4b9b87d21a47f00d845bbd979a50a66
parent	56c0c0dc73096bb5d08bd93f8f2644225a551524 (diff)