diff options
author | Hiltjo Posthuma <hiltjo@codemadness.org> | 2013-05-20 19:34:07 +0200 |
---|---|---|
committer | Hiltjo Posthuma <hiltjo@codemadness.org> | 2013-05-20 19:34:07 +0200 |
commit | 97d0866d07ce25b157d1cfb79940e2a6e1a57b33 (patch) | |
tree | ea4c43abe4b9b87d21a47f00d845bbd979a50a66 | |
parent | 56c0c0dc73096bb5d08bd93f8f2644225a551524 (diff) |
sfeed.c: lots of changes, see CHANGELOG, will separate commits better in the future after next version release
Signed-off-by: Hiltjo Posthuma <hiltjo@codemadness.org>
-rw-r--r-- | sfeed.c | 822 |
1 files changed, 546 insertions, 276 deletions
@@ -1,15 +1,15 @@ #include <stdio.h> #include <stdlib.h> -#include <strings.h> #include <string.h> #include <time.h> #include <ctype.h> -#include <expat.h> /* libexpat */ +#include "xml.h" +#include "compat.h" -enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2, FeedTypeLast = 3 }; +enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 }; const char *feedtypes[] = { "", "rss", "atom" }; -enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2, ContentTypeLast = 3 }; +enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 }; const char *contenttypes[] = { "", "plain", "html" }; typedef struct string { /* String data / pool */ @@ -32,13 +32,112 @@ typedef struct feeditem { /* Feed item */ void die(const char *s); void cleanup(void); +String *currentfield = NULL; /* TODO */ const int FieldSeparator = '\t'; FeedItem feeditem; /* data for current feed item */ -char tag[1024]; /* current XML tag being parsed. */ -char feeditemtag[1024]; /* current tag _inside_ a feeditem */ -XML_Parser parser; /* expat XML parser state */ -int incdata = 0; +char feeditemtag[256] = ""; /* current tag _inside_ a feeditem */ +size_t feeditemtaglen = 0; +int feeditemtagid = 0; +int iscontent = 0; +int iscontenttag = 0; +size_t attrcount = 0; char *standardtz = NULL; /* TZ variable at start of program */ +XMLParser parser; /* XML parser state */ + +enum { + TagUnknown = 0, + /* RSS */ + RSSTagDcdate, RSSTagPubdate, RSSTagTitle, + RSSTagLink, RSSTagDescription, RSSTagContentencoded, + RSSTagGuid, RSSTagAuthor, RSSTagDccreator, + /* Atom */ + AtomTagPublished, AtomTagUpdated, AtomTagTitle, + AtomTagSummary, AtomTagContent, + AtomTagId, AtomTagLink, AtomTagAuthor +}; + +typedef struct feedtag { + char *name; + size_t namelen; + int id; +} FeedTag; + +/* TODO: optimize lookup */ +int +gettag(int feedtype, const char *name, size_t namelen) { + /* RSS, alphabetical order */ + static FeedTag rsstag[] = { + { "author", 6, RSSTagAuthor }, + { "content:encoded", 15, RSSTagContentencoded }, + { "dc:creator", 10, RSSTagDccreator }, + { "dc:date", 7, RSSTagDcdate }, + { "description", 11, RSSTagDescription }, + { "guid", 4, RSSTagGuid }, + { "link", 4, RSSTagLink }, + { "pubdate", 7, RSSTagPubdate }, + { "title", 5, RSSTagTitle }, + { NULL, 0, -1 } + }; + /* Atom, alphabetical order */ + static FeedTag atomtag[] = { + { "author", 6, AtomTagAuthor }, /* assume this is: <author><name></name></author> */ + { "content", 7, AtomTagContent }, + { "id", 2, AtomTagId }, + { "link", 4, AtomTagLink }, + { "published", 9, AtomTagPublished }, + { "summary", 7, AtomTagSummary }, + { "title", 5, AtomTagTitle }, + { "updated", 7, AtomTagUpdated }, + { NULL, 0, -1 } + }; + int i, n; + + if(namelen >= 2 && namelen <= 15) { + if(feedtype == FeedTypeRSS) { + for(i = 0; rsstag[i].name; i++) { + if(!(n = xstrncasecmp(rsstag[i].name, name, rsstag[i].namelen))) + return rsstag[i].id; + /* optimization: it's sorted so nothing after it matches. */ + if(n > 0) + return TagUnknown; + } + } else if(feedtype == FeedTypeAtom) { + for(i = 0; atomtag[i].name; i++) { + if(!(n = xstrncasecmp(atomtag[i].name, name, atomtag[i].namelen))) + return atomtag[i].id; + /* optimization: it's sorted so nothing after it matches. */ + if(n > 0) + return TagUnknown; + } + } + } + return TagUnknown; +} + +int +entitytostr(const char *e, char *buffer, size_t bufsiz) { + /* TODO: optimize lookup? */ + char *entities[6][2] = { + { "<", "<" }, + { ">", ">" }, + { "'", "'" }, + { "&", "&" }, + { """, "\"" }, + { NULL, NULL } + }; + size_t i; + if(*e != '&' || bufsiz < 2) /* doesnt start with & */ + return 0; + for(i = 0; entities[i][0]; i++) { + /* NOTE: compares max 7 chars */ + if(!xstrncasecmp(e, entities[i][0], 6)) { + buffer[0] = *(entities[i][1]); + buffer[1] = '\0'; + return 1; + } + } + return 0; +} void string_clear(String *s) { @@ -68,23 +167,24 @@ int string_buffer_expand(String *s, size_t newlen) { char *p; size_t alloclen; - /* check if allocation is necesary, dont shrink buffer */ - if(!s->data || (newlen > s->bufsiz)) { - /* should be more than bufsiz ofcourse */ - for(alloclen = 16; alloclen <= newlen; alloclen *= 2); - if(!(p = realloc(s->data, alloclen))) { - string_free(s); /* free previous allocation */ - die("can't allocate enough memory"); - } - s->bufsiz = alloclen; - s->data = p; + /* check if allocation is necesary, dont shrink buffer + should be more than bufsiz ofcourse */ + for(alloclen = 16; alloclen <= newlen; alloclen *= 2); + if(!(p = realloc(s->data, alloclen))) { + string_free(s); /* free previous allocation */ + die("can't allocate enough memory"); } + s->bufsiz = alloclen; + s->data = p; return s->bufsiz; } void string_append(String *s, const char *data, size_t len) { - string_buffer_expand(s, s->len + len); + if(!len || *data == '\0') + return; + if(s->len + len > s->bufsiz) + string_buffer_expand(s, s->len + len); memcpy(s->data + s->len, data, len); s->len += len; s->data[s->len] = '\0'; @@ -92,7 +192,6 @@ string_append(String *s, const char *data, size_t len) { void /* cleanup parser, free allocated memory, etc */ cleanup(void) { - XML_ParserFree(parser); string_free(&feeditem.timestamp); string_free(&feeditem.title); string_free(&feeditem.link); @@ -106,114 +205,194 @@ die(const char *s) { fputs("sfeed: ", stderr); fputs(s, stderr); fputc('\n', stderr); - cleanup(); exit(EXIT_FAILURE); } -void +/* get timezone from string, return as formatted string and time offset, + * for the offset it assumes GMT */ +int gettimetz(const char *s, char *buf, size_t bufsiz) { const char *p = s; + char tzname[16] = "", *t = NULL; int tzhour = 0, tzmin = 0; - char tzname[128] = "", *t = NULL; unsigned int i; + char c; buf[0] = '\0'; - for(; *p && isspace(*p); p++); /* skip whitespace */ - /* detect time offset, assume time offset isn't specified in the first 18 characters */ - for(i = 0; *p && ((*p != '+' && *p != '-') || i <= 18); p++, i++); - - if(isalpha(*p)) { + if(bufsiz < sizeof(tzname) + 7) + return 0; + for(; *p && isspace((int)*p); p++); /* skip whitespace */ + /* loop until some common timezone delimiters are found */ + for(;*p && (*p != '+' && *p != '-' && *p != 'Z' && *p != 'z'); p++); + + /* TODO: cleanup / simplify */ + if(isalpha((int)*p)) { if(*p == 'Z' || *p == 'z') { - strncpy(buf, "GMT+00:00", bufsiz); - return; + memcpy(buf, "GMT+00:00", strlen("GMT+00:00") + 1); + return 0; } else { - for(i = 0, t = &tzname[0]; i < (sizeof(tzname) - 1) && (*p && isalpha(*p)); i++) + for(i = 0, t = &tzname[0]; i < (sizeof(tzname) - 1) && + (*p && isalpha((int)*p)); i++) *(t++) = *(p++); *t = '\0'; } } else - strncpy(tzname, "GMT", sizeof(tzname) - 1); - if(!(*p)) { + memcpy(tzname, "GMT", strlen("GMT") + 1); + if(!(*p)) { strncpy(buf, tzname, bufsiz); - return; + return 0; } - /* NOTE: reverses time offsets for TZ */ - if((sscanf(p, "+%02d:%02d", &tzhour, &tzmin)) > 0) - snprintf(buf, bufsiz, "%s-%02d:%02d", tzname, tzhour, tzmin); - else if((sscanf(p, "-%02d:%02d", &tzhour, &tzmin)) > 0) - snprintf(buf, bufsiz, "%s+%02d:%02d", tzname, tzhour, tzmin); - else if((sscanf(p, "+%02d%02d", &tzhour, &tzmin)) > 0) - snprintf(buf, bufsiz, "%s-%02d:%02d", tzname, tzhour, tzmin); - else if((sscanf(p, "-%02d%02d", &tzhour, &tzmin)) > 0) - snprintf(buf, bufsiz, "%s+%02d:%02d", tzname, tzhour, tzmin); - else if(sscanf(p, "+%d", &tzhour) > 0) - snprintf(buf, bufsiz, "%s-%02d:00", tzname, tzhour); - else if(sscanf(p, "-%d", &tzhour) > 0) - snprintf(buf, bufsiz, "%s+%02d:00", tzname, tzhour); + if((sscanf(p, "%c%02d:%02d", &c, &tzhour, &tzmin)) > 0); + else if(sscanf(p, "%c%02d%02d", &c, &tzhour, &tzmin) > 0); + else if(sscanf(p, "%c%d", &c, &tzhour) > 0) + tzmin = 0; + sprintf(buf, "%s%c%02d%02d", tzname, c, tzhour, tzmin); + /* TODO: test + or - offset */ + return (tzhour * 3600) + (tzmin * 60) * (c == '-' ? -1 : 1); } -time_t -parsetime(const char *s, char *buf, size_t bufsiz) { - struct tm tm = { 0 }; - time_t t = 0; - char timebuf[64], tz[256], *p; +/* parses everything in a format similar to: + * "%a, %d %b %Y %H:%M:%S" or "%Y-%m-%d %H:%M:%S" */ +/* TODO: calculate time offset (GMT only) from gettimetz ? */ +int +parsetimeformat(const char *s, struct tm *t, const char **end) { + static const char *months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", + "Nov", "Dec" + }; + const char *p = s; + unsigned int i, fm; + unsigned long l; - if(buf) - buf[0] = '\0'; - gettimetz(s, tz, sizeof(tz) - 1); - if(!standardtz || strcmp(standardtz, tz)) { - if(!strcmp(tz, "")) { /* restore TZ */ - if(standardtz) - setenv("TZ", standardtz, 1); - else - unsetenv("TZ"); + memset(t, 0, sizeof(struct tm)); + if((l = strtoul(p, (void *)&p, 10))) { + t->tm_year = abs(l) - 1900; + if(!(l = strtoul(p, (void *)&p, 10))) + return 0; + t->tm_mon = abs(l) - 1; + if(!(t->tm_mday = abs(strtoul(p, (void *)&p, 10)))) + return 0; + } else { + for(; *p && !isdigit((int)*p); p++); + if(!(t->tm_mday = abs(strtoul(p, (void *)&p, 10)))) + return 0; + for(; *p && !isalpha((int)*p); p++); /* skip non-alpha */ + for(fm = 0, i = 0; i < 12; i++) { /* parse month names */ + if(!xstrncasecmp(p, months[i], 3)) { + t->tm_mon = i; + fm = 1; + break; + } } - else - setenv("TZ", tz, 1); - tzset(); + if(!fm) /* can't find month */ + return 0; + for(; *p && !isdigit((int)*p); p++); /* skip non-digit */ + if(!(l = strtoul(p, (void *)&p, 10))) + return 0; + t->tm_year = abs(l) - 1900; } - if((strptime(s, "%Y-%m-%dT%H:%M:%SZ", &tm)) || - (strptime(s, "%Y-%m-%d %H:%M:%S", &tm)) || - (strptime(s, "%a, %d %b %Y %H:%M:%S", &tm)) || - (strptime(s, "%Y-%m-%dT%H:%M:%S", &tm))) { - tm.tm_isdst = -1; /* detect Daylight Saving Time */ - if((t = mktime(&tm)) == -1) - t = 0; - if(buf && (strftime(timebuf, sizeof(timebuf) - 1, - "%Y-%m-%d %H:%M:%S", &tm))) { - for(p = tz; *p; p++) /* print time offset reverse */ - *p = ((*p == '-') ? '+' : (*p == '+' ? '-' : *p)); - snprintf(buf, bufsiz, "%s %s", timebuf, tz); + for(; *p && !isdigit((int)*p); p++); /* skip non-digit */ + if((t->tm_hour = abs(strtoul(p, (void *)&p, 10))) > 23) + return 0; + for(; *p && !isdigit((int)*p); p++); /* skip non-digit */ + if((t->tm_min = abs(strtoul(p, (void *)&p, 10))) > 59) + return 0; + for(; *p && !isdigit((int)*p); p++); /* skip non-digit */ + if((t->tm_sec = abs(strtoul(p, (void *)&p, 10))) > 60) + return 0; + if(end) + *end = p; + return 1; +} + +/* C defines the rounding for division in a nonsensical way */ +#define Q(a,b) ((a)>0 ? (a)/(b) : -(((b)-(a)-1)/(b))) + +/* copied from Musl C awesome small implementation, see LICENSE. */ +time_t +tm_to_time(struct tm *tm) { + time_t year = tm->tm_year - 100; + int month = tm->tm_mon; + int day = tm->tm_mday; + int daysbeforemon[] = { 0,31,59,90,120,151,181,212,243,273,304,334 }; + int z4, z100, z400; + + /* normalize month */ + if(month >= 12) { + year += month / 12; + month %= 12; + } else if(month < 0) { + year += month / 12; + month %= 12; + if(month) { + month += 12; + year--; } } - return t; + z4 = Q(year - (month < 2), 4); /* is leap? */ + z100 = Q(z4, 25); + z400 = Q(z100, 4); + day += year * 365 + z4 - z100 + z400 + daysbeforemon[month]; + return (time_t)day * 86400 + + tm->tm_hour * 3600 + tm->tm_min * 60 + tm->tm_sec + + 946684800; /* the dawn of time, aka 1970 (30 years of seconds) :) */ +} + +time_t +parsetime(const char *s, char *buf) { + struct tm tm; + char tz[64]; + const char *end; + int offset; + + if(buf) + buf[0] = '\0'; + if(parsetimeformat(s, &tm, &end)) { + offset = gettimetz(end, tz, sizeof(tz) - 1); + /* TODO: make sure snprintf cant overflow */ + if(buf) + sprintf(buf, "%04d-%02d-%02d %02d:%02d:%02d %-.16s", + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, tz); + /* return UNIX time, reverse offset to GMT+0 */ + return tm_to_time(&tm) - offset; + } + return -1; /* can't parse */ } /* print text, ignore tabs, newline and carriage return etc -1 * print some HTML 2.0 / XML 1.0 as normal text */ + * print some HTML 2.0 / XML 1.0 as normal text */ void string_print_trimmed(String *s) { - const char *entities[] = { - "&", "&", "<", "<", ">", ">", "'", "'", """, "\"", +/* const char *entities[] = { + "&", "&", "<", "<", ">", ">", "'", "'", + """, "\"", NULL, NULL }; - const char *p, *n, **e; - unsigned int len, found; - if(!s->data) + unsigned char entlen[] = { 5, 4, 4, 6, 6 };*/ + /*unsigned int len, found, i;*/ + const char *p, *n/*, **e*/; + char buffer[BUFSIZ + 4]; + size_t buflen = 0; + + if(!s->len) return; - for(p = s->data; isspace(*p); p++); /* strip leading whitespace */ - for(; *p; ) { /* ignore tabs, newline and carriage return etc */ - if(!isspace(*p) || *p == ' ') { + for(p = s->data; isspace((int)*p); p++); /* strip leading whitespace */ + for(; *p; ) { /* ignore tabs, newline and carriage return etc, except space */ + /*if(!isspace((int)*p) || *p == ' ') {*/ + if(!((unsigned)*p - '\t' < 5)) { if(*p == '<') { /* skip tags */ - if((n = strchr(p, '>'))) - p = n; - else - putchar('<'); - } else if(*p == '&') { - for(e = entities, found = 0; *e; e += 2) { - len = strlen(*e); - if(!strncmp(*e, p, len)) { /* compare entities and "replace" */ - fputs(*(e + 1), stdout); + if((n = strchr(p, '>'))) { + p = n + 1; + continue; + } + } + /* TODO: not necesary anymore because xml_handler_data_entity is used with entitytostr ? */ + /* else if(*p == '&') { + for(e = entities, i = 0, found = 0; *e; e += 2, i++) { + len = entlen[i]; + if(!strncmp(*e, p, len)) { + buffer[buflen++] = *(e + 1)[0]; p += len; found = 1; break; @@ -221,122 +400,298 @@ string_print_trimmed(String *s) { } if(found) continue; - else - putchar('&'); - } else - fputc(*p, stdout); + }*/ + buffer[buflen++] = *p; + } + if(buflen >= BUFSIZ) { + fwrite(buffer, 1, buflen, stdout); + buflen = 0; } p++; } +/* printf("%d |", buflen);*/ + if(buflen) + fwrite(buffer, 1, buflen, stdout); +/* printf("|\n");*/ } void /* print text, escape tabs, newline and carriage return etc */ string_print_textblock(String *s) { const char *p; - if(!s->data) - return; - for(p = s->data; *p && isspace(*p); p++); /* strip leading whitespace */ - for(; *p; p++) { - if(*p == '\n') /* escape newline */ - fputs("\\n", stdout); - else if(*p == '\\') /* escape \ */ - fputs("\\\\", stdout); - else if(*p == '\t') /* tab */ - fputs("\\t", stdout); - else if(!isspace(*p) || *p == ' ') /* ignore other whitespace chars */ - fputc(*p, stdout); + char buffer[BUFSIZ + 4]; + size_t i; + + if(!s->len) + return; + /* skip leading whitespace */ + for(p = s->data; *p && isspace((int)*p); p++); + for(i = 0; *p; p++) { + if(((unsigned)*p - '\t') < 5) { + if(*p == '\n') { /* escape newline */ + buffer[i++] = '\\'; + buffer[i++] = 'n'; + } else if(*p == '\\') { /* escape \ */ + buffer[i++] = '\\'; + buffer[i++] = '\\'; + } else if(*p == '\t') { /* tab */ + buffer[i++] = '\\'; + buffer[i++] = 't'; + } + /* ignore other whitespace chars, except space */ + } else { + buffer[i++] = *p; + } + if(i >= BUFSIZ) { /* TODO: align */ + fwrite(buffer, 1, i, stdout); + i = 0; + } } + if(i) + fwrite(buffer, 1, i, stdout); } int -istag(const char *name, const char *name2) { - return (!strcasecmp(name, name2)); +istag(const char *name, size_t len, const char *name2, size_t len2) { + return (len == len2 && !xstrcasecmp(name, name2)); } int -isattr(const char *name, const char *name2) { - return (!strcasecmp(name, name2)); +isattr(const char *name, size_t len, const char *name2, size_t len2) { + return (len == len2 && !xstrcasecmp(name, name2)); } -char * /* search for attr value by attr name in attributes list */ -getattrvalue(const char **atts, const char *name) { - const char **attr = NULL, *key, *value; - if(!atts || !(*atts)) - return NULL; - for(attr = atts; *attr; ) { - key = *(attr++); - value = *(attr++); - if(key && value && isattr(key, name)) - return (char *)value; +/* NOTE: this handler can be called multiple times if the data in this + * block is bigger than the buffer */ +void +xml_handler_data(XMLParser *p, const char *s, size_t len) { + if(currentfield) { + if(feeditemtagid != AtomTagAuthor || !strcmp(p->tag, "name")) /* author>name */ + string_append(currentfield, s, len); } - return NULL; } -void XMLCALL -xml_handler_start_element(void *data, const char *name, const char **atts) { - const char *value; +void +xml_handler_cdata(XMLParser *p, const char *s, size_t len) { + if(currentfield) + string_append(currentfield, s, len); +} - strncpy(tag, name, sizeof(tag) - 1); /* set tag */ - if(feeditem.feedtype != FeedTypeNone) { /* in item */ - if(feeditem.feedtype == FeedTypeAtom) { - if(istag(feeditemtag, "content") || istag(feeditemtag, "summary")) { - XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */ - } else if(istag(name, "link")) { /* link href attribute */ - if((value = getattrvalue(atts, "href"))) - string_append(&feeditem.link, value, strlen(value)); - } else if(istag(name, "content") || istag(name, "summary")) { - if((value = getattrvalue(atts, "type"))) { /* content type is HTML or plain text */ - if(!strcasecmp(value, "xhtml") || !strcasecmp(value, "text/xhtml") || - !strcasecmp(value, "html") || !strcasecmp(value, "text/html")) - feeditem.contenttype = ContentTypeHTML; - } +void +xml_handler_attr_start(struct xmlparser *p, const char *tag, size_t taglen, const char *name, size_t namelen) { + if(iscontent && !iscontenttag) { + if(!attrcount) + xml_handler_data(p, " ", 1); + attrcount++; + xml_handler_data(p, name, namelen); + xml_handler_data(p, "=\"", 2); + return; + } +} + +void +xml_handler_attr_end(struct xmlparser *p, const char *tag, size_t taglen, const char *name, size_t namelen) { + if(iscontent && !iscontenttag) { + xml_handler_data(p, "\"", 1); + attrcount = 0; + } +} + +void +xml_handler_start_element_parsed(XMLParser *p, const char *tag, size_t taglen, int isshort) { + if(iscontent && !iscontenttag) { + if(isshort) + xml_handler_data(p, "/>", 2); + else + xml_handler_data(p, ">", 1); + } +} + +void +xml_handler_attr(XMLParser *p, const char *tag, size_t taglen, + const char *name, size_t namelen, const char *value, + size_t valuelen) { + if(iscontent && !iscontenttag) { + xml_handler_data(p, value, valuelen); + return; + } + if(feeditem.feedtype == FeedTypeAtom) { + /*if(feeditemtagid == AtomTagContent || feeditemtagid == AtomTagSummary) {*/ + if(iscontenttag) { + if(isattr(name, namelen, "type", strlen("type")) && + (isattr(value, valuelen, "xhtml", strlen("xhtml")) || isattr(value, valuelen, "text/xhtml", strlen("text/xhtml")) || + isattr(value, valuelen, "html", strlen("html")) || isattr(value, valuelen, "text/html", strlen("text/html")))) { + feeditem.contenttype = ContentTypeHTML; + iscontent = 1; +/* p->xmldataentity = NULL;*/ + p->xmlattrstart = xml_handler_attr_start; + p->xmlattrend = xml_handler_attr_end; + p->xmltagstartparsed = xml_handler_start_element_parsed; } - } else if(feeditem.feedtype == FeedTypeRSS) { - if((istag(feeditemtag, "description") && !feeditem.content.len) || istag(feeditemtag, "content:encoded")) { - string_clear(&feeditem.content); - XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */ + } else if(feeditemtagid == AtomTagLink && isattr(name, namelen, "href", strlen("href"))) /* link href attribute */ + string_append(&feeditem.link, value, valuelen); + } +} + +void +xml_handler_start_element(XMLParser *p, const char *name, size_t namelen) { + if(iscontenttag) { + /* starts with div, handle as XML, dont convert entities */ + /* TODO: test properly and do printf() to debug */ + if(feeditem.feedtype == FeedTypeAtom && !strncmp(name, "div", strlen("div"))) + p->xmldataentity = NULL; + } + if(iscontent) { + attrcount = 0; + iscontenttag = 0; + xml_handler_data(p, "<", 1); + xml_handler_data(p, name, namelen); + return; + } + + /* TODO: cleanup, merge with code below ?, return function if FeedTypeNone */ +/* iscontenttag = 0;*/ + if(feeditem.feedtype != FeedTypeNone) { /* in item */ + if(feeditemtag[0] == '\0') { /* set tag if not already set. */ +/* strncpy(feeditemtag, name, sizeof(feeditemtag) - 1);*/ + if(namelen >= sizeof(feeditemtag) - 2) + return; + memcpy(feeditemtag, name, namelen + 1); + feeditemtaglen = namelen; /* XXX: assumes feeditemtag had enough space */ + feeditemtagid = gettag(feeditem.feedtype, feeditemtag, feeditemtaglen); + + if(feeditem.feedtype == FeedTypeRSS) { + if(feeditemtagid == TagUnknown) + currentfield = NULL; + else if(feeditemtagid == RSSTagPubdate || feeditemtagid == RSSTagDcdate) + currentfield = &feeditem.timestamp; + else if(feeditemtagid == RSSTagTitle) + currentfield = &feeditem.title; + else if(feeditemtagid == RSSTagLink) + currentfield = &feeditem.link; + else if(feeditemtagid == RSSTagDescription || feeditemtagid == RSSTagContentencoded) { + /* clear previous summary, assumes previous content was not a summary text */ + if(feeditemtagid == RSSTagContentencoded && feeditem.content.len) + string_clear(&feeditem.content); + /* ignore, prefer content:encoded over description */ + if(!(feeditemtagid == RSSTagDescription && feeditem.content.len)) { + iscontenttag = 1; + currentfield = &feeditem.content; + } + } else if(feeditemtagid == RSSTagGuid) + currentfield = &feeditem.id; + else if(feeditemtagid == RSSTagAuthor || feeditemtagid == RSSTagDccreator) + currentfield = &feeditem.author; + } else if(feeditem.feedtype == FeedTypeAtom) { + if(feeditemtagid == TagUnknown) + currentfield = NULL; + else if(feeditemtagid == AtomTagPublished || feeditemtagid == AtomTagUpdated) + currentfield = &feeditem.timestamp; + else if(feeditemtagid == AtomTagTitle) + currentfield = &feeditem.title; + else if(feeditemtagid == AtomTagSummary || feeditemtagid == AtomTagContent) { + /* clear previous summary, assumes previous content was not a summary text */ + if(feeditemtagid == AtomTagContent && feeditem.content.len) + string_clear(&feeditem.content); + /* ignore, prefer content:encoded over description */ + if(!(feeditemtagid == AtomTagSummary && feeditem.content.len)) { + iscontenttag = 1; + currentfield = &feeditem.content; + } + } else if(feeditemtagid == AtomTagId) + currentfield = &feeditem.id; + else if(feeditemtagid == AtomTagLink) + currentfield = &feeditem.link; + else if(feeditemtagid == AtomTagAuthor) + currentfield = &feeditem.author; } + /* TODO: prefer content encoded over content? */ } - if(feeditemtag[0] == '\0') /* set tag if not already set. */ - strncpy(feeditemtag, name, sizeof(feeditemtag) - 1); - } else { /* start of RSS or Atom entry / item */ - if(istag(name, "entry")) { /* Atom */ + } else { /* start of RSS or Atom item / entry */ + if(istag(name, namelen, "entry", strlen("entry"))) { /* Atom */ feeditem.feedtype = FeedTypeAtom; feeditem.contenttype = ContentTypePlain; /* Default content type */ - } else if(istag(name, "item")) { /* RSS */ + currentfield = NULL; /* XXX: optimization */ + } else if(istag(name, namelen, "item", strlen("item"))) { /* RSS */ feeditem.feedtype = FeedTypeRSS; feeditem.contenttype = ContentTypeHTML; /* Default content type */ + currentfield = NULL; /* XXX: optimization */ } } } -void XMLCALL -xml_handler_end_element(void *data, const char *name) { +void +xml_handler_data_entity(XMLParser *p, const char *data, size_t datalen) { + char buffer[16]; + size_t len; + +#if 0 + if(iscontent) { + xml_handler_data(p, data, datalen); /* TODO: for now, dont convert entities */ + return; + } +#endif + /* TODO: for content HTML data entities, convert & to &? */ + if((len = entitytostr(data, buffer, sizeof(buffer)))) + xml_handler_data(p, buffer, len); + else + xml_handler_data(p, data, datalen); /* can't convert entity, just use it's data */ +} + +void +xml_handler_end_element(XMLParser *p, const char *name, size_t namelen, int isshort) { char timebuf[64]; + int tagid; + +/* printf("%d end tag: </%s>\n", iscontent, name);*/ + if(iscontent) { + attrcount = 0; + /* TODO: optimize */ + tagid = gettag(feeditem.feedtype, name, namelen); + if(feeditemtagid == tagid) { /* close content */ + iscontent = 0; + iscontenttag = 0; + + p->xmldataentity = xml_handler_data_entity; + p->xmlattrstart = NULL; + p->xmlattrend = NULL; + p->xmltagstartparsed = NULL; + + feeditemtag[0] = '\0'; /* unset tag */ + feeditemtaglen = 0; + feeditemtagid = TagUnknown; + return; /* TODO: not sure if !isshort check below should be skipped */ + } + if(!isshort) { + xml_handler_data(p, "</", 2); + xml_handler_data(p, name, namelen); + xml_handler_data(p, ">", 1); + } + return; + } if(feeditem.feedtype != FeedTypeNone) { /* end of RSS or Atom entry / item */ - if((istag(name, "entry") && (feeditem.feedtype == FeedTypeAtom)) || /* Atom */ - (istag(name, "item") && (feeditem.feedtype == FeedTypeRSS))) { /* RSS */ - printf("%ld", (long)parsetime((&feeditem.timestamp)->data, timebuf, - sizeof(timebuf) - 1)); - fputc(FieldSeparator, stdout); + /* TODO: optimize, use gettag() ? to tagid? */ + if((feeditem.feedtype == FeedTypeAtom && istag(name, namelen, "entry", strlen("entry"))) || /* Atom */ + (feeditem.feedtype == FeedTypeRSS && istag(name, namelen, "item", strlen("item")))) { /* RSS */ + printf("%ld", (long)parsetime((&feeditem.timestamp)->data, timebuf)); + putchar(FieldSeparator); fputs(timebuf, stdout); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); string_print_trimmed(&feeditem.title); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); string_print_trimmed(&feeditem.link); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); string_print_textblock(&feeditem.content); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); fputs(contenttypes[feeditem.contenttype], stdout); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); string_print_trimmed(&feeditem.id); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); string_print_trimmed(&feeditem.author); - fputc(FieldSeparator, stdout); + putchar(FieldSeparator); fputs(feedtypes[feeditem.feedtype], stdout); - fputc('\n', stdout); + putchar('\n'); /* clear strings */ string_clear(&feeditem.timestamp); @@ -347,109 +702,29 @@ xml_handler_end_element(void *data, const char *name) { string_clear(&feeditem.author); feeditem.feedtype = FeedTypeNone; feeditem.contenttype = ContentTypePlain; - incdata = 0; feeditemtag[0] = '\0'; /* unset tag */ - } else if(!strcmp(feeditemtag, name)) { /* clear */ + feeditemtaglen = 0; + feeditemtagid = TagUnknown; + + /* not sure if needed */ + iscontenttag = 0; + iscontent = 0; + } else if(!strcmp(feeditemtag, name)) { /* clear */ /* XXX: optimize ? */ + currentfield = NULL; feeditemtag[0] = '\0'; /* unset tag */ - } else { - if(feeditem.feedtype == FeedTypeAtom) { - if(istag(feeditemtag, "content") || istag(feeditemtag, "summary")) { - /* pass to default handler to process inline HTML etc */ - XML_DefaultCurrent(parser); - return; - } - } + feeditemtaglen = 0; + feeditemtagid = TagUnknown; + + /* not sure if needed */ + iscontenttag = 0; + iscontent = 0; } } - tag[0] = '\0'; /* unset tag */ -} - -/* NOTE: this handler can be called multiple times if the data in this block - * is bigger than the buffer */ -void XMLCALL -xml_handler_data(void *data, const XML_Char *s, int len) { - if(feeditem.feedtype == FeedTypeRSS) { - if(istag(feeditemtag, "pubdate") || istag(feeditemtag, "dc:date")) - string_append(&feeditem.timestamp, s, len); - else if(istag(feeditemtag, "title")) - string_append(&feeditem.title, s, len); - else if(istag(feeditemtag, "link")) - string_append(&feeditem.link, s, len); - else if(istag(feeditemtag, "description") || istag(feeditemtag, "content:encoded")) { - if(incdata) - XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */ - else - string_append(&feeditem.content, s, len); - } else if(istag(feeditemtag, "guid")) - string_append(&feeditem.id, s, len); - else if(istag(feeditemtag, "author") || istag(feeditemtag, "dc:creator")) - string_append(&feeditem.author, s, len); - } else if(feeditem.feedtype == FeedTypeAtom) { - if(istag(feeditemtag, "published") || istag(feeditemtag, "updated")) - string_append(&feeditem.timestamp, s, len); - else if(istag(feeditemtag, "title")) { - string_append(&feeditem.title, s, len); - } else if(istag(feeditemtag, "summary") || istag(feeditemtag, "content")) { - if(feeditem.contenttype == ContentTypeHTML) { - if(incdata) - XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */ - else - string_append(&feeditem.content, s, len); - } else - XML_DefaultCurrent(parser); /* pass to default handler to process inline HTML etc */ - } else if(istag(feeditemtag, "id")) - string_append(&feeditem.id, s, len); - else if(istag(feeditemtag, "name")) /* assume this is: <author><name></name></author> */ - string_append(&feeditem.author, s, len); - } -} - -int /* parse XML from stream using setup parser, return 1 on success, 0 on failure. */ -xml_parse_stream(XML_Parser parser, FILE *fp) { - char buffer[BUFSIZ]; - int done = 0, len = 0; - - while(!feof(fp)) { - len = fread(buffer, 1, sizeof(buffer), fp); - done = (feof(fp) || ferror(fp)); - if(XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR && (len > 0)) { - if(XML_GetErrorCode(parser) == XML_ERROR_NO_ELEMENTS) - return 1; /* Ignore "no elements found" / empty document as an error */ - fprintf(stderr, "sfeed: error parsing xml %s at line %lu column %lu\n", - XML_ErrorString(XML_GetErrorCode(parser)), (unsigned long)XML_GetCurrentLineNumber(parser), - (unsigned long)XML_GetCurrentColumnNumber(parser)); - return 0; - } - } while(!done); - return 1; -} - -void -xml_handler_default(void *data, const XML_Char *s, int len) { - if((feeditem.feedtype == FeedTypeAtom && (istag(feeditemtag, "summary") || istag(feeditemtag, "content"))) || - (feeditem.feedtype == FeedTypeRSS && (istag(feeditemtag, "description") || istag(feeditemtag, "content:encoded")))) - /*if(!istag(tag, "script") && !istag(tag, "style"))*/ /* ignore data in inline script and style */ - string_append(&feeditem.content, s, len); -} - -void /* NOTE: data is null terminated. */ -xml_handler_comment(void *data, const XML_Char *s) { -} - -void -xml_cdata_section_handler_start(void *userdata) { - incdata = 1; -} - -void -xml_cdata_section_handler_end(void *userdata) { - incdata = 0; } int main(void) { - int status; - standardtz = getenv("TZ"); + atexit(cleanup); /* init strings and initial memory pool size */ string_buffer_init(&feeditem.timestamp, 64); @@ -460,20 +735,15 @@ main(void) { string_buffer_init(&feeditem.author, 256); feeditem.contenttype = ContentTypePlain; feeditem.feedtype = FeedTypeNone; - feeditemtag[0] = '\0'; /* unset tag */ - tag[0] = '\0'; /* unset tag */ - - if(!(parser = XML_ParserCreate("UTF-8"))) - die("can't create parser"); - - XML_SetElementHandler(parser, xml_handler_start_element, xml_handler_end_element); - XML_SetCharacterDataHandler(parser, xml_handler_data); - XML_SetCommentHandler(parser, xml_handler_comment); - XML_SetCdataSectionHandler(parser, xml_cdata_section_handler_start, xml_cdata_section_handler_end); - XML_SetDefaultHandler(parser, xml_handler_default); - status = xml_parse_stream(parser, stdin); - cleanup(); + xmlparser_init(&parser); + parser.xmltagstart = xml_handler_start_element; + parser.xmltagend = xml_handler_end_element; + parser.xmldata = xml_handler_data; + parser.xmldataentity = xml_handler_data_entity; + parser.xmlattr = xml_handler_attr; + parser.xmlcdata = xml_handler_cdata; + xmlparser_parse(&parser); - return status ? EXIT_SUCCESS : EXIT_FAILURE; + return EXIT_SUCCESS; } |