#include #include #include #include #include #include #include #include #include "util.h" #include "xml.h" #define ISWSNOSPACE(c) (((unsigned)c - '\t') < 5) /* isspace(c) && c != ' ' */ enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 }; static const char *feedtypes[] = { "", "rss", "atom" }; enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 }; static const char *contenttypes[] = { "", "plain", "html" }; static const int FieldSeparator = '\t'; /* output field seperator character */ enum { TagUnknown = 0, /* RSS */ RSSTagDcdate, RSSTagPubdate, RSSTagTitle, RSSTagLink, RSSTagDescription, RSSTagContentencoded, RSSTagGuid, RSSTagAuthor, RSSTagDccreator, /* Atom */ AtomTagPublished, AtomTagUpdated, AtomTagTitle, AtomTagSummary, AtomTagContent, AtomTagId, AtomTagLink, AtomTagAuthor }; /* String data / memory pool */ typedef struct string { char *data; /* data */ size_t len; /* string length */ size_t bufsiz; /* allocated size */ } String; /* Feed item */ typedef struct feeditem { String timestamp; String title; String link; String content; int contenttype; /* ContentTypePlain or ContentTypeHTML */ String id; String author; int feedtype; /* FeedTypeRSS or FeedTypeAtom */ } FeedItem; typedef struct feedtag { char *name; size_t namelen; int id; } FeedTag; typedef struct feedcontext { String *field; /* pointer to current FeedItem field String */ FeedItem item; /* data for current feed item */ char tag[256]; /* current tag _inside_ a feeditem */ int tagid; /* unique number for parsed tag (faster comparison) */ size_t taglen; int iscontent; int iscontenttag; int attrcount; } FeedContext; static void die(const char *s); static void cleanup(void); static FeedContext ctx; static XMLParser parser; /* XML parser state */ static char *append = NULL; /* append string after each output line */ /* unique number for parsed tag (faster comparison) */ static int gettag(int feedtype, const char *name, size_t namelen) { /* RSS, alphabetical order */ static FeedTag rsstag[] = { { "author", 6, RSSTagAuthor }, { "content:encoded", 15, RSSTagContentencoded }, { "dc:creator", 10, RSSTagDccreator }, { "dc:date", 7, RSSTagDcdate }, { "description", 11, RSSTagDescription }, { "guid", 4, RSSTagGuid }, { "link", 4, RSSTagLink }, { "pubdate", 7, RSSTagPubdate }, { "title", 5, RSSTagTitle }, { NULL, 0, -1 } }; /* Atom, alphabetical order */ static FeedTag atomtag[] = { { "author", 6, AtomTagAuthor }, { "content", 7, AtomTagContent }, { "id", 2, AtomTagId }, { "link", 4, AtomTagLink }, { "published", 9, AtomTagPublished }, { "summary", 7, AtomTagSummary }, { "title", 5, AtomTagTitle }, { "updated", 7, AtomTagUpdated }, { NULL, 0, -1 } }; int i, n; if(namelen < 2 || namelen > 15) /* optimization */ return TagUnknown; if(feedtype == FeedTypeRSS) { for(i = 0; rsstag[i].name; i++) { if(!(n = strncasecmp(rsstag[i].name, name, rsstag[i].namelen))) return rsstag[i].id; /* optimization: it's sorted so nothing after it matches. */ if(n > 0) return TagUnknown; } } else if(feedtype == FeedTypeAtom) { for(i = 0; atomtag[i].name; i++) { if(!(n = strncasecmp(atomtag[i].name, name, atomtag[i].namelen))) return atomtag[i].id; /* optimization: it's sorted so nothing after it matches. */ if(n > 0) return TagUnknown; } } return TagUnknown; } static size_t codepointtoutf8(uint32_t cp, uint32_t *utf) { if(cp >= 0x10000) { /* 4 bytes */ *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) | ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | (cp & 0x3f); return 4; } else if(cp >= 0x00800) { /* 3 bytes */ *utf = 0xe08080 | ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | (cp & 0x3f); return 3; } else if(cp >= 0x80) { /* 2 bytes */ *utf = 0xc080 | ((cp & 0xfc0) << 2) | (cp & 0x3f); return 2; } *utf = cp & 0xff; return *utf ? 1 : 0; /* 1 byte */ } static size_t namedentitytostr(const char *e, char *buffer, size_t bufsiz) { char *entities[6][2] = { { "<", "<" }, { ">", ">" }, { "'", "'" }, { "&", "&" }, { """, "\"" }, { NULL, NULL } }; size_t i; if(*e != '&' || bufsiz < 2) /* doesn't start with & */ return 0; for(i = 0; entities[i][0]; i++) { /* NOTE: compares max 7 chars */ if(!strncasecmp(e, entities[i][0], 6)) { buffer[0] = *(entities[i][1]); buffer[1] = '\0'; return 1; } } return 0; } /* convert named- or numeric entity string to buffer string * returns byte-length of string. */ static size_t entitytostr(const char *e, char *buffer, size_t bufsiz) { uint32_t l = 0, cp = 0; size_t len = 0, b; int c; if(*e != '&' || bufsiz < 5) /* doesn't start with & */ return 0; if(e[1] == '#') { e += 2; /* skip &# */ errno = 0; if(*e == 'x') l = strtoul(e + 1, NULL, 16); /* hex */ else l = strtoul(e, NULL, 10); /* decimal */ if(errno != 0) return 0; /* invalid value */ if(!(len = codepointtoutf8(l, &cp))) return 0; /* make string */ for(b = 0; b < len; b++) buffer[b] = (cp >> (8 * (len - 1 - b))) & 0xff; buffer[len] = '\0'; /* escape whitespace */ if(ISWSNOSPACE(buffer[0])) { /* isspace(c) && c != ' ' */ switch(buffer[0]) { case '\n': c = 'n'; break; case '\\': c = '\\'; break; case '\t': c = 't'; break; default: c = '\0'; break; } if(c != '\0') { buffer[0] = '\\'; buffer[1] = c; buffer[2] = '\0'; len = 2; } } } else /* named entity */ len = namedentitytostr(e, buffer, bufsiz); return len; } /* clear string only; don't free, prevents unnecessary reallocation */ static void string_clear(String *s) { if(s->data) s->data[0] = '\0'; s->len = 0; } static void string_buffer_init(String *s, size_t len) { if(!(s->data = malloc(len))) die("can't allocate enough memory"); s->bufsiz = len; string_clear(s); } static void string_free(String *s) { free(s->data); s->data = NULL; s->bufsiz = 0; s->len = 0; } static int string_buffer_realloc(String *s, size_t newlen) { char *p; size_t alloclen; for(alloclen = 16; alloclen <= newlen; alloclen *= 2); if(!(p = realloc(s->data, alloclen))) { string_free(s); /* free previous allocation */ die("can't allocate enough memory"); } s->bufsiz = alloclen; s->data = p; return s->bufsiz; } static void string_append(String *s, const char *data, size_t len) { if(!len || *data == '\0') return; /* check if allocation is necesary, don't shrink buffer should be more than bufsiz ofcourse */ if(s->len + len > s->bufsiz) string_buffer_realloc(s, s->len + len); memcpy(s->data + s->len, data, len); s->len += len; s->data[s->len] = '\0'; } /* cleanup, free allocated memory, etc */ static void cleanup(void) { string_free(&ctx.item.timestamp); string_free(&ctx.item.title); string_free(&ctx.item.link); string_free(&ctx.item.content); string_free(&ctx.item.id); string_free(&ctx.item.author); } /* print error message to stderr */ static void die(const char *s) { fprintf(stderr, "sfeed: %s\n", s); exit(EXIT_FAILURE); } /* get timezone from string, return as formatted string and time offset, * for the offset it assumes GMT */ static int gettimetz(const char *s, char *buf, size_t bufsiz) { const char *p = s; char tzname[16] = "", *t = NULL; int tzhour = 0, tzmin = 0; unsigned int i; char c; buf[0] = '\0'; if(bufsiz < sizeof(tzname) + strlen(" -00:00")) return 0; for(; *p && isspace((int)*p); p++); /* skip whitespace */ /* loop until some common timezone delimiters are found */ for(; *p && (*p != '+' && *p != '-' && *p != 'Z' && *p != 'z'); p++); /* TODO: cleanup / simplify */ if(isalpha((int)*p)) { if(*p == 'Z' || *p == 'z') { strlcpy(buf, "GMT+00:00", sizeof(buf)); return 0; } else { for(i = 0, t = &tzname[0]; i < (sizeof(tzname) - 1) && (*p && isalpha((int)*p)); i++) *(t++) = *(p++); *t = '\0'; } } else strlcpy(tzname, "GMT", sizeof(tzname)); if(!(*p)) { strlcpy(buf, tzname, bufsiz); return 0; } if((sscanf(p, "%c%02d:%02d", &c, &tzhour, &tzmin)) > 0); else if(sscanf(p, "%c%02d%02d", &c, &tzhour, &tzmin) > 0); else if(sscanf(p, "%c%d", &c, &tzhour) > 0) tzmin = 0; snprintf(buf, bufsiz, "%s%c%02d%02d", tzname, c, tzhour, tzmin); /* TODO: test + or - offset */ return (tzhour * 3600) + (tzmin * 60) * (c == '-' ? -1 : 1); } static time_t parsetime(const char *s, char *buf, size_t bufsiz) { time_t t = -1; /* can't parse */ char tz[64] = ""; struct tm tm; const char *formats[] = { "%a, %d %b %Y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", NULL }; char *p; unsigned int i; if(buf && bufsiz > 0) buf[0] = '\0'; memset(&tm, 0, sizeof(tm)); for(i = 0; formats[i]; i++) { if((p = strptime(s, formats[i], &tm))) { tm.tm_isdst = -1; /* don't use DST */ if((t = mktime(&tm)) == -1) /* error */ return t; t -= gettimetz(p, tz, sizeof(tz)); if(buf) snprintf(buf, bufsiz, "%04d-%02d-%02d %02d:%02d:%02d %-.16s", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, tz); break; } } return t; } /* print text, escape tabs, newline and carriage return etc */ static void string_print(String *s) { const char *p; /* skip leading whitespace */ for(p = s->data; *p && isspace((int)*p); p++); for(; *p; p++) { if(ISWSNOSPACE(*p)) { switch(*p) { case '\n': fputs("\\n", stdout); break; case '\\': fputs("\\\\", stdout); break; case '\t': fputs("\\t", stdout); break; default: break; /* ignore other whitespace chars */ } } else putchar(*p); } } static int istag(const char *name, size_t len, const char *name2, size_t len2) { return (len == len2 && !strcasecmp(name, name2)); } static int isattr(const char *name, size_t len, const char *name2, size_t len2) { return (len == len2 && !strcasecmp(name, name2)); } /* NOTE: this handler can be called multiple times if the data in this * block is bigger than the buffer */ static void xml_handler_data(XMLParser *p, const char *s, size_t len) { if(ctx.field) { /* add only data from inside tag * or any other non- tag */ if(ctx.tagid != AtomTagAuthor || !strcmp(p->tag, "name")) string_append(ctx.field, s, len); } } static void xml_handler_cdata(XMLParser *p, const char *s, size_t len) { (void)p; if(ctx.field) string_append(ctx.field, s, len); } static void xml_handler_attr_start(struct xmlparser *p, const char *tag, size_t taglen, const char *name, size_t namelen) { (void)tag; (void)taglen; if(ctx.iscontent && !ctx.iscontenttag) { if(!ctx.attrcount) xml_handler_data(p, " ", 1); ctx.attrcount++; xml_handler_data(p, name, namelen); xml_handler_data(p, "=\"", 2); return; } } static void xml_handler_attr_end(struct xmlparser *p, const char *tag, size_t taglen, const char *name, size_t namelen) { (void)tag; (void)taglen; (void)name; (void)namelen; if(ctx.iscontent && !ctx.iscontenttag) { xml_handler_data(p, "\"", 1); ctx.attrcount = 0; } } static void xml_handler_start_element_parsed(XMLParser *p, const char *tag, size_t taglen, int isshort) { (void)tag; (void)taglen; if(ctx.iscontent && !ctx.iscontenttag) { if(isshort) xml_handler_data(p, "/>", 2); else xml_handler_data(p, ">", 1); } } static void xml_handler_attr(XMLParser *p, const char *tag, size_t taglen, const char *name, size_t namelen, const char *value, size_t valuelen) { (void)tag; (void)taglen; if(ctx.iscontent && !ctx.iscontenttag) { xml_handler_data(p, value, valuelen); return; } if(ctx.item.feedtype == FeedTypeAtom) { /*if(ctx.tagid == AtomTagContent || ctx.tagid == AtomTagSummary) {*/ if(ctx.iscontenttag) { if(isattr(name, namelen, "type", strlen("type")) && (isattr(value, valuelen, "xhtml", strlen("xhtml")) || isattr(value, valuelen, "text/xhtml", strlen("text/xhtml")) || isattr(value, valuelen, "html", strlen("html")) || isattr(value, valuelen, "text/html", strlen("text/html")))) { ctx.item.contenttype = ContentTypeHTML; ctx.iscontent = 1; /* p->xmldataentity = NULL;*/ p->xmlattrstart = xml_handler_attr_start; p->xmlattrend = xml_handler_attr_end; p->xmltagstartparsed = xml_handler_start_element_parsed; } } else if(ctx.tagid == AtomTagLink && isattr(name, namelen, "href", strlen("href"))) { /* link href attribute */ string_append(&ctx.item.link, value, valuelen); } } } static void xml_handler_start_element(XMLParser *p, const char *name, size_t namelen) { if(ctx.iscontenttag) { /* starts with div, handle as XML, don't convert entities (set handle to NULL) */ if(ctx.item.feedtype == FeedTypeAtom && namelen == strlen("div") && !strncmp(name, "div", strlen("div"))) { p->xmldataentity = NULL; } } if(ctx.iscontent) { ctx.attrcount = 0; ctx.iscontenttag = 0; xml_handler_data(p, "<", 1); xml_handler_data(p, name, namelen); return; } /* TODO: cleanup, merge with code below ?, return function if FeedTypeNone */ /* ctx.iscontenttag = 0;*/ /* start of RSS or Atom item / entry */ if(ctx.item.feedtype == FeedTypeNone) { if(istag(name, namelen, "entry", strlen("entry"))) { /* Atom */ ctx.item.feedtype = FeedTypeAtom; ctx.item.contenttype = ContentTypePlain; /* Default content type */ ctx.field = NULL; /* XXX: optimization */ } else if(istag(name, namelen, "item", strlen("item"))) { /* RSS */ ctx.item.feedtype = FeedTypeRSS; ctx.item.contenttype = ContentTypeHTML; /* Default content type */ ctx.field = NULL; /* XXX: optimization */ } return; } /* tag already set: return */ if(ctx.tag[0] != '\0') return; /* in item */ strlcpy(ctx.tag, name, sizeof(ctx.tag)); ctx.taglen = namelen; ctx.tagid = gettag(ctx.item.feedtype, ctx.tag, ctx.taglen); if(ctx.tagid == TagUnknown) ctx.field = NULL; if(ctx.item.feedtype == FeedTypeRSS) { if(ctx.tagid == RSSTagPubdate || ctx.tagid == RSSTagDcdate) ctx.field = &ctx.item.timestamp; else if(ctx.tagid == RSSTagTitle) ctx.field = &ctx.item.title; else if(ctx.tagid == RSSTagLink) ctx.field = &ctx.item.link; else if(ctx.tagid == RSSTagDescription || ctx.tagid == RSSTagContentencoded) { /* clear content, assumes previous content was not a summary text */ if(ctx.tagid == RSSTagContentencoded && ctx.item.content.len) string_clear(&ctx.item.content); /* ignore, prefer content:encoded over description */ if(!(ctx.tagid == RSSTagDescription && ctx.item.content.len)) { ctx.iscontenttag = 1; ctx.field = &ctx.item.content; } } else if(ctx.tagid == RSSTagGuid) ctx.field = &ctx.item.id; else if(ctx.tagid == RSSTagAuthor || ctx.tagid == RSSTagDccreator) ctx.field = &ctx.item.author; } else if(ctx.item.feedtype == FeedTypeAtom) { if(ctx.tagid == AtomTagPublished || ctx.tagid == AtomTagUpdated) ctx.field = &ctx.item.timestamp; else if(ctx.tagid == AtomTagTitle) ctx.field = &ctx.item.title; else if(ctx.tagid == AtomTagSummary || ctx.tagid == AtomTagContent) { /* clear content, assumes previous content was not a summary text */ if(ctx.tagid == AtomTagContent && ctx.item.content.len) string_clear(&ctx.item.content); /* ignore, prefer content:encoded over description */ if(!(ctx.tagid == AtomTagSummary && ctx.item.content.len)) { ctx.iscontenttag = 1; ctx.field = &ctx.item.content; } } else if(ctx.tagid == AtomTagId) ctx.field = &ctx.item.id; else if(ctx.tagid == AtomTagLink) ctx.field = &ctx.item.link; else if(ctx.tagid == AtomTagAuthor) ctx.field = &ctx.item.author; } } static void xml_handler_data_entity(XMLParser *p, const char *data, size_t datalen) { char buffer[16]; size_t len; /* try to translate entity, else just pass as data */ if((len = entitytostr(data, buffer, sizeof(buffer))) > 0) xml_handler_data(p, buffer, len); else xml_handler_data(p, data, datalen); } static void xml_handler_end_element(XMLParser *p, const char *name, size_t namelen, int isshort) { char timebuf[64]; int tagid; if(ctx.iscontent) { ctx.attrcount = 0; /* TODO: optimize */ tagid = gettag(ctx.item.feedtype, name, namelen); if(ctx.tagid == tagid) { /* close content */ ctx.iscontent = 0; ctx.iscontenttag = 0; p->xmldataentity = xml_handler_data_entity; p->xmlattrstart = NULL; p->xmlattrend = NULL; p->xmltagstartparsed = NULL; ctx.tag[0] = '\0'; /* unset tag */ ctx.taglen = 0; ctx.tagid = TagUnknown; return; /* TODO: not sure if !isshort check below should be skipped */ } if(!isshort) { xml_handler_data(p, "", 1); } return; } if(ctx.item.feedtype == FeedTypeNone) return; /* end of RSS or Atom entry / item */ /* TODO: optimize, use gettag() ? to tagid? */ if((ctx.item.feedtype == FeedTypeAtom && istag(name, namelen, "entry", strlen("entry"))) || /* Atom */ (ctx.item.feedtype == FeedTypeRSS && istag(name, namelen, "item", strlen("item")))) /* RSS */ { printf("%ld", (long)parsetime((&ctx.item.timestamp)->data, timebuf, sizeof(timebuf))); putchar(FieldSeparator); fputs(timebuf, stdout); putchar(FieldSeparator); string_print(&ctx.item.title); putchar(FieldSeparator); string_print(&ctx.item.link); putchar(FieldSeparator); string_print(&ctx.item.content); putchar(FieldSeparator); fputs(contenttypes[ctx.item.contenttype], stdout); putchar(FieldSeparator); string_print(&ctx.item.id); putchar(FieldSeparator); string_print(&ctx.item.author); putchar(FieldSeparator); fputs(feedtypes[ctx.item.feedtype], stdout); if(append) { putchar(FieldSeparator); fputs(append, stdout); } putchar('\n'); /* clear strings */ string_clear(&ctx.item.timestamp); string_clear(&ctx.item.title); string_clear(&ctx.item.link); string_clear(&ctx.item.content); string_clear(&ctx.item.id); string_clear(&ctx.item.author); ctx.item.feedtype = FeedTypeNone; ctx.item.contenttype = ContentTypePlain; ctx.tag[0] = '\0'; /* unset tag */ ctx.taglen = 0; ctx.tagid = TagUnknown; /* not sure if needed */ ctx.iscontenttag = 0; ctx.iscontent = 0; } else if(ctx.taglen == namelen && !strcmp(ctx.tag, name)) { /* clear */ /* XXX: optimize ? */ ctx.field = NULL; ctx.tag[0] = '\0'; /* unset tag */ ctx.taglen = 0; ctx.tagid = TagUnknown; /* not sure if needed */ ctx.iscontenttag = 0; ctx.iscontent = 0; } } int main(int argc, char **argv) { atexit(cleanup); if(argc > 1) append = argv[1]; memset(&ctx, 0, sizeof(ctx)); /* init strings and initial memory pool size */ string_buffer_init(&ctx.item.timestamp, 64); string_buffer_init(&ctx.item.title, 256); string_buffer_init(&ctx.item.link, 1024); string_buffer_init(&ctx.item.content, 4096); string_buffer_init(&ctx.item.id, 1024); string_buffer_init(&ctx.item.author, 256); ctx.item.contenttype = ContentTypePlain; ctx.item.feedtype = FeedTypeNone; xmlparser_init(&parser, stdin); parser.xmltagstart = xml_handler_start_element; parser.xmltagend = xml_handler_end_element; parser.xmldata = xml_handler_data; parser.xmldataentity = xml_handler_data_entity; parser.xmlattr = xml_handler_attr; parser.xmlcdata = xml_handler_cdata; xmlparser_parse(&parser); return EXIT_SUCCESS; }