diff options
-rw-r--r-- | sfeed.c | 140 | ||||
-rw-r--r-- | xml.c | 136 | ||||
-rw-r--r-- | xml.h | 27 |
3 files changed, 169 insertions, 134 deletions
@@ -11,8 +11,6 @@ #include "util.h" #include "xml.h" -/* fast isspace(c) && c != ' ' check. */ -#define ISWSNOSPACE(c) (((unsigned)c - '\t') < 5) #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) /* string and size */ @@ -27,6 +25,7 @@ enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 }; static const char *contenttypes[] = { "", "plain", "html" }; static const int FieldSeparator = '\t'; /* output field seperator character */ +static const char *baseurl = ""; enum { TagUnknown = 0, @@ -60,8 +59,8 @@ typedef struct feeditem { } FeedItem; typedef struct feedtag { - char *name; - size_t namelen; + char *name; + size_t namelen; int id; } FeedTag; @@ -76,13 +75,10 @@ typedef struct feedcontext { int attrcount; } FeedContext; -static size_t codepointtoutf8(uint32_t, uint32_t *); -static size_t entitytostr(const char *, char *, size_t); static int gettag(int, const char *, size_t); static int gettimetz(const char *, char *, size_t, int *); static int isattr(const char *, size_t, const char *, size_t); static int istag(const char *, size_t, const char *, size_t); -static size_t namedentitytostr(const char *, char *, size_t); static int parsetime(const char *, char *, size_t, time_t *); static void printfields(void); static void string_append(String *, const char *, size_t); @@ -106,7 +102,6 @@ static void xml_handler_start_element_parsed(XMLParser *, const char *, static FeedContext ctx; static XMLParser parser; /* XML parser state */ -static char *append = NULL; /* append string after each output line */ /* unique number for parsed tag (faster comparison) */ static int @@ -163,109 +158,6 @@ gettag(int feedtype, const char *name, size_t namelen) return TagUnknown; } -static size_t -codepointtoutf8(uint32_t cp, uint32_t *utf) -{ - if(cp >= 0x10000) { - /* 4 bytes */ - *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) | - ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | - (cp & 0x3f); - return 4; - } else if(cp >= 0x00800) { - /* 3 bytes */ - *utf = 0xe08080 | - ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | - (cp & 0x3f); - return 3; - } else if(cp >= 0x80) { - /* 2 bytes */ - *utf = 0xc080 | - ((cp & 0xfc0) << 2) | (cp & 0x3f); - return 2; - } - *utf = cp & 0xff; - return *utf ? 1 : 0; /* 1 byte */ -} - -static size_t -namedentitytostr(const char *e, char *buffer, size_t bufsiz) -{ - char *entities[6][2] = { - { "<", "<" }, - { ">", ">" }, - { "'", "'" }, - { "&", "&" }, - { """, "\"" }, - { NULL, NULL } - }; - size_t i; - - if(*e != '&' || bufsiz < 2) /* doesn't start with & */ - return 0; - for(i = 0; entities[i][0]; i++) { - /* NOTE: compares max 7 chars */ - if(!strncasecmp(e, entities[i][0], 6)) { - buffer[0] = *(entities[i][1]); - buffer[1] = '\0'; - return 1; - } - } - return 0; -} - -/* convert named- or numeric entity string to buffer string - * returns byte-length of string. */ -static size_t -entitytostr(const char *e, char *buffer, size_t bufsiz) -{ - uint32_t l = 0, cp = 0; - size_t len = 0, b; - int c; - char *end; - - /* doesn't start with & or insufficient buffer size */ - if(e[0] != '&' || bufsiz < 5) - return 0; - /* named entity */ - if(e[1] != '#') - return namedentitytostr(e, buffer, bufsiz); - - /* e[1] == '#', numeric / hexadecimal entity */ - e += 2; /* skip "&#" */ - errno = 0; - /* hex (16) or decimal (10) */ - if(*e == 'x') - l = strtoul(e + 1, &end, 16); - else - l = strtoul(e, &end, 10); - /* invalid value or not a well-formed entity */ - if(errno != 0 || (*end != '\0' && *end != ';')) - return 0; - if(!(len = codepointtoutf8(l, &cp))) - return 0; - /* make string */ - for(b = 0; b < len; b++) - buffer[b] = (cp >> (8 * (len - 1 - b))) & 0xff; - buffer[len] = '\0'; - /* escape whitespace */ - if(ISWSNOSPACE(buffer[0])) { - switch(buffer[0]) { - case '\n': c = 'n'; break; - case '\\': c = '\\'; break; - case '\t': c = 't'; break; - default: c = '\0'; break; - } - if(c != '\0') { - buffer[0] = '\\'; - buffer[1] = c; - buffer[2] = '\0'; - len = 2; - } - } - return len; -} - /* clear string only; don't free, prevents unnecessary reallocation */ static void string_clear(String *s) @@ -479,10 +371,6 @@ printfields(void) string_print(&ctx.item.author); putchar(FieldSeparator); fputs(feedtypes[ctx.item.feedtype], stdout); - if(append) { - putchar(FieldSeparator); - fputs(append, stdout); - } putchar('\n'); } @@ -703,12 +591,17 @@ static void xml_handler_data_entity(XMLParser *p, const char *data, size_t datalen) { char buffer[16]; - size_t len; + int len; /* try to translate entity, else just pass as data to - * xml_data_handler */ - if((len = entitytostr(data, buffer, sizeof(buffer))) > 0) - xml_handler_data(p, buffer, len); + * xml_data_handler */ + len = xml_entitytostr(data, buffer, sizeof(buffer)); + /* this should never happen (buffer too small) */ + if(len < 0) + return; + + if(len > 0) + xml_handler_data(p, buffer, (size_t)len); else xml_handler_data(p, data, datalen); } @@ -786,13 +679,8 @@ xml_handler_end_element(XMLParser *p, const char *name, size_t namelen, int issh int main(int argc, char *argv[]) { - if(argc > 1) { - append = argv[1]; - if(!strcmp(argv[1], "-v")) { - printf("%s\n", VERSION); - return 0; - } - } + if(argc > 1) + baseurl = argv[1]; /* init strings and initial memory pool size */ string_buffer_init(&ctx.item.timestamp, 64); @@ -7,6 +7,18 @@ #include "xml.h" +static const struct { + char *entity; + size_t len; + int c; +} entities[] = { + { .entity = "<", .len = 4, .c = '<' }, + { .entity = ">", .len = 4, .c = '>' }, + { .entity = "'", .len = 6, .c = '\'' }, + { .entity = "&", .len = 5, .c = '&' }, + { .entity = """, .len = 6, .c = '"' } +}; + static int xmlparser_string_getnext(XMLParser *x) { @@ -185,7 +197,7 @@ xmlparser_parsecomment(XMLParser *x) } /* TODO: - * <test><![CDATA[1234567dddd8]]]> + * <test><![CDATA[1234567dddd8]]> * * with x->data of sizeof(15) gives 2 ] at end of cdata, should be 1 * test comment function too for similar bug? @@ -194,12 +206,31 @@ xmlparser_parsecomment(XMLParser *x) static __inline__ void xmlparser_parsecdata(XMLParser *x) { + static const char *end = "]]>"; + static const size_t endsiz = sizeof(end); size_t datalen = 0, i = 0; int c; if(x->xmlcdatastart) x->xmlcdatastart(x); while((c = xmlparser_getnext(x)) != EOF) { + if(c == end[i++]) { + if(!end[i]) { /* end of match */ + if(datalen >= endsiz) { + datalen -= endsiz; + x->data[datalen] = '\0'; + } + if(x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + if(x->xmlcdataend) + x->xmlcdataend(x); + break; + } + continue; + } else { + i = 0; + } +#if 0 if(c == ']' && i < 2) { i++; } else if(c == '>') { @@ -216,6 +247,7 @@ xmlparser_parsecdata(XMLParser *x) } i = 0; } +#endif /* TODO: what if the end has ]>, and it's cut on the boundary */ if(datalen < sizeof(x->data) - 1) { x->data[datalen++] = c; @@ -229,6 +261,108 @@ xmlparser_parsecdata(XMLParser *x) } } +int +xml_codepointtoutf8(uint32_t cp, uint32_t *utf) +{ + if(cp >= 0x10000) { + /* 4 bytes */ + *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) | + ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | + (cp & 0x3f); + return 4; + } else if(cp >= 0x00800) { + /* 3 bytes */ + *utf = 0xe08080 | + ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | + (cp & 0x3f); + return 3; + } else if(cp >= 0x80) { + /* 2 bytes */ + *utf = 0xc080 | + ((cp & 0xfc0) << 2) | (cp & 0x3f); + return 2; + } + *utf = cp & 0xff; + return *utf ? 1 : 0; /* 1 byte */ +} + +ssize_t +xml_namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + size_t i; + + /* buffer is too small */ + if(bufsiz < 2) + return -1; + + /* doesn't start with &: can't match */ + if(*e != '&') + return 0; + + for(i = 0; sizeof(entities) / sizeof(*entities); i++) { + /* NOTE: compares max 6 chars */ + if(!strncasecmp(e, entities[i].entity, 6)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return 0; +} + +ssize_t +xml_numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + uint32_t l = 0, cp = 0; + size_t b, len; + char *end; + + /* buffer is too small */ + if(bufsiz < 5) + return -1; + + /* not a numeric entity */ + if(!(e[0] == '&' && e[1] == '#')) + return 0; + + /* e[1] == '#', numeric / hexadecimal entity */ + e += 2; /* skip "&#" */ + errno = 0; + /* hex (16) or decimal (10) */ + if(*e == 'x') + l = strtoul(e + 1, &end, 16); + else + l = strtoul(e, &end, 10); + /* invalid value or not a well-formed entity */ + if(errno != 0 || (*end != '\0' && *end != ';')) + return 0; + if(!(len = xml_codepointtoutf8(l, &cp))) + return 0; + /* make string */ + for(b = 0; b < len; b++) + buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff; + buf[len] = '\0'; + return (ssize_t)len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string. */ +ssize_t +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* buffer is too small */ + if(bufsiz < 5) + return -1; + /* doesn't start with & */ + if(e[0] != '&') + return 0; + /* named entity */ + if(e[1] != '#') + return xml_namedentitytostr(e, buf, bufsiz); + else /* numeric entity */ + return xml_numericentitytostr(e, buf, bufsiz); +} + static void xmlparser_parse(XMLParser *x) { @@ -29,22 +29,35 @@ typedef struct xmlparser { int (*getnext)(struct xmlparser *); - int readerrno; /* errno set from read(). */ - int fd; /* fd to read from */ + /* for use with xmlparser_parse_fd */ + /* errno set from read(). */ + int readerrno; + int fd; - const char *str; /* "read" from string */ + /* for use with "read" from string: xmlparser_parse_string */ + const char *str; /* private; internal state */ - char tag[1024]; /* current tag */ - int isshorttag; /* current tag is in short form ? */ + + /* current tag */ + char tag[1024]; + /* current tag is in short form ? */ + int isshorttag; size_t taglen; - char name[256]; /* current attribute name */ - char data[BUFSIZ]; /* data buffer used for tag and attribute data */ + /* current attribute name */ + char name[256]; + /* data buffer used for tag data, cdata and attribute data */ + char data[BUFSIZ]; size_t readoffset; size_t readlastbytes; /* read buffer used by xmlparser_getnext */ unsigned char readbuf[BUFSIZ]; } XMLParser; +int xml_codepointtoutf8(uint32_t, uint32_t *); +ssize_t xml_entitytostr(const char *, char *, size_t); +ssize_t xml_namedentitytostr(const char *, char *, size_t); +ssize_t xml_numericetitytostr(const char *, char *, size_t); + void xmlparser_parse_fd(XMLParser *, int); void xmlparser_parse_string(XMLParser *, const char *); |