summaryrefslogtreecommitdiff
path: root/sfeed.c
diff options
context:
space:
mode:
authorHiltjo Posthuma <hiltjo@codemadness.org>2015-07-31 21:06:52 +0200
committerHiltjo Posthuma <hiltjo@codemadness.org>2015-07-31 21:12:07 +0200
commit356e7d79925f91b9b703ee63e3680694c53a59a4 (patch)
treebc06b59ee637c2695055b62221abad696d66eb7c /sfeed.c
parenteb586eda26967183de91c314a57d323b124110bb (diff)
Various improvements
- Only escape characters in "content" field, these can contain newlines. - Trim newlines and tabs, etc from the title, id and author fields. - Make decodefield, xmlencode functions easier to "chain" without allocatting new buffers. - Move printutf8pad from util (only used by sfeed_plain) to sfeed_plain. - Update README, still need to update the man-page and improve the documentation in general. - Code cleanup.
Diffstat (limited to 'sfeed.c')
-rw-r--r--sfeed.c172
1 files changed, 105 insertions, 67 deletions
diff --git a/sfeed.c b/sfeed.c
index 744e7a5..6c6e172 100644
--- a/sfeed.c
+++ b/sfeed.c
@@ -18,16 +18,16 @@
/* length of string */
#define STRSIZ(s) (sizeof(s)-1)
-enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 };
+enum FeedType { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 };
static const char *feedtypes[] = { "", "rss", "atom" };
-enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 };
+enum ContentType { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 };
static const char *contenttypes[] = { "", "plain", "html" };
static const int FieldSeparator = '\t'; /* output field seperator character */
static const char *baseurl = "";
-enum {
+enum TagId {
TagUnknown = 0,
/* RSS */
RSSTagDcdate, RSSTagPubdate, RSSTagTitle,
@@ -59,9 +59,9 @@ typedef struct feeditem {
} FeedItem;
typedef struct feedtag {
- char *name;
- size_t namelen;
- int id;
+ char *name;
+ size_t namelen;
+ enum TagId id;
} FeedTag;
typedef struct feedcontext {
@@ -75,7 +75,7 @@ typedef struct feedcontext {
int attrcount;
} FeedContext;
-static int gettag(int, const char *, size_t);
+static enum TagId gettag(enum FeedType, const char *, size_t);
static int gettimetz(const char *, char *, size_t, int *);
static int isattr(const char *, size_t, const char *, size_t);
static int istag(const char *, size_t, const char *, size_t);
@@ -85,7 +85,8 @@ static void string_append(String *, const char *, size_t);
static void string_buffer_init(String *, size_t);
static void string_buffer_realloc(String *, size_t);
static void string_clear(String *);
-static void string_print(String *);
+static void string_print_encoded(String *);
+static void string_print_trimmed(String *);
static void xml_handler_attr(XMLParser *, const char *, size_t,
const char *, size_t, const char *, size_t);
static void xml_handler_attr_start(XMLParser *, const char *, size_t,
@@ -104,8 +105,8 @@ static FeedContext ctx;
static XMLParser parser; /* XML parser state */
/* unique number for parsed tag (faster comparison) */
-static int
-gettag(int feedtype, const char *name, size_t namelen)
+static enum TagId
+gettag(enum FeedType feedtype, const char *name, size_t namelen)
{
/* RSS, alphabetical order */
static FeedTag rsstag[] = {
@@ -138,24 +139,29 @@ gettag(int feedtype, const char *name, size_t namelen)
if (namelen < 2 || namelen > 15)
return TagUnknown;
- if (feedtype == FeedTypeRSS) {
+ switch (feedtype) {
+ case FeedTypeRSS:
for (i = 0; rsstag[i].name; i++) {
if (!(n = strncasecmp(rsstag[i].name, name, rsstag[i].namelen)))
- return rsstag[i].id;
+ return rsstag[i].id; /* found */
/* optimization: it's sorted so nothing after it matches. */
if (n > 0)
return TagUnknown;
}
- } else if (feedtype == FeedTypeAtom) {
+ break;
+ case FeedTypeAtom:
for (i = 0; atomtag[i].name; i++) {
if (!(n = strncasecmp(atomtag[i].name, name, atomtag[i].namelen)))
- return atomtag[i].id;
+ return atomtag[i].id; /* found */
/* optimization: it's sorted so nothing after it matches. */
if (n > 0)
return TagUnknown;
}
+ break;
+ default:
+ return TagUnknown;
}
- return TagUnknown;
+ return TagUnknown; /* NOTREACHED */
}
/* clear string only; don't free, prevents unnecessary reallocation */
@@ -334,23 +340,26 @@ parsetime(const char *s, char *buf, size_t bufsiz, time_t *tp)
return -1;
}
-/* print text, escape tabs, newline and carriage return etc */
+/* Print text, encode TABs, newlines and '\', remove other whitespace.
+ * Remove leading and trailing whitespace. */
static void
-string_print(String *s)
+string_print_encoded(String *s)
{
const char *p, *e;
/* skip leading whitespace */
- p = trimstart(s->data);
- e = trimend(p);
+ for (p = s->data; *p && isspace((int)*p); p++)
+ ;
+ /* seek offset of trailing whitespace */
+ for (e = p + strlen(p); e > p && isspace((int)*(e - 1)); e--)
+ ;
for (; *p && p != e; p++) {
- /* isspace(c) && c != ' '. */
- if (((unsigned)*p - '\t') < 5) {
- switch(*p) {
- case '\n': fputs("\\n", stdout); break;
+ if (isspace((int)*p) && *p != ' ') {
+ switch (*p) {
+ case '\n': fputs("\\n", stdout); break;
case '\\': fputs("\\\\", stdout); break;
- case '\t': fputs("\\t", stdout); break;
+ case '\t': fputs("\\t", stdout); break;
default: break; /* ignore other whitespace chars */
}
} else if (!iscntrl((int)*p)) { /* ignore control chars */
@@ -359,6 +368,29 @@ string_print(String *s)
}
}
+/* Print text, replace TABs, carriage return and other whitespace with ' '.
+ * Other control chars are removed. Remove leading and trailing whitespace. */
+static void
+string_print_trimmed(String *s)
+{
+ const char *p, *e;
+
+ /* skip leading whitespace */
+ for (p = s->data; *p && isspace((int)*p); p++)
+ ;
+ /* seek offset of trailing whitespace */
+ for (e = p + strlen(p); e > p && isspace((int)*(e - 1)); e--)
+ ;
+
+ for (; *p && p != e; p++) {
+ if (isspace((int)*p))
+ putchar(' ');
+ else if (!iscntrl((int)*p))
+ /* ignore other control chars */
+ putchar((int)*p);
+ }
+}
+
static void
printfields(void)
{
@@ -376,19 +408,19 @@ printfields(void)
if (r != -1)
fputs(timebuf, stdout);
putchar(FieldSeparator);
- string_print(&ctx.item.title);
+ string_print_trimmed(&ctx.item.title);
putchar(FieldSeparator);
/* always print absolute urls */
if (absuri(ctx.item.link.data, baseurl, link, sizeof(link)) != -1)
fputs(link, stdout);
putchar(FieldSeparator);
- string_print(&ctx.item.content);
+ string_print_encoded(&ctx.item.content);
putchar(FieldSeparator);
fputs(contenttypes[ctx.item.contenttype], stdout);
putchar(FieldSeparator);
- string_print(&ctx.item.id);
+ string_print_trimmed(&ctx.item.id);
putchar(FieldSeparator);
- string_print(&ctx.item.author);
+ string_print_trimmed(&ctx.item.author);
putchar(FieldSeparator);
fputs(feedtypes[ctx.item.feedtype], stdout);
putchar('\n');
@@ -555,53 +587,59 @@ xml_handler_start_element(XMLParser *p, const char *name, size_t namelen)
/* tag already set: return */
if (ctx.tag[0] != '\0')
return;
+
/* in item */
strlcpy(ctx.tag, name, sizeof(ctx.tag)); /* NOTE: truncation ignored */
ctx.taglen = namelen;
ctx.tagid = gettag(ctx.item.feedtype, ctx.tag, ctx.taglen);
- if (ctx.tagid == TagUnknown)
- ctx.field = NULL;
- if (ctx.item.feedtype == FeedTypeRSS) {
- if (ctx.tagid == RSSTagPubdate || ctx.tagid == RSSTagDcdate)
+ switch (ctx.tagid) {
+ case RSSTagPubdate:
+ case RSSTagDcdate:
+ ctx.field = &ctx.item.timestamp;
+ break;
+ case AtomTagPublished:
+ case AtomTagUpdated:
+ /* prefer published over updated if set */
+ if (ctx.tagid != AtomTagUpdated || !ctx.item.timestamp.len) {
ctx.field = &ctx.item.timestamp;
- else if (ctx.tagid == RSSTagTitle)
- ctx.field = &ctx.item.title;
- else if (ctx.tagid == RSSTagLink)
- ctx.field = &ctx.item.link;
- else if (ctx.tagid == RSSTagDescription ||
- ctx.tagid == RSSTagContentencoded) {
- /* ignore, prefer content:encoded over description */
- if (ctx.tagid != RSSTagDescription || !ctx.item.content.len) {
- ctx.iscontenttag = 1;
- ctx.field = &ctx.item.content;
- }
- } else if (ctx.tagid == RSSTagGuid) {
- ctx.field = &ctx.item.id;
- } else if (ctx.tagid == RSSTagAuthor || ctx.tagid == RSSTagDccreator) {
- ctx.field = &ctx.item.author;
}
- } else if (ctx.item.feedtype == FeedTypeAtom) {
- if (ctx.tagid == AtomTagPublished || ctx.tagid == AtomTagUpdated) {
- /* ignore, prefer published over updated */
- if (ctx.tagid != AtomTagUpdated || !ctx.item.timestamp.len) {
- ctx.field = &ctx.item.timestamp;
- }
- } else if (ctx.tagid == AtomTagTitle) {
- ctx.field = &ctx.item.title;
- } else if (ctx.tagid == AtomTagSummary || ctx.tagid == AtomTagContent) {
- /* ignore, prefer content:encoded over description */
- if (ctx.tagid != AtomTagSummary || !ctx.item.content.len) {
- ctx.iscontenttag = 1;
- ctx.field = &ctx.item.content;
- }
- } else if (ctx.tagid == AtomTagId) {
- ctx.field = &ctx.item.id;
- } else if (ctx.tagid == AtomTagLink) {
- ctx.field = &ctx.item.link;
- } else if (ctx.tagid == AtomTagAuthor) {
- ctx.field = &ctx.item.author;
+ break;
+ case RSSTagTitle:
+ case AtomTagTitle:
+ ctx.field = &ctx.item.title;
+ break;
+ case RSSTagLink:
+ case AtomTagLink:
+ ctx.field = &ctx.item.link;
+ break;
+ case RSSTagDescription:
+ case RSSTagContentencoded:
+ /* prefer content:encoded over description if set */
+ if (ctx.tagid != RSSTagDescription || !ctx.item.content.len) {
+ ctx.iscontenttag = 1;
+ ctx.field = &ctx.item.content;
}
+ break;
+ case AtomTagSummary:
+ case AtomTagContent:
+ /* prefer content over summary if set */
+ if (ctx.tagid != AtomTagSummary || !ctx.item.content.len) {
+ ctx.iscontenttag = 1;
+ ctx.field = &ctx.item.content;
+ }
+ break;
+ case RSSTagGuid:
+ case AtomTagId:
+ ctx.field = &ctx.item.id;
+ break;
+ case RSSTagAuthor:
+ case RSSTagDccreator:
+ case AtomTagAuthor:
+ ctx.field = &ctx.item.author;
+ break;
+ default:
+ ctx.field = NULL;
}
/* clear field */
if (ctx.field)