sfeed.c (29123B) - raw


      1 #include <ctype.h>
      2 #include <errno.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <string.h>
      7 #include <strings.h>
      8 
      9 #include "util.h"
     10 #include "xml.h"
     11 
     12 #define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
     13 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
     14 
     15 /* these feed fields support multiple separated values */
     16 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
     17 
     18 /* string and byte-length */
     19 #define STRP(s)           s,sizeof(s)-1
     20 
     21 enum FeedType {
     22 	FeedTypeNone = 0,
     23 	FeedTypeRSS  = 1,
     24 	FeedTypeAtom = 2
     25 };
     26 
     27 enum ContentType {
     28 	ContentTypeNone  = 0,
     29 	ContentTypePlain = 1,
     30 	ContentTypeHTML  = 2
     31 };
     32 static const char *contenttypes[] = { "", "plain", "html" };
     33 
     34 /* String data / memory pool */
     35 typedef struct string {
     36 	char   *data;   /* data */
     37 	size_t  len;    /* string length */
     38 	size_t  bufsiz; /* allocated size */
     39 } String;
     40 
     41 /* NOTE: the order of these fields (content, date, author) indicate the
     42  *       priority to use them, from least important to high. */
     43 enum TagId {
     44 	TagUnknown = 0,
     45 	/* RSS */
     46 	RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
     47 	RSSTagTitle,
     48 	RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
     49 	RSSTagGuid,
     50 	RSSTagGuidPermalinkFalse,
     51 	RSSTagGuidPermalinkTrue,
     52 	/* must be defined after GUID, because it can be a link (isPermaLink) */
     53 	RSSTagLink,
     54 	RSSTagEnclosure,
     55 	RSSTagAuthor, RSSTagDccreator,
     56 	RSSTagCategory,
     57 	/* Atom */
     58 	/* creation date has higher priority */
     59 	AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
     60 	AtomTagTitle,
     61 	AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
     62 	AtomTagId,
     63 	AtomTagLink,
     64 	AtomTagLinkAlternate,
     65 	AtomTagLinkEnclosure,
     66 	AtomTagAuthor, AtomTagAuthorName,
     67 	AtomTagCategory,
     68 	TagLast
     69 };
     70 
     71 typedef struct feedtag {
     72 	char       *name; /* name of tag to match */
     73 	size_t      len;  /* len of `name` */
     74 	enum TagId  id;   /* unique ID */
     75 } FeedTag;
     76 
     77 typedef struct field {
     78 	String     str;
     79 	enum TagId tagid; /* tagid set previously, used for tag priority */
     80 } FeedField;
     81 
     82 enum {
     83 	FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
     84 	FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
     85 	FeedFieldLast
     86 };
     87 
     88 typedef struct feedcontext {
     89 	String          *field;        /* current FeedItem field String */
     90 	FeedField        fields[FeedFieldLast]; /* data for current item */
     91 	FeedTag          tag;          /* unique current parsed tag */
     92 	int              iscontent;    /* in content data */
     93 	int              iscontenttag; /* in content tag */
     94 	enum ContentType contenttype;  /* content-type for item */
     95 	enum FeedType    feedtype;
     96 	int              attrcount;    /* count item HTML element attributes */
     97 } FeedContext;
     98 
     99 static long long datetounix(long long, int, int, int, int, int);
    100 static FeedTag * gettag(enum FeedType, const char *, size_t);
    101 static long gettzoffset(const char *);
    102 static int  isattr(const char *, size_t, const char *, size_t);
    103 static int  istag(const char *, size_t, const char *, size_t);
    104 static int  parsetime(const char *, long long *);
    105 static void printfields(void);
    106 static void string_append(String *, const char *, size_t);
    107 static void string_buffer_realloc(String *, size_t);
    108 static void string_clear(String *);
    109 static void string_print_encoded(String *);
    110 static void string_print_timestamp(String *);
    111 static void string_print_trimmed(String *);
    112 static void string_print_trimmed_multi(String *);
    113 static void string_print_uri(String *);
    114 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
    115                     const char *, size_t);
    116 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
    117                           size_t, const char *, size_t);
    118 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
    119                        size_t);
    120 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
    121                          size_t);
    122 static void xmldata(XMLParser *, const char *, size_t);
    123 static void xmldataentity(XMLParser *, const char *, size_t);
    124 static void xmltagend(XMLParser *, const char *, size_t, int);
    125 static void xmltagstart(XMLParser *, const char *, size_t);
    126 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
    127 
    128 /* map tag name to TagId type */
    129 /* RSS, must be alphabetical order */
    130 static FeedTag rsstags[] = {
    131 	{ STRP("author"),            RSSTagAuthor            },
    132 	{ STRP("category"),          RSSTagCategory          },
    133 	{ STRP("content:encoded"),   RSSTagContentEncoded    },
    134 	{ STRP("dc:creator"),        RSSTagDccreator         },
    135 	{ STRP("dc:date"),           RSSTagDcdate            },
    136 	{ STRP("description"),       RSSTagDescription       },
    137 	/* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
    138 	{ STRP("enclosure"),         RSSTagEnclosure         },
    139 	{ STRP("guid"),              RSSTagGuid              },
    140 	{ STRP("link"),              RSSTagLink              },
    141 	{ STRP("media:description"), RSSTagMediaDescription  },
    142 	{ STRP("pubdate"),           RSSTagPubdate           },
    143 	{ STRP("title"),             RSSTagTitle             }
    144 };
    145 
    146 /* Atom, must be alphabetical order */
    147 static FeedTag atomtags[] = {
    148 	{ STRP("author"),            AtomTagAuthor           },
    149 	{ STRP("category"),          AtomTagCategory         },
    150 	{ STRP("content"),           AtomTagContent          },
    151 	{ STRP("id"),                AtomTagId               },
    152 	{ STRP("issued"),            AtomTagIssued           }, /* Atom 0.3 */
    153 	/* Atom: <link href="" />, RSS has <link></link> */
    154 	{ STRP("link"),              AtomTagLink             },
    155 	{ STRP("media:description"), AtomTagMediaDescription },
    156 	{ STRP("modified"),          AtomTagModified         }, /* Atom 0.3 */
    157 	{ STRP("published"),         AtomTagPublished        },
    158 	{ STRP("summary"),           AtomTagSummary          },
    159 	{ STRP("title"),             AtomTagTitle            },
    160 	{ STRP("updated"),           AtomTagUpdated          }
    161 };
    162 
    163 /* special case: nested <author><name> */
    164 static FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
    165 static FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
    166 
    167 /* reference to no / unknown tag */
    168 static FeedTag notag = { STRP(""), TagUnknown };
    169 
    170 /* map TagId type to RSS/Atom field, all tags must be defined */
    171 static int fieldmap[TagLast] = {
    172 	[TagUnknown]               = -1,
    173 	/* RSS */
    174 	[RSSTagDcdate]             = FeedFieldTime,
    175 	[RSSTagPubdate]            = FeedFieldTime,
    176 	[RSSTagTitle]              = FeedFieldTitle,
    177 	[RSSTagMediaDescription]   = FeedFieldContent,
    178 	[RSSTagDescription]        = FeedFieldContent,
    179 	[RSSTagContentEncoded]     = FeedFieldContent,
    180 	[RSSTagGuid]               = -1,
    181 	[RSSTagGuidPermalinkFalse] = FeedFieldId,
    182 	[RSSTagGuidPermalinkTrue]  = FeedFieldId, /* special-case: both a link and an id */
    183 	[RSSTagLink]               = FeedFieldLink,
    184 	[RSSTagEnclosure]          = FeedFieldEnclosure,
    185 	[RSSTagAuthor]             = FeedFieldAuthor,
    186 	[RSSTagDccreator]          = FeedFieldAuthor,
    187 	[RSSTagCategory]           = FeedFieldCategory,
    188 	/* Atom */
    189 	[AtomTagModified]          = FeedFieldTime,
    190 	[AtomTagUpdated]           = FeedFieldTime,
    191 	[AtomTagIssued]            = FeedFieldTime,
    192 	[AtomTagPublished]         = FeedFieldTime,
    193 	[AtomTagTitle]             = FeedFieldTitle,
    194 	[AtomTagMediaDescription]  = FeedFieldContent,
    195 	[AtomTagSummary]           = FeedFieldContent,
    196 	[AtomTagContent]           = FeedFieldContent,
    197 	[AtomTagId]                = FeedFieldId,
    198 	[AtomTagLink]              = -1,
    199 	[AtomTagLinkAlternate]     = FeedFieldLink,
    200 	[AtomTagLinkEnclosure]     = FeedFieldEnclosure,
    201 	[AtomTagAuthor]            = -1,
    202 	[AtomTagAuthorName]        = FeedFieldAuthor,
    203 	[AtomTagCategory]          = FeedFieldCategory
    204 };
    205 
    206 static const int FieldSeparator = '\t';
    207 /* separator for multiple values in a field, separator should be 1 byte */
    208 static const char *FieldMultiSeparator = "|";
    209 static struct uri baseuri;
    210 static const char *baseurl;
    211 
    212 static FeedContext ctx;
    213 static XMLParser parser; /* XML parser state */
    214 static String attrispermalink, attrrel, attrtype, tmpstr;
    215 
    216 static int
    217 tagcmp(const void *v1, const void *v2)
    218 {
    219 	return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
    220 }
    221 
    222 /* Unique tagid for parsed tag name. */
    223 static FeedTag *
    224 gettag(enum FeedType feedtype, const char *name, size_t namelen)
    225 {
    226 	FeedTag f, *r = NULL;
    227 
    228 	f.name = (char *)name;
    229 
    230 	switch (feedtype) {
    231 	case FeedTypeRSS:
    232 		r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
    233 		        sizeof(rsstags[0]), tagcmp);
    234 		break;
    235 	case FeedTypeAtom:
    236 		r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
    237 		        sizeof(atomtags[0]), tagcmp);
    238 		break;
    239 	default:
    240 		break;
    241 	}
    242 
    243 	return r;
    244 }
    245 
    246 static char *
    247 ltrim(const char *s)
    248 {
    249 	for (; isspace((unsigned char)*s); s++)
    250 		;
    251 	return (char *)s;
    252 }
    253 
    254 static char *
    255 rtrim(const char *s)
    256 {
    257 	const char *e;
    258 
    259 	for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--)
    260 		;
    261 	return (char *)e;
    262 }
    263 
    264 /* Clear string only; don't free, prevents unnecessary reallocation. */
    265 static void
    266 string_clear(String *s)
    267 {
    268 	if (s->data)
    269 		s->data[0] = '\0';
    270 	s->len = 0;
    271 }
    272 
    273 static void
    274 string_buffer_realloc(String *s, size_t newlen)
    275 {
    276 	size_t alloclen;
    277 
    278 	if (newlen > SIZE_MAX / 2) {
    279 		alloclen = SIZE_MAX;
    280 	} else {
    281 		for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
    282 			;
    283 	}
    284 	if (!(s->data = realloc(s->data, alloclen)))
    285 		err(1, "realloc");
    286 	s->bufsiz = alloclen;
    287 }
    288 
    289 /* Append data to String, s->data and data may not overlap. */
    290 static void
    291 string_append(String *s, const char *data, size_t len)
    292 {
    293 	if (!len)
    294 		return;
    295 
    296 	if (s->len >= SIZE_MAX - len) {
    297 		errno = EOVERFLOW;
    298 		err(1, "realloc");
    299 	}
    300 
    301 	/* check if allocation is necessary, never shrink the buffer. */
    302 	if (s->len + len >= s->bufsiz)
    303 		string_buffer_realloc(s, s->len + len + 1);
    304 	memcpy(s->data + s->len, data, len);
    305 	s->len += len;
    306 	s->data[s->len] = '\0';
    307 }
    308 
    309 /* Print text, encode TABs, newlines and '\', remove other whitespace.
    310  * Remove leading and trailing whitespace. */
    311 static void
    312 string_print_encoded(String *s)
    313 {
    314 	const char *p, *e;
    315 
    316 	if (!s->data || !s->len)
    317 		return;
    318 
    319 	p = ltrim(s->data);
    320 	e = rtrim(p);
    321 
    322 	for (; *p && p != e; p++) {
    323 		switch (*p) {
    324 		case '\n': putchar('\\'); putchar('n'); break;
    325 		case '\\': putchar('\\'); putchar('\\'); break;
    326 		case '\t': putchar('\\'); putchar('t'); break;
    327 		default:
    328 			/* ignore control chars */
    329 			if (!iscntrl((unsigned char)*p))
    330 				putchar(*p);
    331 			break;
    332 		}
    333 	}
    334 }
    335 
    336 static void
    337 printtrimmed(const char *s)
    338 {
    339 	char *p, *e;
    340 
    341 	p = ltrim(s);
    342 	e = rtrim(p);
    343 	for (; *p && p != e; p++) {
    344 		if (isspace((unsigned char)*p))
    345 			putchar(' '); /* any whitespace to space */
    346 		else if (!iscntrl((unsigned char)*p))
    347 			/* ignore other control chars */
    348 			putchar(*p);
    349 	}
    350 }
    351 
    352 /* Print text, replace TABs, carriage return and other whitespace with ' '.
    353  * Other control chars are removed. Remove leading and trailing whitespace. */
    354 static void
    355 string_print_trimmed(String *s)
    356 {
    357 	if (!s->data || !s->len)
    358 		return;
    359 
    360 	printtrimmed(s->data);
    361 }
    362 
    363 /* Print each field with trimmed whitespace, separated by '|'. */
    364 static void
    365 string_print_trimmed_multi(String *s)
    366 {
    367 	char *p, *e;
    368 	int c;
    369 
    370 	if (!s->data || !s->len)
    371 		return;
    372 
    373 	for (p = s->data; ; p = e + 1) {
    374 		if ((e = strstr(p, FieldMultiSeparator))) {
    375 			c = *e;
    376 			*e = '\0';
    377 			printtrimmed(p);
    378 			*e = c; /* restore NUL byte to original character */
    379 			fputs(FieldMultiSeparator, stdout);
    380 		} else {
    381 			printtrimmed(p);
    382 			break;
    383 		}
    384 	}
    385 }
    386 
    387 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
    388 static void
    389 printuri(char *s)
    390 {
    391 	char link[4096], *p, *e;
    392 	struct uri newuri, olduri;
    393 	int c, r = -1;
    394 
    395 	p = ltrim(s);
    396 	e = rtrim(p);
    397 	c = *e;
    398 	*e = '\0';
    399 
    400 	if (baseurl && !uri_hasscheme(p) &&
    401 	    uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
    402 	    uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
    403 		r = uri_format(link, sizeof(link), &newuri);
    404 
    405 	if (r >= 0 && (size_t)r < sizeof(link))
    406 		printtrimmed(link);
    407 	else
    408 		printtrimmed(p);
    409 
    410 	*e = c; /* restore NUL byte to original character */
    411 }
    412 
    413 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */
    414 static void
    415 string_print_uri(String *s)
    416 {
    417 	if (!s->data || !s->len)
    418 		return;
    419 
    420 	printuri(s->data);
    421 }
    422 
    423 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
    424 static void
    425 string_print_timestamp(String *s)
    426 {
    427 	long long t;
    428 
    429 	if (!s->data || !s->len)
    430 		return;
    431 
    432 	if (parsetime(s->data, &t) != -1)
    433 		printf("%lld", t);
    434 }
    435 
    436 /* Convert time fields. Returns a UNIX timestamp. */
    437 static long long
    438 datetounix(long long year, int mon, int day, int hour, int min, int sec)
    439 {
    440 	static const int secs_through_month[] = {
    441 		0, 31 * 86400, 59 * 86400, 90 * 86400,
    442 		120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
    443 		243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
    444 	int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
    445 	long long t;
    446 
    447 	if (year - 2ULL <= 136) {
    448 		leaps = (year - 68) >> 2;
    449 		if (!((year - 68) & 3)) {
    450 			leaps--;
    451 			is_leap = 1;
    452 		} else {
    453 			is_leap = 0;
    454 		}
    455 		t = 31536000 * (year - 70) + 86400 * leaps;
    456 	} else {
    457 		cycles = (year - 100) / 400;
    458 		rem = (year - 100) % 400;
    459 		if (rem < 0) {
    460 			cycles--;
    461 			rem += 400;
    462 		}
    463 		if (!rem) {
    464 			is_leap = 1;
    465 		} else {
    466 			if (rem >= 300)
    467 				centuries = 3, rem -= 300;
    468 			else if (rem >= 200)
    469 				centuries = 2, rem -= 200;
    470 			else if (rem >= 100)
    471 				centuries = 1, rem -= 100;
    472 			if (rem) {
    473 				leaps = rem / 4U;
    474 				rem %= 4U;
    475 				is_leap = !rem;
    476 			}
    477 		}
    478 		leaps += 97 * cycles + 24 * centuries - is_leap;
    479 		t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
    480 	}
    481 	t += secs_through_month[mon];
    482 	if (is_leap && mon >= 2)
    483 		t += 86400;
    484 	t += 86400LL * (day - 1);
    485 	t += 3600LL * hour;
    486 	t += 60LL * min;
    487 	t += sec;
    488 
    489 	return t;
    490 }
    491 
    492 /* Get timezone from string, return time offset in seconds from UTC.
    493  * NOTE: only parses timezones in RFC-822, many other timezone names are
    494  * ambiguous anyway.
    495  * ANSI and military zones are defined wrong in RFC822 and are unsupported,
    496  * see note on RFC2822 4.3 page 32. */
    497 static long
    498 gettzoffset(const char *s)
    499 {
    500 	static struct {
    501 		char *name;
    502 		const int offhour;
    503 	} tzones[] = {
    504 		{ "CDT", -5 * 3600 },
    505 		{ "CST", -6 * 3600 },
    506 		{ "EDT", -4 * 3600 },
    507 		{ "EST", -5 * 3600 },
    508 		{ "MDT", -6 * 3600 },
    509 		{ "MST", -7 * 3600 },
    510 		{ "PDT", -7 * 3600 },
    511 		{ "PST", -8 * 3600 },
    512 	};
    513 	const char *p;
    514 	long tzhour = 0, tzmin = 0;
    515 	size_t i;
    516 
    517 	for (; isspace((unsigned char)*s); s++)
    518 		;
    519 	switch (*s) {
    520 	case '-': /* offset */
    521 	case '+':
    522 		for (i = 0, p = s + 1; i < 2 && isdigit((unsigned char)*p); i++, p++)
    523 			tzhour = (tzhour * 10) + (*p - '0');
    524 		if (*p == ':')
    525 			p++;
    526 		for (i = 0; i < 2 && isdigit((unsigned char)*p); i++, p++)
    527 			tzmin = (tzmin * 10) + (*p - '0');
    528 		return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
    529 	default: /* timezone name */
    530 		for (i = 0; isalpha((unsigned char)s[i]); i++)
    531 			;
    532 		if (i != 3)
    533 			return 0;
    534 		/* compare tz and adjust offset relative to UTC */
    535 		for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
    536 			if (!memcmp(s, tzones[i].name, 3))
    537 				return tzones[i].offhour;
    538 		}
    539 	}
    540 	return 0;
    541 }
    542 
    543 /* Parse time string `s` into the UNIX timestamp `tp`.
    544    Returns 0 on success or -1 on failure. */
    545 static int
    546 parsetime(const char *s, long long *tp)
    547 {
    548 	static struct {
    549 		char *name;
    550 		int len;
    551 	} mons[] = {
    552 		{ STRP("January"),   },
    553 		{ STRP("February"),  },
    554 		{ STRP("March"),     },
    555 		{ STRP("April"),     },
    556 		{ STRP("May"),       },
    557 		{ STRP("June"),      },
    558 		{ STRP("July"),      },
    559 		{ STRP("August"),    },
    560 		{ STRP("September"), },
    561 		{ STRP("October"),   },
    562 		{ STRP("November"),  },
    563 		{ STRP("December"),  },
    564 	};
    565 	int va[6] = { 0 }, i, j, v, vi;
    566 	size_t m;
    567 
    568 	for (; isspace((unsigned char)*s); s++)
    569 		;
    570 	if (!isdigit((unsigned char)*s) && !isalpha((unsigned char)*s))
    571 		return -1;
    572 
    573 	if (isdigit((unsigned char)s[0]) &&
    574 	    isdigit((unsigned char)s[1]) &&
    575 	    isdigit((unsigned char)s[2]) &&
    576 	    isdigit((unsigned char)s[3])) {
    577 		/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
    578 		vi = 0;
    579 	} else {
    580 		/* format: "[%a, ]%d %b %Y %H:%M:%S" */
    581 		/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
    582 		for (; isalpha((unsigned char)*s); s++)
    583 			;
    584 		for (; isspace((unsigned char)*s); s++)
    585 			;
    586 		if (*s == ',')
    587 			s++;
    588 		for (; isspace((unsigned char)*s); s++)
    589 			;
    590 		for (v = 0, i = 0; i < 2 && isdigit((unsigned char)*s); s++, i++)
    591 			v = (v * 10) + (*s - '0');
    592 		va[2] = v; /* day */
    593 		for (; isspace((unsigned char)*s); s++)
    594 			;
    595 		/* end of word month */
    596 		for (j = 0; isalpha((unsigned char)s[j]); j++)
    597 			;
    598 		/* check month name */
    599 		if (j < 3 || j > 9)
    600 			return -1; /* month cannot match */
    601 		for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
    602 			/* abbreviation (3 length) or long name */
    603 			if ((j == 3 || j == mons[m].len) &&
    604 			    !strncasecmp(mons[m].name, s, j)) {
    605 				va[1] = m + 1;
    606 				s += j;
    607 				break;
    608 			}
    609 		}
    610 		if (m >= 12)
    611 			return -1; /* no month found */
    612 		for (; isspace((unsigned char)*s); s++)
    613 			;
    614 		for (v = 0, i = 0; i < 4 && isdigit((unsigned char)*s); s++, i++)
    615 			v = (v * 10) + (*s - '0');
    616 		/* obsolete short year: RFC2822 4.3 */
    617 		if (i <= 3)
    618 			v += (v >= 0 && v <= 49) ? 2000 : 1900;
    619 		va[0] = v; /* year */
    620 		for (; isspace((unsigned char)*s); s++)
    621 			;
    622 		/* parse only regular time part, see below */
    623 		vi = 3;
    624 	}
    625 
    626 	/* parse time parts (and possibly remaining date parts) */
    627 	for (; *s && vi < 6; vi++) {
    628 		for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
    629 		                   isdigit((unsigned char)*s); s++, i++) {
    630 			v = (v * 10) + (*s - '0');
    631 		}
    632 		va[vi] = v;
    633 
    634 		if ((vi < 2 && *s == '-') ||
    635 		    (vi == 2 && (*s == 'T' || isspace((unsigned char)*s))) ||
    636 		    (vi > 2 && *s == ':'))
    637 			s++;
    638 	}
    639 
    640 	/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
    641 	if (*s == '.') {
    642 		for (s++; isdigit((unsigned char)*s); s++)
    643 			;
    644 	}
    645 
    646 	/* invalid range */
    647 	if (va[0] < 0 || va[0] > 9999 ||
    648 	    va[1] < 1 || va[1] > 12 ||
    649 	    va[2] < 1 || va[2] > 31 ||
    650 	    va[3] < 0 || va[3] > 23 ||
    651 	    va[4] < 0 || va[4] > 59 ||
    652 	    va[5] < 0 || va[5] > 60) /* allow leap second */
    653 		return -1;
    654 
    655 	*tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
    656 	      gettzoffset(s);
    657 
    658 	return 0;
    659 }
    660 
    661 static void
    662 printfields(void)
    663 {
    664 	string_print_timestamp(&ctx.fields[FeedFieldTime].str);
    665 	putchar(FieldSeparator);
    666 	string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
    667 	putchar(FieldSeparator);
    668 	string_print_uri(&ctx.fields[FeedFieldLink].str);
    669 	putchar(FieldSeparator);
    670 	string_print_encoded(&ctx.fields[FeedFieldContent].str);
    671 	putchar(FieldSeparator);
    672 	fputs(contenttypes[ctx.contenttype], stdout);
    673 	putchar(FieldSeparator);
    674 	string_print_trimmed(&ctx.fields[FeedFieldId].str);
    675 	putchar(FieldSeparator);
    676 	string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
    677 	putchar(FieldSeparator);
    678 	string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
    679 	putchar(FieldSeparator);
    680 	string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
    681 	putchar('\n');
    682 }
    683 
    684 static int
    685 istag(const char *name, size_t len, const char *name2, size_t len2)
    686 {
    687 	return (len == len2 && !strcasecmp(name, name2));
    688 }
    689 
    690 static int
    691 isattr(const char *name, size_t len, const char *name2, size_t len2)
    692 {
    693 	return (len == len2 && !strcasecmp(name, name2));
    694 }
    695 
    696 static void
    697 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
    698 	const char *v, size_t vl)
    699 {
    700 	/* handles transforming inline XML to data */
    701 	if (ISINCONTENT(ctx)) {
    702 		if (ctx.contenttype == ContentTypeHTML)
    703 			xmldata(p, v, vl);
    704 		return;
    705 	}
    706 
    707 	if (!ctx.tag.id)
    708 		return;
    709 
    710 	/* content-type may be: Atom: text, xhtml, html or mime-type.
    711 	   MRSS (media:description): plain, html. */
    712 	if (ISCONTENTTAG(ctx)) {
    713 		if (isattr(n, nl, STRP("type")))
    714 			string_append(&attrtype, v, vl);
    715 		return;
    716 	}
    717 
    718 	if (ctx.feedtype == FeedTypeRSS) {
    719 		if (ctx.tag.id == RSSTagEnclosure &&
    720 		    isattr(n, nl, STRP("url"))) {
    721 			string_append(&tmpstr, v, vl);
    722 		} else if (ctx.tag.id == RSSTagGuid &&
    723 		           isattr(n, nl, STRP("ispermalink"))) {
    724 			string_append(&attrispermalink, v, vl);
    725 		}
    726 	} else if (ctx.feedtype == FeedTypeAtom) {
    727 		if (ctx.tag.id == AtomTagLink) {
    728 			if (isattr(n, nl, STRP("rel"))) {
    729 				string_append(&attrrel, v, vl);
    730 			} else if (isattr(n, nl, STRP("href"))) {
    731 				string_append(&tmpstr, v, vl);
    732 			}
    733 		} else if (ctx.tag.id == AtomTagCategory &&
    734 			   isattr(n, nl, STRP("term"))) {
    735 			string_append(&tmpstr, v, vl);
    736 		}
    737 	}
    738 }
    739 
    740 static void
    741 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
    742               const char *data, size_t datalen)
    743 {
    744 	char buf[16];
    745 	int len;
    746 
    747 	/* handles transforming inline XML to data */
    748 	if (ISINCONTENT(ctx)) {
    749 		if (ctx.contenttype == ContentTypeHTML)
    750 			xmldata(p, data, datalen);
    751 		return;
    752 	}
    753 
    754 	if (!ctx.tag.id)
    755 		return;
    756 
    757 	/* try to translate entity, else just pass as data to
    758 	 * xmldata handler. */
    759 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
    760 		xmlattr(p, t, tl, n, nl, buf, (size_t)len);
    761 	else
    762 		xmlattr(p, t, tl, n, nl, data, datalen);
    763 }
    764 
    765 static void
    766 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
    767 {
    768 	if (ISINCONTENT(ctx)) {
    769 		if (ctx.contenttype == ContentTypeHTML) {
    770 			/* handles transforming inline XML to data */
    771 			xmldata(p, "\"", 1);
    772 			ctx.attrcount = 0;
    773 		}
    774 		return;
    775 	}
    776 }
    777 
    778 static void
    779 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
    780 {
    781 	if (ISINCONTENT(ctx)) {
    782 		if (ctx.contenttype == ContentTypeHTML) {
    783 			/* handles transforming inline XML to data */
    784 			if (!ctx.attrcount)
    785 				xmldata(p, " ", 1);
    786 			ctx.attrcount++;
    787 			xmldata(p, n, nl);
    788 			xmldata(p, "=\"", 2);
    789 		}
    790 		return;
    791 	}
    792 
    793 	if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
    794 		string_clear(&attrispermalink);
    795 	else if (attrrel.len && isattr(n, nl, STRP("rel")))
    796 		string_clear(&attrrel);
    797 	else if (attrtype.len && isattr(n, nl, STRP("type")))
    798 		string_clear(&attrtype);
    799 	else if (tmpstr.len &&
    800 	    (isattr(n, nl, STRP("href")) ||
    801 	     isattr(n, nl, STRP("term")) ||
    802 	     isattr(n, nl, STRP("url"))))
    803 		string_clear(&tmpstr); /* use the last value for multiple attribute values */
    804 }
    805 
    806 static void
    807 xmldata(XMLParser *p, const char *s, size_t len)
    808 {
    809 	if (!ctx.field)
    810 		return;
    811 
    812 	if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
    813 		string_append(&tmpstr, s, len);
    814 	else
    815 		string_append(ctx.field, s, len);
    816 }
    817 
    818 static void
    819 xmldataentity(XMLParser *p, const char *data, size_t datalen)
    820 {
    821 	char buf[16];
    822 	int len;
    823 
    824 	if (!ctx.field)
    825 		return;
    826 
    827 	/* try to translate entity, else just pass as data to
    828 	 * xmldata handler. */
    829 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
    830 		xmldata(p, buf, (size_t)len);
    831 	else
    832 		xmldata(p, data, datalen);
    833 }
    834 
    835 static void
    836 xmltagstart(XMLParser *p, const char *t, size_t tl)
    837 {
    838 	FeedTag *f;
    839 
    840 	if (ISINCONTENT(ctx)) {
    841 		if (ctx.contenttype == ContentTypeHTML) {
    842 			ctx.attrcount = 0;
    843 			xmldata(p, "<", 1);
    844 			xmldata(p, t, tl);
    845 		}
    846 		return;
    847 	}
    848 
    849 	/* start of RSS or Atom item / entry */
    850 	if (ctx.feedtype == FeedTypeNone) {
    851 		if (istag(t, tl, STRP("entry")))
    852 			ctx.feedtype = FeedTypeAtom;
    853 		else if (istag(t, tl, STRP("item")))
    854 			ctx.feedtype = FeedTypeRSS;
    855 		return;
    856 	}
    857 
    858 	/* field tagid already set or nested tags. */
    859 	if (ctx.tag.id) {
    860 		/* nested <author><name> for Atom */
    861 		if (ctx.tag.id == AtomTagAuthor &&
    862 		    istag(t, tl, STRP("name"))) {
    863 			memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
    864 		} else {
    865 			return; /* other nested tags are not allowed: return */
    866 		}
    867 	}
    868 
    869 	/* in item */
    870 	if (ctx.tag.id == TagUnknown) {
    871 		if (!(f = gettag(ctx.feedtype, t, tl)))
    872 			f = &notag;
    873 		memcpy(&(ctx.tag), f, sizeof(ctx.tag));
    874 	}
    875 
    876 	ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
    877 	string_clear(&attrispermalink);
    878 	string_clear(&attrrel);
    879 	string_clear(&attrtype);
    880 }
    881 
    882 static void
    883 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
    884 {
    885 	enum TagId tagid;
    886 
    887 	if (ISINCONTENT(ctx)) {
    888 		if (ctx.contenttype == ContentTypeHTML) {
    889 			if (isshort)
    890 				xmldata(p, "/>", 2);
    891 			else
    892 				xmldata(p, ">", 1);
    893 		}
    894 		return;
    895 	}
    896 
    897 	/* set tag type based on it's attribute value */
    898 	if (ctx.tag.id == RSSTagGuid) {
    899 		/* if empty the default is "true" */
    900 		if (!attrispermalink.len ||
    901 		    isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
    902 			ctx.tag.id = RSSTagGuidPermalinkTrue;
    903 		else
    904 			ctx.tag.id = RSSTagGuidPermalinkFalse;
    905 	} else if (ctx.tag.id == AtomTagLink) {
    906 		/* empty or "alternate": other types could be
    907 		   "enclosure", "related", "self" or "via" */
    908 		if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
    909 			ctx.tag.id = AtomTagLinkAlternate;
    910 		else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
    911 			ctx.tag.id = AtomTagLinkEnclosure;
    912 		else
    913 			ctx.tag.id = AtomTagLink; /* unknown */
    914 	}
    915 
    916 	tagid = ctx.tag.id;
    917 
    918 	/* map tag type to field: unknown or lesser priority is ignored,
    919 	   when tags of the same type are repeated only the first is used. */
    920 	if (fieldmap[tagid] == -1 ||
    921 	    (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
    922 	     tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
    923 		return;
    924 	}
    925 
    926 	if (ctx.iscontenttag) {
    927 		ctx.iscontent = 1;
    928 		ctx.iscontenttag = 0;
    929 
    930 		/* detect content-type based on type attribute */
    931 		if (attrtype.len) {
    932 			if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
    933 			    isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
    934 			    isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
    935 			    isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
    936 			    isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
    937 				ctx.contenttype = ContentTypeHTML;
    938 			else /* unknown: handle as base64 text data */
    939 				ctx.contenttype = ContentTypePlain;
    940 		} else {
    941 			/* default content-type */
    942 			if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
    943 				ctx.contenttype = ContentTypeHTML;
    944 			else
    945 				ctx.contenttype = ContentTypePlain;
    946 		}
    947 	}
    948 
    949 	ctx.field = &(ctx.fields[fieldmap[tagid]].str);
    950 	ctx.fields[fieldmap[tagid]].tagid = tagid;
    951 
    952 	/* clear field if it is overwritten (with a priority order) for the new
    953 	   value, if the field can have multiple values then do not clear it. */
    954 	if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
    955 		string_clear(ctx.field);
    956 }
    957 
    958 static void
    959 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
    960 {
    961 	size_t i;
    962 
    963 	if (ctx.feedtype == FeedTypeNone)
    964 		return;
    965 
    966 	if (ISINCONTENT(ctx)) {
    967 		/* not close content field */
    968 		if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
    969 			if (!isshort && ctx.contenttype == ContentTypeHTML) {
    970 				xmldata(p, "</", 2);
    971 				xmldata(p, t, tl);
    972 				xmldata(p, ">", 1);
    973 			}
    974 			return;
    975 		}
    976 	} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
    977 		/* matched tag end: close it */
    978 		/* copy also to the link field if the attribute isPermaLink="true"
    979 		   and it is not set by a tag with higher prio. */
    980 		if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
    981 		    ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
    982 			string_clear(&ctx.fields[FeedFieldLink].str);
    983 			string_append(&ctx.fields[FeedFieldLink].str,
    984 			              ctx.field->data, ctx.field->len);
    985 			ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
    986 		}
    987 	} else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
    988 	   istag(t, tl, STRP("entry"))) || /* Atom */
    989 	   (ctx.feedtype == FeedTypeRSS &&
    990 	   istag(t, tl, STRP("item"))))) /* RSS */
    991 	{
    992 		/* end of RSS or Atom entry / item */
    993 		printfields();
    994 
    995 		/* clear strings */
    996 		for (i = 0; i < FeedFieldLast; i++) {
    997 			string_clear(&ctx.fields[i].str);
    998 			ctx.fields[i].tagid = TagUnknown;
    999 		}
   1000 		ctx.contenttype = ContentTypeNone;
   1001 		/* allow parsing of Atom and RSS concatenated in one XML stream. */
   1002 		ctx.feedtype = FeedTypeNone;
   1003 	} else {
   1004 		return; /* not end of field */
   1005 	}
   1006 
   1007 	/* temporary string: for fields that cannot be processed
   1008 	   directly and need more context, for example by it's tag
   1009 	   attributes, like the Atom link rel="alternate|enclosure". */
   1010 	if (tmpstr.len && ctx.field) {
   1011 		if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
   1012 			if (ctx.field->len)
   1013 				string_append(ctx.field, FieldMultiSeparator, 1);
   1014 			string_append(ctx.field, tmpstr.data, tmpstr.len);
   1015 		} else {
   1016 			string_clear(ctx.field);
   1017 			string_append(ctx.field, tmpstr.data, tmpstr.len);
   1018 		}
   1019 	}
   1020 
   1021 	/* close field */
   1022 	string_clear(&tmpstr); /* reuse and clear temporary string */
   1023 
   1024 	if (ctx.tag.id == AtomTagAuthorName)
   1025 		memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
   1026 	else
   1027 		memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
   1028 
   1029 	ctx.iscontent = 0;
   1030 	ctx.field = NULL;
   1031 }
   1032 
   1033 int
   1034 main(int argc, char *argv[])
   1035 {
   1036 	if (pledge("stdio", NULL) == -1)
   1037 		err(1, "pledge");
   1038 
   1039 	if (argc > 1) {
   1040 		if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
   1041 			baseurl = argv[1];
   1042 		else
   1043 			errx(1, "baseurl incorrect or too long");
   1044 	}
   1045 
   1046 	memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
   1047 
   1048 	parser.xmlattr = xmlattr;
   1049 	parser.xmlattrentity = xmlattrentity;
   1050 	parser.xmlattrend = xmlattrend;
   1051 	parser.xmlattrstart = xmlattrstart;
   1052 	parser.xmlcdata = xmldata;
   1053 	parser.xmldata = xmldata;
   1054 	parser.xmldataentity = xmldataentity;
   1055 	parser.xmltagend = xmltagend;
   1056 	parser.xmltagstart = xmltagstart;
   1057 	parser.xmltagstartparsed = xmltagstartparsed;
   1058 
   1059 	/* NOTE: getnext is defined in xml.h for inline optimization */
   1060 	xml_parse(&parser);
   1061 
   1062 	return 0;
   1063 }