Merge remote-tracking branch 'upstream/master'HEAD master

author: Benjamin Chausse <benjamin@chausse.xyz> 2024-08-09 14:11:50 -0400
committer: Benjamin Chausse <benjamin@chausse.xyz> 2024-08-09 14:11:50 -0400
commit: 5857d82e8e596d6fda406a0c4d8d68ca7a03c124 (patch)
tree: 553916894dee907825360580c5d9a05c82c5af16 /sfeed.c
parent: 3574e3cbf9d99546e868aeb995ce2c171cdc36a6 (diff)
parent: 19957bc272e745af7b56b79fa648e8b6b77113b1 (diff)
1 files changed, 90 insertions, 70 deletions
diff --git a/sfeed.c b/sfeed.c
index 4dd89c1..16141cf 100644
--- a/sfeed.c
+++ b/sfeed.c
@@ -1,4 +1,3 @@
-#include <ctype.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -127,7 +126,7 @@ static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
 
 /* map tag name to TagId type */
 /* RSS, must be alphabetical order */
-static FeedTag rsstags[] = {
+static const FeedTag rsstags[] = {
 	{ STRP("author"),            RSSTagAuthor            },
 	{ STRP("category"),          RSSTagCategory          },
 	{ STRP("content:encoded"),   RSSTagContentEncoded    },
@@ -144,7 +143,7 @@ static FeedTag rsstags[] = {
 };
 
 /* Atom, must be alphabetical order */
-static FeedTag atomtags[] = {
+static const FeedTag atomtags[] = {
 	{ STRP("author"),            AtomTagAuthor           },
 	{ STRP("category"),          AtomTagCategory         },
 	{ STRP("content"),           AtomTagContent          },
@@ -161,14 +160,14 @@ static FeedTag atomtags[] = {
 };
 
 /* special case: nested <author><name> */
-static FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
-static FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
+static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
+static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
 
 /* reference to no / unknown tag */
-static FeedTag notag = { STRP(""), TagUnknown };
+static const FeedTag notag = { STRP(""), TagUnknown };
 
 /* map TagId type to RSS/Atom field, all tags must be defined */
-static int fieldmap[TagLast] = {
+static const int fieldmap[TagLast] = {
 	[TagUnknown]               = -1,
 	/* RSS */
 	[RSSTagDcdate]             = FeedFieldTime,
@@ -205,7 +204,7 @@ static int fieldmap[TagLast] = {
 
 static const int FieldSeparator = '\t';
 /* separator for multiple values in a field, separator should be 1 byte */
-static const char *FieldMultiSeparator = "|";
+static const char FieldMultiSeparator[] = "|";
 static struct uri baseuri;
 static const char *baseurl;
 
@@ -246,7 +245,7 @@ gettag(enum FeedType feedtype, const char *name, size_t namelen)
 static char *
 ltrim(const char *s)
 {
-	for (; isspace((unsigned char)*s); s++)
+	for (; ISSPACE((unsigned char)*s); s++)
 		;
 	return (char *)s;
 }
@@ -256,7 +255,7 @@ rtrim(const char *s)
 {
 	const char *e;
 
-	for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--)
+	for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
 		;
 	return (char *)e;
 }
@@ -294,7 +293,7 @@ string_append(String *s, const char *data, size_t len)
 		return;
 
 	if (s->len >= SIZE_MAX - len) {
-		errno = EOVERFLOW;
+		errno = ENOMEM;
 		err(1, "realloc");
 	}
 
@@ -326,7 +325,7 @@ string_print_encoded(String *s)
 		case '\t': putchar('\\'); putchar('t'); break;
 		default:
 			/* ignore control chars */
-			if (!iscntrl((unsigned char)*p))
+			if (!ISCNTRL((unsigned char)*p))
 				putchar(*p);
 			break;
 		}
@@ -341,9 +340,9 @@ printtrimmed(const char *s)
 	p = ltrim(s);
 	e = rtrim(p);
 	for (; *p && p != e; p++) {
-		if (isspace((unsigned char)*p))
+		if (ISSPACE((unsigned char)*p))
 			putchar(' '); /* any whitespace to space */
-		else if (!iscntrl((unsigned char)*p))
+		else if (!ISCNTRL((unsigned char)*p))
 			/* ignore other control chars */
 			putchar(*p);
 	}
@@ -384,7 +383,7 @@ string_print_trimmed_multi(String *s)
 	}
 }
 
-/* Print URL, if it's a relative URL then it uses the global `baseurl`. */
+/* Print URL, if it is a relative URL then it uses the global `baseurl`. */
 static void
 printuri(char *s)
 {
@@ -410,7 +409,7 @@ printuri(char *s)
 	*e = c; /* restore NUL byte to original character */
 }
 
-/* Print URL, if it's a relative URL then it uses the global `baseurl`. */
+/* Print URL, if it is a relative URL then it uses the global `baseurl`. */
 static void
 string_print_uri(String *s)
 {
@@ -433,18 +432,23 @@ string_print_timestamp(String *s)
 		printf("%lld", t);
 }
 
-/* Convert time fields. Returns a UNIX timestamp. */
+/* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
+   Parameters should be passed as they are in a struct tm:
+   that is: year = year - 1900, month = month - 1. */
 static long long
 datetounix(long long year, int mon, int day, int hour, int min, int sec)
 {
-	static const int secs_through_month[] = {
+	/* seconds in a month in a regular (non-leap) year */
+	static const long secs_through_month[] = {
 		0, 31 * 86400, 59 * 86400, 90 * 86400,
 		120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
 		243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
 	int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
 	long long t;
 
+	/* optimization: handle common range year 1902 up to and including 2038 */
 	if (year - 2ULL <= 136) {
+		/* amount of leap days relative to 1970: every 4 years */
 		leaps = (year - 68) >> 2;
 		if (!((year - 68) & 3)) {
 			leaps--;
@@ -452,8 +456,11 @@ datetounix(long long year, int mon, int day, int hour, int min, int sec)
 		} else {
 			is_leap = 0;
 		}
-		t = 31536000 * (year - 70) + 86400 * leaps;
+		t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
 	} else {
+		/* general leap year calculation:
+		   leap years occur mostly every 4 years but every 100 years
+		   a leap year is skipped unless the year is divisible by 400 */
 		cycles = (year - 100) / 400;
 		rem = (year - 100) % 400;
 		if (rem < 0) {
@@ -463,20 +470,27 @@ datetounix(long long year, int mon, int day, int hour, int min, int sec)
 		if (!rem) {
 			is_leap = 1;
 		} else {
-			if (rem >= 300)
-				centuries = 3, rem -= 300;
-			else if (rem >= 200)
-				centuries = 2, rem -= 200;
-			else if (rem >= 100)
-				centuries = 1, rem -= 100;
+			if (rem >= 300) {
+				centuries = 3;
+				rem -= 300;
+			} else if (rem >= 200) {
+				centuries = 2;
+				rem -= 200;
+			} else if (rem >= 100) {
+				centuries = 1;
+				rem -= 100;
+			}
 			if (rem) {
 				leaps = rem / 4U;
 				rem %= 4U;
 				is_leap = !rem;
 			}
 		}
-		leaps += 97 * cycles + 24 * centuries - is_leap;
-		t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
+		leaps += (97 * cycles) + (24 * centuries) - is_leap;
+
+		/* adjust 8 leap days from 1970 up to and including 2000:
+		   ((30 * 365) + 8) * 86400 = 946771200 */
+		t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
 	}
 	t += secs_through_month[mon];
 	if (is_leap && mon >= 2)
@@ -490,16 +504,16 @@ datetounix(long long year, int mon, int day, int hour, int min, int sec)
 }
 
 /* Get timezone from string, return time offset in seconds from UTC.
- * NOTE: only parses timezones in RFC-822, many other timezone names are
+ * NOTE: only parses timezones in RFC 822, many other timezone names are
  * ambiguous anyway.
- * ANSI and military zones are defined wrong in RFC822 and are unsupported,
- * see note on RFC2822 4.3 page 32. */
+ * ANSI and military zones are defined wrong in RFC 822 and are unsupported,
+ * see note on RFC 2822 4.3 page 32. */
 static long
 gettzoffset(const char *s)
 {
-	static struct {
+	static const struct {
 		char *name;
-		const int offhour;
+		int offhour;
 	} tzones[] = {
 		{ "CDT", -5 * 3600 },
 		{ "CST", -6 * 3600 },
@@ -514,24 +528,24 @@ gettzoffset(const char *s)
 	long tzhour = 0, tzmin = 0;
 	size_t i;
 
-	for (; isspace((unsigned char)*s); s++)
+	for (; ISSPACE((unsigned char)*s); s++)
 		;
 	switch (*s) {
 	case '-': /* offset */
 	case '+':
-		for (i = 0, p = s + 1; i < 2 && isdigit((unsigned char)*p); i++, p++)
+		for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
 			tzhour = (tzhour * 10) + (*p - '0');
 		if (*p == ':')
 			p++;
-		for (i = 0; i < 2 && isdigit((unsigned char)*p); i++, p++)
+		for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
 			tzmin = (tzmin * 10) + (*p - '0');
 		return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
 	default: /* timezone name */
-		for (i = 0; isalpha((unsigned char)s[i]); i++)
+		for (i = 0; ISALPHA((unsigned char)s[i]); i++)
 			;
 		if (i != 3)
 			return 0;
-		/* compare tz and adjust offset relative to UTC */
+		/* compare timezone and adjust offset relative to UTC */
 		for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
 			if (!memcmp(s, tzones[i].name, 3))
 				return tzones[i].offhour;
@@ -545,7 +559,7 @@ gettzoffset(const char *s)
 static int
 parsetime(const char *s, long long *tp)
 {
-	static struct {
+	static const struct {
 		char *name;
 		int len;
 	} mons[] = {
@@ -565,35 +579,35 @@ parsetime(const char *s, long long *tp)
 	int va[6] = { 0 }, i, j, v, vi;
 	size_t m;
 
-	for (; isspace((unsigned char)*s); s++)
+	for (; ISSPACE((unsigned char)*s); s++)
 		;
-	if (!isdigit((unsigned char)*s) && !isalpha((unsigned char)*s))
+	if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
 		return -1;
 
-	if (isdigit((unsigned char)s[0]) &&
-	    isdigit((unsigned char)s[1]) &&
-	    isdigit((unsigned char)s[2]) &&
-	    isdigit((unsigned char)s[3])) {
+	if (ISDIGIT((unsigned char)s[0]) &&
+	    ISDIGIT((unsigned char)s[1]) &&
+	    ISDIGIT((unsigned char)s[2]) &&
+	    ISDIGIT((unsigned char)s[3])) {
 		/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
 		vi = 0;
 	} else {
 		/* format: "[%a, ]%d %b %Y %H:%M:%S" */
 		/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
-		for (; isalpha((unsigned char)*s); s++)
+		for (; ISALPHA((unsigned char)*s); s++)
 			;
-		for (; isspace((unsigned char)*s); s++)
+		for (; ISSPACE((unsigned char)*s); s++)
 			;
 		if (*s == ',')
 			s++;
-		for (; isspace((unsigned char)*s); s++)
+		for (; ISSPACE((unsigned char)*s); s++)
 			;
-		for (v = 0, i = 0; i < 2 && isdigit((unsigned char)*s); s++, i++)
+		for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
 			v = (v * 10) + (*s - '0');
 		va[2] = v; /* day */
-		for (; isspace((unsigned char)*s); s++)
+		for (; ISSPACE((unsigned char)*s); s++)
 			;
 		/* end of word month */
-		for (j = 0; isalpha((unsigned char)s[j]); j++)
+		for (j = 0; ISALPHA((unsigned char)s[j]); j++)
 			;
 		/* check month name */
 		if (j < 3 || j > 9)
@@ -609,15 +623,15 @@ parsetime(const char *s, long long *tp)
 		}
 		if (m >= 12)
 			return -1; /* no month found */
-		for (; isspace((unsigned char)*s); s++)
+		for (; ISSPACE((unsigned char)*s); s++)
 			;
-		for (v = 0, i = 0; i < 4 && isdigit((unsigned char)*s); s++, i++)
+		for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
 			v = (v * 10) + (*s - '0');
-		/* obsolete short year: RFC2822 4.3 */
-		if (i <= 3)
-			v += (v >= 0 && v <= 49) ? 2000 : 1900;
+		/* obsolete short year: RFC 2822 4.3 */
+		if (i == 2 || i == 3)
+			v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
 		va[0] = v; /* year */
-		for (; isspace((unsigned char)*s); s++)
+		for (; ISSPACE((unsigned char)*s); s++)
 			;
 		/* parse only regular time part, see below */
 		vi = 3;
@@ -626,20 +640,20 @@ parsetime(const char *s, long long *tp)
 	/* parse time parts (and possibly remaining date parts) */
 	for (; *s && vi < 6; vi++) {
 		for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
-		                   isdigit((unsigned char)*s); s++, i++) {
+		                   ISDIGIT((unsigned char)*s); s++, i++) {
 			v = (v * 10) + (*s - '0');
 		}
 		va[vi] = v;
 
 		if ((vi < 2 && *s == '-') ||
-		    (vi == 2 && (*s == 'T' || isspace((unsigned char)*s))) ||
+		    (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsigned char)*s))) ||
 		    (vi > 2 && *s == ':'))
 			s++;
 	}
 
 	/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
 	if (*s == '.') {
-		for (s++; isdigit((unsigned char)*s); s++)
+		for (s++; ISDIGIT((unsigned char)*s); s++)
 			;
 	}
 
@@ -679,6 +693,9 @@ printfields(void)
 	putchar(FieldSeparator);
 	string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
 	putchar('\n');
+
+	if (ferror(stdout)) /* check for errors but do not flush */
+		checkfileerror(stdout, "<stdout>", 'w');
 }
 
 static int
@@ -707,8 +724,8 @@ xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
 	if (!ctx.tag.id)
 		return;
 
-	/* content-type may be: Atom: text, xhtml, html or mime-type.
-	   MRSS (media:description): plain, html. */
+	/* content-type may be for Atom: text, xhtml, html or a mime-type.
+	   for MRSS (media:description): plain, html. */
 	if (ISCONTENTTAG(ctx)) {
 		if (isattr(n, nl, STRP("type")))
 			string_append(&attrtype, v, vl);
@@ -741,7 +758,7 @@ static void
 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
               const char *data, size_t datalen)
 {
-	char buf[16];
+	char buf[8];
 	int len;
 
 	/* handles transforming inline XML to data */
@@ -755,7 +772,7 @@ xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
 		return;
 
 	/* try to translate entity, else just pass as data to
-	 * xmldata handler. */
+	 * xmlattr handler. */
 	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
 		xmlattr(p, t, tl, n, nl, buf, (size_t)len);
 	else
@@ -818,7 +835,7 @@ xmldata(XMLParser *p, const char *s, size_t len)
 static void
 xmldataentity(XMLParser *p, const char *data, size_t datalen)
 {
-	char buf[16];
+	char buf[8];
 	int len;
 
 	if (!ctx.field)
@@ -835,7 +852,7 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
 static void
 xmltagstart(XMLParser *p, const char *t, size_t tl)
 {
-	FeedTag *f;
+	const FeedTag *f;
 
 	if (ISINCONTENT(ctx)) {
 		if (ctx.contenttype == ContentTypeHTML) {
@@ -894,7 +911,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
 		return;
 	}
 
-	/* set tag type based on it's attribute value */
+	/* set tag type based on its attribute value */
 	if (ctx.tag.id == RSSTagGuid) {
 		/* if empty the default is "true" */
 		if (!attrispermalink.len ||
@@ -964,7 +981,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
 		return;
 
 	if (ISINCONTENT(ctx)) {
-		/* not close content field */
+		/* not a closed content field */
 		if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
 			if (!isshort && ctx.contenttype == ContentTypeHTML) {
 				xmldata(p, "</", 2);
@@ -976,7 +993,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
 	} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
 		/* matched tag end: close it */
 		/* copy also to the link field if the attribute isPermaLink="true"
-		   and it is not set by a tag with higher prio. */
+		   and it is not set by a tag with higher priority. */
 		if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
 		    ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
 			string_clear(&ctx.fields[FeedFieldLink].str);
@@ -1005,7 +1022,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
 	}
 
 	/* temporary string: for fields that cannot be processed
-	   directly and need more context, for example by it's tag
+	   directly and need more context, for example by its tag
 	   attributes, like the Atom link rel="alternate|enclosure". */
 	if (tmpstr.len && ctx.field) {
 		if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
@@ -1056,8 +1073,11 @@ main(int argc, char *argv[])
 	parser.xmltagstart = xmltagstart;
 	parser.xmltagstartparsed = xmltagstartparsed;
 
-	/* NOTE: getnext is defined in xml.h for inline optimization */
+	/* NOTE: GETNEXT is defined in xml.h for inline optimization */
 	xml_parse(&parser);
 
+	checkfileerror(stdin, "<stdin>", 'r');
+	checkfileerror(stdout, "<stdout>", 'w');
+
 	return 0;
 }
author	Benjamin Chausse <benjamin@chausse.xyz>	2024-08-09 14:11:50 -0400
committer	Benjamin Chausse <benjamin@chausse.xyz>	2024-08-09 14:11:50 -0400
commit	5857d82e8e596d6fda406a0c4d8d68ca7a03c124 (patch)
tree	553916894dee907825360580c5d9a05c82c5af16 /sfeed.c
parent	3574e3cbf9d99546e868aeb995ce2c171cdc36a6 (diff)
parent	19957bc272e745af7b56b79fa648e8b6b77113b1 (diff)