7 files changed, 234 insertions, 149 deletions
diff --git a/sfeed.1 b/sfeed.1
index 0045716..9dafa85 100644
--- a/sfeed.1
+++ b/sfeed.1
@@ -1,4 +1,4 @@
-.Dd January 26, 2021
+.Dd February 19, 2021
 .Dt SFEED 1
 .Os
 .Sh NAME
@@ -13,8 +13,11 @@ reads RSS or Atom feed data (XML) from stdin.
 It writes the feed data in a TAB-separated format to stdout.
 A
 .Ar baseurl
-can be specified if the links in the feed are relative URLs.
-It is recommended to always have absolute URLs in your feeds.
+can be specified if the links or enclosures in the feed are relative URLs.
+If the
+.Ar baseurl
+is a valid absolute URL then the relative links or enclosures will be
+made absolute.
 .Sh TAB-SEPARATED FORMAT FIELDS
 The items are output per line in a TSV-like format.
 .Pp
@@ -35,7 +38,7 @@ UNIX timestamp in UTC+0, empty if missing or on parse failure.
 .It title
 Title text, HTML code in titles is ignored and is treated as plain-text.
 .It link
-Absolute URL, unsafe characters are encoded.
+Link
 .It content
 Content, can have plain-text or HTML code depending on the content-type field.
 .It content-type
diff --git a/sfeed.5 b/sfeed.5
index b0bbd4c..bd5d6fe 100644
--- a/sfeed.5
+++ b/sfeed.5
@@ -1,4 +1,4 @@
-.Dd September 19, 2020
+.Dd February 19, 2021
 .Dt SFEED 5
 .Os
 .Sh NAME
@@ -29,7 +29,7 @@ UNIX timestamp in UTC+0, empty if missing or on parse failure.
 .It title
 Title text, HTML code in titles is ignored and is treated as plain-text.
 .It link
-Absolute URL, unsafe characters are encoded.
+Link
 .It content
 Content, can have plain-text or HTML code depending on the content-type field.
 .It content-type
diff --git a/sfeed.c b/sfeed.c
index 79768f3..91b2d65 100644
--- a/sfeed.c
+++ b/sfeed.c
@@ -204,7 +204,8 @@ static int fieldmap[TagLast] = {
 static const int FieldSeparator = '\t';
 /* separator for multiple values in a field, separator should be 1 byte */
 static const char *FieldMultiSeparator = "|";
-static const char *baseurl = "";
+static struct uri baseuri;
+static const char *baseurl;
 
 static FeedContext ctx;
 static XMLParser parser; /* XML parser state */
@@ -381,23 +382,33 @@ string_print_trimmed_multi(String *s)
 	}
 }
 
-/* always print absolute urls (using global baseurl) */
+/* always print absolute URLs (using global baseurl) */
 void
 printuri(char *s)
 {
 	char link[4096], *p, *e;
-	int c;
+	struct uri newuri, olduri;
+	int c, r = -1;
 
 	p = ltrim(s);
 	e = rtrim(p);
 	c = *e;
 	*e = '\0';
-	if (absuri(link, sizeof(link), p, baseurl) != -1)
-		fputs(link, stdout);
+
+	if (baseurl && !uri_hasscheme(p) &&
+	    uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
+	    uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
+		r = uri_format(link, sizeof(link), &newuri);
+
+	if (r >= 0 && (size_t)r < sizeof(link))
+		printtrimmed(link);
+	else
+		printtrimmed(p);
+
 	*e = c; /* restore NUL byte to original character */
 }
 
-/* always print absolute urls (using global baseurl) */
+/* always print absolute URLs (using global baseurl) */
 void
 string_print_uri(String *s)
 {
@@ -1015,8 +1026,12 @@ main(int argc, char *argv[])
 	if (pledge("stdio", NULL) == -1)
 		err(1, "pledge");
 
-	if (argc > 1)
-		baseurl = argv[1];
+	if (argc > 1) {
+		if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
+			baseurl = argv[1];
+		else
+			errx(1, "baseurl incorrect or too long");
+	}
 
 	memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
 
diff --git a/sfeed_gopher.c b/sfeed_gopher.c
index b4e3a8c..28dcb9d 100644
--- a/sfeed_gopher.c
+++ b/sfeed_gopher.c
@@ -38,7 +38,8 @@ static void
 printfeed(FILE *fpitems, FILE *fpin, struct feed *f)
 {
 	struct uri u;
-	char *fields[FieldLast], *itemhost, *itemport, *itempath;
+	char *fields[FieldLast];
+	char *itemhost, *itemport, *itempath, *itemquery, *itemfragment;
 	ssize_t linelen;
 	unsigned int isnew;
 	struct tm rtm, *tm;
@@ -59,15 +60,20 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f)
 		itemport = port;
 		itemtype = 'i';
 		itempath = fields[FieldLink];
+		itemquery = "";
+		itemfragment = "";
 
 		if (fields[FieldLink][0]) {
 			itemtype = 'h';
+			/* if it's a gopher URL then change it into a direntry */
 			if (!strncmp(fields[FieldLink], "gopher://", 9) &&
-			    parseuri(fields[FieldLink], &u, 0) != -1) {
+			    uri_parse(fields[FieldLink], &u) != -1) {
 				itemhost = u.host;
 				itemport = u.port[0] ? u.port : "70";
 				itemtype = '1';
 				itempath = u.path;
+				itemquery = u.query;
+				itemfragment = u.fragment;
 
 				if (itempath[0] == '/') {
 					itempath++;
@@ -100,6 +106,14 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f)
 		if (itemtype == 'h' && fields[FieldLink] == itempath)
 			fputs("URL:", fpitems);
 		gophertext(fpitems, itempath);
+		if (itemquery[0]) {
+			fputs("?", fpitems);
+			gophertext(fpitems, itemquery);
+		}
+		if (itemfragment[0]) {
+			fputs("#", fpitems);
+			gophertext(fpitems, itemfragment);
+		}
 		fprintf(fpitems, "\t%s\t%s\r\n", itemhost, itemport);
 	}
 	fputs(".\r\n", fpitems);
diff --git a/sfeed_web.c b/sfeed_web.c
index 6d547a7..a715731 100644
--- a/sfeed_web.c
+++ b/sfeed_web.c
@@ -12,7 +12,6 @@
 static XMLParser parser;
 static int isbasetag, islinktag, ishrefattr, istypeattr;
 static char linkhref[4096], linktype[256], basehref[4096];
-static char abslink[4096];
 
 static void
 printvalue(const char *s)
@@ -39,6 +38,10 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
 static void
 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
 {
+	struct uri baseuri, linkuri, u;
+	char buf[4096];
+	int r = -1;
+
 	if (!islinktag)
 		return;
 
@@ -47,10 +50,18 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
 	    strncasecmp(linktype, STRP("application/rss")))
 		return;
 
-	if (absuri(abslink, sizeof(abslink), linkhref, basehref) != -1)
-		printvalue(abslink);
+	/* parse base URI each time: it can change. */
+	if (basehref[0] &&
+	    uri_parse(linkhref, &linkuri) != -1 && !linkuri.proto[0] &&
+	    uri_parse(basehref, &baseuri) != -1 &&
+	    uri_makeabs(&u, &linkuri, &baseuri) != -1 && u.proto[0])
+		r = uri_format(buf, sizeof(buf), &u);
+
+	if (r >= 0 && (size_t)r < sizeof(buf))
+		printvalue(buf);
 	else
 		printvalue(linkhref);
+
 	putchar('\t');
 	printvalue(linktype);
 	putchar('\n');
diff --git a/util.c b/util.c
index 362fdd9..6aef1f3 100644
--- a/util.c
+++ b/util.c
@@ -7,167 +7,203 @@
 
 #include "util.h"
 
+/* check if string has a non-empty scheme / protocol part */
 int
-parseuri(const char *s, struct uri *u, int rel)
+uri_hasscheme(const char *s)
 {
-	const char *p = s, *b;
-	char *endptr = NULL;
+	const char *p = s;
+
+	for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+		       *p == '+' || *p == '-' || *p == '.'; p++)
+		;
+	/* scheme, except if empty and starts with ":" then it is a path */
+	return (*p == ':' && p != s);
+}
+
+int
+uri_parse(const char *s, struct uri *u)
+{
+	const char *p = s;
+	char *endptr;
 	size_t i;
-	unsigned long l;
+	long l;
 
-	u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
-	if (!*s)
-		return 0;
+	u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
+	u->path[0] = u->query[0] = u->fragment[0] = '\0';
 
-	/* prefix is "//", don't read protocol, skip to domain parsing */
-	if (!strncmp(p, "//", 2)) {
+	/* protocol-relative */
+	if (*p == '/' && *(p + 1) == '/') {
 		p += 2; /* skip "//" */
-	} else {
-		/* protocol part */
-		for (p = s; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
-			       *p == '+' || *p == '-' || *p == '.'; p++)
-			;
-		if (!strncmp(p, "://", 3)) {
-			if ((size_t)(p - s) >= sizeof(u->proto))
-				return -1; /* protocol too long */
-			memcpy(u->proto, s, p - s);
-			u->proto[p - s] = '\0';
+		goto parseauth;
+	}
+
+	/* scheme / protocol part */
+	for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+		       *p == '+' || *p == '-' || *p == '.'; p++)
+		;
+	/* scheme, except if empty and starts with ":" then it is a path */
+	if (*p == ':' && p != s) {
+		if (*(p + 1) == '/' && *(p + 2) == '/')
 			p += 3; /* skip "://" */
-		} else {
-			p = s; /* no protocol format, set to start */
-			/* relative url: read rest as path, else as domain */
-			if (rel)
-				goto readpath;
-		}
+		else
+			p++; /* skip ":" */
+
+		if ((size_t)(p - s) >= sizeof(u->proto))
+			return -1; /* protocol too long */
+		memcpy(u->proto, s, p - s);
+		u->proto[p - s] = '\0';
+
+		if (*(p - 1) != '/')
+			goto parsepath;
+	} else {
+		p = s; /* no scheme format, reset to start */
+		goto parsepath;
+	}
+
+parseauth:
+	/* userinfo (username:password) */
+	i = strcspn(p, "@/?#");
+	if (p[i] == '@') {
+		if (i >= sizeof(u->userinfo))
+			return -1; /* userinfo too long */
+		memcpy(u->userinfo, p, i);
+		u->userinfo[i] = '\0';
+		p += i + 1;
 	}
+
 	/* IPv6 address */
 	if (*p == '[') {
-		/* bracket not found or host too long */
-		if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
-		    (size_t)(b - p) >= sizeof(u->host))
+		/* bracket not found, host too short or too long */
+		i = strcspn(p, "]");
+		if (p[i] != ']' || i < 3)
 			return -1;
-		memcpy(u->host, p, b - p + 1);
-		u->host[b - p + 1] = '\0';
-		p = b + 1;
+		i++; /* including "]" */
 	} else {
 		/* domain / host part, skip until port, path or end. */
-		if ((i = strcspn(p, ":/")) >= sizeof(u->host))
-			return -1; /* host too long */
-		memcpy(u->host, p, i);
-		u->host[i] = '\0';
-		p = &p[i];
+		i = strcspn(p, ":/?#");
 	}
+	if (i >= sizeof(u->host))
+		return -1; /* host too long */
+	memcpy(u->host, p, i);
+	u->host[i] = '\0';
+	p += i;
+
 	/* port */
 	if (*p == ':') {
-		if ((i = strcspn(++p, "/")) >= sizeof(u->port))
+		p++;
+		if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
 			return -1; /* port too long */
 		memcpy(u->port, p, i);
 		u->port[i] = '\0';
-		/* check for valid port: range 1 - 65535 */
+		/* check for valid port: range 1 - 65535, may be empty */
 		errno = 0;
-		l = strtoul(u->port, &endptr, 10);
-		if (errno || u->port[0] == '\0' || *endptr ||
-		    !l || l > 65535)
+		l = strtol(u->port, &endptr, 10);
+		if (i && (errno || *endptr || l <= 0 || l > 65535))
 			return -1;
-		p = &p[i];
+		p += i;
 	}
-readpath:
-	if (u->host[0]) {
-		p = &p[strspn(p, "/")];
-		strlcpy(u->path, "/", sizeof(u->path));
-	} else {
-		/* absolute uri must have a host specified */
-		if (!rel)
-			return -1;
-	}
-	/* treat truncation as an error */
-	if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
-		return -1;
-	return 0;
-}
 
-static int
-encodeuri(char *buf, size_t bufsiz, const char *s)
-{
-	static const char *table = "0123456789ABCDEF";
-	size_t i, b;
+parsepath:
+	/* path */
+	if ((i = strcspn(p, "?#")) >= sizeof(u->path))
+		return -1; /* path too long */
+	memcpy(u->path, p, i);
+	u->path[i] = '\0';
+	p += i;
 
-	for (i = 0, b = 0; s[i]; i++) {
-		if ((unsigned char)s[i] <= ' ' ||
-		    (unsigned char)s[i] >= 127) {
-			if (b + 3 >= bufsiz)
-				return -1;
-			buf[b++] = '%';
-			buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
-			buf[b++] = table[(unsigned char)s[i] & 15];
-		} else if (b < bufsiz) {
-			buf[b++] = s[i];
-		} else {
-			return -1;
-		}
+	/* query */
+	if (*p == '?') {
+		p++;
+		if ((i = strcspn(p, "#")) >= sizeof(u->query))
+			return -1; /* query too long */
+		memcpy(u->query, p, i);
+		u->query[i] = '\0';
+		p += i;
+	}
+
+	/* fragment */
+	if (*p == '#') {
+		p++;
+		if ((i = strlen(p)) >= sizeof(u->fragment))
+			return -1; /* fragment too long */
+		memcpy(u->fragment, p, i);
+		u->fragment[i] = '\0';
 	}
-	if (b >= bufsiz)
-		return -1;
-	buf[b] = '\0';
 
 	return 0;
 }
 
-/* Get absolute uri; if `link` is relative use `base` to make it absolute.
- * the returned string in `buf` is uri encoded, see: encodeuri(). */
+/* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
+   Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
+   Returns 0 on success, -1 on error or truncation. */
 int
-absuri(char *buf, size_t bufsiz, const char *link, const char *base)
+uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
 {
-	struct uri ulink, ubase;
-	char tmp[4096], *host, *p, *port;
-	int c, r;
-	size_t i;
+	char *p;
+	int c;
 
-	buf[0] = '\0';
-	if (parseuri(base, &ubase, 0) == -1 ||
-	    parseuri(link, &ulink, 1) == -1 ||
-	    (!ulink.host[0] && !ubase.host[0]))
-		return -1;
+	strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
 
-	if (!strncmp(link, "//", 2)) {
-		host = ulink.host;
-		port = ulink.port;
-	} else {
-		host = ulink.host[0] ? ulink.host : ubase.host;
-		port = ulink.port[0] ? ulink.port : ubase.port;
+	if (u->proto[0] || u->host[0]) {
+		strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
+		strlcpy(a->host, u->host, sizeof(a->host));
+		strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
+		strlcpy(a->host, u->host, sizeof(a->host));
+		strlcpy(a->port, u->port, sizeof(a->port));
+		strlcpy(a->path, u->path, sizeof(a->path));
+		strlcpy(a->query, u->query, sizeof(a->query));
+		return 0;
 	}
-	r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
-		ulink.proto[0] ?
-			ulink.proto :
-			(ubase.proto[0] ? ubase.proto : "http"),
-		host,
-		port[0] ? ":" : "",
-		port);
-	if (r < 0 || (size_t)r >= sizeof(tmp))
-		return -1; /* error or truncation */
-
-	/* relative to root */
-	if (!ulink.host[0] && ulink.path[0] != '/') {
-		/* relative to base url path */
-		if (ulink.path[0]) {
-			if ((p = strrchr(ubase.path, '/'))) {
-				/* temporary null-terminate */
-				c = *(++p);
-				*p = '\0';
-				i = strlcat(tmp, ubase.path, sizeof(tmp));
-				*p = c; /* restore */
-				if (i >= sizeof(tmp))
-					return -1;
-			}
-		} else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
-		           sizeof(tmp)) {
-			return -1;
+
+	strlcpy(a->proto, b->proto, sizeof(a->proto));
+	strlcpy(a->host, b->host, sizeof(a->host));
+	strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
+	strlcpy(a->host, b->host, sizeof(a->host));
+	strlcpy(a->port, b->port, sizeof(a->port));
+
+	if (!u->path[0]) {
+		strlcpy(a->path, b->path, sizeof(a->path));
+	} else if (u->path[0] == '/') {
+		strlcpy(a->path, u->path, sizeof(a->path));
+	} else {
+		a->path[0] = (a->host[0] && b->path[0] != '/') ? '/' : '\0';
+		a->path[1] = '\0';
+
+		if ((p = strrchr(b->path, '/'))) {
+			c = *(++p);
+			*p = '\0'; /* temporary NUL-terminate */
+			if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
+				return -1;
+			*p = c; /* restore */
 		}
+		if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
+			return -1;
 	}
-	if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
-		return -1;
 
-	return encodeuri(buf, bufsiz, tmp);
+	if (u->path[0] || u->query[0])
+		strlcpy(a->query, u->query, sizeof(a->query));
+	else
+		strlcpy(a->query, b->query, sizeof(a->query));
+
+	return 0;
+}
+
+int
+uri_format(char *buf, size_t bufsiz, struct uri *u)
+{
+	return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
+		u->proto,
+		u->userinfo[0] ? u->userinfo : "",
+		u->userinfo[0] ? "@" : "",
+		u->host,
+		u->port[0] ? ":" : "",
+		u->port,
+		u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
+		u->path,
+		u->query[0] ? "?" : "",
+		u->query,
+		u->fragment[0] ? "#" : "",
+		u->fragment);
 }
 
 /* Splits fields in the line buffer by replacing TAB separators with NUL ('\0')
diff --git a/util.h b/util.h
index bc6e52f..15d6702 100644
--- a/util.h
+++ b/util.h
@@ -21,12 +21,15 @@ struct feed {
 	unsigned long total;    /* total items */
 };
 
-/* uri */
+/* URI */
 struct uri {
-	char proto[48];
+	char proto[48];     /* scheme including ":" or "://" */
+	char userinfo[256]; /* username [:password] */
 	char host[256];
-	char path[2048];
-	char port[6];     /* numeric port */
+	char port[6];       /* numeric port */
+	char path[1024];
+	char query[1024];
+	char fragment[1024];
 };
 
 enum {
@@ -35,9 +38,12 @@ enum {
 	FieldLast
 };
 
-int  absuri(char *, size_t, const char *, const char *);
+int uri_format(char *, size_t, struct uri *);
+int uri_hasscheme(const char *);
+int uri_makeabs(struct uri *, struct uri *, struct uri *);
+int uri_parse(const char *, struct uri *);
+
 void parseline(char *, char *[FieldLast]);
-int  parseuri(const char *, struct uri *, int);
 void printutf8pad(FILE *, const char *, size_t, int);
 int  strtotime(const char *, time_t *);
 void xmlencode(const char *, FILE *);