diff options
-rw-r--r-- | sfeed.1 | 11 | ||||
-rw-r--r-- | sfeed.5 | 4 | ||||
-rw-r--r-- | sfeed.c | 31 | ||||
-rw-r--r-- | sfeed_gopher.c | 18 | ||||
-rw-r--r-- | sfeed_web.c | 17 | ||||
-rw-r--r-- | util.c | 284 | ||||
-rw-r--r-- | util.h | 18 |
7 files changed, 234 insertions, 149 deletions
@@ -1,4 +1,4 @@ -.Dd January 26, 2021 +.Dd February 19, 2021 .Dt SFEED 1 .Os .Sh NAME @@ -13,8 +13,11 @@ reads RSS or Atom feed data (XML) from stdin. It writes the feed data in a TAB-separated format to stdout. A .Ar baseurl -can be specified if the links in the feed are relative URLs. -It is recommended to always have absolute URLs in your feeds. +can be specified if the links or enclosures in the feed are relative URLs. +If the +.Ar baseurl +is a valid absolute URL then the relative links or enclosures will be +made absolute. .Sh TAB-SEPARATED FORMAT FIELDS The items are output per line in a TSV-like format. .Pp @@ -35,7 +38,7 @@ UNIX timestamp in UTC+0, empty if missing or on parse failure. .It title Title text, HTML code in titles is ignored and is treated as plain-text. .It link -Absolute URL, unsafe characters are encoded. +Link .It content Content, can have plain-text or HTML code depending on the content-type field. .It content-type @@ -1,4 +1,4 @@ -.Dd September 19, 2020 +.Dd February 19, 2021 .Dt SFEED 5 .Os .Sh NAME @@ -29,7 +29,7 @@ UNIX timestamp in UTC+0, empty if missing or on parse failure. .It title Title text, HTML code in titles is ignored and is treated as plain-text. .It link -Absolute URL, unsafe characters are encoded. +Link .It content Content, can have plain-text or HTML code depending on the content-type field. .It content-type @@ -204,7 +204,8 @@ static int fieldmap[TagLast] = { static const int FieldSeparator = '\t'; /* separator for multiple values in a field, separator should be 1 byte */ static const char *FieldMultiSeparator = "|"; -static const char *baseurl = ""; +static struct uri baseuri; +static const char *baseurl; static FeedContext ctx; static XMLParser parser; /* XML parser state */ @@ -381,23 +382,33 @@ string_print_trimmed_multi(String *s) } } -/* always print absolute urls (using global baseurl) */ +/* always print absolute URLs (using global baseurl) */ void printuri(char *s) { char link[4096], *p, *e; - int c; + struct uri newuri, olduri; + int c, r = -1; p = ltrim(s); e = rtrim(p); c = *e; *e = '\0'; - if (absuri(link, sizeof(link), p, baseurl) != -1) - fputs(link, stdout); + + if (baseurl && !uri_hasscheme(p) && + uri_parse(p, &olduri) != -1 && !olduri.proto[0] && + uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0]) + r = uri_format(link, sizeof(link), &newuri); + + if (r >= 0 && (size_t)r < sizeof(link)) + printtrimmed(link); + else + printtrimmed(p); + *e = c; /* restore NUL byte to original character */ } -/* always print absolute urls (using global baseurl) */ +/* always print absolute URLs (using global baseurl) */ void string_print_uri(String *s) { @@ -1015,8 +1026,12 @@ main(int argc, char *argv[]) if (pledge("stdio", NULL) == -1) err(1, "pledge"); - if (argc > 1) - baseurl = argv[1]; + if (argc > 1) { + if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0]) + baseurl = argv[1]; + else + errx(1, "baseurl incorrect or too long"); + } memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); diff --git a/sfeed_gopher.c b/sfeed_gopher.c index b4e3a8c..28dcb9d 100644 --- a/sfeed_gopher.c +++ b/sfeed_gopher.c @@ -38,7 +38,8 @@ static void printfeed(FILE *fpitems, FILE *fpin, struct feed *f) { struct uri u; - char *fields[FieldLast], *itemhost, *itemport, *itempath; + char *fields[FieldLast]; + char *itemhost, *itemport, *itempath, *itemquery, *itemfragment; ssize_t linelen; unsigned int isnew; struct tm rtm, *tm; @@ -59,15 +60,20 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) itemport = port; itemtype = 'i'; itempath = fields[FieldLink]; + itemquery = ""; + itemfragment = ""; if (fields[FieldLink][0]) { itemtype = 'h'; + /* if it's a gopher URL then change it into a direntry */ if (!strncmp(fields[FieldLink], "gopher://", 9) && - parseuri(fields[FieldLink], &u, 0) != -1) { + uri_parse(fields[FieldLink], &u) != -1) { itemhost = u.host; itemport = u.port[0] ? u.port : "70"; itemtype = '1'; itempath = u.path; + itemquery = u.query; + itemfragment = u.fragment; if (itempath[0] == '/') { itempath++; @@ -100,6 +106,14 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) if (itemtype == 'h' && fields[FieldLink] == itempath) fputs("URL:", fpitems); gophertext(fpitems, itempath); + if (itemquery[0]) { + fputs("?", fpitems); + gophertext(fpitems, itemquery); + } + if (itemfragment[0]) { + fputs("#", fpitems); + gophertext(fpitems, itemfragment); + } fprintf(fpitems, "\t%s\t%s\r\n", itemhost, itemport); } fputs(".\r\n", fpitems); diff --git a/sfeed_web.c b/sfeed_web.c index 6d547a7..a715731 100644 --- a/sfeed_web.c +++ b/sfeed_web.c @@ -12,7 +12,6 @@ static XMLParser parser; static int isbasetag, islinktag, ishrefattr, istypeattr; static char linkhref[4096], linktype[256], basehref[4096]; -static char abslink[4096]; static void printvalue(const char *s) @@ -39,6 +38,10 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) static void xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) { + struct uri baseuri, linkuri, u; + char buf[4096]; + int r = -1; + if (!islinktag) return; @@ -47,10 +50,18 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) strncasecmp(linktype, STRP("application/rss"))) return; - if (absuri(abslink, sizeof(abslink), linkhref, basehref) != -1) - printvalue(abslink); + /* parse base URI each time: it can change. */ + if (basehref[0] && + uri_parse(linkhref, &linkuri) != -1 && !linkuri.proto[0] && + uri_parse(basehref, &baseuri) != -1 && + uri_makeabs(&u, &linkuri, &baseuri) != -1 && u.proto[0]) + r = uri_format(buf, sizeof(buf), &u); + + if (r >= 0 && (size_t)r < sizeof(buf)) + printvalue(buf); else printvalue(linkhref); + putchar('\t'); printvalue(linktype); putchar('\n'); @@ -7,167 +7,203 @@ #include "util.h" +/* check if string has a non-empty scheme / protocol part */ int -parseuri(const char *s, struct uri *u, int rel) +uri_hasscheme(const char *s) { - const char *p = s, *b; - char *endptr = NULL; + const char *p = s; + + for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || + *p == '+' || *p == '-' || *p == '.'; p++) + ; + /* scheme, except if empty and starts with ":" then it is a path */ + return (*p == ':' && p != s); +} + +int +uri_parse(const char *s, struct uri *u) +{ + const char *p = s; + char *endptr; size_t i; - unsigned long l; + long l; - u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0'; - if (!*s) - return 0; + u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0'; + u->path[0] = u->query[0] = u->fragment[0] = '\0'; - /* prefix is "//", don't read protocol, skip to domain parsing */ - if (!strncmp(p, "//", 2)) { + /* protocol-relative */ + if (*p == '/' && *(p + 1) == '/') { p += 2; /* skip "//" */ - } else { - /* protocol part */ - for (p = s; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || - *p == '+' || *p == '-' || *p == '.'; p++) - ; - if (!strncmp(p, "://", 3)) { - if ((size_t)(p - s) >= sizeof(u->proto)) - return -1; /* protocol too long */ - memcpy(u->proto, s, p - s); - u->proto[p - s] = '\0'; + goto parseauth; + } + + /* scheme / protocol part */ + for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || + *p == '+' || *p == '-' || *p == '.'; p++) + ; + /* scheme, except if empty and starts with ":" then it is a path */ + if (*p == ':' && p != s) { + if (*(p + 1) == '/' && *(p + 2) == '/') p += 3; /* skip "://" */ - } else { - p = s; /* no protocol format, set to start */ - /* relative url: read rest as path, else as domain */ - if (rel) - goto readpath; - } + else + p++; /* skip ":" */ + + if ((size_t)(p - s) >= sizeof(u->proto)) + return -1; /* protocol too long */ + memcpy(u->proto, s, p - s); + u->proto[p - s] = '\0'; + + if (*(p - 1) != '/') + goto parsepath; + } else { + p = s; /* no scheme format, reset to start */ + goto parsepath; + } + +parseauth: + /* userinfo (username:password) */ + i = strcspn(p, "@/?#"); + if (p[i] == '@') { + if (i >= sizeof(u->userinfo)) + return -1; /* userinfo too long */ + memcpy(u->userinfo, p, i); + u->userinfo[i] = '\0'; + p += i + 1; } + /* IPv6 address */ if (*p == '[') { - /* bracket not found or host too long */ - if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 || - (size_t)(b - p) >= sizeof(u->host)) + /* bracket not found, host too short or too long */ + i = strcspn(p, "]"); + if (p[i] != ']' || i < 3) return -1; - memcpy(u->host, p, b - p + 1); - u->host[b - p + 1] = '\0'; - p = b + 1; + i++; /* including "]" */ } else { /* domain / host part, skip until port, path or end. */ - if ((i = strcspn(p, ":/")) >= sizeof(u->host)) - return -1; /* host too long */ - memcpy(u->host, p, i); - u->host[i] = '\0'; - p = &p[i]; + i = strcspn(p, ":/?#"); } + if (i >= sizeof(u->host)) + return -1; /* host too long */ + memcpy(u->host, p, i); + u->host[i] = '\0'; + p += i; + /* port */ if (*p == ':') { - if ((i = strcspn(++p, "/")) >= sizeof(u->port)) + p++; + if ((i = strcspn(p, "/?#")) >= sizeof(u->port)) return -1; /* port too long */ memcpy(u->port, p, i); u->port[i] = '\0'; - /* check for valid port: range 1 - 65535 */ + /* check for valid port: range 1 - 65535, may be empty */ errno = 0; - l = strtoul(u->port, &endptr, 10); - if (errno || u->port[0] == '\0' || *endptr || - !l || l > 65535) + l = strtol(u->port, &endptr, 10); + if (i && (errno || *endptr || l <= 0 || l > 65535)) return -1; - p = &p[i]; + p += i; } -readpath: - if (u->host[0]) { - p = &p[strspn(p, "/")]; - strlcpy(u->path, "/", sizeof(u->path)); - } else { - /* absolute uri must have a host specified */ - if (!rel) - return -1; - } - /* treat truncation as an error */ - if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path)) - return -1; - return 0; -} -static int -encodeuri(char *buf, size_t bufsiz, const char *s) -{ - static const char *table = "0123456789ABCDEF"; - size_t i, b; +parsepath: + /* path */ + if ((i = strcspn(p, "?#")) >= sizeof(u->path)) + return -1; /* path too long */ + memcpy(u->path, p, i); + u->path[i] = '\0'; + p += i; - for (i = 0, b = 0; s[i]; i++) { - if ((unsigned char)s[i] <= ' ' || - (unsigned char)s[i] >= 127) { - if (b + 3 >= bufsiz) - return -1; - buf[b++] = '%'; - buf[b++] = table[((unsigned char)s[i] >> 4) & 15]; - buf[b++] = table[(unsigned char)s[i] & 15]; - } else if (b < bufsiz) { - buf[b++] = s[i]; - } else { - return -1; - } + /* query */ + if (*p == '?') { + p++; + if ((i = strcspn(p, "#")) >= sizeof(u->query)) + return -1; /* query too long */ + memcpy(u->query, p, i); + u->query[i] = '\0'; + p += i; + } + + /* fragment */ + if (*p == '#') { + p++; + if ((i = strlen(p)) >= sizeof(u->fragment)) + return -1; /* fragment too long */ + memcpy(u->fragment, p, i); + u->fragment[i] = '\0'; } - if (b >= bufsiz) - return -1; - buf[b] = '\0'; return 0; } -/* Get absolute uri; if `link` is relative use `base` to make it absolute. - * the returned string in `buf` is uri encoded, see: encodeuri(). */ +/* Transform and try to make the URI `u` absolute using base URI `b` into `a`. + Follows some of the logic from "RFC 3986 - 5.2.2. Transform References". + Returns 0 on success, -1 on error or truncation. */ int -absuri(char *buf, size_t bufsiz, const char *link, const char *base) +uri_makeabs(struct uri *a, struct uri *u, struct uri *b) { - struct uri ulink, ubase; - char tmp[4096], *host, *p, *port; - int c, r; - size_t i; + char *p; + int c; - buf[0] = '\0'; - if (parseuri(base, &ubase, 0) == -1 || - parseuri(link, &ulink, 1) == -1 || - (!ulink.host[0] && !ubase.host[0])) - return -1; + strlcpy(a->fragment, u->fragment, sizeof(a->fragment)); - if (!strncmp(link, "//", 2)) { - host = ulink.host; - port = ulink.port; - } else { - host = ulink.host[0] ? ulink.host : ubase.host; - port = ulink.port[0] ? ulink.port : ubase.port; + if (u->proto[0] || u->host[0]) { + strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto)); + strlcpy(a->host, u->host, sizeof(a->host)); + strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo)); + strlcpy(a->host, u->host, sizeof(a->host)); + strlcpy(a->port, u->port, sizeof(a->port)); + strlcpy(a->path, u->path, sizeof(a->path)); + strlcpy(a->query, u->query, sizeof(a->query)); + return 0; } - r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s", - ulink.proto[0] ? - ulink.proto : - (ubase.proto[0] ? ubase.proto : "http"), - host, - port[0] ? ":" : "", - port); - if (r < 0 || (size_t)r >= sizeof(tmp)) - return -1; /* error or truncation */ - - /* relative to root */ - if (!ulink.host[0] && ulink.path[0] != '/') { - /* relative to base url path */ - if (ulink.path[0]) { - if ((p = strrchr(ubase.path, '/'))) { - /* temporary null-terminate */ - c = *(++p); - *p = '\0'; - i = strlcat(tmp, ubase.path, sizeof(tmp)); - *p = c; /* restore */ - if (i >= sizeof(tmp)) - return -1; - } - } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >= - sizeof(tmp)) { - return -1; + + strlcpy(a->proto, b->proto, sizeof(a->proto)); + strlcpy(a->host, b->host, sizeof(a->host)); + strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo)); + strlcpy(a->host, b->host, sizeof(a->host)); + strlcpy(a->port, b->port, sizeof(a->port)); + + if (!u->path[0]) { + strlcpy(a->path, b->path, sizeof(a->path)); + } else if (u->path[0] == '/') { + strlcpy(a->path, u->path, sizeof(a->path)); + } else { + a->path[0] = (a->host[0] && b->path[0] != '/') ? '/' : '\0'; + a->path[1] = '\0'; + + if ((p = strrchr(b->path, '/'))) { + c = *(++p); + *p = '\0'; /* temporary NUL-terminate */ + if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path)) + return -1; + *p = c; /* restore */ } + if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path)) + return -1; } - if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp)) - return -1; - return encodeuri(buf, bufsiz, tmp); + if (u->path[0] || u->query[0]) + strlcpy(a->query, u->query, sizeof(a->query)); + else + strlcpy(a->query, b->query, sizeof(a->query)); + + return 0; +} + +int +uri_format(char *buf, size_t bufsiz, struct uri *u) +{ + return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s", + u->proto, + u->userinfo[0] ? u->userinfo : "", + u->userinfo[0] ? "@" : "", + u->host, + u->port[0] ? ":" : "", + u->port, + u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "", + u->path, + u->query[0] ? "?" : "", + u->query, + u->fragment[0] ? "#" : "", + u->fragment); } /* Splits fields in the line buffer by replacing TAB separators with NUL ('\0') @@ -21,12 +21,15 @@ struct feed { unsigned long total; /* total items */ }; -/* uri */ +/* URI */ struct uri { - char proto[48]; + char proto[48]; /* scheme including ":" or "://" */ + char userinfo[256]; /* username [:password] */ char host[256]; - char path[2048]; - char port[6]; /* numeric port */ + char port[6]; /* numeric port */ + char path[1024]; + char query[1024]; + char fragment[1024]; }; enum { @@ -35,9 +38,12 @@ enum { FieldLast }; -int absuri(char *, size_t, const char *, const char *); +int uri_format(char *, size_t, struct uri *); +int uri_hasscheme(const char *); +int uri_makeabs(struct uri *, struct uri *, struct uri *); +int uri_parse(const char *, struct uri *); + void parseline(char *, char *[FieldLast]); -int parseuri(const char *, struct uri *, int); void printutf8pad(FILE *, const char *, size_t, int); int strtotime(const char *, time_t *); void xmlencode(const char *, FILE *); |