README: add sfeed_download example, parallel downloader/extractor

author: Hiltjo Posthuma <hiltjo@codemadness.org> 2022-01-06 12:30:45 +0100
committer: Hiltjo Posthuma <hiltjo@codemadness.org> 2022-01-06 12:30:45 +0100
commit: f25a3e56e4b8d760469c120a22297f74961c22f3 (patch)
tree: 999a7d50a7398c4f5335d5327400fb51ef1f212c
parent: e158bec62365732c0c5d54fa785e34470d56c9f6 (diff)
1 files changed, 118 insertions, 0 deletions
diff --git a/README b/README
index fea2da8..bdfbc1f 100644
--- a/README
+++ b/README
@@ -735,6 +735,124 @@ sfeed_update_xargs shellscript:
 
 - - -
 
+Shellscript to handle URLs and enclosures in parallel using xargs -P.
+
+This can be used to download and process URLs for downloading podcasts,
+webcomics, download and convert webpages, mirror videos, etc. It uses a
+plain-text cache file for remembering processed URLs. The match patterns are
+defined in the fetch() function and in the awk script and can be modified to
+handle items differently depending on their context. The arguments for the
+scripts are stdin or files in the sfeed(5) format.
+
+	#!/bin/sh
+	# sfeed_download: Downloader for URLs and enclosures in feed files.
+	# Dependencies: awk, curl, flock, xargs (-P), youtube-dl.
+	
+	cachefile="${SFEED_CACHEFILE:-$HOME/.sfeed/downloaded_urls}"
+	jobs="${SFEED_JOBS:-4}"
+	lockfile="${HOME}/.sfeed/sfeed_download.lock"
+	
+	# log(feedname, s, status)
+	log() {
+		if [ "$1" != "-" ]; then
+			s="[$1] $2"
+		else
+			s="$2"
+		fi
+		printf '[%s]: %s: %s\n' "$(date +'%H:%M:%S')" "${s}" "$3" >&2
+	}
+	
+	# fetch(url, feedname)
+	fetch() {
+		case "$1" in
+		*youtube.com*)
+			youtube-dl "$1";;
+		*.flac|*.ogg|*.m3u|*.m3u8|*.m4a|*.mkv|*.mp3|*.mp4|*.wav|*.webm)
+			# allow 2 redirects, hide User-Agent, connect timeout is 15 seconds.
+			curl -O -L --max-redirs 2 -H "User-Agent:" -f -s --connect-timeout 15 "$1";;
+		esac
+	}
+	
+	# downloader(url, title, feedname)
+	downloader() {
+		url="$1"
+		title="$2"
+		feedname="${3##*/}"
+	
+		msg="${title}: ${url}"
+	
+		# download directory.
+		if [ "${feedname}" != "-" ]; then
+			mkdir -p "${feedname}"
+			if ! cd "${feedname}"; then
+				log "${feedname}" "${msg}: ${feedname}" "DIR FAIL"
+				exit 1
+			fi
+		fi
+	
+		log "${feedname}" "${msg}" "START"
+		fetch "${url}" "${feedname}"
+		if [ $? = 0 ]; then
+			log "${feedname}" "${msg}" "OK"
+	
+			# append it safely in parallel to the cachefile on a
+			# successful download.
+			(flock 9 || exit 1
+			printf '%s\n' "${url}" >> "${cachefile}"
+			) 9>"${lockfile}"
+		else
+			log "${feedname}" "${msg}" "FAIL"
+		fi
+	}
+	
+	if [ "${SFEED_DOWNLOAD_CHILD}" = "1" ]; then
+		# Downloader helper for parallel downloading.
+		# Receives arguments: $1 = URL, $2 = title, $3 = feed filename or "-".
+		# It should write the URI to the cachefile if it is succesful.
+		downloader "$1" "$2" "$3"
+		exit $?
+	fi
+	
+	# ...else parent mode:
+	
+	tmp=$(mktemp)
+	trap "rm -f ${tmp}" EXIT
+	
+	[ -f "${cachefile}" ] || touch "${cachefile}"
+	cat "${cachefile}" > "${tmp}"
+	echo >> "${tmp}" # force it to have one line for awk.
+	
+	LC_ALL=C awk -F '\t' '
+	# fast prefilter what to download or not.
+	function filter(url, field, feedname) {
+		u = tolower(url);
+		return (match(u, "youtube\\.com") ||
+		        match(u, "\\.(flac|ogg|m3u|m3u8|m4a|mkv|mp3|mp4|wav|webm)$"));
+	}
+	function download(url, field, title, filename) {
+		if (!length(url) || urls[url] || !filter(url, field, filename))
+			return;
+		# NUL-separated for xargs -0.
+		printf("%s%c%s%c%s%c", url, 0, title, 0, filename, 0);
+		urls[url] = 1; # print once
+	}
+	{
+		FILENR += (FNR == 1);
+	}
+	# lookup table from cachefile which contains downloaded URLs.
+	FILENR == 1 {
+		urls[$0] = 1;
+	}
+	# feed file(s).
+	FILENR != 1 {
+		download($3, 3, $2, FILENAME); # link
+		download($8, 8, $2, FILENAME); # enclosure
+	}
+	' "${tmp}" "${@:--}" | \
+	SFEED_DOWNLOAD_CHILD="1" xargs -r -0 -L 3 -P "${jobs}" "$(readlink -f "$0")"
+
+- - -
+
 Shellscript to export existing newsboat cached items from sqlite3 to the sfeed
 TSV format.
author	Hiltjo Posthuma <hiltjo@codemadness.org>	2022-01-06 12:30:45 +0100
committer	Hiltjo Posthuma <hiltjo@codemadness.org>	2022-01-06 12:30:45 +0100
commit	f25a3e56e4b8d760469c120a22297f74961c22f3 (patch)
tree	999a7d50a7398c4f5335d5327400fb51ef1f212c
parent	e158bec62365732c0c5d54fa785e34470d56c9f6 (diff)