summaryrefslogtreecommitdiff
path: root/sfeed_update
diff options
context:
space:
mode:
authorBenjamin Chausse <benjamin@chausse.xyz>2024-08-09 14:11:50 -0400
committerBenjamin Chausse <benjamin@chausse.xyz>2024-08-09 14:11:50 -0400
commit5857d82e8e596d6fda406a0c4d8d68ca7a03c124 (patch)
tree553916894dee907825360580c5d9a05c82c5af16 /sfeed_update
parent3574e3cbf9d99546e868aeb995ce2c171cdc36a6 (diff)
parent19957bc272e745af7b56b79fa648e8b6b77113b1 (diff)
Merge remote-tracking branch 'upstream/master'HEADmaster
Diffstat (limited to 'sfeed_update')
-rwxr-xr-xsfeed_update139
1 files changed, 89 insertions, 50 deletions
diff --git a/sfeed_update b/sfeed_update
index ba9e242..fd468a5 100755
--- a/sfeed_update
+++ b/sfeed_update
@@ -7,7 +7,7 @@ sfeedpath="$HOME/.sfeed/feeds"
# used for processing feeds concurrently: wait until ${maxjobs} amount of
# feeds are finished at a time.
-maxjobs=8
+maxjobs=16
# load config (evaluate shellscript).
# loadconfig(configfile)
@@ -16,26 +16,33 @@ loadconfig() {
if [ "$1" != "" ]; then
# get absolute path of config file required for including.
config="$1"
- path=$(readlink -f "${config}" 2>/dev/null)
+ configpath=$(readlink -f "${config}" 2>/dev/null)
else
# default config location.
config="$HOME/.sfeed/sfeedrc"
- path="${config}"
+ configpath="${config}"
fi
# config is loaded here to be able to override $sfeedpath or functions.
- if [ -r "${path}" ]; then
- . "${path}"
+ if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then
+ . "${configpath}"
else
- echo "Configuration file \"${config}\" cannot be read." >&2
- echo "See sfeedrc.example for an example." >&2
- exit 1
+ printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
+ echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
+ die
fi
}
# log(name, s)
log() {
+ printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
+}
+
+# log_error(name, s)
+log_error() {
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
+ # set error exit status indicator for parallel jobs.
+ rm -f "${sfeedtmpdir}/ok"
}
# fetch a feed via HTTP/HTTPS etc.
@@ -64,7 +71,7 @@ parse() {
}
# filter fields.
-# filter(name)
+# filter(name, url)
filter() {
cat
}
@@ -76,9 +83,9 @@ merge() {
}
# order by timestamp (descending).
-# order(name)
+# order(name, url)
order() {
- sort -t ' ' -k1rn,1
+ sort -t ' ' -k1rn,1 2>/dev/null
}
# internal handler to fetch and process a feed.
@@ -91,101 +98,111 @@ _feed() {
filename="$(printf '%s' "${name}" | tr '/' '_')"
sfeedfile="${sfeedpath}/${filename}"
- tmpfeedfile="${sfeedtmpdir}/${filename}"
+ tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
# if file does not exist yet create it.
[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
- log "${name}" "FAIL (FETCH)"
- return
+ log_error "${name}" "FAIL (FETCH)"
+ return 1
fi
# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
- log "${name}" "FAIL (ENCODING)"
- return
+ log_error "${name}" "FAIL (ENCODING)"
+ return 1
fi
rm -f "${tmpfeedfile}.fetch"
# if baseurl is empty then use feedurl.
if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
- log "${name}" "FAIL (PARSE)"
- return
+ log_error "${name}" "FAIL (PARSE)"
+ return 1
fi
rm -f "${tmpfeedfile}.utf8"
- if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
- log "${name}" "FAIL (FILTER)"
- return
+ if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
+ log_error "${name}" "FAIL (FILTER)"
+ return 1
fi
rm -f "${tmpfeedfile}.tsv"
# new feed data is empty: no need for below stages.
if [ ! -s "${tmpfeedfile}.filter" ]; then
log "${name}" "OK"
- return
+ return 0
fi
if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
- log "${name}" "FAIL (MERGE)"
- return
+ log_error "${name}" "FAIL (MERGE)"
+ return 1
fi
rm -f "${tmpfeedfile}.filter"
- if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
- log "${name}" "FAIL (ORDER)"
- return
+ if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
+ log_error "${name}" "FAIL (ORDER)"
+ return 1
fi
rm -f "${tmpfeedfile}.merge"
# copy
if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
- log "${name}" "FAIL (COPY)"
- return
+ log_error "${name}" "FAIL (COPY)"
+ return 1
fi
rm -f "${tmpfeedfile}.order"
# OK
log "${name}" "OK"
+ return 0
}
# fetch and process a feed in parallel.
# feed(name, feedurl, [basesiteurl], [encoding])
feed() {
- # wait until ${maxjobs} are finished: will stall the queue if an item
- # is slow, but it is portable.
- [ ${signo} -ne 0 ] && return
- [ $((curjobs % maxjobs)) -eq 0 ] && wait
- [ ${signo} -ne 0 ] && return
- curjobs=$((curjobs + 1))
-
- _feed "$@" &
+ # Output job parameters for xargs.
+ # Specify fields as a single parameter separated by a NUL byte.
+ # The parameter is split into fields later by the child process, this
+ # allows using xargs with empty fields across many implementations.
+ printf '%s\037%s\037%s\037%s\037%s\037%s\0' \
+ "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4"
}
+# cleanup()
cleanup() {
# remove temporary directory with feed files.
rm -rf "${sfeedtmpdir}"
}
+# die(statuscode)
+die() {
+ statuscode="${1:-1}" # default: exit 1
+ # cleanup temporary files etc.
+ cleanup
+ exit "${statuscode}"
+}
+
+# sighandler(signo)
sighandler() {
signo="$1"
# ignore TERM signal for myself.
trap -- "" TERM
- # kill all running childs >:D
+ # kill all running children >:D
kill -TERM -$$
}
+# feeds()
feeds() {
- echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
+ printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
echo "See sfeedrc.example for an example." >&2
+ die
}
+# main(args...)
main() {
- # job counter.
- curjobs=0
# signal number received for parent.
signo=0
# SIGINT: signal to interrupt parent.
@@ -195,18 +212,40 @@ main() {
# load config file.
loadconfig "$1"
# fetch feeds and store in temporary directory.
- sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')"
+ sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die
+ mkdir -p "${sfeedtmpdir}/feeds"
+ touch "${sfeedtmpdir}/ok" || die
# make sure path exists.
mkdir -p "${sfeedpath}"
- # fetch feeds specified in config file.
- feeds
- # wait till all feeds are fetched (concurrently).
- [ ${signo} -eq 0 ] && wait
- # cleanup temporary files etc.
- cleanup
+
+ # print feeds for parallel processing with xargs.
+ feeds > "${sfeedtmpdir}/jobs" || die
+ SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \
+ "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs"
+ statuscode=$?
+
+ # check error exit status indicator for parallel jobs.
+ [ -f "${sfeedtmpdir}/ok" ] || statuscode=1
# on signal SIGINT and SIGTERM exit with signal number + 128.
- [ ${signo} -ne 0 ] && exit $((signo+128))
- return 0
+ [ ${signo} -ne 0 ] && die $((signo+128))
+ die ${statuscode}
}
+# process a single feed.
+# parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding
+if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then
+ IFS="" # "\037"
+ [ "$1" = "" ] && exit 0 # must have an argument set
+ printf '%s\n' "$1" | \
+ while read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do
+ loadconfig "${_config}"
+ sfeedtmpdir="${_tmpdir}"
+ _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}"
+ exit "$?"
+ done
+ exit 0
+fi
+
+# ...else parent mode:
+argv0="$0" # store $0, in the zsh shell $0 is the name of the function.
[ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"