diff options
author | Benjamin Chausse <benjamin@chausse.xyz> | 2024-08-09 14:11:50 -0400 |
---|---|---|
committer | Benjamin Chausse <benjamin@chausse.xyz> | 2024-08-09 14:11:50 -0400 |
commit | 5857d82e8e596d6fda406a0c4d8d68ca7a03c124 (patch) | |
tree | 553916894dee907825360580c5d9a05c82c5af16 /sfeed_update | |
parent | 3574e3cbf9d99546e868aeb995ce2c171cdc36a6 (diff) | |
parent | 19957bc272e745af7b56b79fa648e8b6b77113b1 (diff) |
Diffstat (limited to 'sfeed_update')
-rwxr-xr-x | sfeed_update | 139 |
1 files changed, 89 insertions, 50 deletions
diff --git a/sfeed_update b/sfeed_update index ba9e242..fd468a5 100755 --- a/sfeed_update +++ b/sfeed_update @@ -7,7 +7,7 @@ sfeedpath="$HOME/.sfeed/feeds" # used for processing feeds concurrently: wait until ${maxjobs} amount of # feeds are finished at a time. -maxjobs=8 +maxjobs=16 # load config (evaluate shellscript). # loadconfig(configfile) @@ -16,26 +16,33 @@ loadconfig() { if [ "$1" != "" ]; then # get absolute path of config file required for including. config="$1" - path=$(readlink -f "${config}" 2>/dev/null) + configpath=$(readlink -f "${config}" 2>/dev/null) else # default config location. config="$HOME/.sfeed/sfeedrc" - path="${config}" + configpath="${config}" fi # config is loaded here to be able to override $sfeedpath or functions. - if [ -r "${path}" ]; then - . "${path}" + if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then + . "${configpath}" else - echo "Configuration file \"${config}\" cannot be read." >&2 - echo "See sfeedrc.example for an example." >&2 - exit 1 + printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2 + echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2 + die fi } # log(name, s) log() { + printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" +} + +# log_error(name, s) +log_error() { printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 + # set error exit status indicator for parallel jobs. + rm -f "${sfeedtmpdir}/ok" } # fetch a feed via HTTP/HTTPS etc. @@ -64,7 +71,7 @@ parse() { } # filter fields. -# filter(name) +# filter(name, url) filter() { cat } @@ -76,9 +83,9 @@ merge() { } # order by timestamp (descending). -# order(name) +# order(name, url) order() { - sort -t ' ' -k1rn,1 + sort -t ' ' -k1rn,1 2>/dev/null } # internal handler to fetch and process a feed. @@ -91,101 +98,111 @@ _feed() { filename="$(printf '%s' "${name}" | tr '/' '_')" sfeedfile="${sfeedpath}/${filename}" - tmpfeedfile="${sfeedtmpdir}/${filename}" + tmpfeedfile="${sfeedtmpdir}/feeds/${filename}" # if file does not exist yet create it. [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then - log "${name}" "FAIL (FETCH)" - return + log_error "${name}" "FAIL (FETCH)" + return 1 fi # try to detect encoding (if not specified). if detecting the encoding fails assume utf-8. [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch") if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then - log "${name}" "FAIL (ENCODING)" - return + log_error "${name}" "FAIL (ENCODING)" + return 1 fi rm -f "${tmpfeedfile}.fetch" # if baseurl is empty then use feedurl. if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then - log "${name}" "FAIL (PARSE)" - return + log_error "${name}" "FAIL (PARSE)" + return 1 fi rm -f "${tmpfeedfile}.utf8" - if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then - log "${name}" "FAIL (FILTER)" - return + if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then + log_error "${name}" "FAIL (FILTER)" + return 1 fi rm -f "${tmpfeedfile}.tsv" # new feed data is empty: no need for below stages. if [ ! -s "${tmpfeedfile}.filter" ]; then log "${name}" "OK" - return + return 0 fi if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then - log "${name}" "FAIL (MERGE)" - return + log_error "${name}" "FAIL (MERGE)" + return 1 fi rm -f "${tmpfeedfile}.filter" - if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then - log "${name}" "FAIL (ORDER)" - return + if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then + log_error "${name}" "FAIL (ORDER)" + return 1 fi rm -f "${tmpfeedfile}.merge" # copy if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then - log "${name}" "FAIL (COPY)" - return + log_error "${name}" "FAIL (COPY)" + return 1 fi rm -f "${tmpfeedfile}.order" # OK log "${name}" "OK" + return 0 } # fetch and process a feed in parallel. # feed(name, feedurl, [basesiteurl], [encoding]) feed() { - # wait until ${maxjobs} are finished: will stall the queue if an item - # is slow, but it is portable. - [ ${signo} -ne 0 ] && return - [ $((curjobs % maxjobs)) -eq 0 ] && wait - [ ${signo} -ne 0 ] && return - curjobs=$((curjobs + 1)) - - _feed "$@" & + # Output job parameters for xargs. + # Specify fields as a single parameter separated by a NUL byte. + # The parameter is split into fields later by the child process, this + # allows using xargs with empty fields across many implementations. + printf '%s\037%s\037%s\037%s\037%s\037%s\0' \ + "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4" } +# cleanup() cleanup() { # remove temporary directory with feed files. rm -rf "${sfeedtmpdir}" } +# die(statuscode) +die() { + statuscode="${1:-1}" # default: exit 1 + # cleanup temporary files etc. + cleanup + exit "${statuscode}" +} + +# sighandler(signo) sighandler() { signo="$1" # ignore TERM signal for myself. trap -- "" TERM - # kill all running childs >:D + # kill all running children >:D kill -TERM -$$ } +# feeds() feeds() { - echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2 + printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2 echo "See sfeedrc.example for an example." >&2 + die } +# main(args...) main() { - # job counter. - curjobs=0 # signal number received for parent. signo=0 # SIGINT: signal to interrupt parent. @@ -195,18 +212,40 @@ main() { # load config file. loadconfig "$1" # fetch feeds and store in temporary directory. - sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')" + sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die + mkdir -p "${sfeedtmpdir}/feeds" + touch "${sfeedtmpdir}/ok" || die # make sure path exists. mkdir -p "${sfeedpath}" - # fetch feeds specified in config file. - feeds - # wait till all feeds are fetched (concurrently). - [ ${signo} -eq 0 ] && wait - # cleanup temporary files etc. - cleanup + + # print feeds for parallel processing with xargs. + feeds > "${sfeedtmpdir}/jobs" || die + SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \ + "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs" + statuscode=$? + + # check error exit status indicator for parallel jobs. + [ -f "${sfeedtmpdir}/ok" ] || statuscode=1 # on signal SIGINT and SIGTERM exit with signal number + 128. - [ ${signo} -ne 0 ] && exit $((signo+128)) - return 0 + [ ${signo} -ne 0 ] && die $((signo+128)) + die ${statuscode} } +# process a single feed. +# parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding +if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then + IFS="" # "\037" + [ "$1" = "" ] && exit 0 # must have an argument set + printf '%s\n' "$1" | \ + while read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do + loadconfig "${_config}" + sfeedtmpdir="${_tmpdir}" + _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}" + exit "$?" + done + exit 0 +fi + +# ...else parent mode: +argv0="$0" # store $0, in the zsh shell $0 is the name of the function. [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@" |