From cc9f0d5549b21bb6254aede2ff479698183ea5e3 Mon Sep 17 00:00:00 2001 From: Hiltjo Posthuma Date: Fri, 28 Sep 2018 17:11:56 +0200 Subject: sfeed_update: add filter(), order() support per feed + improvements Pass the name parameter to the functions and add these to the pipeline. They can be overridden in the config. - add the ability to change the merge logic per feed. - add the ability to filter lines and fields per feed. - add the ability to order lines differently per feed. - add filter example to README. - code-style: - fetchfeed consistency in parameter order. - change [ x"" = x"" ] to [ "" = "" ]. Simplify some if statements. - wrap long line in fetchfeed(). - use signal names for trap. --- README | 60 ++++++++++++++++++++++++++++++++++++++++++----- sfeed_update | 73 +++++++++++++++++++++++++++++++++------------------------- sfeed_update.1 | 29 ++++++++++++++++------- 3 files changed, 117 insertions(+), 45 deletions(-) diff --git a/README b/README index a98eb0b..5c7dab1 100644 --- a/README +++ b/README @@ -127,12 +127,18 @@ Files read at runtime by sfeed_update(1) ---------------------------------------- sfeedrc - Config file. This file is evaluated as a shellscript in - sfeed_update(1). You can for example override the fetchfeed() - function to use wget(1), OpenBSD ftp(1) an other download program or - you can override the merge() function to change the merge logic. The - function feeds() is called to fetch the feeds. The function feed() - can safely be executed concurrently as a background job in your - sfeedrc(5) config file to make updating faster. + sfeed_update(1). + +Atleast the following functions can be overridden per feed: + +- fetchfeed: to use wget(1), OpenBSD ftp(1) or an other download program. +- merge: to change the merge logic. +- filter: to filter on fields. +- order: to change the sort order. + +The function feeds() is called to fetch the feeds. The function feed() can +safely be executed concurrently as a background job in your sfeedrc(5) config +file to make updating faster. Files written at runtime by sfeed_update(1) @@ -212,6 +218,48 @@ argument is optional): - - - +# filter fields. +# filter(name) +filter() { + case "$1" in + "tweakers") + LC_LOCALE=C awk -F ' ' 'BEGIN { + OFS = " "; + } + # skip ads. + $2 ~ /^ADV:/ { + next; + } + # shorten link. + { + if (match($3, /^https:\/\/tweakers\.net\/(nieuws|downloads|reviews|geek)\/[0-9]+\//)) { + $3 = substr($3, RSTART, RLENGTH); + } + print $0; + }';; + "yt BSDNow") + # filter only BSD Now from channel. + LC_LOCALE=C awk -F ' ' '$2 ~ / \| BSD Now/';; + *) + cat;; + esac | \ + # replace youtube links with embed links. + sed 's@www.youtube.com/watch?v=@www.youtube.com/embed/@g' | \ + # try to strip utm_ tracking parameters. + LC_LOCALE=C awk -F ' ' 'BEGIN { + OFS = " "; + } + { + gsub(/\?utm_([^&]+)/, "?", $3); + gsub(/&utm_([^&]+)/, "", $3); + gsub(/\?&/, "?", $3); + gsub(/[\?&]+$/, "", $3); + print $0; + }' +} + +- - - + Over time your feeds file might become quite big. You can archive items from a specific date by doing for example: diff --git a/sfeed_update b/sfeed_update index 8ac5b32..2b23d3d 100755 --- a/sfeed_update +++ b/sfeed_update @@ -9,7 +9,7 @@ sfeedpath="$HOME/.sfeed/feeds" # loadconfig(configfile) loadconfig() { # allow to specify config via argv[1]. - if [ ! x"$1" = x"" ]; then + if [ "$1" != "" ]; then # get absolute path of config file. config=$(readlink -f "$1") else @@ -17,8 +17,7 @@ loadconfig() { config="$HOME/.sfeed/sfeedrc" fi - # load config: config is loaded here to be able to override $sfeedpath - # or functions. + # config is loaded here to be able to override $sfeedpath or functions. if [ -r "${config}" ]; then . "${config}" else @@ -28,30 +27,11 @@ loadconfig() { fi } -# merge raw files. -# merge(oldfile, newfile) -merge() { - # unique sort by id, title, link. - # order by timestamp (desc). - (sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$1" "$2" 2>/dev/null) | - sort -t ' ' -k1rn,1 -} - -# fetch a feed via HTTP/HTTPS etc. -# fetchfeed(url, name, feedfile) -fetchfeed() { - if curl -L --max-redirs 0 -H 'User-Agent:' -f -s -S -m 15 -z "$3" "$1" 2>/dev/null; then - printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2 - else - printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2 - fi -} - # convert encoding from one encoding to another. # convertencoding(from, to) convertencoding() { # if from != to - if [ ! "$1" = "" ] && [ ! "$2" = "" ] && [ ! "$1" = "$2" ]; then + if [ "$1" != "" ] && [ "$2" != "" ] && [ "$1" != "$2" ]; then iconv -cs -f "$1" -t "$2" 2> /dev/null else # else no convert, just output @@ -59,6 +39,35 @@ convertencoding() { fi } +# merge raw files: unique sort by id, title, link. +# merge(name, oldfile, newfile) +merge() { + sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null +} + +# filter fields. +# filter(name) +filter() { + cat +} + +# order by timestamp (descending). +# order(name) +order() { + sort -t ' ' -k1rn,1 +} + +# fetch a feed via HTTP/HTTPS etc. +# fetchfeed(name, url, feedfile) +fetchfeed() { + if curl -L --max-redirs 0 -H "User-Agent:" -f -s -S -m 15 \ + -z "$3" "$2" 2>/dev/null; then + printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2 + else + printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2 + fi +} + # fetch and parse feed. # feed(name, feedurl, [basesiteurl], [encoding]) feed() { @@ -72,14 +81,14 @@ feed() { sfeedfile="${sfeedpath}/${filename}" if [ ! "${encoding}" = "" ]; then - fetchfeed "${feedurl}" "${name}" "${sfeedfile}" | \ + fetchfeed "${name}" "${feedurl}" "${sfeedfile}" | \ convertencoding "${encoding}" "utf-8" else # detect encoding. tmpencfile="${tmpfeedfile}.enc" - fetchfeed "${feedurl}" "${name}" "${sfeedfile}" > "${tmpencfile}" + fetchfeed "${name}" "${feedurl}" "${sfeedfile}" > "${tmpencfile}" detectenc=$(sfeed_xmlenc < "${tmpencfile}") convertencoding "${detectenc}" "utf-8" < "${tmpencfile}" - fi | sfeed "${basesiteurl}" > "${tmpfeedfile}" + fi | sfeed "${basesiteurl}" | filter "${name}" > "${tmpfeedfile}" # get new data and merge with old. sfeedfilenew="${sfeedpath}/${filename}.new" @@ -87,18 +96,20 @@ feed() { if [ -s "${tmpfeedfile}" ]; then # if file exists, merge if [ -e "${sfeedfile}" ]; then - merge "${sfeedfile}" "${tmpfeedfile}" > "${sfeedfilenew}" + merge "${name}" "${sfeedfile}" "${tmpfeedfile}" | \ + order "${name}" > "${sfeedfilenew}" # overwrite old file with updated file mv "${sfeedfilenew}" "${sfeedfile}" else - merge "/dev/null" "${tmpfeedfile}" > "${sfeedfile}" + merge "${name}" "/dev/null" "${tmpfeedfile}" | \ + order "${name}" > "${sfeedfile}" fi fi) & } cleanup() { - # remove temporary files + # remove temporary files. rm -rf "${sfeedtmpdir}" } @@ -114,9 +125,9 @@ feeds() { # kill whole current process group on ^C (SIGINT). isinterrupted="0" # SIGTERM: signal to terminate parent. -trap -- "interrupted" "15" +trap -- "interrupted" "TERM" # SIGINT: kill all running childs >:D -trap -- "kill -TERM -$$" "2" +trap -- "kill -TERM -$$" "INT" # load config file. loadconfig "$1" # fetch feeds and store in temporary file. diff --git a/sfeed_update.1 b/sfeed_update.1 index 622e08c..2260de0 100644 --- a/sfeed_update.1 +++ b/sfeed_update.1 @@ -1,4 +1,4 @@ -.Dd August 5, 2015 +.Dd September 28, 2018 .Dt SFEED_UPDATE 1 .Os .Sh NAME @@ -29,15 +29,28 @@ section for more information. Config file, see the sfeedrc.example file for an example. This file is evaluated as a shellscript in .Nm . -You can for example override the fetchfeed() function to -use -.Xr curl 1 , +.Pp +Atleast the following functions can be overridden per feed: +.Bl -tag -width 17n +.It fetchfeed +to use .Xr wget 1 , -or an other network downloader or you can override the merge() function to -change the merge logic. +OpenBSD +.Xr ftp 1 +or an other download program. +.It merge +to change the merge logic. +.It filter +to filter on fields. +.It order +to change the sort order. +.El +.Pp The function feeds() is called to fetch the feeds. -By default the function feed() is executed concurrently as a background job to -speedup updating. +The function feed() can safely be executed concurrently as a background job in +your +.Xr sfeedrc 5 +config file to make updating faster. .El .Sh FILES WRITTEN .Bl -tag -width 17n -- cgit v1.2.3