sfeed_update (5193B) - raw


      1 #!/bin/sh
      2 # update feeds, merge with old feeds.
      3 # NOTE: assumes "sfeed_*" executables are in $PATH.
      4 
      5 # defaults
      6 sfeedpath="$HOME/.sfeed/feeds"
      7 
      8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
      9 # feeds are finished at a time.
     10 maxjobs=8
     11 
     12 # load config (evaluate shellscript).
     13 # loadconfig(configfile)
     14 loadconfig() {
     15 	# allow to specify config via argv[1].
     16 	if [ "$1" != "" ]; then
     17 		# get absolute path of config file required for including.
     18 		config="$1"
     19 		path=$(readlink -f "${config}" 2>/dev/null)
     20 	else
     21 		# default config location.
     22 		config="$HOME/.sfeed/sfeedrc"
     23 		path="${config}"
     24 	fi
     25 
     26 	# config is loaded here to be able to override $sfeedpath or functions.
     27 	if [ -r "${path}" ]; then
     28 		. "${path}"
     29 	else
     30 		echo "Configuration file \"${config}\" cannot be read." >&2
     31 		echo "See sfeedrc.example for an example." >&2
     32 		exit 1
     33 	fi
     34 }
     35 
     36 # log(name, s)
     37 log() {
     38 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
     39 }
     40 
     41 # fetch a feed via HTTP/HTTPS etc.
     42 # fetch(name, url, feedfile)
     43 fetch() {
     44 	# fail on redirects, hide User-Agent, timeout is 15 seconds.
     45 	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
     46 		"$2" 2>/dev/null
     47 }
     48 
     49 # convert encoding from one encoding to another.
     50 # convertencoding(name, from, to)
     51 convertencoding() {
     52 	if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
     53 		iconv -cs -f "$2" -t "$3" 2> /dev/null
     54 	else
     55 		# else no convert, just output.
     56 		cat
     57 	fi
     58 }
     59 
     60 # parse and convert input, by default XML to the sfeed(5) TSV format.
     61 # parse(name, feedurl, basesiteurl)
     62 parse() {
     63 	sfeed "$3"
     64 }
     65 
     66 # filter fields.
     67 # filter(name)
     68 filter() {
     69 	cat
     70 }
     71 
     72 # merge raw files: unique sort by id, title, link.
     73 # merge(name, oldfile, newfile)
     74 merge() {
     75 	sort -t '	' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
     76 }
     77 
     78 # order by timestamp (descending).
     79 # order(name)
     80 order() {
     81 	sort -t '	' -k1rn,1
     82 }
     83 
     84 # internal handler to fetch and process a feed.
     85 # _feed(name, feedurl, [basesiteurl], [encoding])
     86 _feed() {
     87 	name="$1"
     88 	feedurl="$2"
     89 	basesiteurl="$3"
     90 	encoding="$4"
     91 
     92 	filename="$(printf '%s' "${name}" | tr '/' '_')"
     93 	sfeedfile="${sfeedpath}/${filename}"
     94 	tmpfeedfile="${sfeedtmpdir}/${filename}"
     95 
     96 	# if file does not exist yet create it.
     97 	[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
     98 
     99 	if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
    100 		log "${name}" "FAIL (FETCH)"
    101 		return
    102 	fi
    103 
    104 	# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
    105 	[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
    106 
    107 	if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
    108 		log "${name}" "FAIL (ENCODING)"
    109 		return
    110 	fi
    111 	rm -f "${tmpfeedfile}.fetch"
    112 
    113 	# if baseurl is empty then use feedurl.
    114 	if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
    115 		log "${name}" "FAIL (PARSE)"
    116 		return
    117 	fi
    118 	rm -f "${tmpfeedfile}.utf8"
    119 
    120 	if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
    121 		log "${name}" "FAIL (FILTER)"
    122 		return
    123 	fi
    124 	rm -f "${tmpfeedfile}.tsv"
    125 
    126 	# new feed data is empty: no need for below stages.
    127 	if [ ! -s "${tmpfeedfile}.filter" ]; then
    128 		log "${name}" "OK"
    129 		return
    130 	fi
    131 
    132 	if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
    133 		log "${name}" "FAIL (MERGE)"
    134 		return
    135 	fi
    136 	rm -f "${tmpfeedfile}.filter"
    137 
    138 	if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
    139 		log "${name}" "FAIL (ORDER)"
    140 		return
    141 	fi
    142 	rm -f "${tmpfeedfile}.merge"
    143 
    144 	# copy
    145 	if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
    146 		log "${name}" "FAIL (COPY)"
    147 		return
    148 	fi
    149 	rm -f "${tmpfeedfile}.order"
    150 
    151 	# OK
    152 	log "${name}" "OK"
    153 }
    154 
    155 # fetch and process a feed in parallel.
    156 # feed(name, feedurl, [basesiteurl], [encoding])
    157 feed() {
    158 	# wait until ${maxjobs} are finished: will stall the queue if an item
    159 	# is slow, but it is portable.
    160 	[ ${signo} -ne 0 ] && return
    161 	[ $((curjobs % maxjobs)) -eq 0 ] && wait
    162 	[ ${signo} -ne 0 ] && return
    163 	curjobs=$((curjobs + 1))
    164 
    165 	_feed "$@" &
    166 }
    167 
    168 cleanup() {
    169 	# remove temporary directory with feed files.
    170 	rm -rf "${sfeedtmpdir}"
    171 }
    172 
    173 sighandler() {
    174 	signo="$1"
    175 	# ignore TERM signal for myself.
    176 	trap -- "" TERM
    177 	# kill all running childs >:D
    178 	kill -TERM -$$
    179 }
    180 
    181 feeds() {
    182 	echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
    183 	echo "See sfeedrc.example for an example." >&2
    184 }
    185 
    186 main() {
    187 	# job counter.
    188 	curjobs=0
    189 	# signal number received for parent.
    190 	signo=0
    191 	# SIGINT: signal to interrupt parent.
    192 	trap -- "sighandler 2" "INT"
    193 	# SIGTERM: signal to terminate parent.
    194 	trap -- "sighandler 15" "TERM"
    195 	# load config file.
    196 	loadconfig "$1"
    197 	# fetch feeds and store in temporary directory.
    198 	sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')"
    199 	# make sure path exists.
    200 	mkdir -p "${sfeedpath}"
    201 	# fetch feeds specified in config file.
    202 	feeds
    203 	# wait till all feeds are fetched (concurrently).
    204 	[ ${signo} -eq 0 ] && wait
    205 	# cleanup temporary files etc.
    206 	cleanup
    207 	# on signal SIGINT and SIGTERM exit with signal number + 128.
    208 	[ ${signo} -ne 0 ] && exit $((signo+128))
    209 	return 0
    210 }
    211 
    212 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"