diff options
author | Hiltjo Posthuma <hiltjo@codemadness.org> | 2019-04-20 16:01:02 +0200 |
---|---|---|
committer | Hiltjo Posthuma <hiltjo@codemadness.org> | 2019-04-20 16:01:02 +0200 |
commit | eb9e9a957a75069edecbfcc1872930d07e146a1a (patch) | |
tree | 8b49afe7815db0ff5bde9ded948e5d5cec5fe4cb | |
parent | 4aeb397ef388962380cb5ce5b3de48bd22dbfb40 (diff) |
README: update filter example
- Show how to filter protocol schemes more strictly. For example to allow only
http://, https:// and gopher:// (not file://, javascript:, etc).
- Filter links and now also enclosures.
-rw-r--r-- | README | 26 |
1 files changed, 18 insertions, 8 deletions
@@ -260,20 +260,30 @@ filter() { sed 's@www.youtube.com/watch?v=@www.youtube.com/embed/@g' | \ LC_LOCALE=C awk -F ' ' 'BEGIN { OFS = " "; } - { + function filterlink(s) { + # protocol must start with http, https or gopher. + if (match(s, /^(http|https|gopher):\/\//) == 0) { + return ""; + } + # shorten feedburner links. - if (match($3, /^(http|https):\/\/[^/]+\/~r\/.*\/~3\/[^\/]+\//)) { - $3 = substr($3, RSTART, RLENGTH); + if (match(s, /^(http|https):\/\/[^/]+\/~r\/.*\/~3\/[^\/]+\//)) { + s = substr($3, RSTART, RLENGTH); } # strip tracking parameters - # urchin, facebook, piwik, webtrekk and generic. - gsub(/\?(ad|campaign|pk|tm|wt)_([^&]+)/, "?", $3); - gsub(/&(ad|campaign|pk|tm|wt)_([^&]+)/, "", $3); + gsub(/\?(ad|campaign|pk|tm|wt)_([^&]+)/, "?", s); + gsub(/&(ad|campaign|pk|tm|wt)_([^&]+)/, "", s); + + gsub(/\?&/, "?", s); + gsub(/[\?&]+$/, "", s); - gsub(/\?&/, "?", $3); - gsub(/[\?&]+$/, "", $3); + return s + } + { + $3 = filterlink($3); # link + $8 = filterlink($8); # enclosure print $0; }' |