README: newsboat sqlite3 export script: improvements

- Export read/unread state to a separate plain-text "urls" file, line by line. - Handle white-space control-chars better. From the sfeed(1) man page: " The fields: title, id, author are not allowed to have newlines and TABs, all whitespace characters are replaced by a single space character. Control characters are removed." So do the reverse for newsboat aswell: change white-space characters which are also control-characters (such as TABs and newlines) to a single space character.
author: Hiltjo Posthuma <hiltjo@codemadness.org> 2021-01-15 18:31:52 +0100
committer: Hiltjo Posthuma <hiltjo@codemadness.org> 2021-01-16 12:26:16 +0100
commit: 7feab0fd885c1c93344fd427904eae91943493b7 (patch)
tree: e183706253651adc9dabf05a7d715392a906e2ac
parent: f18f4818ed2c992aa9b7b91c74bb9ce7cc1bc745 (diff)
1 files changed, 22 insertions, 9 deletions
diff --git a/README b/README
index 586a25e..2bf3dcb 100644
--- a/README
+++ b/README
@@ -628,10 +628,12 @@ sfeedrc file and change the curl options "-L --max-redirs 0".
 
 Shellscript to export existing newsboat cached items from sqlite3 to the sfeed
 TSV format.
-	
+
 	#!/bin/sh
 	# Export newsbeuter/newsboat cached items from sqlite3 to the sfeed TSV format.
 	# The data is split per file per feed with the name of the newsboat title/url.
+	# It writes the urls of the read items line by line to a "urls" file.
+	#
 	# Dependencies: sqlite3, awk.
 	#
 	# Usage: create some directory to store the feeds, run this script.
@@ -653,8 +655,8 @@ TSV format.
 	SELECT
 		i.pubDate, i.title, i.url, i.content, i.guid, i.author,
 		i.enclosure_url,
-		f.rssurl AS rssurl, f.title AS feedtitle --,
-		-- i.id, i.unread, i.enclosure_type, i.enqueued, i.flags, i.deleted,
+		f.rssurl AS rssurl, f.title AS feedtitle, i.unread --,
+		-- i.id, i.enclosure_type, i.enqueued, i.flags, i.deleted,
 		-- i.base
 	FROM rss_feed f
 	INNER JOIN rss_item i ON i.feedurl = f.rssurl
@@ -668,13 +670,19 @@ TSV format.
 		FS = "\x1f";
 		RS = "\x1e";
 	}
-	# strip all control-chars for normal fields.
-	function strip(s) {
+	# normal non-content fields.
+	function field(s) {
+		gsub("^[[:space:]]*", "", s);
+		gsub("[[:space:]]*$", "", s);
+		gsub("[[:space:]]", " ", s);
 		gsub("[[:cntrl:]]", "", s);
 		return s;
 	}
-	# escape chars in content field.
-	function escape(s) {
+	# content field.
+	function content(s) {
+		gsub("^[[:space:]]*", "", s);
+		gsub("[[:space:]]*$", "", s);
+		# escape chars in content field.
 		gsub("\\\\", "\\\\", s);
 		gsub("\n", "\\n", s);
 		gsub("\t", "\\t", s);
@@ -690,9 +698,14 @@ TSV format.
 			print "Writing file: \"" fname "\" (title: " $9 ", url: " $8 ")" > "/dev/stderr";
 		}
 	
-		print $1 "\t" strip($2) "\t" strip($3) "\t" escape($4) "\t" \
-			"html" "\t" strip($5) "\t" strip($6) "\t" strip($7) \
+		print $1 "\t" field($2) "\t" field($3) "\t" content($4) "\t" \
+			"html" "\t" field($5) "\t" field($6) "\t" field($7) \
 			> fname;
+	
+		# write urls of the read items to a file line by line.
+		if ($10 == "0") {
+			print $3 > "urls";
+		}
 	}'
author	Hiltjo Posthuma <hiltjo@codemadness.org>	2021-01-15 18:31:52 +0100
committer	Hiltjo Posthuma <hiltjo@codemadness.org>	2021-01-16 12:26:16 +0100
commit	7feab0fd885c1c93344fd427904eae91943493b7 (patch)
tree	e183706253651adc9dabf05a7d715392a906e2ac
parent	f18f4818ed2c992aa9b7b91c74bb9ce7cc1bc745 (diff)