summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiltjo Posthuma <hiltjo@codemadness.org>2021-01-08 19:08:59 +0100
committerHiltjo Posthuma <hiltjo@codemadness.org>2021-01-08 19:33:29 +0100
commit04b832539cd5b5392c56ef238ec9b42b689de3ae (patch)
treef934b2769023a800ee0befe7608bdf70642fd39f
parentc7e3ec5f37738c43b3918cba6977fa51631a23af (diff)
util.c: printutf8pad(): improve padded printing and printing invalid unicode characters
This affects sfeed_plain. - Use unicode replacement character (codepoint 0xfffd) when a codepoint is invalid and proceed printing the rest of the characters. - When a codepoint is invalid reset the internal state of mbtowc(3), from the OpenBSD man page: " If a call to mbtowc() resulted in an undefined internal state, mbtowc() must be called with s set to NULL to reset the internal state before it can safely be used again." - Optimize for the common ASCII case and use a macro to print the character instead of a wasteful fwrite() function call. With 250k lines (+- 350MB) this improves printing performance from 1.7s to 1.0s on my laptop. On an other system it improved by +- 25%. Tested with clang and gcc and also tested the worst-case (non-ASCII) with no penalty. To test: printf '0\tabc\xc3 def' | sfeed_plain Before: 1970-01-01 01:00 abc After: 1970-01-01 01:00 abc� def
-rw-r--r--util.c47
1 files changed, 33 insertions, 14 deletions
diff --git a/util.c b/util.c
index 1c5f40e..955332b 100644
--- a/util.c
+++ b/util.c
@@ -234,29 +234,48 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
{
wchar_t wc;
size_t col = 0, i, slen;
- int rl, w;
+ int rl, siz, w;
if (!len)
return;
slen = strlen(s);
- for (i = 0; i < slen; i += rl) {
- rl = w = 1;
- if ((unsigned char)s[i] < 32)
- continue;
- if ((unsigned char)s[i] >= 127) {
- if ((rl = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4)) <= 0)
+ for (i = 0; i < slen; i += siz) {
+ siz = 1;
+ if ((unsigned char)s[i] < 32) {
+ continue; /* skip control characters */
+ } else if ((unsigned char)s[i] >= 127) {
+ rl = siz = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4);
+ if (rl < 0) {
+ mbtowc(NULL, NULL, 0); /* reset state */
+ siz = 1; /* next byte */
+ w = 1; /* replacement char is one width */
+ } else if ((w = wcwidth(wc)) == -1) {
+ continue;
+ }
+
+ if (col + w > len || (col + w == len && s[i + siz])) {
+ fputs("\xe2\x80\xa6", fp); /* ellipsis */
+ col++;
break;
- if ((w = wcwidth(wc)) == -1)
+ } else if (rl < 0) {
+ fputs("\xef\xbf\xbd", fp); /* replacement */
+ col++;
continue;
- }
- if (col + w > len || (col + w == len && s[i + rl])) {
- fputs("\xe2\x80\xa6", fp);
+ }
+ fwrite(&s[i], 1, siz, fp);
+ col += w;
+ } else {
+ /* simple ASCII character */
+ if (col + 1 > len || (col + 1 == len && s[i + 1])) {
+ fputs("\xe2\x80\xa6", fp); /* ellipsis */
+ col++;
+ break;
+ }
+ putc(s[i], fp);
col++;
- break;
}
- fwrite(&s[i], 1, rl, fp);
- col += w;
+
}
for (; col < len; ++col)
putc(pad, fp);