changeset 1128:fbe2ce98a4b8

make tab expansion for plain text utf8 aware In discussion with corvid <corvid@lavabit.com>.
author Johannes Hofmann <Johannes.Hofmann@gmx.de>
date Mon, 25 May 2009 18:42:24 +0200
parents 65c7e33e4466
children 205e83fe66c0
files src/misc.c src/utf8.cc src/utf8.hh
diffstat 3 files changed, 29 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/src/misc.c	Sun May 24 21:52:59 2009 +0000
+++ b/src/misc.c	Mon May 25 18:42:24 2009 +0200
@@ -16,6 +16,7 @@
 #include <string.h>
 #include <ctype.h>
 
+#include "utf8.hh"
 #include "msg.h"
 #include "misc.h"
 
@@ -47,7 +48,6 @@
    return p;
 }
 
-
 #define TAB_SIZE 8
 /*
  * Takes a string and converts any tabs to spaces.
@@ -55,23 +55,27 @@
 char *a_Misc_expand_tabs(const char *str, int len)
 {
    Dstr *New = dStr_new("");
-   int i, j, pos, old_pos;
+   int i = 0, j, pos = 0, old_pos, char_len;
+   uint_t code;
    char *val;
 
-   if (len) {
-      for (pos = 0, i = 0; i < len; i++) {
-         if (str[i] == '\t') {
-            /* Fill with whitespaces until the next tab. */
-            old_pos = pos;
-            pos += TAB_SIZE - (pos % TAB_SIZE);
-            for (j = old_pos; j < pos; j++)
-               dStr_append_c(New, ' ');
-         } else {
-            dStr_append_c(New, str[i]);
-            pos++;
-         }
+   while (i < len) {
+      code = a_Utf8_decode(&str[i], str + len, &char_len);
+
+      if (code == '\t') {
+         /* Fill with whitespaces until the next tab. */
+         old_pos = pos;
+         pos += TAB_SIZE - (pos % TAB_SIZE);
+         for (j = old_pos; j < pos; j++)
+            dStr_append_c(New, ' ');
+      } else {
+         dStr_append_l(New, &str[i], char_len);
+         pos++;
       }
+
+      i += char_len;
    }
+
    val = New->str;
    dStr_free(New, FALSE);
    return val;
--- a/src/utf8.cc	Sun May 24 21:52:59 2009 +0000
+++ b/src/utf8.cc	Mon May 25 18:42:24 2009 +0200
@@ -36,6 +36,16 @@
 }
 
 /*
+ * Decode a single UTF-8-encoded character starting at p.
+ * The resulting Unicode value (in the range 0-0x10ffff) is returned,
+ * and len is set the the number of bytes in the UTF-8 encoding.
+ */
+uint_t a_Utf8_decode(const char* str, const char* end, int* len)
+{
+   return utf8decode(str, end, len);
+}
+
+/*
  * Write UTF-8 encoding of ucs into buf and return number of bytes written.
  */
 int a_Utf8_encode(unsigned int ucs, char *buf)
--- a/src/utf8.hh	Sun May 24 21:52:59 2009 +0000
+++ b/src/utf8.hh	Mon May 25 18:42:24 2009 +0200
@@ -9,6 +9,7 @@
 #include "d_size.h"
 
 uint_t a_Utf8_end_of_char(const char *str, uint_t i);
+uint_t a_Utf8_decode(const char*, const char* end, int* len);
 int a_Utf8_encode(unsigned int ucs, char *buf);
 int a_Utf8_test(const char* src, unsigned int srclen);