changeset 1453:328111d18d57

respect UTF-8 when splitting long lines in plain.cc (noticed by corvid) When splitting long lines in plain text to avoid X11 coordinate overflows we need to make sure that multibyte UTF-8 chars are not split. Additionally combining chars like accents should stay together with their base char.
author Johannes Hofmann <Johannes.Hofmann@gmx.de>
date Sun, 29 Nov 2009 21:40:02 +0100
parents 3f94a991d848
children 4f088370cfda
files src/misc.c src/misc.h src/plain.cc src/utf8.cc src/utf8.hh
diffstat 5 files changed, 57 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/src/misc.c	Fri Nov 27 20:05:36 2009 +0100
+++ b/src/misc.c	Sun Nov 29 21:40:02 2009 +0100
@@ -14,11 +14,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <assert.h>
 
 #include "utf8.hh"
 #include "msg.h"
 #include "misc.h"
-#include "utf8.hh"
 
 /*
  * Escape characters as %XX sequences.
@@ -51,38 +51,50 @@
 /*
  * Takes a string and converts any tabs to spaces.
  */
-char *a_Misc_expand_tabs(const char *str, int len)
+int
+a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
 {
-   int i = 0, j, pos = 0, old_pos, char_len;
+   int j, pos = 0, written = 0, old_pos, char_len;
    uint_t code;
-   char *val;
-
-   if (memchr(str, '\t', len) == NULL) {
-      val = dStrndup(str, len);
-   } else {
-      Dstr *New = dStr_new("");
+   static const int combining_char_space = 32;
 
-      while (i < len) {
-         code = a_Utf8_decode(&str[i], str + len, &char_len);
+   while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
+      code = a_Utf8_decode(*start, end, &char_len);
 
-         if (code == '\t') {
-            /* Fill with whitespaces until the next tab. */
-            old_pos = pos;
-            pos += TAB_SIZE - (pos % TAB_SIZE);
-            for (j = old_pos; j < pos; j++)
-               dStr_append_c(New, ' ');
-         } else {
-            dStr_append_l(New, &str[i], char_len);
-            pos++;
-         }
-
-         i += char_len;
+      if (code == '\t') {
+         /* Fill with whitespaces until the next tab. */
+         old_pos = pos;
+         pos += TAB_SIZE - (pos % TAB_SIZE);
+         for (j = old_pos; j < pos; j++)
+            buf[written++] = ' ';
+      } else {
+         assert(char_len <= 4);
+         for (j = 0; j < char_len; j++)
+            buf[written++] = (*start)[j];
+         pos++;
       }
 
-      val = New->str;
-      dStr_free(New, FALSE);
+      *start += char_len;
    }
-   return val;
+
+   /* If following chars are combining chars (e.g. accents) add them to the
+    * buffer. We have reserved combining_char_space bytes for this.
+    * If there should be more combining chars, we split nevertheless.
+    */
+   while (*start < end && written < buflen - 4) {
+      code = a_Utf8_decode(*start, end, &char_len);
+
+      if (! a_Utf8_combining_char(code))
+         break;
+
+      assert(char_len <= 4);
+      for (j = 0; j < char_len; j++)
+         buf[written++] = (*start)[j];
+    
+      *start += char_len;
+   }
+
+   return written;
 }
 
 /* TODO: could use dStr ADT! */
--- a/src/misc.h	Fri Nov 27 20:05:36 2009 +0100
+++ b/src/misc.h	Sun Nov 29 21:40:02 2009 +0100
@@ -10,7 +10,7 @@
 
 
 char *a_Misc_escape_chars(const char *str, const char *esc_set);
-char *a_Misc_expand_tabs(const char *str, int len);
+int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen);
 int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT);
 int a_Misc_content_type_check(const char *EntryType, const char *DetectedType);
 void a_Misc_parse_content_type(const char *str, char **major, char **minor,
--- a/src/plain.cc	Fri Nov 27 20:05:36 2009 +0100
+++ b/src/plain.cc	Sun Nov 29 21:40:02 2009 +0100
@@ -135,20 +135,16 @@
 
 void DilloPlain::addLine(char *Buf, uint_t BufSize)
 {
-   uint_t remaining;
-   char *dp, *data;
-   const uint_t maxWordLen = 128; // Limit word len to avoid X11 coordinate
-                                  // overflow with extremely long lines.
-   dp = data = a_Misc_expand_tabs(Buf, BufSize);
-   remaining = strlen(data);
-   while (remaining > maxWordLen) {
-      DW2TB(dw)->addText(dp, maxWordLen, widgetStyle);
-      remaining -= maxWordLen;
-      dp += maxWordLen;
-   }
-   DW2TB(dw)->addText(dp, widgetStyle);
+   int len;
+   char buf[128];
+   char *end = Buf + BufSize;
+
+   // Limit word len to avoid X11 coordinate
+   // overflow with extremely long lines.
+   while ((len = a_Misc_expand_tabs(&Buf, end, buf, sizeof(buf))))
+      DW2TB(dw)->addText(buf, len, widgetStyle);
+
    DW2TB(dw)->addParbreak(0, widgetStyle);
-   dFree(data);
 }
 
 /*
--- a/src/utf8.cc	Fri Nov 27 20:05:36 2009 +0100
+++ b/src/utf8.cc	Sun Nov 29 21:40:02 2009 +0100
@@ -92,3 +92,11 @@
    }
    return ret;
 }
+
+bool_t a_Utf8_combining_char(int unicode)
+{
+   return ((unicode >= 0x0300 && unicode <= 0x036f) ||
+           (unicode >= 0x1dc0 && unicode <= 0x1dff) ||
+           (unicode >= 0x20d0 && unicode <= 0x20ff) ||
+           (unicode >= 0xfe20 && unicode <= 0xfe2f));
+}
--- a/src/utf8.hh	Fri Nov 27 20:05:36 2009 +0100
+++ b/src/utf8.hh	Sun Nov 29 21:40:02 2009 +0100
@@ -20,6 +20,7 @@
 int a_Utf8_encode(unsigned int ucs, char *buf);
 int a_Utf8_test(const char* src, unsigned int srclen);
 bool_t a_Utf8_ideographic(const char *s, const char *end, int *len);
+bool_t a_Utf8_combining_char(int unicode);
 
 #ifdef __cplusplus
 }