changeset 1255:5d6869b28e4d

treat ideographic characters (Chinese/Japanese) as words
author corvid <corvid@lavabit.com>
date Sun, 02 Aug 2009 03:59:14 +0000
parents 68190badd2bf
children 6a1e98ad782e
files src/html.cc src/utf8.cc src/utf8.hh
diffstat 3 files changed, 44 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/src/html.cc	Sun Aug 02 03:31:55 2009 +0000
+++ b/src/html.cc	Sun Aug 02 03:59:14 2009 +0000
@@ -1189,17 +1189,30 @@
          }
       }
       for (start = i = 0; Pword[i]; start = i) {
+         int len;
+
          if (isspace(Pword[i])) {
             while (Pword[++i] && isspace(Pword[i])) ;
             Html_process_space(html, Pword + start, i - start);
-         } else {
-            while (Pword[++i] && !isspace(Pword[i])) ;
+         } else if (a_Utf8_ideographic(Pword+i, Pword_end, &len)) {
+            i += len;
             ch = Pword[i];
             Pword[i] = '\0';
             HT2TB(html)->addText(Pword + start,
                                  html->styleEngine->wordStyle ());
             Pword[i] = ch;
             html->PrevWasSPC = false;
+         } else {
+            do {
+               i += len;
+            } while (Pword[i] && !isspace(Pword[i]) &&
+                     (!a_Utf8_ideographic(Pword+i, Pword_end, &len)));
+            ch = Pword[i];
+            Pword[i] = 0;
+            HT2TB(html)->addText(Pword + start,
+                                 html->styleEngine->wordStyle ());
+            Pword[i] = ch;
+            html->PrevWasSPC = false;
          }
       }
       if (word != Pword)
--- a/src/utf8.cc	Sun Aug 02 03:31:55 2009 +0000
+++ b/src/utf8.cc	Sun Aug 02 03:59:14 2009 +0000
@@ -11,6 +11,7 @@
 
 #include <fltk/utf.h>
 
+#include "../dlib/dlib.h"    /* TRUE/FALSE */
 #include "utf8.hh"
 
 // C++ functions with C linkage ----------------------------------------------
@@ -64,3 +65,30 @@
 {
    return utf8test(src, srclen);
 }
+
+/*
+ * Does s point to a UTF-8-encoded ideographic character?
+ *
+ * This is based on http://unicode.org/reports/tr14/#ID plus some guesses
+ * for what might make the most sense for Dillo. Surprisingly, they include
+ * Hangul Compatibility Jamo, but they're the experts, so I'll follow along.
+ */
+bool_t a_Utf8_ideographic(const char *s, const char *end, int *len)
+{
+   bool_t ret = FALSE;
+
+   if ((uchar_t)*s >= 0xe2) {
+      /* Unicode char >= U+2000. */
+      unsigned unicode = a_Utf8_decode(s, end, len);
+
+      if (unicode >= 0x2e80 &&
+           ((unicode <= 0xa4cf) ||
+            (unicode >= 0xf900 && unicode <= 0xfaff) ||
+            (unicode >= 0xff00 && unicode <= 0xff9f))) {
+         ret = TRUE;
+     }
+   } else {
+      *len = 1 + (int)a_Utf8_end_of_char(s, 0);
+   }
+   return ret;
+}
--- a/src/utf8.hh	Sun Aug 02 03:31:55 2009 +0000
+++ b/src/utf8.hh	Sun Aug 02 03:59:14 2009 +0000
@@ -19,6 +19,7 @@
 uint_t a_Utf8_decode(const char*, const char* end, int* len);
 int a_Utf8_encode(unsigned int ucs, char *buf);
 int a_Utf8_test(const char* src, unsigned int srclen);
+bool_t a_Utf8_ideographic(const char *s, const char *end, int *len);
 
 #ifdef __cplusplus
 }