comparison src/utf8.cc @ 1255:5d6869b28e4d

treat ideographic characters (Chinese/Japanese) as words
author corvid <corvid@lavabit.com>
date Sun, 02 Aug 2009 03:59:14 +0000
parents cdcb6c1fb148
children 328111d18d57
comparison
equal deleted inserted replaced
1254:68190badd2bf 1255:5d6869b28e4d
9 * (at your option) any later version. 9 * (at your option) any later version.
10 */ 10 */
11 11
12 #include <fltk/utf.h> 12 #include <fltk/utf.h>
13 13
14 #include "../dlib/dlib.h" /* TRUE/FALSE */
14 #include "utf8.hh" 15 #include "utf8.hh"
15 16
16 // C++ functions with C linkage ---------------------------------------------- 17 // C++ functions with C linkage ----------------------------------------------
17 18
18 /* 19 /*
62 */ 63 */
63 int a_Utf8_test(const char* src, unsigned int srclen) 64 int a_Utf8_test(const char* src, unsigned int srclen)
64 { 65 {
65 return utf8test(src, srclen); 66 return utf8test(src, srclen);
66 } 67 }
68
69 /*
70 * Does s point to a UTF-8-encoded ideographic character?
71 *
72 * This is based on http://unicode.org/reports/tr14/#ID plus some guesses
73 * for what might make the most sense for Dillo. Surprisingly, they include
74 * Hangul Compatibility Jamo, but they're the experts, so I'll follow along.
75 */
76 bool_t a_Utf8_ideographic(const char *s, const char *end, int *len)
77 {
78 bool_t ret = FALSE;
79
80 if ((uchar_t)*s >= 0xe2) {
81 /* Unicode char >= U+2000. */
82 unsigned unicode = a_Utf8_decode(s, end, len);
83
84 if (unicode >= 0x2e80 &&
85 ((unicode <= 0xa4cf) ||
86 (unicode >= 0xf900 && unicode <= 0xfaff) ||
87 (unicode >= 0xff00 && unicode <= 0xff9f))) {
88 ret = TRUE;
89 }
90 } else {
91 *len = 1 + (int)a_Utf8_end_of_char(s, 0);
92 }
93 return ret;
94 }