Mercurial > dillo_port1.3
annotate src/utf8.cc @ 1255:5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
author | corvid <corvid@lavabit.com> |
---|---|
date | Sun, 02 Aug 2009 03:59:14 +0000 |
parents | cdcb6c1fb148 |
children | 328111d18d57 |
rev | line source |
---|---|
1098
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
1 /* |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
2 * File: utf8.c |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
3 * |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
4 * Copyright (C) 2009 Jorge Arellano Cid <jcid@dillo.org> |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
5 * |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
6 * This program is free software; you can redistribute it and/or modify |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
7 * it under the terms of the GNU General Public License as published by |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
8 * the Free Software Foundation; either version 3 of the License, or |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
9 * (at your option) any later version. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
10 */ |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
11 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
12 #include <fltk/utf.h> |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
13 |
1255
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
14 #include "../dlib/dlib.h" /* TRUE/FALSE */ |
1098
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
15 #include "utf8.hh" |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
16 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
17 // C++ functions with C linkage ---------------------------------------------- |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
18 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
19 /* |
1103
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
20 * Return index of the last byte of the UTF-8-encoded character that str + i |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
21 * points to or into. |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
22 */ |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
23 uint_t a_Utf8_end_of_char(const char *str, uint_t i) |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
24 { |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
25 /* We can almost get what we want from utf8fwd(p+1,...)-1, but that |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
26 * does not work for the last character in a string, and the fn makes some |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
27 * assumptions that do not suit us. |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
28 * Here's something very simpleminded instead: |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
29 */ |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
30 if (str && *str && (str[i] & 0x80)) { |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
31 int internal_bytes = (str[i] & 0x40) ? 0 : 1; |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
32 |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
33 while (((str[i + 1] & 0xc0) == 0x80) && (++internal_bytes < 4)) |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
34 i++; |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
35 } |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
36 return i; |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
37 } |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
38 |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
39 /* |
1128
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
40 * Decode a single UTF-8-encoded character starting at p. |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
41 * The resulting Unicode value (in the range 0-0x10ffff) is returned, |
1130 | 42 * and len is set to the number of bytes in the UTF-8 encoding. |
43 * Note that utf8decode(), if given non-UTF-8 data, will interpret | |
44 * it as ISO-8859-1 or CP1252 if possible. | |
1128
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
45 */ |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
46 uint_t a_Utf8_decode(const char* str, const char* end, int* len) |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
47 { |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
48 return utf8decode(str, end, len); |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
49 } |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
50 |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
51 /* |
1098
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
52 * Write UTF-8 encoding of ucs into buf and return number of bytes written. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
53 */ |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
54 int a_Utf8_encode(unsigned int ucs, char *buf) |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
55 { |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
56 return utf8encode(ucs, buf); |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
57 } |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
58 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
59 /* |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
60 * Examine first srclen bytes of src. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
61 * Return 0 if not legal UTF-8, 1 if all ASCII, 2 if all below 0x800, |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
62 * 3 if all below 0x10000, and 4 otherwise. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
63 */ |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
64 int a_Utf8_test(const char* src, unsigned int srclen) |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
65 { |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
66 return utf8test(src, srclen); |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
67 } |
1255
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
68 |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
69 /* |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
70 * Does s point to a UTF-8-encoded ideographic character? |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
71 * |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
72 * This is based on http://unicode.org/reports/tr14/#ID plus some guesses |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
73 * for what might make the most sense for Dillo. Surprisingly, they include |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
74 * Hangul Compatibility Jamo, but they're the experts, so I'll follow along. |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
75 */ |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
76 bool_t a_Utf8_ideographic(const char *s, const char *end, int *len) |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
77 { |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
78 bool_t ret = FALSE; |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
79 |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
80 if ((uchar_t)*s >= 0xe2) { |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
81 /* Unicode char >= U+2000. */ |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
82 unsigned unicode = a_Utf8_decode(s, end, len); |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
83 |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
84 if (unicode >= 0x2e80 && |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
85 ((unicode <= 0xa4cf) || |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
86 (unicode >= 0xf900 && unicode <= 0xfaff) || |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
87 (unicode >= 0xff00 && unicode <= 0xff9f))) { |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
88 ret = TRUE; |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
89 } |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
90 } else { |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
91 *len = 1 + (int)a_Utf8_end_of_char(s, 0); |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
92 } |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
93 return ret; |
5d6869b28e4d
treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents:
1130
diff
changeset
|
94 } |