Mercurial > dillo_port1.3
annotate src/utf8.cc @ 1128:fbe2ce98a4b8
make tab expansion for plain text utf8 aware
In discussion with corvid <corvid@lavabit.com>.
author | Johannes Hofmann <Johannes.Hofmann@gmx.de> |
---|---|
date | Mon, 25 May 2009 18:42:24 +0200 |
parents | 94b9265663f6 |
children | cdcb6c1fb148 |
rev | line source |
---|---|
1098
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
1 /* |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
2 * File: utf8.c |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
3 * |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
4 * Copyright (C) 2009 Jorge Arellano Cid <jcid@dillo.org> |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
5 * |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
6 * This program is free software; you can redistribute it and/or modify |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
7 * it under the terms of the GNU General Public License as published by |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
8 * the Free Software Foundation; either version 3 of the License, or |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
9 * (at your option) any later version. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
10 */ |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
11 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
12 #include <fltk/utf.h> |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
13 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
14 #include "utf8.hh" |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
15 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
16 // C++ functions with C linkage ---------------------------------------------- |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
17 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
18 /* |
1103
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
19 * Return index of the last byte of the UTF-8-encoded character that str + i |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
20 * points to or into. |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
21 */ |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
22 uint_t a_Utf8_end_of_char(const char *str, uint_t i) |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
23 { |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
24 /* We can almost get what we want from utf8fwd(p+1,...)-1, but that |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
25 * does not work for the last character in a string, and the fn makes some |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
26 * assumptions that do not suit us. |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
27 * Here's something very simpleminded instead: |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
28 */ |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
29 if (str && *str && (str[i] & 0x80)) { |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
30 int internal_bytes = (str[i] & 0x40) ? 0 : 1; |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
31 |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
32 while (((str[i + 1] & 0xc0) == 0x80) && (++internal_bytes < 4)) |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
33 i++; |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
34 } |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
35 return i; |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
36 } |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
37 |
94b9265663f6
New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents:
1098
diff
changeset
|
38 /* |
1128
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
39 * Decode a single UTF-8-encoded character starting at p. |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
40 * The resulting Unicode value (in the range 0-0x10ffff) is returned, |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
41 * and len is set the the number of bytes in the UTF-8 encoding. |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
42 */ |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
43 uint_t a_Utf8_decode(const char* str, const char* end, int* len) |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
44 { |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
45 return utf8decode(str, end, len); |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
46 } |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
47 |
fbe2ce98a4b8
make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents:
1103
diff
changeset
|
48 /* |
1098
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
49 * Write UTF-8 encoding of ucs into buf and return number of bytes written. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
50 */ |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
51 int a_Utf8_encode(unsigned int ucs, char *buf) |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
52 { |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
53 return utf8encode(ucs, buf); |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
54 } |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
55 |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
56 /* |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
57 * Examine first srclen bytes of src. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
58 * Return 0 if not legal UTF-8, 1 if all ASCII, 2 if all below 0x800, |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
59 * 3 if all below 0x10000, and 4 otherwise. |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
60 */ |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
61 int a_Utf8_test(const char* src, unsigned int srclen) |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
62 { |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
63 return utf8test(src, srclen); |
614b1d02e6c3
Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff
changeset
|
64 } |