annotate src/utf8.cc @ 2104:3e7e5395f0bc

non-ASCII keybindings Alexander Voigt has kindly done some testing, and it seems that this makes bindings to most keys on a German keyboard possible -- except those that need AltGr don't work yet.
author corvid <corvid@lavabit.com>
date Thu, 23 Jun 2011 19:24:11 +0000
parents 719f4bae567c
children
rev   line source
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
1 /*
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
2 * File: utf8.c
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
3 *
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
4 * Copyright (C) 2009 Jorge Arellano Cid <jcid@dillo.org>
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
5 *
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
6 * This program is free software; you can redistribute it and/or modify
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
7 * it under the terms of the GNU General Public License as published by
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
8 * the Free Software Foundation; either version 3 of the License, or
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
9 * (at your option) any later version.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
10 */
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
11
1801
corvid <corvid@lavabit.com>
parents: 1453
diff changeset
12 #include <FL/fl_utf8.h>
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
13
1255
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
14 #include "../dlib/dlib.h" /* TRUE/FALSE */
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
15 #include "utf8.hh"
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
16
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
17 // C++ functions with C linkage ----------------------------------------------
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
18
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
19 /*
1103
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
20 * Return index of the last byte of the UTF-8-encoded character that str + i
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
21 * points to or into.
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
22 */
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
23 uint_t a_Utf8_end_of_char(const char *str, uint_t i)
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
24 {
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
25 /* We can almost get what we want from utf8fwd(p+1,...)-1, but that
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
26 * does not work for the last character in a string, and the fn makes some
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
27 * assumptions that do not suit us.
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
28 * Here's something very simpleminded instead:
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
29 */
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
30 if (str && *str && (str[i] & 0x80)) {
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
31 int internal_bytes = (str[i] & 0x40) ? 0 : 1;
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
32
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
33 while (((str[i + 1] & 0xc0) == 0x80) && (++internal_bytes < 4))
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
34 i++;
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
35 }
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
36 return i;
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
37 }
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
38
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
39 /*
1128
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
40 * Decode a single UTF-8-encoded character starting at p.
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
41 * The resulting Unicode value (in the range 0-0x10ffff) is returned,
1130
cdcb6c1fb148 comment
corvid <corvid@lavabit.com>
parents: 1128
diff changeset
42 * and len is set to the number of bytes in the UTF-8 encoding.
cdcb6c1fb148 comment
corvid <corvid@lavabit.com>
parents: 1128
diff changeset
43 * Note that utf8decode(), if given non-UTF-8 data, will interpret
cdcb6c1fb148 comment
corvid <corvid@lavabit.com>
parents: 1128
diff changeset
44 * it as ISO-8859-1 or CP1252 if possible.
1128
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
45 */
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
46 uint_t a_Utf8_decode(const char* str, const char* end, int* len)
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
47 {
1801
corvid <corvid@lavabit.com>
parents: 1453
diff changeset
48 return fl_utf8decode(str, end, len);
1128
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
49 }
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
50
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
51 /*
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
52 * Write UTF-8 encoding of ucs into buf and return number of bytes written.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
53 */
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
54 int a_Utf8_encode(unsigned int ucs, char *buf)
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
55 {
1801
corvid <corvid@lavabit.com>
parents: 1453
diff changeset
56 return fl_utf8encode(ucs, buf);
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
57 }
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
58
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
59 /*
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
60 * Examine first srclen bytes of src.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
61 * Return 0 if not legal UTF-8, 1 if all ASCII, 2 if all below 0x800,
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
62 * 3 if all below 0x10000, and 4 otherwise.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
63 */
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
64 int a_Utf8_test(const char* src, unsigned int srclen)
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
65 {
1801
corvid <corvid@lavabit.com>
parents: 1453
diff changeset
66 return fl_utf8test(src, srclen);
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
67 }
1255
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
68
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
69 /*
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
70 * Does s point to a UTF-8-encoded ideographic character?
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
71 *
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
72 * This is based on http://unicode.org/reports/tr14/#ID plus some guesses
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
73 * for what might make the most sense for Dillo. Surprisingly, they include
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
74 * Hangul Compatibility Jamo, but they're the experts, so I'll follow along.
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
75 */
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
76 bool_t a_Utf8_ideographic(const char *s, const char *end, int *len)
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
77 {
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
78 bool_t ret = FALSE;
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
79
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
80 if ((uchar_t)*s >= 0xe2) {
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
81 /* Unicode char >= U+2000. */
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
82 unsigned unicode = a_Utf8_decode(s, end, len);
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
83
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
84 if (unicode >= 0x2e80 &&
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
85 ((unicode <= 0xa4cf) ||
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
86 (unicode >= 0xf900 && unicode <= 0xfaff) ||
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
87 (unicode >= 0xff00 && unicode <= 0xff9f))) {
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
88 ret = TRUE;
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
89 }
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
90 } else {
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
91 *len = 1 + (int)a_Utf8_end_of_char(s, 0);
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
92 }
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
93 return ret;
5d6869b28e4d treat ideographic characters (Chinese/Japanese) as words
corvid <corvid@lavabit.com>
parents: 1130
diff changeset
94 }
1453
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
95
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
96 bool_t a_Utf8_combining_char(int unicode)
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
97 {
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
98 return ((unicode >= 0x0300 && unicode <= 0x036f) ||
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
99 (unicode >= 0x1dc0 && unicode <= 0x1dff) ||
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
100 (unicode >= 0x20d0 && unicode <= 0x20ff) ||
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
101 (unicode >= 0xfe20 && unicode <= 0xfe2f));
328111d18d57 respect UTF-8 when splitting long lines in plain.cc (noticed by corvid)
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1255
diff changeset
102 }
2104
3e7e5395f0bc non-ASCII keybindings
corvid <corvid@lavabit.com>
parents: 1801
diff changeset
103
3e7e5395f0bc non-ASCII keybindings
corvid <corvid@lavabit.com>
parents: 1801
diff changeset
104 int a_Utf8_char_count(const char *str, int len)
3e7e5395f0bc non-ASCII keybindings
corvid <corvid@lavabit.com>
parents: 1801
diff changeset
105 {
3e7e5395f0bc non-ASCII keybindings
corvid <corvid@lavabit.com>
parents: 1801
diff changeset
106 return fl_utf_nb_char((const uchar_t*)str, len);
3e7e5395f0bc non-ASCII keybindings
corvid <corvid@lavabit.com>
parents: 1801
diff changeset
107 }