annotate src/utf8.cc @ 1128:fbe2ce98a4b8

make tab expansion for plain text utf8 aware In discussion with corvid <corvid@lavabit.com>.
author Johannes Hofmann <Johannes.Hofmann@gmx.de>
date Mon, 25 May 2009 18:42:24 +0200
parents 94b9265663f6
children cdcb6c1fb148
rev   line source
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
1 /*
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
2 * File: utf8.c
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
3 *
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
4 * Copyright (C) 2009 Jorge Arellano Cid <jcid@dillo.org>
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
5 *
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
6 * This program is free software; you can redistribute it and/or modify
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
7 * it under the terms of the GNU General Public License as published by
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
8 * the Free Software Foundation; either version 3 of the License, or
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
9 * (at your option) any later version.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
10 */
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
11
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
12 #include <fltk/utf.h>
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
13
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
14 #include "utf8.hh"
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
15
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
16 // C++ functions with C linkage ----------------------------------------------
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
17
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
18 /*
1103
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
19 * Return index of the last byte of the UTF-8-encoded character that str + i
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
20 * points to or into.
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
21 */
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
22 uint_t a_Utf8_end_of_char(const char *str, uint_t i)
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
23 {
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
24 /* We can almost get what we want from utf8fwd(p+1,...)-1, but that
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
25 * does not work for the last character in a string, and the fn makes some
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
26 * assumptions that do not suit us.
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
27 * Here's something very simpleminded instead:
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
28 */
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
29 if (str && *str && (str[i] & 0x80)) {
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
30 int internal_bytes = (str[i] & 0x40) ? 0 : 1;
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
31
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
32 while (((str[i + 1] & 0xc0) == 0x80) && (++internal_bytes < 4))
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
33 i++;
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
34 }
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
35 return i;
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
36 }
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
37
94b9265663f6 New utility function: a_Utf8_end_of_char()
corvid <corvid@lavabit.com>
parents: 1098
diff changeset
38 /*
1128
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
39 * Decode a single UTF-8-encoded character starting at p.
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
40 * The resulting Unicode value (in the range 0-0x10ffff) is returned,
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
41 * and len is set the the number of bytes in the UTF-8 encoding.
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
42 */
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
43 uint_t a_Utf8_decode(const char* str, const char* end, int* len)
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
44 {
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
45 return utf8decode(str, end, len);
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
46 }
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
47
fbe2ce98a4b8 make tab expansion for plain text utf8 aware
Johannes Hofmann <Johannes.Hofmann@gmx.de>
parents: 1103
diff changeset
48 /*
1098
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
49 * Write UTF-8 encoding of ucs into buf and return number of bytes written.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
50 */
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
51 int a_Utf8_encode(unsigned int ucs, char *buf)
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
52 {
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
53 return utf8encode(ucs, buf);
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
54 }
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
55
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
56 /*
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
57 * Examine first srclen bytes of src.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
58 * Return 0 if not legal UTF-8, 1 if all ASCII, 2 if all below 0x800,
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
59 * 3 if all below 0x10000, and 4 otherwise.
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
60 */
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
61 int a_Utf8_test(const char* src, unsigned int srclen)
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
62 {
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
63 return utf8test(src, srclen);
614b1d02e6c3 Refactor: isolate calls to utf8 functions into a single source file.
corvid <corvid@lavabit.com>
parents:
diff changeset
64 }