utf8_functions.h revision 647b5a29
1/*
2 * Copyright 2004-2010, Haiku, Inc.
3 * Distributed under the terms of the MIT License.
4 */
5#ifndef _UTF8_FUNCTIONS_H
6#define _UTF8_FUNCTIONS_H
7
8
9#include <SupportDefs.h>
10
11
12static inline bool
13IsInsideGlyph(uchar ch)
14{
15	return (ch & 0xc0) == 0x80;
16}
17
18
19static inline uint32
20UTF8NextCharLenUnsafe(const char *text)
21{
22	const char *ptr = text;
23
24	do {
25		ptr++;
26	} while (IsInsideGlyph(*ptr));
27
28	return ptr - text;
29}
30
31
32static inline uint32
33UTF8NextCharLen(const char *text)
34{
35	if (text == NULL || *text == 0)
36		return 0;
37
38	return UTF8NextCharLenUnsafe(text);
39}
40
41
42static inline uint32
43UTF8NextCharLen(const char *bytes, size_t length)
44{
45	if (bytes == NULL || length == 0 || bytes[0] == 0)
46		return 0;
47
48	if ((bytes[0] & 0x80) == 0) {
49		// A single ASCII char - or so...
50		return 1;
51	}
52
53	if (IsInsideGlyph(bytes[0])) {
54		// Not a proper multibyte start.
55		return 0;
56	}
57
58	// We already know that we have the upper two bits set due to the above
59	// two checks.
60	uint8 mask = 0x20;
61	size_t bytesExpected = 2;
62	while ((bytes[0] & mask) != 0) {
63		if (mask == 0x02) {
64			// Seven byte char - invalid.
65			return 0;
66		}
67
68		bytesExpected++;
69		mask >>= 1;
70	}
71
72	// There would need to be more bytes to satisfy the char.
73	if (bytesExpected > length)
74		return 0;
75
76	// We already know the first byte is fine, check the rest.
77	for (size_t i = 1; i < bytesExpected; i++) {
78		if (!IsInsideGlyph(bytes[i])) {
79			// The sequence is incomplete.
80			return 0;
81		}
82	}
83
84	// Puh, everything's fine.
85	return bytesExpected;
86}
87
88
89static inline uint32
90UTF8PreviousCharLen(const char *text, const char *limit)
91{
92	const char *ptr = text;
93
94	if (ptr == NULL || limit == NULL)
95		return 0;
96
97	do {
98		if (ptr == limit)
99			break;
100		ptr--;
101	} while (IsInsideGlyph(*ptr));
102
103	return text - ptr;
104}
105
106
107/*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
108	numChars characters are read. If numChars is a negative value it is ignored
109	and the string is read up to the terminating 0.
110*/
111static inline uint32
112UTF8CountBytes(const char *bytes, int32 numChars)
113{
114	if (bytes == NULL)
115		return 0;
116
117	if (numChars < 0)
118		numChars = INT_MAX;
119
120	const char *base = bytes;
121	while (bytes[0] != '\0') {
122		if ((bytes[0] & 0xc0) != 0x80) {
123			if (--numChars < 0)
124				break;
125		}
126		bytes++;
127	}
128
129	return bytes - base;
130}
131
132
133/*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
134	numBytes bytes are read. If numBytes is a negative value it is ignored
135	and the string is read up to the terminating 0.
136*/
137static inline uint32
138UTF8CountChars(const char *bytes, int32 numBytes)
139{
140	if (bytes == NULL)
141		return 0;
142
143	uint32 length = 0;
144	if (numBytes < 0) {
145		while (bytes[0]) {
146			if ((bytes++[0] & 0xc0) != 0x80)
147				length++;
148		}
149	} else {
150		const char *last = bytes + numBytes - 1;
151		while (bytes[0] && bytes <= last) {
152			if ((bytes++[0] & 0xc0) != 0x80)
153				length++;
154		}
155	}
156
157	return length;
158}
159
160
161/*!	UTF8ToCharCode converts the input that includes potential multibyte chars
162	to UTF-32 char codes that can be used by FreeType. The string pointer is
163	then advanced to the next character in the string. In case the terminating
164	0 is reached, the string pointer is not advanced anymore and nulls are
165	returned. This makes it safe to overruns and enables streamed processing
166	of UTF8 strings.
167*/
168static inline uint32
169UTF8ToCharCode(const char **bytes)
170{
171	#define UTF8_SUBSTITUTE_CHARACTER	0xfffd
172
173	uint32 result;
174	if (((*bytes)[0] & 0x80) == 0) {
175		// a single byte character
176		result = (*bytes)[0];
177		if (result != '\0') {
178			// do not advance beyond the terminating '\0'
179			(*bytes)++;
180		}
181
182		return result;
183	}
184
185	if (((*bytes)[0] & 0xc0) == 0x80) {
186		// not a proper multibyte start
187		(*bytes)++;
188		return UTF8_SUBSTITUTE_CHARACTER;
189	}
190
191	// start of a multibyte character
192	uint8 mask = 0x80;
193	result = (uint32)((*bytes)[0] & 0xff);
194	(*bytes)++;
195
196	while (result & mask) {
197		if (mask == 0x02) {
198			// seven byte char - invalid
199			return UTF8_SUBSTITUTE_CHARACTER;
200		}
201
202		result &= ~mask;
203		mask >>= 1;
204	}
205
206	while (((*bytes)[0] & 0xc0) == 0x80) {
207		result <<= 6;
208		result += (*bytes)[0] & 0x3f;
209		(*bytes)++;
210
211		mask <<= 1;
212		if (mask == 0x40)
213			return result;
214	}
215
216	if (mask == 0x40)
217		return result;
218
219	if ((*bytes)[0] == '\0') {
220		// string terminated within multibyte char
221		return 0x00;
222	}
223
224	// not enough bytes in multibyte char
225	return UTF8_SUBSTITUTE_CHARACTER;
226
227	#undef UTF8_SUBSTITUTE_CHARACTER
228}
229
230#endif	// _UTF8_FUNCTIONS_H
231