naev 0.12.6
utf8.c
1// clang-format off
2/*
3 Basic UTF-8 manipulation routines
4 by Jeff Bezanson
5 placed in the public domain Fall 2005
6
7 This code is designed to provide the utilities you need to manipulate
8 UTF-8 as an internal string encoding. These functions do not perform the
9 error checking normally needed when handling UTF-8 data, so if you happen
10 to be from the Unicode Consortium you will want to flay me alive.
11 I do this because error checking can be performed at the boundaries (I/O),
12 with these routines reserved for higher performance on data known to be
13 valid.
14 A UTF-8 validation routine is included.
15*/
16
18#if NAEV_HAVE_ALLOCA_H
19# include <alloca.h> /* Not available in windows, necessary for linux. */
20#endif /* NAEV_HAVE_ALLOCA_H */
21#include <assert.h>
22#if HAVE_MALLOC_H
23# include <malloc.h>
24#endif /* HAVE_MALLOC_H */
25#include <stdio.h>
26#include <string.h>
28
29#include "utf8.h"
30
31static const uint32_t offsetsFromUTF8[6] = {
32 0x00000000UL, 0x00003080UL, 0x000E2080UL,
33 0x03C82080UL, 0xFA082080UL, 0x82082080UL
34};
35
36static const char trailingBytesForUTF8[256] = {
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
43 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
45};
46
47/* returns length of next utf-8 sequence */
48size_t u8_seqlen(const char *s)
49{
50 return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
51}
52
53/* returns the # of bytes needed to encode a certain character
54 0 means the character cannot (or should not) be encoded. */
55size_t u8_charlen(uint32_t ch)
56{
57 if (ch < 0x80)
58 return 1;
59 else if (ch < 0x800)
60 return 2;
61 else if (ch < 0x10000)
62 return 3;
63 else if (ch < 0x110000)
64 return 4;
65 return 0;
66}
67
68size_t u8_codingsize(uint32_t *wcstr, size_t n)
69{
70 size_t i, c=0;
71
72 for (i=0; i < n; i++)
73 c += u8_charlen(wcstr[i]);
74 return c;
75}
76
77/* conversions without error checking
78 only works for valid UTF-8, i.e. no 5- or 6-byte sequences
79 srcsz = source size in bytes
80 sz = dest size in # of wide characters
81
82 returns # characters converted
83 if sz == srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
84*/
85size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz)
86{
87 uint32_t ch;
88 const char *src_end = src + srcsz;
89 size_t nb;
90 size_t i=0;
91
92 if (sz == 0 || srcsz == 0)
93 return 0;
94
95 while (i < sz) {
96 if (!isutf(*src)) { // invalid sequence
97 dest[i++] = 0xFFFD;
98 src++;
99 if (src >= src_end) break;
100 continue;
101 }
102 nb = trailingBytesForUTF8[(unsigned char)*src];
103 if (src + nb >= src_end)
104 break;
105 ch = 0;
106 switch (nb) {
107 /* these fall through deliberately */
108 case 5: ch += (unsigned char)*src++; ch <<= 6;
109 /* Falls through. */
110 case 4: ch += (unsigned char)*src++; ch <<= 6;
111 /* Falls through. */
112 case 3: ch += (unsigned char)*src++; ch <<= 6;
113 /* Falls through. */
114 case 2: ch += (unsigned char)*src++; ch <<= 6;
115 /* Falls through. */
116 case 1: ch += (unsigned char)*src++; ch <<= 6;
117 /* Falls through. */
118 case 0: ch += (unsigned char)*src++;
119 }
120 ch -= offsetsFromUTF8[nb];
121 dest[i++] = ch;
122 }
123 return i;
124}
125
126/* srcsz = number of source characters
127 sz = size of dest buffer in bytes
128
129 returns # bytes stored in dest
130 the destination string will never be bigger than the source string.
131*/
132size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz)
133{
134 uint32_t ch;
135 size_t i = 0;
136 char *dest0 = dest;
137 char *dest_end = dest + sz;
138
139 while (i < srcsz) {
140 ch = src[i];
141 if (ch < 0x80) {
142 if (dest >= dest_end)
143 break;
144 *dest++ = (char)ch;
145 }
146 else if (ch < 0x800) {
147 if (dest >= dest_end-1)
148 break;
149 *dest++ = (ch>>6) | 0xC0;
150 *dest++ = (ch & 0x3F) | 0x80;
151 }
152 else if (ch < 0x10000) {
153 if (dest >= dest_end-2)
154 break;
155 *dest++ = (ch>>12) | 0xE0;
156 *dest++ = ((ch>>6) & 0x3F) | 0x80;
157 *dest++ = (ch & 0x3F) | 0x80;
158 }
159 else if (ch < 0x110000) {
160 if (dest >= dest_end-3)
161 break;
162 *dest++ = (ch>>18) | 0xF0;
163 *dest++ = ((ch>>12) & 0x3F) | 0x80;
164 *dest++ = ((ch>>6) & 0x3F) | 0x80;
165 *dest++ = (ch & 0x3F) | 0x80;
166 }
167 i++;
168 }
169 return (dest-dest0);
170}
171
172size_t u8_wc_toutf8(char *dest, uint32_t ch)
173{
174 if (ch < 0x80) {
175 dest[0] = (char)ch;
176 return 1;
177 }
178 if (ch < 0x800) {
179 dest[0] = (ch>>6) | 0xC0;
180 dest[1] = (ch & 0x3F) | 0x80;
181 return 2;
182 }
183 if (ch < 0x10000) {
184 dest[0] = (ch>>12) | 0xE0;
185 dest[1] = ((ch>>6) & 0x3F) | 0x80;
186 dest[2] = (ch & 0x3F) | 0x80;
187 return 3;
188 }
189 if (ch < 0x110000) {
190 dest[0] = (ch>>18) | 0xF0;
191 dest[1] = ((ch>>12) & 0x3F) | 0x80;
192 dest[2] = ((ch>>6) & 0x3F) | 0x80;
193 dest[3] = (ch & 0x3F) | 0x80;
194 return 4;
195 }
196 return 0;
197}
198
199/* charnum => byte offset */
200size_t u8_offset(const char *s, size_t charnum)
201{
202 size_t i=0;
203
204 while (charnum > 0) {
205 if (s[i++] & 0x80) {
206 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
207 }
208 charnum--;
209 }
210 return i;
211}
212
213/* byte offset => charnum */
214size_t u8_charnum(const char *s, size_t offset)
215{
216 size_t charnum = 0, i=0;
217
218 while (i < offset) {
219 if (s[i++] & 0x80) {
220 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
221 }
222 charnum++;
223 }
224 return charnum;
225}
226
227/* number of characters in NUL-terminated string */
228size_t u8_strlen(const char *s)
229{
230 size_t count = 0;
231 size_t i = 0, lasti;
232
233 while (1) {
234 lasti = i;
235 while (s[i] > 0)
236 i++;
237 count += (i-lasti);
238 if (s[i++]==0) break;
239 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
240 count++;
241 }
242 return count;
243}
244
245/* reads the next utf-8 sequence out of a string, updating an index */
246uint32_t u8_nextchar(const char *s, size_t *i)
247{
248 uint32_t ch = 0;
249 size_t sz = 0;
250
251 do {
252 ch <<= 6;
253 ch += (unsigned char)s[(*i)];
254 sz++;
255 } while (s[*i] && (++(*i)) && !isutf(s[*i]));
256 ch -= offsetsFromUTF8[sz-1];
257
258 return ch;
259}
260
261/* next character without NUL character terminator */
262uint32_t u8_nextmemchar(const char *s, size_t *i)
263{
264 uint32_t ch = 0;
265 size_t sz = 0;
266
267 do {
268 ch <<= 6;
269 ch += (unsigned char)s[(*i)++];
270 sz++;
271 } while (!isutf(s[*i]));
272 ch -= offsetsFromUTF8[sz-1];
273
274 return ch;
275}
276
277void u8_inc(const char *s, size_t *i)
278{
279 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
280}
281
282void u8_dec(const char *s, size_t *i)
283{
284 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
285}
286
287int octal_digit(char c)
288{
289 return (c >= '0' && c <= '7');
290}
291
292int hex_digit(char c)
293{
294 return ((c >= '0' && c <= '9') ||
295 (c >= 'A' && c <= 'F') ||
296 (c >= 'a' && c <= 'f'));
297}
298
299char read_escape_control_char(char c)
300{
301 if (c == 'n')
302 return '\n';
303 else if (c == 't')
304 return '\t';
305 else if (c == 'r')
306 return '\r';
307 else if (c == 'e')
308 return 033; // '\e'
309 else if (c == 'b')
310 return '\b';
311 else if (c == 'f')
312 return '\f';
313 else if (c == 'v')
314 return '\v';
315 else if (c == 'a')
316 return '\a';
317 return c;
318}
319
320/* assumes that src points to the character after a backslash
321 returns number of input characters processed, 0 if error */
322size_t u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest)
323{
324 uint32_t ch;
325 char digs[10];
326 int dno=0, ndig;
327 size_t i=1;
328 char c0 = str[0];
329 assert(ssz > 0);
330
331 if (octal_digit(c0)) {
332 i = 0;
333 do {
334 digs[dno++] = str[i++];
335 } while (i<ssz && octal_digit(str[i]) && dno<3);
336 digs[dno] = '\0';
337 ch = strtol(digs, NULL, 8);
338 }
339 else if ((c0=='x' && (ndig=2)) ||
340 (c0=='u' && (ndig=4)) ||
341 (c0=='U' && (ndig=8))) {
342 while (i<ssz && hex_digit(str[i]) && dno<ndig) {
343 digs[dno++] = str[i++];
344 }
345 if (dno == 0) return 0;
346 digs[dno] = '\0';
347 ch = strtol(digs, NULL, 16);
348 }
349 else {
350 ch = (uint32_t)read_escape_control_char(c0);
351 }
352 *dest = ch;
353
354 return i;
355}
356
357/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
358 example: u8_unescape(mybuf, 256, "hello\\u220e")
359 note the double backslash is needed if called on a C string literal */
360size_t u8_unescape(char *buf, size_t sz, const char *src)
361{
362 size_t c=0, amt;
363 uint32_t ch = 0;
364 char temp[4];
365
366 while (*src && c < sz) {
367 if (*src == '\\') {
368 src++;
369 amt = u8_read_escape_sequence(src, 1000, &ch);
370 }
371 else {
372 ch = (uint32_t)*src;
373 amt = 1;
374 }
375 src += amt;
376 amt = u8_wc_toutf8(temp, ch);
377 if (amt > sz-c)
378 break;
379 memcpy(&buf[c], temp, amt);
380 c += amt;
381 }
382 if (c < sz)
383 buf[c] = '\0';
384 return c;
385}
386
387char *u8_strchr(const char *s, uint32_t ch, size_t *charn)
388{
389 size_t i = 0, lasti=0;
390 uint32_t c;
391
392 *charn = 0;
393 while (s[i]) {
394 c = u8_nextchar(s, &i);
395 if (c == ch) {
396 /* it's const for us, but not necessarily the caller */
397 return (char*)&s[lasti];
398 }
399 lasti = i;
400 (*charn)++;
401 }
402 return NULL;
403}
404
405char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn)
406{
407 size_t i = 0, lasti=0;
408 uint32_t c;
409 int csz;
410
411 *charn = 0;
412 while (i < sz) {
413 c = csz = 0;
414 do {
415 c <<= 6;
416 c += (unsigned char)s[i++];
417 csz++;
418 } while (i < sz && !isutf(s[i]));
419 c -= offsetsFromUTF8[csz-1];
420
421 if (c == ch) {
422 return (char*)&s[lasti];
423 }
424 lasti = i;
425 (*charn)++;
426 }
427 return NULL;
428}
429
430char *u8_memrchr(const char *s, uint32_t ch, size_t sz)
431{
432 size_t i = sz-1, tempi=0;
433 uint32_t c;
434
435 if (sz == 0) return NULL;
436
437 while (i && !isutf(s[i])) i--;
438
439 while (1) {
440 tempi = i;
441 c = u8_nextmemchar(s, &tempi);
442 if (c == ch) {
443 return (char*)&s[i];
444 }
445 if (i == 0)
446 break;
447 tempi = i;
448 u8_dec(s, &i);
449 if (i > tempi)
450 break;
451 }
452 return NULL;
453}
454
455/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
456
457 length is in bytes, since without knowing whether the string is valid
458 it's hard to know how many characters there are! */
459int u8_isvalid(const char *str, size_t length)
460{
461 const unsigned char *p, *pend = (unsigned char*)str + length;
462 unsigned char c;
463 int ret = 1; /* ASCII */
464 size_t ab;
465
466 for (p = (unsigned char*)str; p < pend; p++) {
467 c = *p;
468 if (c < 128)
469 continue;
470 ret = 2; /* non-ASCII UTF-8 */
471 if ((c & 0xc0) != 0xc0)
472 return 0;
473 ab = trailingBytesForUTF8[c];
474 if (length < ab)
475 return 0;
476 length -= ab;
477
478 p++;
479 /* Check top bits in the second byte */
480 if ((*p & 0xc0) != 0x80)
481 return 0;
482
483 /* Check for overlong sequences for each different length */
484 switch (ab) {
485 /* Check for xx00 000x */
486 case 1:
487 if ((c & 0x3e) == 0) return 0;
488 continue; /* We know there aren't any more bytes to check */
489
490 /* Check for 1110 0000, xx0x xxxx */
491 case 2:
492 if (c == 0xe0 && (*p & 0x20) == 0) return 0;
493 break;
494
495 /* Check for 1111 0000, xx00 xxxx */
496 case 3:
497 if (c == 0xf0 && (*p & 0x30) == 0) return 0;
498 break;
499
500 /* Check for 1111 1000, xx00 0xxx */
501 case 4:
502 if (c == 0xf8 && (*p & 0x38) == 0) return 0;
503 break;
504
505 /* Check for leading 0xfe or 0xff,
506 and then for 1111 1100, xx00 00xx */
507 case 5:
508 if (c == 0xfe || c == 0xff ||
509 (c == 0xfc && (*p & 0x3c) == 0)) return 0;
510 break;
511 }
512
513 /* Check for valid bytes after the 2nd, if any; all must start 10 */
514 while (--ab > 0) {
515 if ((*(++p) & 0xc0) != 0x80) return 0;
516 }
517 }
518
519 return ret;
520}
521
522int u8_reverse(char *dest, char * src, size_t len)
523{
524 size_t si=0, di=len;
525 unsigned char c;
526
527 dest[di] = '\0';
528 while (si < len) {
529 c = (unsigned char)src[si];
530 if ((~c) & 0x80) {
531 di--;
532 dest[di] = c;
533 si++;
534 }
535 else {
536 switch (c>>4) {
537 case 0xC:
538 case 0xD:
539 di -= 2;
540 memcpy(&dest[di], &src[si], sizeof(int16_t));
541 si += 2;
542 break;
543 case 0xE:
544 di -= 3;
545 dest[di] = src[si];
546 memcpy(&dest[di+1], &src[si+1], sizeof(int16_t));
547 si += 3;
548 break;
549 case 0xF:
550 di -= 4;
551 memcpy(&dest[di], &src[si], sizeof(int32_t));
552 si += 4;
553 break;
554 default:
555 return 1;
556 }
557 }
558 }
559 return 0;
560}
static const double c[]
Definition rng.c:256