31static const uint32_t offsetsFromUTF8[6] = {
32 0x00000000UL, 0x00003080UL, 0x000E2080UL,
33 0x03C82080UL, 0xFA082080UL, 0x82082080UL
36static const char trailingBytesForUTF8[256] = {
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
43 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
48size_t u8_seqlen(
const char *s)
50 return trailingBytesForUTF8[(
unsigned int)(
unsigned char)s[0]] + 1;
55size_t u8_charlen(uint32_t ch)
61 else if (ch < 0x10000)
63 else if (ch < 0x110000)
68size_t u8_codingsize(uint32_t *wcstr,
size_t n)
73 c += u8_charlen(wcstr[i]);
85size_t u8_toucs(uint32_t *dest,
size_t sz,
const char *src,
size_t srcsz)
88 const char *src_end = src + srcsz;
92 if (sz == 0 || srcsz == 0)
99 if (src >= src_end)
break;
102 nb = trailingBytesForUTF8[(
unsigned char)*src];
103 if (src + nb >= src_end)
108 case 5: ch += (
unsigned char)*src++; ch <<= 6;
110 case 4: ch += (
unsigned char)*src++; ch <<= 6;
112 case 3: ch += (
unsigned char)*src++; ch <<= 6;
114 case 2: ch += (
unsigned char)*src++; ch <<= 6;
116 case 1: ch += (
unsigned char)*src++; ch <<= 6;
118 case 0: ch += (
unsigned char)*src++;
120 ch -= offsetsFromUTF8[nb];
132size_t u8_toutf8(
char *dest,
size_t sz,
const uint32_t *src,
size_t srcsz)
137 char *dest_end = dest + sz;
142 if (dest >= dest_end)
146 else if (ch < 0x800) {
147 if (dest >= dest_end-1)
149 *dest++ = (ch>>6) | 0xC0;
150 *dest++ = (ch & 0x3F) | 0x80;
152 else if (ch < 0x10000) {
153 if (dest >= dest_end-2)
155 *dest++ = (ch>>12) | 0xE0;
156 *dest++ = ((ch>>6) & 0x3F) | 0x80;
157 *dest++ = (ch & 0x3F) | 0x80;
159 else if (ch < 0x110000) {
160 if (dest >= dest_end-3)
162 *dest++ = (ch>>18) | 0xF0;
163 *dest++ = ((ch>>12) & 0x3F) | 0x80;
164 *dest++ = ((ch>>6) & 0x3F) | 0x80;
165 *dest++ = (ch & 0x3F) | 0x80;
172size_t u8_wc_toutf8(
char *dest, uint32_t ch)
179 dest[0] = (ch>>6) | 0xC0;
180 dest[1] = (ch & 0x3F) | 0x80;
184 dest[0] = (ch>>12) | 0xE0;
185 dest[1] = ((ch>>6) & 0x3F) | 0x80;
186 dest[2] = (ch & 0x3F) | 0x80;
190 dest[0] = (ch>>18) | 0xF0;
191 dest[1] = ((ch>>12) & 0x3F) | 0x80;
192 dest[2] = ((ch>>6) & 0x3F) | 0x80;
193 dest[3] = (ch & 0x3F) | 0x80;
200size_t u8_offset(
const char *s,
size_t charnum)
204 while (charnum > 0) {
206 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
214size_t u8_charnum(
const char *s,
size_t offset)
216 size_t charnum = 0, i=0;
220 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
228size_t u8_strlen(
const char *s)
238 if (s[i++]==0)
break;
239 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
246uint32_t u8_nextchar(
const char *s,
size_t *i)
253 ch += (
unsigned char)s[(*i)];
255 }
while (s[*i] && (++(*i)) && !isutf(s[*i]));
256 ch -= offsetsFromUTF8[sz-1];
262uint32_t u8_nextmemchar(
const char *s,
size_t *i)
269 ch += (
unsigned char)s[(*i)++];
271 }
while (!isutf(s[*i]));
272 ch -= offsetsFromUTF8[sz-1];
277void u8_inc(
const char *s,
size_t *i)
279 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
282void u8_dec(
const char *s,
size_t *i)
284 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
287int octal_digit(
char c)
289 return (
c >=
'0' &&
c <=
'7');
294 return ((
c >=
'0' &&
c <=
'9') ||
295 (
c >=
'A' &&
c <=
'F') ||
296 (
c >=
'a' &&
c <=
'f'));
299char read_escape_control_char(
char c)
322size_t u8_read_escape_sequence(
const char *str,
size_t ssz, uint32_t *dest)
331 if (octal_digit(c0)) {
334 digs[dno++] = str[i++];
335 }
while (i<ssz && octal_digit(str[i]) && dno<3);
337 ch = strtol(digs, NULL, 8);
339 else if ((c0==
'x' && (ndig=2)) ||
340 (c0==
'u' && (ndig=4)) ||
341 (c0==
'U' && (ndig=8))) {
342 while (i<ssz && hex_digit(str[i]) && dno<ndig) {
343 digs[dno++] = str[i++];
345 if (dno == 0)
return 0;
347 ch = strtol(digs, NULL, 16);
350 ch = (uint32_t)read_escape_control_char(c0);
360size_t u8_unescape(
char *buf,
size_t sz,
const char *src)
366 while (*src &&
c < sz) {
369 amt = u8_read_escape_sequence(src, 1000, &ch);
376 amt = u8_wc_toutf8(temp, ch);
379 memcpy(&buf[
c], temp, amt);
387char *u8_strchr(
const char *s, uint32_t ch,
size_t *charn)
389 size_t i = 0, lasti=0;
394 c = u8_nextchar(s, &i);
397 return (
char*)&s[lasti];
405char *u8_memchr(
const char *s, uint32_t ch,
size_t sz,
size_t *charn)
407 size_t i = 0, lasti=0;
416 c += (
unsigned char)s[i++];
418 }
while (i < sz && !isutf(s[i]));
419 c -= offsetsFromUTF8[csz-1];
422 return (
char*)&s[lasti];
430char *u8_memrchr(
const char *s, uint32_t ch,
size_t sz)
432 size_t i = sz-1, tempi=0;
435 if (sz == 0)
return NULL;
437 while (i && !isutf(s[i])) i--;
441 c = u8_nextmemchar(s, &tempi);
459int u8_isvalid(
const char *str,
size_t length)
461 const unsigned char *p, *pend = (
unsigned char*)str + length;
466 for (p = (
unsigned char*)str; p < pend; p++) {
471 if ((
c & 0xc0) != 0xc0)
473 ab = trailingBytesForUTF8[
c];
480 if ((*p & 0xc0) != 0x80)
487 if ((
c & 0x3e) == 0)
return 0;
492 if (
c == 0xe0 && (*p & 0x20) == 0)
return 0;
497 if (
c == 0xf0 && (*p & 0x30) == 0)
return 0;
502 if (
c == 0xf8 && (*p & 0x38) == 0)
return 0;
508 if (
c == 0xfe ||
c == 0xff ||
509 (
c == 0xfc && (*p & 0x3c) == 0))
return 0;
515 if ((*(++p) & 0xc0) != 0x80)
return 0;
522int u8_reverse(
char *dest,
char * src,
size_t len)
529 c = (
unsigned char)src[si];
540 memcpy(&dest[di], &src[si],
sizeof(int16_t));
546 memcpy(&dest[di+1], &src[si+1],
sizeof(int16_t));
551 memcpy(&dest[di], &src[si],
sizeof(int32_t));