naev 0.12.6
lutf8lib.c
1// clang-format off
2/*
3 * From: https://github.com/starwing/luautf8/releases/tag/0.1.6
4
5MIT License
6
7Copyright (c) 2018 Xavier Wang
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26*/
27#include "lutf8lib.h"
28
30#include <assert.h>
31#include <string.h>
32#include <stdint.h>
33#include <limits.h>
34#include <stdlib.h>
36
37#include "unidata.h"
38
39/* UTF-8 string operations */
40
41#define UTF8_BUFFSZ 8
42#define UTF8_MAX 0x7FFFFFFFu
43#define UTF8_MAXCP 0x10FFFFu
44#define iscont(p) ((*(p) & 0xC0) == 0x80)
45#define CAST(tp,expr) ((tp)(expr))
46
47#ifndef LUA_QL
48# define LUA_QL(x) "'" x "'"
49#endif
50
51static int utf8_invalid (utfint ch)
52{ return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); }
53
54static size_t utf8_encode (char *buff, utfint x) {
55 int n = 1; /* number of bytes put in buffer (backwards) */
56 lua_assert(x <= UTF8_MAX);
57 if (x < 0x80) /* ascii? */
58 buff[UTF8_BUFFSZ - 1] = x & 0x7F;
59 else { /* need continuation bytes */
60 utfint mfb = 0x3f; /* maximum that fits in first byte */
61 do { /* add continuation bytes */
62 buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f);
63 x >>= 6; /* remove added bits */
64 mfb >>= 1; /* now there is one less bit available in first byte */
65 } while (x > mfb); /* still needs continuation byte? */
66 buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */
67 }
68 return n;
69}
70
71static const char *utf8_decode (const char *s, utfint *val, int strict) {
72 static const utfint limits[] =
73 {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u};
74 unsigned int c = (unsigned char)s[0];
75 utfint res = 0; /* final result */
76 if (c < 0x80) /* ascii? */
77 res = c;
78 else {
79 int count = 0; /* to count number of continuation bytes */
80 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
81 unsigned int cc = (unsigned char)s[++count]; /* read next byte */
82 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
83 return NULL; /* invalid byte sequence */
84 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
85 }
86 res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
87 if (count > 5 || res > UTF8_MAX || res < limits[count])
88 return NULL; /* invalid byte sequence */
89 s += count; /* skip continuation bytes read */
90 }
91 if (strict) {
92 /* check for invalid code points; too large or surrogates */
93 if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu))
94 return NULL;
95 }
96 if (val) *val = res;
97 return s + 1; /* +1 to include first byte */
98}
99
100static const char *utf8_prev (const char *s, const char *e) {
101 while (s < e && iscont(e - 1)) --e;
102 return s < e ? e - 1 : s;
103}
104
105static const char *utf8_next (const char *s, const char *e) {
106 while (s < e && iscont(s + 1)) ++s;
107 return s < e ? s + 1 : e;
108}
109
110static size_t utf8_length (const char *s, const char *e) {
111 size_t i;
112 for (i = 0; s < e; ++i)
113 s = utf8_next(s, e);
114 return i;
115}
116
117static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
118 const char *p = s + offset - 1;
119 if (idx >= 0) {
120 while (p < e && idx > 0)
121 p = utf8_next(p, e), --idx;
122 return idx == 0 ? p : NULL;
123 } else {
124 while (s < p && idx < 0)
125 p = utf8_prev(s, p), ++idx;
126 return idx == 0 ? p : NULL;
127 }
128}
129
130static const char *utf8_relat (const char *s, const char *e, int idx) {
131 return idx >= 0 ?
132 utf8_offset(s, e, 1, idx - 1) :
133 utf8_offset(s, e, e-s+1, idx);
134}
135
136static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) {
137 const char *ps = utf8_relat(s, e, CAST(int, *i));
138 const char *pe = utf8_relat(s, e, CAST(int, *j));
139 *i = (ps ? ps : (*i > 0 ? e : s)) - s;
140 *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s;
141 return *i < *j;
142}
143
144/* Indexed by top nibble of first byte in code unit */
145static uint8_t utf8_code_unit_len[] = {
146 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4
147};
148
149/* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */
150static const char *utf8_invalid_offset(const char *s, const char *e) {
151 while (s < e) {
152 uint8_t c = *s;
153 if (c >= 0x80) {
154 /* c < 0xC0 means a continuation byte, but we are not in the middle of a multi-byte code unit
155 * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit
156 * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or else illegal byte 0xFE/0xFF
157 * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid codepoint > U+10FFFF */
158 if (c < 0xC2 || c >= 0xF5)
159 return s;
160 uint8_t needed_bytes = utf8_code_unit_len[c >> 4];
161 if (e - s < needed_bytes)
162 return s; /* String is truncated */
163 uint8_t c2 = *(s+1);
164 if ((c2 & 0xC0) != 0x80)
165 return s; /* 2nd byte of code unit is not a continuation byte */
166 if (needed_bytes >= 3) {
167 uint8_t c3 = *(s+2);
168 if ((c3 & 0xC0) != 0x80)
169 return s; /* 3rd byte of code unit is not a continuation byte */
170 if (needed_bytes == 3) {
171 if (c == 0xE0 && c2 < 0xA0)
172 return s; /* Overlong 3-byte code unit */
173 if (c == 0xED && c2 >= 0xA0)
174 return s; /* Reserved codepoint from U+D800-U+DFFF */
175 } else {
176 uint8_t c4 = *(s+3);
177 if ((c4 & 0xC0) != 0x80)
178 return s; /* 4th byte of code unit is not a continuation byte */
179 if (c == 0xF0 && c2 < 0x90)
180 return s; /* Overlong 4-byte code unit */
181 if (c == 0xF4 && c2 >= 0x90)
182 return s; /* Illegal codepoint > U+10FFFF */
183 }
184 }
185 s += needed_bytes;
186 } else {
187 s++;
188 }
189 }
190 return NULL;
191}
192
193/* Unicode character categories */
194
195#define table_size(t) (sizeof(t)/sizeof((t)[0]))
196
197#define utf8_categories(X) \
198 X('a', alpha) \
199 X('c', cntrl) \
200 X('d', digit) \
201 X('l', lower) \
202 X('p', punct) \
203 X('s', space) \
204 X('t', compose) \
205 X('u', upper) \
206 X('x', xdigit)
207
208#define utf8_converters(X) \
209 X(lower) \
210 X(upper) \
211 X(title) \
212 X(fold)
213
214static int find_in_range (range_table *t, size_t size, utfint ch) {
215 size_t begin, end;
216
217 begin = 0;
218 end = size;
219
220 while (begin < end) {
221 size_t mid = (begin + end) / 2;
222 if (t[mid].last < ch)
223 begin = mid + 1;
224 else if (t[mid].first > ch)
225 end = mid;
226 else
227 return (ch - t[mid].first) % t[mid].step == 0;
228 }
229
230 return 0;
231}
232
233static int convert_char (conv_table *t, size_t size, utfint ch) {
234 size_t begin, end;
235
236 begin = 0;
237 end = size;
238
239 while (begin < end) {
240 size_t mid = (begin + end) / 2;
241 if (t[mid].last < ch)
242 begin = mid + 1;
243 else if (t[mid].first > ch)
244 end = mid;
245 else if ((ch - t[mid].first) % t[mid].step == 0)
246 return ch + t[mid].offset;
247 else
248 return ch;
249 }
250
251 return ch;
252}
253
254/* Normalization */
255
256static int lookup_canon_cls (utfint ch) {
257 /* The first codepoint with canonicalization class != 0 is U+0300 COMBINING GRAVE ACCENT */
258 if (ch < 0x300) {
259 return 0;
260 }
261 size_t begin = 0, end = table_size(nfc_combining_table);
262
263 while (begin < end) {
264 size_t mid = (begin + end) / 2;
265 if (nfc_combining_table[mid].last < ch)
266 begin = mid + 1;
267 else if (nfc_combining_table[mid].first > ch)
268 end = mid;
269 else
270 return nfc_combining_table[mid].canon_cls;
271 }
272
273 return 0;
274}
275
276static nfc_table *nfc_quickcheck (utfint ch) {
277 /* The first character which needs to be checked for possible NFC violations
278 * is U+0300 COMBINING GRAVE ACCENT */
279 if (ch < 0x300) {
280 return NULL;
281 }
282 size_t begin = 0, end = table_size(nfc_quickcheck_table);
283
284 while (begin < end) {
285 size_t mid = (begin + end) / 2;
286 utfint found = nfc_quickcheck_table[mid].cp;
287 if (found < ch)
288 begin = mid + 1;
289 else if (found > ch)
290 end = mid;
291 else
292 return &nfc_quickcheck_table[mid];
293 }
294
295 return NULL;
296}
297
298static int nfc_combine (utfint cp1, utfint cp2, utfint *dest) {
299 size_t begin = 0, end = table_size(nfc_composite_table);
300 unsigned int hash = (cp1 * 213) + cp2;
301
302 while (begin < end) {
303 size_t mid = (begin + end) / 2;
304 utfint val = nfc_composite_table[mid].hash;
305 if (val < hash) {
306 begin = mid + 1;
307 } else if (val > hash) {
308 end = mid;
309 } else if (nfc_composite_table[mid].cp1 == cp1 && nfc_composite_table[mid].cp2 == cp2) {
310 if (dest)
311 *dest = nfc_composite_table[mid].dest;
312 return 1;
313 } else {
314 return 0;
315 }
316 }
317
318 return 0;
319}
320
321static decompose_table *nfc_decompose (utfint ch) {
322 size_t begin = 0, end = table_size(nfc_decompose_table);
323
324 while (begin < end) {
325 size_t mid = (begin + end) / 2;
326 utfint found = nfc_decompose_table[mid].cp;
327 if (found < ch)
328 begin = mid + 1;
329 else if (found > ch)
330 end = mid;
331 else
332 return &nfc_decompose_table[mid];
333 }
334
335 return NULL;
336}
337
338static int nfc_check (utfint ch, nfc_table *entry, utfint starter, unsigned int canon_cls, unsigned int prev_canon_cls) {
339 int reason = entry->reason;
340
341 if (reason == REASON_MUST_CONVERT_1 || reason == REASON_MUST_CONVERT_2) {
342 /* This codepoint has a different, canonical form, so this string is not NFC */
343 return 0;
344 } else if (reason == REASON_STARTER_CAN_COMBINE) {
345 /* It is possible that this 'starter' codepoint should have been combined with the
346 * preceding 'starter' codepoint; if so, this string is not NFC */
347 if (!prev_canon_cls && nfc_combine(starter, ch, NULL)) {
348 /* These codepoints should have been combined */
349 return 0;
350 }
351 } else if (reason == REASON_COMBINING_MARK) {
352 /* Combining mark; check if it should have been combined with preceding starter codepoint */
353 if (canon_cls <= prev_canon_cls) {
354 return 1;
355 }
356 if (nfc_combine(starter, ch, NULL)) {
357 /* Yes, they should have been combined. This string is not NFC */
358 return 0;
359 }
360 /* Could it be that preceding 'starter' codepoint is already combined, but with a
361 * combining mark which is out of order with this one? */
362 decompose_table *decomp = nfc_decompose(starter);
363 if (decomp) {
364 if (decomp->canon_cls2 > canon_cls && nfc_combine(decomp->to1, ch, NULL)) {
365 return 0;
366 } else {
367 decompose_table *decomp2 = nfc_decompose(decomp->to1);
368 if (decomp2 && decomp2->canon_cls2 > canon_cls && nfc_combine(decomp2->to1, ch, NULL)) {
369 return 0;
370 }
371 }
372 }
373 } else if (reason == REASON_JAMO_VOWEL) {
374 if (!prev_canon_cls && starter >= 0x1100 && starter <= 0x1112) {
375 /* Preceding codepoint was a leading jamo; they should have been combined */
376 return 0;
377 }
378 } else if (reason == REASON_JAMO_TRAILING) {
379 if (!prev_canon_cls && starter >= 0xAC00 && starter <= 0xD7A3) {
380 /* Preceding codepoint was a precomposed Hangul syllable; check if it had no trailing jamo */
381 if ((starter - 0xAC00) % 28 == 0) {
382 /* It didn't have a trailing jamo, so this trailing jamo should have been combined */
383 return 0;
384 }
385 }
386 }
387
388 return 1;
389}
390
391static void merge_combining_marks (uint32_t *src1, uint32_t *src2, uint32_t *dest, size_t size1, size_t size2) {
392 while (size1 && size2) {
393 if ((*src1 & 0xFF) > (*src2 & 0xFF)) {
394 *dest++ = *src2++;
395 size2--;
396 } else {
397 *dest++ = *src1++;
398 size1--;
399 }
400 }
401 while (size1) {
402 *dest++ = *src1++;
403 size1--;
404 }
405 while (size2) {
406 *dest++ = *src2++;
407 size2--;
408 }
409}
410
411static void stable_sort_combining_marks (uint32_t *vector, uint32_t *scratch, size_t size) {
412 /* We need to use a stable sort for sorting combining marks which are in the wrong order
413 * when doing NFC normalization; bottom-up merge sort is fast and stable */
414 size_t limit = size - 1;
415 for (unsigned int i = 0; i < limit; i += 2) {
416 if ((vector[i] & 0xFF) > (vector[i+1] & 0xFF)) {
417 uint32_t temp = vector[i];
418 vector[i] = vector[i+1];
419 vector[i+1] = temp;
420 }
421 }
422 if (size <= 2)
423 return;
424
425 uint32_t *src = vector, *dest = scratch;
426 unsigned int runsize = 2; /* Every consecutive slice of this size is sorted */
427 while (runsize < size) {
428 unsigned int blocksize = runsize * 2; /* We will now sort slices of this size */
429 limit = size & ~(blocksize - 1);
430 for (unsigned int i = 0; i < limit; i += blocksize)
431 merge_combining_marks(&src[i], &src[i+runsize], &dest[i], runsize, runsize);
432 if (size - limit > runsize) {
433 merge_combining_marks(&src[limit], &src[limit+runsize], &dest[limit], runsize, size - limit - runsize);
434 } else {
435 memcpy(&dest[limit], &src[limit], (size - limit) * sizeof(uint32_t));
436 }
437 /* After each series of (progressively larger) merges, we swap src & dest to
438 * avoid memcpy'ing the partially sorted results from dest back into src */
439 uint32_t *temp = src; src = dest; dest = temp;
440 runsize = blocksize;
441 }
442
443 if (dest == vector) {
444 /* Since src & dest are swapped on each iteration of the above loop,
445 * this actually means the last buffer which was written into
446 * was 'scratch' */
447 memcpy(vector, scratch, size * sizeof(uint32_t));
448 }
449}
450
451/* Shuffle item `i` up or down to get it into the right position */
452static void stable_insert_combining_mark (uint32_t *vector, size_t vec_size, unsigned int i)
453{
454 unsigned int item = vector[i];
455 unsigned int canon_cls = item & 0xFF;
456 if (i > 0) {
457 if (canon_cls < (vector[i-1] & 0xFF)) {
458 do {
459 vector[i] = vector[i-1];
460 i--;
461 } while (i > 0 && canon_cls < (vector[i-1] & 0xFF));
462 vector[i] = item;
463 return;
464 }
465 }
466 if (i < vec_size-1) {
467 if (canon_cls > (vector[i+1] & 0xFF)) {
468 do {
469 vector[i] = vector[i+1];
470 i++;
471 } while (i < vec_size-1 && canon_cls > (vector[i+1] & 0xFF));
472 vector[i] = item;
473 return;
474 }
475 }
476}
477
478static void add_utf8char (luaL_Buffer *b, utfint ch);
479
480static inline void grow_vector_if_needed (uint32_t **vector, uint32_t *onstack, size_t *size, size_t needed)
481{
482 size_t current_size = *size;
483 if (needed >= current_size) {
484 size_t new_size = current_size * 2; /* `needed` is never bigger than `current_size * 2` */
485 uint32_t *new_vector = malloc(new_size * sizeof(uint32_t));
486 memcpy(new_vector, *vector, current_size * sizeof(uint32_t));
487 *size = new_size;
488 if (*vector != onstack)
489 free(*vector);
490 *vector = new_vector;
491 }
492}
493
494static void string_to_nfc (lua_State *L, luaL_Buffer *buff, const char *s, const char *e)
495{
496 /* Converting a string to Normal Form C involves:
497 * 1) Ensuring that codepoints with "built-in" accents are used whenever possible
498 * rather than separate codepoints for a base character and combining mark
499 * 2) Where combining marks must be used, putting them into canonical order
500 * 3) Converting some deprecated codepoints to the recommended variant
501 * 4) Ensuring that Korean Hangul are represented as precomposed syllable
502 * codepoints whenever possible, rather than sequences of Jamo codepoints
503 *
504 * (Combining marks are accents which appear on top of or below the preceding
505 * character. Starter codepoints are the base characters which combining marks can
506 * 'combine' with. Almost all codepoints are starters, including all the Latin alphabet.
507 * Every Unicode codepoint has a numeric 'canonicalization class'; starters have class = 0.
508 * Combining marks must be sorted in order of their canonicalization class. Since the
509 * canonicalization class numbers are not unique, the sort must be stable.)
510 *
511 * When converting to NFC, the largest scope which we need to work on at once
512 * consists of a 'starter' codepoint and either 1 or more ensuing combining marks,
513 * OR else a directly following starter codepoint.
514 *
515 * As we walk through the string, whenever we pass by a complete sequence of starter +
516 * combining marks or starter + starter, we process that sequence to see if it is NFC or not.
517 * If it is, we memcpy the bytes verbatim into the output buffer. If it is not, then we
518 * convert the codepoints to NFC and then emit those codepoints as UTF-8 bytes. */
519
520 utfint starter = -1, ch; /* 'starter' is last starter codepoint seen */
521 const char *to_copy = s; /* pointer to next bytes we might need to memcpy into output buffer */
522 unsigned int prev_canon_cls = 0, canon_cls = 0;
523 int fixedup = 0; /* has the sequence currently under consideration been modified to make it NFC? */
524
525 /* Temporary storage for a sequence of consecutive combining marks
526 * In the vast majority of cases, this small on-stack array will provide enough
527 * space; if not, we will switch to a malloc'd buffer */
528 uint32_t onstack[8];
529 size_t vec_size = 0, vec_max = sizeof(onstack)/sizeof(uint32_t);
530 uint32_t *vector = onstack;
531
532 while (s < e) {
533 const char *new_s = utf8_decode(s, &ch, 1);
534 if (new_s == NULL) {
535 if (vector != onstack)
536 free(vector);
537 lua_pushstring(L, "string is not valid UTF-8");
538 lua_error(L);
539 }
540 unsigned int canon_cls = lookup_canon_cls(ch);
541
542 if (!canon_cls) {
543 /* This is a starter codepoint */
544 nfc_table *entry = nfc_quickcheck(ch);
545
546 /* But in rare cases, a deprecated 'starter' codepoint may convert
547 * to combining marks instead!
548 * Why, oh why, did the Unicode Consortium do this?? */
549 if (entry && entry->reason == REASON_MUST_CONVERT_2) {
550 utfint conv1 = entry->data1;
551 unsigned int canon_cls1 = lookup_canon_cls(conv1);
552 if (canon_cls1) {
553 utfint conv2 = entry->data2;
554 unsigned int canon_cls2 = lookup_canon_cls(conv2);
555 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 2);
556 vector[vec_size++] = (conv1 << 8) | (canon_cls1 & 0xFF);
557 vector[vec_size++] = (conv2 << 8) | (canon_cls2 & 0xFF);
558 s = new_s;
559 prev_canon_cls = canon_cls2;
560 fixedup = 1;
561 continue;
562 }
563 }
564
565 /* Handle preceding starter and optional sequence of combining marks which may have followed it */
566 if (prev_canon_cls) {
567 /* Before this starter, there was a sequence of combining marks.
568 * Check those over and emit output to 'buff' */
569process_combining_marks:
570
571 /* Check if accumulated combining marks were in correct order */
572 for (unsigned int i = 1; i < vec_size; i++) {
573 if ((vector[i-1] & 0xFF) > (vector[i] & 0xFF)) {
574 /* Order is incorrect, we need to sort */
575 uint32_t *scratch = malloc(vec_size * sizeof(uint32_t));
576 stable_sort_combining_marks(vector, scratch, vec_size);
577 free(scratch);
578 fixedup = 1;
579 break;
580 }
581 }
582
583 /* Check if any of those combining marks are in violation of NFC */
584 unsigned int i = 0;
585 while (i < vec_size) {
586 utfint combine_mark = vector[i] >> 8;
587 nfc_table *mark_entry = nfc_quickcheck(combine_mark);
588 if (mark_entry) {
589 if (mark_entry->reason == REASON_MUST_CONVERT_1) {
590 /* This combining mark must be converted to a different one */
591 vector[i] = (mark_entry->data1 << 8) | mark_entry->data2;
592 fixedup = 1;
593 continue;
594 } else if (mark_entry->reason == REASON_MUST_CONVERT_2) {
595 /* This combining mark must be converted to two others */
596 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
597 memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1));
598 vector[i] = (mark_entry->data1 << 8) | lookup_canon_cls(mark_entry->data1);
599 vector[i+1] = (mark_entry->data2 << 8) | lookup_canon_cls(mark_entry->data2);
600 vec_size++;
601 fixedup = 1;
602 continue;
603 } else if (mark_entry->reason == REASON_COMBINING_MARK) {
604 unsigned int mark_canon_cls = vector[i] & 0xFF;
605 if (i == 0 || mark_canon_cls > (vector[i-1] & 0xFF)) {
606 if (nfc_combine(starter, combine_mark, &starter)) {
607 /* This combining mark must be combined with preceding starter */
608 vec_size--;
609 memmove(&vector[i], &vector[i+1], sizeof(uint32_t) * (vec_size - i)); /* Remove element i */
610 fixedup = 1;
611 continue;
612 }
613
614 decompose_table *decomp = nfc_decompose(starter);
615 if (decomp) {
616 if (decomp->canon_cls2 > mark_canon_cls && nfc_combine(decomp->to1, combine_mark, &starter)) {
617 /* The preceding starter already included an accent, but when represented as a combining
618 * mark, that accent has a HIGHER canonicalization class than this one
619 * Further, this one is able to combine with the same base character
620 * In other words, the base character was wrongly combined with a "lower-priority"
621 * combining mark; fix that up */
622 unsigned int class2 = lookup_canon_cls(decomp->to2);
623 memmove(&vector[1], &vector[0], sizeof(uint32_t) * i);
624 vector[0] = (decomp->to2 << 8) | class2;
625 stable_insert_combining_mark(vector, vec_size, 0);
626 fixedup = 1;
627 continue;
628 } else {
629 decompose_table *decomp2 = nfc_decompose(decomp->to1);
630 if (decomp2 && decomp2->canon_cls2 > mark_canon_cls && nfc_combine(decomp2->to1, combine_mark, &starter)) {
631 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
632 memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1));
633 memmove(&vector[2], &vector[0], sizeof(uint32_t) * i);
634 vector[0] = (decomp2->to2 << 8) | lookup_canon_cls(decomp2->to2);
635 vector[1] = (decomp->to2 << 8) | lookup_canon_cls(decomp->to2);
636 vec_size++;
637 stable_insert_combining_mark(vector, vec_size, 1);
638 stable_insert_combining_mark(vector, vec_size, 0);
639 fixedup = 1;
640 continue;
641 }
642 }
643 }
644 }
645 }
646 }
647 i++;
648 }
649
650 if (fixedup) {
651 /* The preceding starter/combining mark sequence was bad; convert fixed-up codepoints
652 * to UTF-8 bytes */
653 if (starter != -1)
654 add_utf8char(buff, starter);
655 for (unsigned int i = 0; i < vec_size; i++)
656 add_utf8char(buff, vector[i] >> 8);
657 } else {
658 /* The preceding starter/combining mark sequence was good; copy raw bytes to output */
659 luaL_addlstring(buff, to_copy, s - to_copy);
660 }
661 if (s >= e) {
662 /* We jumped in to the middle of the main loop to finish processing trailing
663 * combining marks... we are actually done now */
664 if (vector != onstack)
665 free(vector);
666 return;
667 }
668 vec_size = 0; /* Clear vector of combining marks in readiness for next such sequence */
669 fixedup = 0;
670 } else if (starter != -1) {
671 /* This starter was preceded immediately by another starter
672 * Check if this one should combine with it */
673 fixedup = 0;
674 if (entry) {
675 if (entry->reason == REASON_STARTER_CAN_COMBINE && nfc_combine(starter, ch, &ch)) {
676 fixedup = 1;
677 } else if (entry->reason == REASON_JAMO_VOWEL && starter >= 0x1100 && starter <= 0x1112) {
678 ch = 0xAC00 + ((starter - 0x1100) * 588) + ((ch - 0x1161) * 28);
679 fixedup = 1;
680 } else if (entry->reason == REASON_JAMO_TRAILING) {
681 if (starter >= 0xAC00 && starter <= 0xD7A3 && (starter - 0xAC00) % 28 == 0) {
682 ch = starter + ch - 0x11A7;
683 fixedup = 1;
684 }
685 }
686 }
687 if (!fixedup)
688 add_utf8char(buff, starter); /* Emit previous starter to output */
689 }
690 starter = ch;
691 to_copy = s;
692
693 /* We are finished processing the preceding starter and optional sequence of combining marks
694 * Now check if this (possibly deprecated) starter needs to be converted to a canonical variant */
695 if (entry) {
696 if (entry->reason == REASON_MUST_CONVERT_1) {
697 starter = entry->data1;
698 fixedup = 1;
699 } else if (entry->reason == REASON_MUST_CONVERT_2) {
700 utfint conv1 = entry->data1;
701 utfint conv2 = entry->data2;
702 /* It's possible that 'ch' might convert to two other codepoints,
703 * where the 2nd one is a combining mark */
704 unsigned int canon_cls2 = lookup_canon_cls(conv2);
705 if (canon_cls2) {
706 /* It's possible that the 1st resulting codepoint may need to be
707 * split again into more codepoints */
708 nfc_table *conv_entry = nfc_quickcheck(conv1);
709 if (conv_entry && conv_entry->reason == REASON_MUST_CONVERT_2) {
710 utfint conv3 = conv2;
711 unsigned int canon_cls3 = canon_cls2;
712 conv1 = conv_entry->data1;
713 conv2 = conv_entry->data2;
714 canon_cls2 = lookup_canon_cls(conv2);
715 if (canon_cls2) {
716 starter = conv1;
717 vector[0] = (conv2 << 8) | canon_cls2;
718 vector[1] = (conv3 << 8) | canon_cls3;
719 vec_size = 2;
720 } else {
721 add_utf8char(buff, conv1);
722 starter = conv2;
723 vector[0] = (conv3 << 8) | canon_cls3;
724 vec_size = 1;
725 }
726 canon_cls = canon_cls3;
727 } else {
728 starter = conv1;
729 vector[0] = (conv2 << 8) | canon_cls2;
730 vec_size = 1;
731 canon_cls = canon_cls2;
732 }
733 } else {
734 add_utf8char(buff, conv1);
735 starter = conv2;
736 }
737 fixedup = 1;
738 }
739 }
740 } else {
741 /* Accumulate combining marks in vector */
742 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
743 vector[vec_size++] = (ch << 8) | (canon_cls & 0xFF);
744 }
745
746 s = new_s;
747 prev_canon_cls = canon_cls;
748 }
749
750 if (vec_size)
751 goto process_combining_marks; /* Finish processing trailing combining marks */
752 if (starter != -1)
753 add_utf8char(buff, starter);
754
755 if (vector != onstack)
756 free(vector);
757}
758
759/* Grapheme cluster support */
760
761static int hangul_type (utfint ch) {
762 /* The first Hangul codepoint is U+1100 */
763 if (ch < 0x1100) {
764 return 0;
765 }
766 size_t begin = 0, end = table_size(hangul_table);
767
768 while (begin < end) {
769 size_t mid = (begin + end) / 2;
770 if (hangul_table[mid].last < ch)
771 begin = mid + 1;
772 else if (hangul_table[mid].first > ch)
773 end = mid;
774 else
775 return hangul_table[mid].type;
776 }
777
778 return 0;
779}
780
781static int indic_conjunct_type (utfint ch) {
782 /* The first Indic conjunct codepoint is U+0300 */
783 if (ch < 0x300) {
784 return 0;
785 }
786 size_t begin = 0, end = table_size(indic_table);
787
788 while (begin < end) {
789 size_t mid = (begin + end) / 2;
790 if (indic_table[mid].last < ch)
791 begin = mid + 1;
792 else if (indic_table[mid].first > ch)
793 end = mid;
794 else
795 return indic_table[mid].type;
796 }
797
798 return 0;
799}
800
801#define define_category(cls, name) static int utf8_is##name (utfint ch)\
802{ return find_in_range(name##_table, table_size(name##_table), ch); }
803#define define_converter(name) static utfint utf8_to##name (utfint ch) \
804{ return convert_char(to##name##_table, table_size(to##name##_table), ch); }
805utf8_categories(define_category)
806utf8_converters(define_converter)
807#undef define_category
808#undef define_converter
809
810static int utf8_isgraph (utfint ch) {
811 if (find_in_range(space_table, table_size(space_table), ch))
812 return 0;
813 if (find_in_range(graph_table, table_size(graph_table), ch))
814 return 1;
815 if (find_in_range(compose_table, table_size(compose_table), ch))
816 return 1;
817 return 0;
818}
819
820static int utf8_isalnum (utfint ch) {
821 if (find_in_range(alpha_table, table_size(alpha_table), ch))
822 return 1;
823 if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
824 return 1;
825 return 0;
826}
827
828static int utf8_width (utfint ch, int ambi_is_single) {
829 if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
830 return 2;
831 if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
832 return ambi_is_single ? 1 : 2;
833 if (find_in_range(compose_table, table_size(compose_table), ch))
834 return 0;
835 if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
836 return 0;
837 return 1;
838}
839
840/* string module compatible interface */
841
842static int typeerror (lua_State *L, int idx, const char *tname)
843{ return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); }
844
845static const char *check_utf8 (lua_State *L, int idx, const char **end) {
846 size_t len;
847 const char *s = luaL_checklstring(L, idx, &len);
848 if (end) *end = s+len;
849 return s;
850}
851
852static const char *to_utf8 (lua_State *L, int idx, const char **end) {
853 size_t len;
854 const char *s = lua_tolstring(L, idx, &len);
855 if (end) *end = s+len;
856 return s;
857}
858
859static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) {
860 p = utf8_decode(p, pval, 0);
861 if (p == NULL) luaL_error(L, "invalid UTF-8 code");
862 return p;
863}
864
865static void add_utf8char (luaL_Buffer *b, utfint ch) {
866 char buff[UTF8_BUFFSZ];
867 size_t n = utf8_encode(buff, ch);
868 luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n);
869}
870
871static lua_Integer byte_relat (lua_Integer pos, size_t len) {
872 if (pos >= 0) return pos;
873 else if (0u - (size_t)pos > len) return 0;
874 else return (lua_Integer)len + pos + 1;
875}
876
877static int Lutf8_len (lua_State *L) {
878 size_t len, n;
879 const char *s = luaL_checklstring(L, 1, &len), *p, *e;
880 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
881 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len);
882 int lax = lua_toboolean(L, 4);
883 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
884 "initial position out of string");
885 luaL_argcheck(L, --pose < (lua_Integer)len, 3,
886 "final position out of string");
887 for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) {
888 if (lax)
889 p = utf8_next(p, e);
890 else {
891 utfint ch;
892 const char *np = utf8_decode(p, &ch, !lax);
893 if (np == NULL || utf8_invalid(ch)) {
894 lua_pushnil(L);
895 lua_pushinteger(L, p - s + 1);
896 return 2;
897 }
898 p = np;
899 }
900 }
901 lua_pushinteger(L, n);
902 return 1;
903}
904
905static int Lutf8_sub (lua_State *L) {
906 const char *e, *s = check_utf8(L, 1, &e);
907 lua_Integer posi = luaL_checkinteger(L, 2);
908 lua_Integer pose = luaL_optinteger(L, 3, -1);
909 if (utf8_range(s, e, &posi, &pose))
910 lua_pushlstring(L, s+posi, pose-posi);
911 else
912 lua_pushliteral(L, "");
913 return 1;
914}
915
916static int Lutf8_reverse (lua_State *L) {
917 luaL_Buffer b;
918 const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e);
919 (void) ends;
920 int lax = lua_toboolean(L, 2);
921 luaL_buffinit(L, &b);
922 if (lax) {
923 for (prev = e; s < prev; e = prev) {
924 prev = utf8_prev(s, prev);
925 luaL_addlstring(&b, prev, e-prev);
926 }
927 } else {
928 for (prev = e; s < prev; prev = pprev) {
929 utfint code = 0;
930 ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code);
931 assert(ends == prev);
932 if (utf8_invalid(code))
933 return luaL_error(L, "invalid UTF-8 code");
934 if (!utf8_iscompose(code)) {
935 luaL_addlstring(&b, pprev, e-pprev);
936 e = pprev;
937 }
938 }
939 }
940 luaL_pushresult(&b);
941 return 1;
942}
943
944static int Lutf8_byte (lua_State *L) {
945 size_t n = 0;
946 const char *e, *s = check_utf8(L, 1, &e);
947 lua_Integer posi = luaL_optinteger(L, 2, 1);
948 lua_Integer pose = luaL_optinteger(L, 3, posi);
949 if (utf8_range(s, e, &posi, &pose)) {
950 for (e = s + pose, s = s + posi; s < e; ++n) {
951 utfint ch = 0;
952 s = utf8_safe_decode(L, s, &ch);
953 lua_pushinteger(L, ch);
954 }
955 }
956 return CAST(int, n);
957}
958
959static int Lutf8_codepoint (lua_State *L) {
960 const char *e, *s = check_utf8(L, 1, &e);
961 size_t len = e-s;
962 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
963 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len);
964 int lax = lua_toboolean(L, 4);
965 int n;
966 const char *se;
967 luaL_argcheck(L, posi >= 1, 2, "out of range");
968 luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
969 if (posi > pose) return 0; /* empty interval; return no values */
970 if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */
971 return luaL_error(L, "string slice too long");
972 n = (int)(pose - posi + 1);
973 luaL_checkstack(L, n, "string slice too long");
974 n = 0; /* count the number of returns */
975 se = s + pose; /* string end */
976 for (n = 0, s += posi - 1; s < se;) {
977 utfint code = 0;
978 s = utf8_safe_decode(L, s, &code);
979 if (!lax && utf8_invalid(code))
980 return luaL_error(L, "invalid UTF-8 code");
981 lua_pushinteger(L, code);
982 n++;
983 }
984 return n;
985}
986
987static int Lutf8_char (lua_State *L) {
988 int i, n = lua_gettop(L); /* number of arguments */
989 luaL_Buffer b;
990 luaL_buffinit(L, &b);
991 for (i = 1; i <= n; ++i) {
992 lua_Integer code = luaL_checkinteger(L, i);
993 luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range");
994 add_utf8char(&b, CAST(utfint, code));
995 }
996 luaL_pushresult(&b);
997 return 1;
998}
999
1000#define bind_converter(name) \
1001static int Lutf8_##name (lua_State *L) { \
1002 int t = lua_type(L, 1); \
1003 if (t == LUA_TNUMBER) \
1004 lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \
1005 else if (t == LUA_TSTRING) { \
1006 luaL_Buffer b; \
1007 const char *e, *s = to_utf8(L, 1, &e); \
1008 luaL_buffinit(L, &b); \
1009 while (s < e) { \
1010 utfint ch = 0; \
1011 s = utf8_safe_decode(L, s, &ch); \
1012 add_utf8char(&b, utf8_to##name(ch)); \
1013 } \
1014 luaL_pushresult(&b); \
1015 } \
1016 else return typeerror(L, 1, "number/string"); \
1017 return 1; \
1018}
1019utf8_converters(bind_converter)
1020#undef bind_converter
1021
1022
1023/* unicode extra interface */
1024
1025static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) {
1026 utfint code = 0;
1027 int in_bracket = 0;
1028 if (*s == '{') ++s, in_bracket = 1;
1029 for (; s < e; ++s) {
1030 utfint ch = (unsigned char)*s;
1031 if (ch >= '0' && ch <= '9') ch = ch - '0';
1032 else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A');
1033 else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a');
1034 else if (!in_bracket) break;
1035 else if (ch == '}') { ++s; break; }
1036 else luaL_error(L, "invalid escape '%c'", ch);
1037 code *= hex ? 16 : 10;
1038 code += ch;
1039 }
1040 *pch = code;
1041 return s;
1042}
1043
1044static int Lutf8_escape (lua_State *L) {
1045 const char *e, *s = check_utf8(L, 1, &e);
1046 luaL_Buffer b;
1047 luaL_buffinit(L, &b);
1048 while (s < e) {
1049 utfint ch = 0;
1050 s = utf8_safe_decode(L, s, &ch);
1051 if (ch == '%') {
1052 int hex = 0;
1053 switch (*s) {
1054 case '0': case '1': case '2': case '3':
1055 case '4': case '5': case '6': case '7':
1056 case '8': case '9': case '{':
1057 break;
1058 case 'x': case 'X': hex = 1; /* fall through */
1059 case 'u': case 'U': if (s+1 < e) { ++s; break; }
1060 /* fall through */
1061 default:
1062 s = utf8_safe_decode(L, s, &ch);
1063 goto next;
1064 }
1065 s = parse_escape(L, s, e, hex, &ch);
1066 }
1067next:
1068 add_utf8char(&b, ch);
1069 }
1070 luaL_pushresult(&b);
1071 return 1;
1072}
1073
1074static int Lutf8_insert (lua_State *L) {
1075 const char *e, *s = check_utf8(L, 1, &e);
1076 size_t sublen;
1077 const char *subs;
1078 luaL_Buffer b;
1079 int nargs = 2;
1080 const char *first = e;
1081 if (lua_type(L, 2) == LUA_TNUMBER) {
1082 int idx = (int)lua_tointeger(L, 2);
1083 if (idx != 0) first = utf8_relat(s, e, idx);
1084 luaL_argcheck(L, first, 2, "invalid index");
1085 ++nargs;
1086 }
1087 subs = luaL_checklstring(L, nargs, &sublen);
1088 luaL_buffinit(L, &b);
1089 luaL_addlstring(&b, s, first-s);
1090 luaL_addlstring(&b, subs, sublen);
1091 luaL_addlstring(&b, first, e-first);
1092 luaL_pushresult(&b);
1093 return 1;
1094}
1095
1096static int Lutf8_remove (lua_State *L) {
1097 const char *e, *s = check_utf8(L, 1, &e);
1098 lua_Integer posi = luaL_optinteger(L, 2, -1);
1099 lua_Integer pose = luaL_optinteger(L, 3, -1);
1100 if (!utf8_range(s, e, &posi, &pose))
1101 lua_settop(L, 1);
1102 else {
1103 luaL_Buffer b;
1104 luaL_buffinit(L, &b);
1105 luaL_addlstring(&b, s, posi);
1106 luaL_addlstring(&b, s+pose, e-s-pose);
1107 luaL_pushresult(&b);
1108 }
1109 return 1;
1110}
1111
1112static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
1113 utfint ch = 0;
1114 const char *p;
1115 if (idx != 0)
1116 p = utf8_offset(s, e, offset, idx);
1117 else if (p = s+offset-1, iscont(p))
1118 p = utf8_prev(s, p);
1119 if (p == NULL || p == e) return 0;
1120 utf8_decode(p, &ch, 0);
1121 lua_pushinteger(L, p-s+1);
1122 lua_pushinteger(L, ch);
1123 return 2;
1124}
1125
1126static int Lutf8_charpos (lua_State *L) {
1127 const char *e, *s = check_utf8(L, 1, &e);
1128 lua_Integer offset = 1;
1129 if (lua_isnoneornil(L, 3)) {
1130 lua_Integer idx = luaL_optinteger(L, 2, 0);
1131 if (idx > 0) --idx;
1132 else if (idx < 0) offset = e-s+1;
1133 return push_offset(L, s, e, offset, idx);
1134 }
1135 offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
1136 if (offset < 1) offset = 1;
1137 return push_offset(L, s, e, offset, luaL_checkinteger(L, 3));
1138}
1139
1140static int Lutf8_offset (lua_State *L) {
1141 size_t len;
1142 const char *s = luaL_checklstring(L, 1, &len);
1143 lua_Integer n = luaL_checkinteger(L, 2);
1144 lua_Integer posi = (n >= 0) ? 1 : len + 1;
1145 posi = byte_relat(luaL_optinteger(L, 3, posi), len);
1146 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
1147 "position out of range");
1148 if (n == 0) {
1149 /* find beginning of current byte sequence */
1150 while (posi > 0 && iscont(s + posi)) posi--;
1151 } else {
1152 if (iscont(s + posi))
1153 return luaL_error(L, "initial position is a continuation byte");
1154 if (n < 0) {
1155 while (n < 0 && posi > 0) { /* move back */
1156 do { /* find beginning of previous character */
1157 posi--;
1158 } while (posi > 0 && iscont(s + posi));
1159 n++;
1160 }
1161 } else {
1162 n--; /* do not move for 1st character */
1163 while (n > 0 && posi < (lua_Integer)len) {
1164 do { /* find beginning of next character */
1165 posi++;
1166 } while (iscont(s + posi)); /* (cannot pass final '\0') */
1167 n--;
1168 }
1169 }
1170 }
1171 if (n == 0) /* did it find given character? */
1172 lua_pushinteger(L, posi + 1);
1173 else /* no such character */
1174 lua_pushnil(L);
1175 return 1;
1176}
1177
1178static int Lutf8_next (lua_State *L) {
1179 const char *e, *s = check_utf8(L, 1, &e);
1180 lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
1181 lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2));
1182 return push_offset(L, s, e, offset, idx);
1183}
1184
1185static int iter_aux (lua_State *L, int strict) {
1186 const char *e, *s = check_utf8(L, 1, &e);
1187 int n = CAST(int, lua_tointeger(L, 2));
1188 const char *p = n <= 0 ? s : utf8_next(s+n-1, e);
1189 if (p < e) {
1190 utfint code = 0;
1191 utf8_safe_decode(L, p, &code);
1192 if (strict && utf8_invalid(code))
1193 return luaL_error(L, "invalid UTF-8 code");
1194 lua_pushinteger(L, p-s+1);
1195 lua_pushinteger(L, code);
1196 return 2;
1197 }
1198 return 0; /* no more codepoints */
1199}
1200
1201static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); }
1202static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); }
1203
1204static int Lutf8_codes (lua_State *L) {
1205 int lax = lua_toboolean(L, 2);
1206 luaL_checkstring(L, 1);
1207 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
1208 lua_pushvalue(L, 1);
1209 lua_pushinteger(L, 0);
1210 return 3;
1211}
1212
1213static int Lutf8_width (lua_State *L) {
1214 int t = lua_type(L, 1);
1215 int ambi_is_single = !lua_toboolean(L, 2);
1216 int default_width = CAST(int, luaL_optinteger(L, 3, 0));
1217 if (t == LUA_TNUMBER) {
1218 size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single);
1219 if (chwidth == 0) chwidth = default_width;
1220 lua_pushinteger(L, (lua_Integer)chwidth);
1221 } else if (t != LUA_TSTRING)
1222 return typeerror(L, 1, "number/string");
1223 else {
1224 const char *e, *s = to_utf8(L, 1, &e);
1225 int width = 0;
1226 while (s < e) {
1227 utfint ch = 0;
1228 int chwidth;
1229 s = utf8_safe_decode(L, s, &ch);
1230 chwidth = utf8_width(ch, ambi_is_single);
1231 width += chwidth == 0 ? default_width : chwidth;
1232 }
1233 lua_pushinteger(L, (lua_Integer)width);
1234 }
1235 return 1;
1236}
1237
1238static int Lutf8_widthindex (lua_State *L) {
1239 const char *e, *s = check_utf8(L, 1, &e);
1240 int width = CAST(int, luaL_checkinteger(L, 2));
1241 int ambi_is_single = !lua_toboolean(L, 3);
1242 int default_width = CAST(int, luaL_optinteger(L, 4, 0));
1243 size_t idx = 1;
1244 while (s < e) {
1245 utfint ch = 0;
1246 size_t chwidth;
1247 s = utf8_safe_decode(L, s, &ch);
1248 chwidth = utf8_width(ch, ambi_is_single);
1249 if (chwidth == 0) chwidth = default_width;
1250 width -= CAST(int, chwidth);
1251 if (width <= 0) {
1252 lua_pushinteger(L, idx);
1253 lua_pushinteger(L, width + chwidth);
1254 lua_pushinteger(L, chwidth);
1255 return 3;
1256 }
1257 ++idx;
1258 }
1259 lua_pushinteger(L, (lua_Integer)idx);
1260 return 1;
1261}
1262
1263static int Lutf8_ncasecmp (lua_State *L) {
1264 const char *e1, *s1 = check_utf8(L, 1, &e1);
1265 const char *e2, *s2 = check_utf8(L, 2, &e2);
1266 while (s1 < e1 || s2 < e2) {
1267 utfint ch1 = 0, ch2 = 0;
1268 if (s1 == e1)
1269 ch2 = 1;
1270 else if (s2 == e2)
1271 ch1 = 1;
1272 else {
1273 s1 = utf8_safe_decode(L, s1, &ch1);
1274 s2 = utf8_safe_decode(L, s2, &ch2);
1275 ch1 = utf8_tofold(ch1);
1276 ch2 = utf8_tofold(ch2);
1277 }
1278 if (ch1 != ch2) {
1279 lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
1280 return 1;
1281 }
1282 }
1283 lua_pushinteger(L, 0);
1284 return 1;
1285}
1286
1287
1288/* utf8 pattern matching implement */
1289
1290#ifndef LUA_MAXCAPTURES
1291# define LUA_MAXCAPTURES 32
1292#endif /* LUA_MAXCAPTURES */
1293
1294#define CAP_UNFINISHED (-1)
1295#define CAP_POSITION (-2)
1296
1297
1298typedef struct MatchState {
1299 int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
1300 const char *src_init; /* init of source string */
1301 const char *src_end; /* end ('\0') of source string */
1302 const char *p_end; /* end ('\0') of pattern */
1303 lua_State *L;
1304 int level; /* total number of captures (finished or unfinished) */
1305 struct {
1306 const char *init;
1307 ptrdiff_t len;
1308 } capture[LUA_MAXCAPTURES];
1309} MatchState;
1310
1311/* recursive function */
1312static const char *match (MatchState *ms, const char *s, const char *p);
1313
1314/* maximum recursion depth for 'match' */
1315#if !defined(MAXCCALLS)
1316#define MAXCCALLS 200
1317#endif
1318
1319#define L_ESC '%'
1320#define SPECIALS "^$*+?.([%-"
1321
1322static int check_capture (MatchState *ms, int l) {
1323 l -= '1';
1324 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
1325 return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
1326 return l;
1327}
1328
1329static int capture_to_close (MatchState *ms) {
1330 int level = ms->level;
1331 while (--level >= 0)
1332 if (ms->capture[level].len == CAP_UNFINISHED) return level;
1333 return luaL_error(ms->L, "invalid pattern capture");
1334}
1335
1336static const char *classend (MatchState *ms, const char *p) {
1337 utfint ch = 0;
1338 p = utf8_safe_decode(ms->L, p, &ch);
1339 switch (ch) {
1340 case L_ESC: {
1341 if (p == ms->p_end)
1342 luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
1343 return utf8_next(p, ms->p_end);
1344 }
1345 case '[': {
1346 if (*p == '^') p++;
1347 do { /* look for a `]' */
1348 if (p == ms->p_end)
1349 luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
1350 if (*(p++) == L_ESC && p < ms->p_end)
1351 p++; /* skip escapes (e.g. `%]') */
1352 } while (*p != ']');
1353 return p+1;
1354 }
1355 default: {
1356 return p;
1357 }
1358 }
1359}
1360
1361static int match_class (utfint c, utfint cl) {
1362 int res;
1363 switch (utf8_tolower(cl)) {
1364#define X(cls, name) case cls: res = utf8_is##name(c); break;
1365 utf8_categories(X)
1366#undef X
1367 case 'g' : res = utf8_isgraph(c); break;
1368 case 'w' : res = utf8_isalnum(c); break;
1369 case 'z' : res = (c == 0); break; /* deprecated option */
1370 default: return (cl == c);
1371 }
1372 return (utf8_islower(cl) ? res : !res);
1373}
1374
1375static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) {
1376 int sig = 1;
1377 assert(*p == '[');
1378 if (*++p == '^') {
1379 sig = 0;
1380 p++; /* skip the `^' */
1381 }
1382 while (p < ec) {
1383 utfint ch = 0;
1384 p = utf8_safe_decode(ms->L, p, &ch);
1385 if (ch == L_ESC) {
1386 p = utf8_safe_decode(ms->L, p, &ch);
1387 if (match_class(c, ch))
1388 return sig;
1389 } else {
1390 utfint next = 0;
1391 const char *np = utf8_safe_decode(ms->L, p, &next);
1392 if (next == '-' && np < ec) {
1393 p = utf8_safe_decode(ms->L, np, &next);
1394 if (ch <= c && c <= next)
1395 return sig;
1396 }
1397 else if (ch == c) return sig;
1398 }
1399 }
1400 return !sig;
1401}
1402
1403static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) {
1404 if (s >= ms->src_end)
1405 return 0;
1406 else {
1407 utfint ch=0, pch=0;
1408 utf8_safe_decode(ms->L, s, &ch);
1409 p = utf8_safe_decode(ms->L, p, &pch);
1410 switch (pch) {
1411 case '.': return 1; /* matches any char */
1412 case L_ESC: utf8_safe_decode(ms->L, p, &pch);
1413 return match_class(ch, pch);
1414 case '[': return matchbracketclass(ms, ch, p-1, ep-1);
1415 default: return pch == ch;
1416 }
1417 }
1418}
1419
1420static const char *matchbalance (MatchState *ms, const char *s, const char **p) {
1421 utfint ch=0, begin=0, end=0;
1422 *p = utf8_safe_decode(ms->L, *p, &begin);
1423 if (*p >= ms->p_end)
1424 luaL_error(ms->L, "malformed pattern "
1425 "(missing arguments to " LUA_QL("%%b") ")");
1426 *p = utf8_safe_decode(ms->L, *p, &end);
1427 s = utf8_safe_decode(ms->L, s, &ch);
1428 if (ch != begin) return NULL;
1429 else {
1430 int cont = 1;
1431 while (s < ms->src_end) {
1432 s = utf8_safe_decode(ms->L, s, &ch);
1433 if (ch == end) {
1434 if (--cont == 0) return s;
1435 }
1436 else if (ch == begin) cont++;
1437 }
1438 }
1439 return NULL; /* string ends out of balance */
1440}
1441
1442static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
1443 const char *m = s; /* matched end of single match p */
1444 while (singlematch(ms, m, p, ep))
1445 m = utf8_next(m, ms->src_end);
1446 /* keeps trying to match with the maximum repetitions */
1447 while (s <= m) {
1448 const char *res = match(ms, m, ep+1);
1449 if (res) return res;
1450 /* else didn't match; reduce 1 repetition to try again */
1451 if (s == m) break;
1452 m = utf8_prev(s, m);
1453 }
1454 return NULL;
1455}
1456
1457static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
1458 for (;;) {
1459 const char *res = match(ms, s, ep+1);
1460 if (res != NULL)
1461 return res;
1462 else if (singlematch(ms, s, p, ep))
1463 s = utf8_next(s, ms->src_end); /* try with one more repetition */
1464 else return NULL;
1465 }
1466}
1467
1468static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) {
1469 const char *res;
1470 int level = ms->level;
1471 if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
1472 ms->capture[level].init = s;
1473 ms->capture[level].len = what;
1474 ms->level = level+1;
1475 if ((res=match(ms, s, p)) == NULL) /* match failed? */
1476 ms->level--; /* undo capture */
1477 return res;
1478}
1479
1480static const char *end_capture (MatchState *ms, const char *s, const char *p) {
1481 int l = capture_to_close(ms);
1482 const char *res;
1483 ms->capture[l].len = s - ms->capture[l].init; /* close capture */
1484 if ((res = match(ms, s, p)) == NULL) /* match failed? */
1485 ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
1486 return res;
1487}
1488
1489static const char *match_capture (MatchState *ms, const char *s, int l) {
1490 size_t len;
1491 l = check_capture(ms, l);
1492 len = ms->capture[l].len;
1493 if ((size_t)(ms->src_end-s) >= len &&
1494 memcmp(ms->capture[l].init, s, len) == 0)
1495 return s+len;
1496 else return NULL;
1497}
1498
1499static const char *match (MatchState *ms, const char *s, const char *p) {
1500 if (ms->matchdepth-- == 0)
1501 luaL_error(ms->L, "pattern too complex");
1502 init: /* using goto's to optimize tail recursion */
1503 if (p != ms->p_end) { /* end of pattern? */
1504 utfint ch = 0;
1505 utf8_safe_decode(ms->L, p, &ch);
1506 switch (ch) {
1507 case '(': { /* start capture */
1508 if (*(p + 1) == ')') /* position capture? */
1509 s = start_capture(ms, s, p + 2, CAP_POSITION);
1510 else
1511 s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
1512 break;
1513 }
1514 case ')': { /* end capture */
1515 s = end_capture(ms, s, p + 1);
1516 break;
1517 }
1518 case '$': {
1519 if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
1520 goto dflt; /* no; go to default */
1521 s = (s == ms->src_end) ? s : NULL; /* check end of string */
1522 break;
1523 }
1524 case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
1525 const char *prev_p = p;
1526 p = utf8_safe_decode(ms->L, p+1, &ch);
1527 switch (ch) {
1528 case 'b': { /* balanced string? */
1529 s = matchbalance(ms, s, &p);
1530 if (s != NULL)
1531 goto init; /* return match(ms, s, p + 4); */
1532 /* else fail (s == NULL) */
1533 break;
1534 }
1535 case 'f': { /* frontier? */
1536 const char *ep; utfint previous = 0, current = 0;
1537 if (*p != '[')
1538 luaL_error(ms->L, "missing " LUA_QL("[") " after "
1539 LUA_QL("%%f") " in pattern");
1540 ep = classend(ms, p); /* points to what is next */
1541 if (s != ms->src_init)
1542 utf8_decode(utf8_prev(ms->src_init, s), &previous, 0);
1543 if (s != ms->src_end)
1544 utf8_decode(s, &current, 0);
1545 if (!matchbracketclass(ms, previous, p, ep - 1) &&
1546 matchbracketclass(ms, current, p, ep - 1)) {
1547 p = ep; goto init; /* return match(ms, s, ep); */
1548 }
1549 s = NULL; /* match failed */
1550 break;
1551 }
1552 case '0': case '1': case '2': case '3':
1553 case '4': case '5': case '6': case '7':
1554 case '8': case '9': { /* capture results (%0-%9)? */
1555 s = match_capture(ms, s, ch);
1556 if (s != NULL) goto init; /* return match(ms, s, p + 2) */
1557 break;
1558 }
1559 default: p = prev_p; goto dflt;
1560 }
1561 break;
1562 }
1563 default: dflt: { /* pattern class plus optional suffix */
1564 const char *ep = classend(ms, p); /* points to optional suffix */
1565 /* does not match at least once? */
1566 if (!singlematch(ms, s, p, ep)) {
1567 if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
1568 p = ep + 1; goto init; /* return match(ms, s, ep + 1); */
1569 } else /* '+' or no suffix */
1570 s = NULL; /* fail */
1571 } else { /* matched once */
1572 const char *next_s = utf8_next(s, ms->src_end);
1573 switch (*ep) { /* handle optional suffix */
1574 case '?': { /* optional */
1575 const char *res;
1576 const char *next_ep = utf8_next(ep, ms->p_end);
1577 if ((res = match(ms, next_s, next_ep)) != NULL)
1578 s = res;
1579 else {
1580 p = next_ep; goto init; /* else return match(ms, s, ep + 1); */
1581 }
1582 break;
1583 }
1584 case '+': /* 1 or more repetitions */
1585 s = next_s; /* 1 match already done */
1586 /* fall through */
1587 case '*': /* 0 or more repetitions */
1588 s = max_expand(ms, s, p, ep);
1589 break;
1590 case '-': /* 0 or more repetitions (minimum) */
1591 s = min_expand(ms, s, p, ep);
1592 break;
1593 default: /* no suffix */
1594 s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */
1595 }
1596 }
1597 break;
1598 }
1599 }
1600 }
1601 ms->matchdepth++;
1602 return s;
1603}
1604
1605static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) {
1606 if (l2 == 0) return s1; /* empty strings are everywhere */
1607 else if (l2 > l1) return NULL; /* avoids a negative `l1' */
1608 else {
1609 const char *init; /* to search for a `*s2' inside `s1' */
1610 l2--; /* 1st char will be checked by `memchr' */
1611 l1 = l1-l2; /* `s2' cannot be found after that */
1612 while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
1613 init++; /* 1st char is already checked */
1614 if (memcmp(init, s2+1, l2) == 0)
1615 return init-1;
1616 else { /* correct `l1' and `s1' to try again */
1617 l1 -= init-s1;
1618 s1 = init;
1619 }
1620 }
1621 return NULL; /* not found */
1622 }
1623}
1624
1625static int get_index (const char *p, const char *s, const char *e) {
1626 int idx;
1627 for (idx = 0; s < e && s < p; ++idx)
1628 s = utf8_next(s, e);
1629 return s == p ? idx : idx - 1;
1630}
1631
1632static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) {
1633 if (i >= ms->level) {
1634 if (i == 0) /* ms->level == 0, too */
1635 lua_pushlstring(ms->L, s, e - s); /* add whole match */
1636 else
1637 luaL_error(ms->L, "invalid capture index");
1638 } else {
1639 ptrdiff_t l = ms->capture[i].len;
1640 if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1641 if (l == CAP_POSITION) {
1642 int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end);
1643 lua_pushinteger(ms->L, idx+1);
1644 } else
1645 lua_pushlstring(ms->L, ms->capture[i].init, l);
1646 }
1647}
1648
1649static int push_captures (MatchState *ms, const char *s, const char *e) {
1650 int i;
1651 int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1652 luaL_checkstack(ms->L, nlevels, "too many captures");
1653 for (i = 0; i < nlevels; i++)
1654 push_onecapture(ms, i, s, e);
1655 return nlevels; /* number of strings pushed */
1656}
1657
1658/* check whether pattern has no special characters */
1659static int nospecials (const char *p, const char * ep) {
1660 while (p < ep) {
1661 if (strpbrk(p, SPECIALS))
1662 return 0; /* pattern has a special character */
1663 p += strlen(p) + 1; /* may have more after \0 */
1664 }
1665 return 1; /* no special chars found */
1666}
1667
1668
1669/* utf8 pattern matching interface */
1670
1671static int find_aux (lua_State *L, int find) {
1672 const char *es, *s = check_utf8(L, 1, &es);
1673 const char *ep, *p = check_utf8(L, 2, &ep);
1674 lua_Integer idx = luaL_optinteger(L, 3, 1);
1675 const char *init;
1676 if (!idx) idx = 1;
1677 init = utf8_relat(s, es, CAST(int, idx));
1678 if (init == NULL) {
1679 if (idx > 0) {
1680 lua_pushnil(L); /* cannot find anything */
1681 return 1;
1682 }
1683 init = s;
1684 }
1685 /* explicit request or no special characters? */
1686 if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1687 /* do a plain search */
1688 const char *s2 = lmemfind(init, es-init, p, ep-p);
1689 if (s2) {
1690 const char *e2 = s2 + (ep - p);
1691 if (iscont(e2)) e2 = utf8_next(e2, es);
1692 lua_pushinteger(L, idx = get_index(s2, s, es) + 1);
1693 lua_pushinteger(L, idx + get_index(e2, s2, es) - 1);
1694 return 2;
1695 }
1696 } else {
1697 MatchState ms;
1698 int anchor = (*p == '^');
1699 if (anchor) p++; /* skip anchor character */
1700 if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */
1701 ms.L = L;
1702 ms.matchdepth = MAXCCALLS;
1703 ms.src_init = s;
1704 ms.src_end = es;
1705 ms.p_end = ep;
1706 do {
1707 const char *res;
1708 ms.level = 0;
1709 assert(ms.matchdepth == MAXCCALLS);
1710 if ((res=match(&ms, init, p)) != NULL) {
1711 if (find) {
1712 lua_pushinteger(L, idx); /* start */
1713 lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1714 return push_captures(&ms, NULL, 0) + 2;
1715 } else
1716 return push_captures(&ms, init, res);
1717 }
1718 if (init == es) break;
1719 idx += 1;
1720 init = utf8_next(init, es);
1721 } while (init <= es && !anchor);
1722 }
1723 lua_pushnil(L); /* not found */
1724 return 1;
1725}
1726
1727static int Lutf8_find (lua_State *L) { return find_aux(L, 1); }
1728static int Lutf8_match (lua_State *L) { return find_aux(L, 0); }
1729
1730static int gmatch_aux (lua_State *L) {
1731 MatchState ms;
1732 const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1733 const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1734 const char *src;
1735 ms.L = L;
1736 ms.matchdepth = MAXCCALLS;
1737 ms.src_init = s;
1738 ms.src_end = es;
1739 ms.p_end = ep;
1740 for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1741 src <= ms.src_end;
1742 src = utf8_next(src, ms.src_end)) {
1743 const char *e;
1744 ms.level = 0;
1745 assert(ms.matchdepth == MAXCCALLS);
1746 if ((e = match(&ms, src, p)) != NULL) {
1747 lua_Integer newstart = e-s;
1748 if (e == src) newstart++; /* empty match? go at least one position */
1749 lua_pushinteger(L, newstart);
1750 lua_replace(L, lua_upvalueindex(3));
1751 return push_captures(&ms, src, e);
1752 }
1753 if (src == ms.src_end) break;
1754 }
1755 return 0; /* not found */
1756}
1757
1758static int Lutf8_gmatch (lua_State *L) {
1759 luaL_checkstring(L, 1);
1760 luaL_checkstring(L, 2);
1761 lua_settop(L, 2);
1762 lua_pushinteger(L, 0);
1763 lua_pushcclosure(L, gmatch_aux, 3);
1764 return 1;
1765}
1766
1767static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) {
1768 const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1769 while (news < new_end) {
1770 utfint ch = 0;
1771 news = utf8_safe_decode(ms->L, news, &ch);
1772 if (ch != L_ESC)
1773 add_utf8char(b, ch);
1774 else {
1775 news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */
1776 if (!utf8_isdigit(ch)) {
1777 if (ch != L_ESC)
1778 luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1779 " in replacement string", L_ESC);
1780 add_utf8char(b, ch);
1781 } else if (ch == '0')
1782 luaL_addlstring(b, s, e-s);
1783 else {
1784 push_onecapture(ms, ch-'1', s, e);
1785 luaL_addvalue(b); /* add capture to accumulated result */
1786 }
1787 }
1788 }
1789}
1790
1791static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) {
1792 lua_State *L = ms->L;
1793 switch (tr) {
1794 case LUA_TFUNCTION: {
1795 int n;
1796 lua_pushvalue(L, 3);
1797 n = push_captures(ms, s, e);
1798 lua_call(L, n, 1);
1799 break;
1800 }
1801 case LUA_TTABLE: {
1802 push_onecapture(ms, 0, s, e);
1803 lua_gettable(L, 3);
1804 break;
1805 }
1806 default: { /* LUA_TNUMBER or LUA_TSTRING */
1807 add_s(ms, b, s, e);
1808 return;
1809 }
1810 }
1811 if (!lua_toboolean(L, -1)) { /* nil or false? */
1812 lua_pop(L, 1);
1813 lua_pushlstring(L, s, e - s); /* keep original text */
1814 } else if (!lua_isstring(L, -1))
1815 luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1816 luaL_addvalue(b); /* add result to accumulator */
1817}
1818
1819static int Lutf8_gsub (lua_State *L) {
1820 const char *es, *s = check_utf8(L, 1, &es);
1821 const char *ep, *p = check_utf8(L, 2, &ep);
1822 int tr = lua_type(L, 3);
1823 lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1824 int anchor = (*p == '^');
1825 lua_Integer n = 0;
1826 MatchState ms;
1827 luaL_Buffer b;
1828 luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1829 tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1830 "string/function/table expected");
1831 luaL_buffinit(L, &b);
1832 if (anchor) p++; /* skip anchor character */
1833 ms.L = L;
1834 ms.matchdepth = MAXCCALLS;
1835 ms.src_init = s;
1836 ms.src_end = es;
1837 ms.p_end = ep;
1838 while (n < max_s) {
1839 const char *e;
1840 ms.level = 0;
1841 assert(ms.matchdepth == MAXCCALLS);
1842 e = match(&ms, s, p);
1843 if (e) {
1844 n++;
1845 add_value(&ms, &b, s, e, tr);
1846 }
1847 if (e && e > s) /* non empty match? */
1848 s = e; /* skip it */
1849 else if (s < es) {
1850 utfint ch = 0;
1851 s = utf8_safe_decode(L, s, &ch);
1852 add_utf8char(&b, ch);
1853 } else break;
1854 if (anchor) break;
1855 }
1856 luaL_addlstring(&b, s, es-s);
1857 luaL_pushresult(&b);
1858 lua_pushinteger(L, n); /* number of substitutions */
1859 return 2;
1860}
1861
1862static int Lutf8_isvalid(lua_State *L) {
1863 const char *e, *s = check_utf8(L, 1, &e);
1864 const char *invalid = utf8_invalid_offset(s, e);
1865 lua_pushboolean(L, invalid == NULL);
1866 return 1;
1867}
1868
1869static int Lutf8_invalidoffset(lua_State *L) {
1870 const char *e, *s = check_utf8(L, 1, &e);
1871 const char *orig_s = s;
1872 int offset = luaL_optinteger(L, 2, 0);
1873 if (offset > 1) {
1874 offset--;
1875 s += offset;
1876 if (s >= e) {
1877 lua_pushnil(L);
1878 return 1;
1879 }
1880 } else if (offset < 0 && s - e < offset) {
1881 s = e + offset;
1882 }
1883 const char *invalid = utf8_invalid_offset(s, e);
1884 if (invalid == NULL) {
1885 lua_pushnil(L);
1886 } else {
1887 lua_pushinteger(L, invalid - orig_s + 1);
1888 }
1889 return 1;
1890}
1891
1892static int Lutf8_clean(lua_State *L) {
1893 const char *e, *s = check_utf8(L, 1, &e);
1894
1895 /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */
1896 size_t repl_len;
1897 const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len);
1898
1899 if (lua_gettop(L) > 1) {
1900 /* Check if replacement string is valid UTF-8 or not */
1901 if (utf8_invalid_offset(r, r + repl_len) != NULL) {
1902 lua_pushstring(L, "replacement string must be valid UTF-8");
1903 lua_error(L);
1904 }
1905 }
1906
1907 const char *invalid = utf8_invalid_offset(s, e);
1908 if (invalid == NULL) {
1909 lua_settop(L, 1); /* Return input string without modification */
1910 lua_pushboolean(L, 1); /* String was clean already */
1911 return 2;
1912 }
1913
1914 luaL_Buffer buff;
1915 luaL_buffinit(L, &buff);
1916
1917 while (1) {
1918 /* Invariant: 's' points to first GOOD byte not in output buffer,
1919 * 'invalid' points to first BAD byte after that */
1920 luaL_addlstring(&buff, s, invalid - s);
1921 luaL_addlstring(&buff, r, repl_len);
1922 /* We do not replace every bad byte with the replacement character,
1923 * but rather a contiguous sequence of bad bytes
1924 * Restore the invariant by stepping forward until we find at least
1925 * one good byte */
1926 s = invalid;
1927 while (s == invalid) {
1928 s++;
1929 invalid = utf8_invalid_offset(s, e);
1930 }
1931 if (invalid == NULL) {
1932 luaL_addlstring(&buff, s, e - s);
1933 luaL_pushresult(&buff);
1934 lua_pushboolean(L, 0); /* String was not clean */
1935 return 2;
1936 }
1937 }
1938}
1939
1940static int Lutf8_isnfc(lua_State *L) {
1941 const char *e, *s = check_utf8(L, 1, &e);
1942 utfint starter = 0, ch;
1943 unsigned int prev_canon_cls = 0;
1944
1945 while (s < e) {
1946 s = utf8_decode(s, &ch, 1);
1947 if (s == NULL) {
1948 lua_pushstring(L, "string is not valid UTF-8");
1949 lua_error(L);
1950 }
1951 if (ch < 0x300) {
1952 starter = ch; /* Fast path */
1953 prev_canon_cls = 0;
1954 continue;
1955 }
1956
1957 unsigned int canon_cls = lookup_canon_cls(ch);
1958 if (canon_cls && canon_cls < prev_canon_cls) {
1959 /* Combining marks are out of order; this string is not NFC */
1960 lua_pushboolean(L, 0); /* Return false */
1961 return 1;
1962 }
1963
1964 nfc_table *entry = nfc_quickcheck(ch);
1965 if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) {
1966 lua_pushboolean(L, 0); /* Return false */
1967 return 1;
1968 }
1969
1970 prev_canon_cls = canon_cls;
1971 if (!canon_cls)
1972 starter = ch;
1973 }
1974
1975 lua_pushboolean(L, 1); /* Return true */
1976 return 1;
1977}
1978
1979static int Lutf8_normalize_nfc(lua_State *L) {
1980 const char *e, *s = check_utf8(L, 1, &e), *p = s, *starter_p = s;
1981 utfint starter = 0, ch;
1982 unsigned int prev_canon_cls = 0;
1983
1984 /* First scan to see if we can find any problems... if not, we may just return the
1985 * input string unchanged */
1986 while (p < e) {
1987 const char *new_p = utf8_decode(p, &ch, 1);
1988 if (new_p == NULL) {
1989 lua_pushstring(L, "string is not valid UTF-8");
1990 lua_error(L);
1991 }
1992
1993 unsigned int canon_cls = lookup_canon_cls(ch);
1994 if (canon_cls && canon_cls < prev_canon_cls) {
1995 goto build_string; /* Combining marks are out of order; this string is not NFC */
1996 }
1997
1998 nfc_table *entry = nfc_quickcheck(ch);
1999 if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) {
2000 goto build_string;
2001 }
2002
2003 prev_canon_cls = canon_cls;
2004 if (!canon_cls) {
2005 starter = ch;
2006 starter_p = p;
2007 }
2008 p = new_p;
2009 }
2010
2011 lua_settop(L, 1); /* Return input string without modification */
2012 lua_pushboolean(L, 1); /* String was in normal form already, so 2nd return value is 'true' */
2013 return 2;
2014
2015build_string: ;
2016 /* We will need to build a new string, this one is not NFC */
2017 luaL_Buffer buff;
2018 luaL_buffinit(L, &buff);
2019 luaL_addlstring(&buff, s, starter_p - s);
2020
2021 string_to_nfc(L, &buff, starter_p, e);
2022
2023 luaL_pushresult(&buff);
2024 lua_pushboolean(L, 0);
2025 return 2;
2026}
2027
2028static int iterate_grapheme_indices(lua_State *L) {
2029 const char *s = luaL_checkstring(L, lua_upvalueindex(1));
2030 lua_Integer pos = luaL_checkinteger(L, lua_upvalueindex(2));
2031 lua_Integer end = luaL_checkinteger(L, lua_upvalueindex(3));
2032
2033 if (pos > end) {
2034 lua_pushnil(L);
2035 return 1;
2036 }
2037 const char *e = s + end;
2038
2039 utfint ch, next_ch;
2040 const char *p = utf8_safe_decode(L, s + pos - 1, &ch);
2041
2042 while (1) {
2043 const char *next_p = utf8_safe_decode(L, p, &next_ch);
2044 int bind = 0;
2045
2046 if (ch == '\r') {
2047 if (next_ch == '\n') {
2048 /* CR binds to following LF */
2049 bind = 1;
2050 } else {
2051 break;
2052 }
2053 } else if (ch == '\n' || next_ch == '\r' || next_ch == '\n') {
2054 /* CR/LF do not bind to any other codepoint or in any other way */
2055 break;
2056 } else if (find_in_range(cntrl_table, table_size(cntrl_table), ch) && !find_in_range(prepend_table, table_size(prepend_table), ch) && ch != 0x200D) {
2057 /* Control characters do not bind to anything */
2058 break;
2059 } else if (next_ch == 0x200D) {
2060 /* U+200D is ZERO WIDTH JOINER, it always binds to preceding char */
2061 if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) {
2062 /* After an Extended_Pictographic codepoint and ZWJ, we bind to a following Extended_Pictographic */
2063 utfint nextnext_ch;
2064 const char *probe_ep = utf8_safe_decode(L, next_p, &nextnext_ch);
2065 if (find_in_range(pictographic_table, table_size(pictographic_table), nextnext_ch)) {
2066 p = probe_ep;
2067 ch = nextnext_ch;
2068 continue;
2069 }
2070 }
2071 bind = 1;
2072 } else if (find_in_range(cntrl_table, table_size(cntrl_table), next_ch) && !find_in_range(prepend_table, table_size(prepend_table), next_ch)) {
2073 /* Control characters do not bind to anything */
2074 break;
2075 } else {
2076 if (indic_conjunct_type(ch) == INDIC_CONSONANT) {
2077 utfint probed_ch = next_ch;
2078 const char *probe = next_p;
2079 int indic_type = indic_conjunct_type(probed_ch);
2080 int saw_linker = 0;
2081 while (indic_type) {
2082 /* Consume any number of Extend or Linker codepoints, followed by a single Consonant
2083 * The sequence must contain at least one Linker, however! */
2084 if (indic_type == INDIC_LINKER) {
2085 saw_linker = 1;
2086 } else if (indic_type == INDIC_CONSONANT) {
2087 if (!saw_linker)
2088 break;
2089 p = probe;
2090 ch = probed_ch;
2091 goto next_iteration;
2092 }
2093 if (probe >= e)
2094 break;
2095 probe = utf8_safe_decode(L, probe, &probed_ch);
2096 indic_type = indic_conjunct_type(probed_ch);
2097 }
2098 }
2099
2100 if (find_in_range(compose_table, table_size(compose_table), next_ch) || (next_ch >= 0x1F3FB && next_ch <= 0x1F3FF)) {
2101 /* The 2nd codepoint has property Grapheme_Extend, or is an Emoji_Modifier codepoint */
2102 if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) {
2103 /* Consume any number of 'extend' codepoints, one ZWJ, and following Extended_Pictographic codepoint */
2104 utfint probed_ch;
2105 const char *probe = next_p;
2106 while (probe < e) {
2107 probe = utf8_safe_decode(L, probe, &probed_ch);
2108 if (probed_ch == 0x200D) {
2109 if (probe < e) {
2110 probe = utf8_safe_decode(L, probe, &probed_ch);
2111 if (find_in_range(pictographic_table, table_size(pictographic_table), probed_ch)) {
2112 next_p = probe;
2113 next_ch = probed_ch;
2114 }
2115 }
2116 break;
2117 } else if (find_in_range(compose_table, table_size(compose_table), probed_ch) || (probed_ch >= 0x1F3FB && probed_ch <= 0x1F3FF)) {
2118 next_p = probe;
2119 next_ch = probed_ch;
2120 } else {
2121 break;
2122 }
2123 }
2124 }
2125 bind = 1;
2126 } else if (find_in_range(spacing_mark_table, table_size(spacing_mark_table), next_ch)) {
2127 /* The 2nd codepoint is in general category Spacing_Mark */
2128 bind = 1;
2129 } else if (find_in_range(prepend_table, table_size(prepend_table), ch)) {
2130 /* The 1st codepoint has property Prepend_Concatenation_Mark, or is a type of
2131 * Indic Syllable which binds to the following codepoint */
2132 bind = 1;
2133 } else if (ch >= 0x1F1E6 && ch <= 0x1F1FF && next_ch >= 0x1F1E6 && next_ch <= 0x1F1FF) {
2134 /* Regional Indicator (flag) emoji bind together; but only in twos */
2135 p = next_p;
2136 ch = 0xFFFE; /* Set 'ch' to bogus value so we will not re-enter this branch on next iteration */
2137 continue;
2138 } else {
2139 /* Korean Hangul codepoints have their own special rules about when they
2140 * are considered a single grapheme cluster */
2141 int hangul1 = hangul_type(ch);
2142 if (hangul1) {
2143 int hangul2 = hangul_type(next_ch);
2144 if (hangul2) {
2145 if (hangul1 == HANGUL_L) {
2146 bind = (hangul2 != HANGUL_T);
2147 } else if (hangul1 == HANGUL_LV || hangul1 == HANGUL_V) {
2148 bind = (hangul2 == HANGUL_V || hangul2 == HANGUL_T);
2149 } else if (hangul1 == HANGUL_LVT || hangul1 == HANGUL_T) {
2150 bind = (hangul2 == HANGUL_T);
2151 }
2152 }
2153 }
2154 }
2155 }
2156
2157 if (!bind)
2158 break;
2159 p = next_p;
2160 ch = next_ch;
2161next_iteration: ;
2162 }
2163
2164 lua_pushinteger(L, (p - s) + 1);
2165 lua_replace(L, lua_upvalueindex(2));
2166
2167 lua_pushinteger(L, pos);
2168 lua_pushinteger(L, p - s);
2169 return 2;
2170}
2171
2172static int Lutf8_grapheme_indices(lua_State *L) {
2173 size_t len;
2174 const char *s = luaL_checklstring(L, 1, &len);
2175 lua_Integer start = byte_relat(luaL_optinteger(L, 2, 1), len);
2176 lua_Integer end = byte_relat(luaL_optinteger(L, 3, len), len);
2177 luaL_argcheck(L, start >= 1, 2, "out of range");
2178 luaL_argcheck(L, end <= (lua_Integer)len, 3, "out of range");
2179
2180 lua_settop(L, 1);
2181 lua_pushinteger(L, start);
2182 lua_pushinteger(L, end);
2183 lua_pushcclosure(L, iterate_grapheme_indices, 3);
2184 return 1;
2185}
2186
2187/* lua module import interface */
2188
2189#if LUA_VERSION_NUM >= 502
2190static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*";
2191#else
2192static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*";
2193#endif
2194
2195LUALIB_API int luaopen_utf8 (lua_State *L) {
2196 luaL_Reg libs[] = {
2197#define ENTRY(name) { #name, Lutf8_##name }
2198 ENTRY(offset),
2199 ENTRY(codes),
2200 ENTRY(codepoint),
2201
2202 ENTRY(len),
2203 ENTRY(sub),
2204 ENTRY(reverse),
2205 ENTRY(lower),
2206 ENTRY(upper),
2207 ENTRY(title),
2208 ENTRY(fold),
2209 ENTRY(byte),
2210 ENTRY(char),
2211 ENTRY(escape),
2212 ENTRY(insert),
2213 ENTRY(remove),
2214 ENTRY(charpos),
2215 ENTRY(next),
2216 ENTRY(width),
2217 ENTRY(widthindex),
2218 ENTRY(ncasecmp),
2219 ENTRY(find),
2220 ENTRY(gmatch),
2221 ENTRY(gsub),
2222 ENTRY(match),
2223 ENTRY(isvalid),
2224 ENTRY(invalidoffset),
2225 ENTRY(clean),
2226 ENTRY(isnfc),
2227 ENTRY(normalize_nfc),
2228 ENTRY(grapheme_indices),
2229#undef ENTRY
2230 { NULL, NULL }
2231 };
2232
2233#if LUA_VERSION_NUM >= 502
2234 luaL_newlib(L, libs);
2235#else
2236 luaL_register(L, "utf8", libs);
2237#endif
2238
2239 lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1);
2240 lua_setfield(L, -2, "charpattern");
2241
2242 return 1;
2243}
2244
2245/* win32cc: flags+='-Wall -Wextra -s -O2 -mdll -DLUA_BUILD_AS_DLL'
2246 * win32cc: libs+='-llua54.dll' output='lua-utf8.dll'
2247 * win32cc: run='lua.exe test.lua'
2248 * maccc: run='lua -- test_compat.lua'
2249 * maccc: flags+='-g --coverage -bundle -undefined dynamic_lookup' output='lua-utf8.so' */
2250
static const double c[]
Definition rng.c:256