Asterisk - The Open Source Telephony Project  21.4.1
utf8.c
Go to the documentation of this file.
1 /*
2  * Asterisk -- An open source telephony toolkit.
3  *
4  * Copyright (C) 2020, Sean Bright
5  *
6  * Sean Bright <sean.bright@gmail.com>
7  *
8  * See http://www.asterisk.org for more information about
9  * the Asterisk project. Please do not directly contact
10  * any of the maintainers of this project for assistance;
11  * the project provides a web site, mailing lists and IRC
12  * channels for your use.
13  *
14  * This program is free software, distributed under the terms of
15  * the GNU General Public License Version 2. See the LICENSE file
16  * at the top of the source tree.
17  */
18 
19 /*! \file
20  *
21  * \brief UTF-8 information and validation functions
22  */
23 
24 /*** MODULEINFO
25  <support_level>core</support_level>
26 ***/
27 
28 #include "asterisk.h"
29 
30 #include "asterisk/utils.h"
31 #include "asterisk/utf8.h"
32 #include "asterisk/test.h"
33 
34 /*
35  * BEGIN THIRD PARTY CODE
36  *
37  * Copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>
38  *
39  * Permission is hereby granted, free of charge, to any person obtaining a copy
40  * of this software and associated documentation files (the "Software"), to deal
41  * in the Software without restriction, including without limitation the rights
42  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
43  * copies of the Software, and to permit persons to whom the Software is
44  * furnished to do so, subject to the following conditions:
45  *
46  * The above copyright notice and this permission notice shall be included in all
47  * copies or substantial portions of the Software.
48  *
49  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
54  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
55  * SOFTWARE.
56  *
57  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
58  */
59 
60 #define UTF8_ACCEPT 0
61 #define UTF8_REJECT 12
62 
63 static const uint8_t utf8d[] = {
64  /* The first part of the table maps bytes to character classes that
65  * to reduce the size of the transition table and create bitmasks. */
66  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
69  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
71  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
72  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
74 
75  /* The second part is a transition table that maps a combination
76  * of a state of the automaton and a character class to a state. */
77  0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
78  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
79  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
80  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
81  12,36,12,12,12,12,12,12,12,12,12,12,
82 };
83 
84 #if 0
85 /* We can bring this back if we need the codepoint? */
86 static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
87  uint32_t type = utf8d[byte];
88 
89  *codep = (*state != UTF8_ACCEPT) ?
90  (byte & 0x3fu) | (*codep << 6) :
91  (0xff >> type) & (byte);
92 
93  *state = utf8d[256 + *state + type];
94  return *state;
95 }
96 #endif
97 
98 static uint32_t inline decode(uint32_t *state, uint32_t byte) {
99  uint32_t type = utf8d[byte];
100  *state = utf8d[256 + *state + type];
101  return *state;
102 }
103 
104 /*
105  * END THIRD PARTY CODE
106  *
107  * See copyright notice above.
108  */
109 
110 int ast_utf8_is_valid(const char *src)
111 {
112  uint32_t state = UTF8_ACCEPT;
113 
114  while (*src) {
115  decode(&state, (uint8_t) *src++);
116  }
117 
118  return state == UTF8_ACCEPT;
119 }
120 
121 int ast_utf8_is_validn(const char *src, size_t size)
122 {
123  uint32_t state = UTF8_ACCEPT;
124 
125  while (size && *src) {
126  decode(&state, (uint8_t) *src++);
127  size--;
128  }
129 
130  return state == UTF8_ACCEPT;
131 }
132 
133 void ast_utf8_copy_string(char *dst, const char *src, size_t size)
134 {
135  uint32_t state = UTF8_ACCEPT;
136  char *last_good = dst;
137 
138  ast_assert(size > 0);
139 
140  while (size && *src) {
141  if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
142  /* We _could_ replace with U+FFFD and try to recover, but for now
143  * we treat this the same as if we had run out of space */
144  break;
145  }
146 
147  *dst++ = *src++;
148  size--;
149 
150  if (size && state == UTF8_ACCEPT) {
151  /* last_good is where we will ultimately write the 0 byte */
152  last_good = dst;
153  }
154  }
155 
156  *last_good = '\0';
157 }
158 
159 /*!
160  * \warning A UTF-8 sequence could be 1, 2, 3 or 4 bytes long depending
161  * on the first byte in the sequence. Don't try to modify this function
162  * without understanding how UTF-8 works.
163  */
164 
165 /*
166  * The official unicode replacement character is U+FFFD
167  * which is actually the 3 following bytes:
168  */
169 #define REPL_SEQ "\xEF\xBF\xBD"
170 #define REPL_SEQ_LEN 3
171 
173 ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src,
174  size_t src_len)
175 {
177  size_t src_pos = 0;
178  size_t dst_pos = 0;
179  uint32_t prev_state = UTF8_ACCEPT;
180  uint32_t curr_state = UTF8_ACCEPT;
181  /*
182  * UTF-8 sequences can be 1 - 4 bytes in length so we
183  * have to keep track of where we are.
184  */
185  int seq_len = 0;
186 
187  if (dst) {
188  memset(dst, 0, *dst_size);
189  } else {
190  *dst_size = 0;
191  }
192 
193  if (!src || src_len == 0) {
194  return AST_UTF8_REPLACE_VALID;
195  }
196 
197  for (prev_state = 0, curr_state = 0; src_pos < src_len; prev_state = curr_state, src_pos++) {
198  uint32_t rc;
199 
200  rc = decode(&curr_state, (uint8_t) src[src_pos]);
201 
202  if (dst && dst_pos >= *dst_size - 1) {
203  if (prev_state > UTF8_REJECT) {
204  /*
205  * We ran out of space in the middle of a possible
206  * multi-byte sequence so we have to back up and
207  * overwrite the start of the sequence with the
208  * NULL terminator.
209  */
210  dst_pos -= (seq_len - (prev_state / 36));
211  }
212  dst[dst_pos] = '\0';
213 
215  }
216 
217  if (rc == UTF8_ACCEPT) {
218  if (dst) {
219  dst[dst_pos] = src[src_pos];
220  }
221  dst_pos++;
222  seq_len = 0;
223  }
224 
225  if (rc > UTF8_REJECT) {
226  /*
227  * We're possibly at the start of, or in the middle of,
228  * a multi-byte sequence. The curr_state will tell us how many
229  * bytes _should_ be remaining in the sequence.
230  */
231  if (prev_state == UTF8_ACCEPT) {
232  /* If the previous state was a good character then
233  * this can only be the start of s sequence
234  * which is all we care about.
235  */
236  seq_len = curr_state / 36 + 1;
237  }
238 
239  if (dst) {
240  dst[dst_pos] = src[src_pos];
241  }
242  dst_pos++;
243  }
244 
245  if (rc == UTF8_REJECT) {
246  /* We got at least 1 rejection so the string is invalid */
248 
249  if (prev_state != UTF8_ACCEPT) {
250  /*
251  * If we were in a multi-byte sequence and this
252  * byte isn't valid at this time, we'll back
253  * the destination pointer back to the start
254  * of the now-invalid sequence and write the
255  * replacement bytes there. Then we'll
256  * process the current byte again in the next
257  * loop iteration. It may be quite valid later.
258  */
259  dst_pos -= (seq_len - (prev_state / 36));
260  src_pos--;
261  }
262  if (dst) {
263  /*
264  * If we're not just calculating the needed destination
265  * buffer space, and we don't have enough room to write
266  * the replacement sequence, terminate the output
267  * and return.
268  */
269  if (dst_pos > *dst_size - 4) {
270  dst[dst_pos] = '\0';
272  }
273  memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
274  }
275  dst_pos += REPL_SEQ_LEN;
276  /* Reset the state machine */
277  curr_state = UTF8_ACCEPT;
278  }
279  }
280 
281  if (curr_state != UTF8_ACCEPT) {
282  /*
283  * We were probably in the middle of a
284  * sequence and ran out of space.
285  */
286  res = AST_UTF8_INVALID;
287  dst_pos -= (seq_len - (prev_state / 36));
288  if (dst) {
289  if (dst_pos > *dst_size - 4) {
290  dst[dst_pos] = '\0';
292  }
293  memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
294  }
295  dst_pos += REPL_SEQ_LEN;
296  }
297 
298  if (dst) {
299  dst[dst_pos] = '\0';
300  } else {
301  *dst_size = dst_pos + 1;
302  }
303 
304  return res;
305 }
306 
308  uint32_t state;
309 };
310 
312 {
313  struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
314 
315  if (!tmp) {
316  return 1;
317  }
318 
319  tmp->state = UTF8_ACCEPT;
320  *validator = tmp;
321  return 0;
322 }
323 
325  struct ast_utf8_validator *validator)
326 {
327  switch (validator->state) {
328  case UTF8_ACCEPT:
329  return AST_UTF8_VALID;
330  case UTF8_REJECT:
331  return AST_UTF8_INVALID;
332  default:
333  return AST_UTF8_UNKNOWN;
334  }
335 }
336 
338  struct ast_utf8_validator *validator, const char *data)
339 {
340  while (*data) {
341  decode(&validator->state, (uint8_t) *data++);
342  }
343 
344  return ast_utf8_validator_state(validator);
345 }
346 
348  struct ast_utf8_validator *validator, const char *data, size_t size)
349 {
350  while (size && *data) {
351  decode(&validator->state, (uint8_t) *data++);
352  size--;
353  }
354 
355  return ast_utf8_validator_state(validator);
356 }
357 
359 {
360  validator->state = UTF8_ACCEPT;
361 }
362 
364 {
365  ast_free(validator);
366 }
367 
368 #ifdef TEST_FRAMEWORK
369 
370 #include "asterisk/json.h"
371 
372 AST_TEST_DEFINE(test_utf8_is_valid)
373 {
374  switch (cmd) {
375  case TEST_INIT:
376  info->name = "is_valid";
377  info->category = "/main/utf8/";
378  info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn";
379  info->description =
380  "Tests UTF-8 string validation code.";
381  return AST_TEST_NOT_RUN;
382  case TEST_EXECUTE:
383  break;
384  }
385 
386  /* Valid UTF-8 */
387  ast_test_validate(test, ast_utf8_is_valid("Asterisk"));
388  ast_test_validate(test, ast_utf8_is_valid("\xce\xbb"));
389  ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b"));
390  ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e"));
391 
392  /* Valid with leading */
393  ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk"));
394  ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb"));
395  ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b"));
396  ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e"));
397 
398  /* Valid with trailing */
399  ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa"));
400  ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa"));
401  ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa"));
402  ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa"));
403 
404  /* Valid with leading and trailing */
405  ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa"));
406  ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa"));
407  ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa"));
408  ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa"));
409 
410  /* Valid if limited by number of bytes */
411  ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk")));
412  ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb")));
413  ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b")));
414  ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e")));
415 
416  /* Invalid */
417  ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */
418  ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */
419  ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28"));
420  ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1"));
421  ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1"));
422  ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28"));
423  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc"));
424  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc"));
425  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28"));
426 
427  return AST_TEST_PASS;
428 }
429 
430 static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
431 {
432  char dst[dst_len];
433  ast_utf8_copy_string(dst, src, dst_len);
434  return strcmp(dst, cmp) == 0;
435 }
436 
437 AST_TEST_DEFINE(test_utf8_copy_string)
438 {
439  switch (cmd) {
440  case TEST_INIT:
441  info->name = "copy_string";
442  info->category = "/main/utf8/";
443  info->summary = "Test ast_utf8_copy_string";
444  info->description =
445  "Tests UTF-8 string copying code.";
446  return AST_TEST_NOT_RUN;
447  case TEST_EXECUTE:
448  break;
449  }
450 
451  ast_test_validate(test, test_copy_and_compare("Asterisk", 6, "Aster"));
452  ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk "));
453  ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae"));
454  ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk "));
455  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, ""));
456  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, ""));
457  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb"));
458  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb "));
459  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x"));
460  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy"));
461  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz"));
462 
463  return AST_TEST_PASS;
464 }
465 
466 /*
467  * Let the replace function determine how much
468  * buffer space is required for the destination.
469  */
470 #define SIZE_REQUIRED 0
471 /*
472  * Set the destination buffer size to the size
473  * we expect it to be. 0xDead has no meaning
474  * other than it's larger than any test needs
475  * a buffer to be.
476  */
477 #define SIZE_EXPECTED 0xDead
478 
479 static int tracs(int run, const char *src, const char *cmp,
480  size_t dst_size, enum ast_utf8_replace_result exp_result)
481 {
482  char *dst = NULL;
483  struct ast_json *blob;
484  enum ast_utf8_replace_result result;
485 
486  if (dst_size == SIZE_REQUIRED) {
487  ast_utf8_replace_invalid_chars(dst, &dst_size, src, src ? strlen(src) : 0);
488  } else if (dst_size == SIZE_EXPECTED) {
489  dst_size = strlen(cmp) + 1;
490  }
491 
492  dst = (char *)ast_alloca(dst_size);
493  result = ast_utf8_replace_invalid_chars(dst, &dst_size, src, src ? strlen(src) : 0);
494  if (result != exp_result || strcmp(dst, cmp) != 0) {
495  ast_log(LOG_ERROR, "Run: %2d Invalid result. Src: '%s', Dst: '%s', ExpDst: '%s' Result: %d ExpResult: %d\n",
496  run, src, dst, cmp, result, exp_result);
497  return 0;
498  }
499 
500  /*
501  * The ultimate test: Does jansson accept the result as valid UTF-8?
502  */
503  blob = ast_json_pack("{s: s, s: s}",
504  "variable", "doesntmatter",
505  "value", dst);
506  ast_json_unref(blob);
507 
508  return blob != NULL;
509 }
510 
511 #define ATV(t, v) ast_test_validate(t, v)
512 
513 AST_TEST_DEFINE(test_utf8_replace_invalid_chars)
514 {
515  const char *src;
516  size_t dst_size;
517  enum ast_utf8_replace_result result;
518  int k = 0;
519 
520  switch (cmd) {
521  case TEST_INIT:
522  info->name = "replace_invalid";
523  info->category = "/main/utf8/";
524  info->summary = "Test ast_utf8_replace_invalid_chars";
525  info->description =
526  "Tests UTF-8 string copying/replacing code.";
527  return AST_TEST_NOT_RUN;
528  case TEST_EXECUTE:
529  break;
530  }
531 
532 /*
533  Table 3-7. Well-Formed UTF-8 Byte Sequences
534  Code Points First Second Third Fourth
535  Byte Byte Byte Byte
536  U+0000..U+007F 00..7F
537  U+0080..U+07FF C2..DF 80..BF
538  U+0800..U+0FFF E0 A0..BF 80..BF
539  U+1000..U+CFFF E1..EC 80..BF 80..BF
540  U+D000..U+D7FF ED 80..9F 80..BF
541  U+E000..U+FFFF EE..EF 80..BF 80..BF
542  U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
543  U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
544  U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
545 
546  Older compilers don't support using the \uXXXX or \UXXXXXXXX
547  universal character notation so we have to manually specify
548  the byte sequences even for valid UTF-8 sequences.
549 
550  These are the ones used for the tests below:
551 
552  \u00B0 = \xC2\xB0
553  \u0800 = \xE0\xA0\x80
554  \uE000 = \xEE\x80\x80
555  \U00040000 = \xF1\x80\x80\x80
556 */
557 
558  /*
559  * Check that NULL destination with a valid source string gives us a
560  * valid result code and buffer size = the length of the input string
561  * plus room for the NULL terminator.
562  */
563  src = "ABC\xC2\xB0xyz";
564  result = ast_utf8_replace_invalid_chars(NULL, &dst_size, src, src ? strlen(src) : 0);
565  ATV(test, result == AST_UTF8_REPLACE_VALID && dst_size == strlen(src) + 1);
566 
567  /*
568  * Check that NULL destination with an invalid source string gives us an
569  * invalid result code and buffer size = the length of the input string
570  * plus room for the NULL terminator plus the 2 extra bytes needed for
571  * the one replacement character.
572  */
573  src = "ABC\xFFxyz";
574  result = ast_utf8_replace_invalid_chars(NULL, &dst_size, src, src ? strlen(src) : 0);
575  ATV(test, result == AST_UTF8_REPLACE_INVALID && dst_size == strlen(src) + 3);
576 
577  /*
578  * NULL or empty input
579  */
580  ATV(test, tracs(__LINE__, NULL, "", 80, AST_UTF8_REPLACE_VALID));
581  ATV(test, tracs(__LINE__, "", "", 80, AST_UTF8_REPLACE_VALID));
582 
583 
584  /* Let the replace function calculate the space needed for result */
585  k = SIZE_REQUIRED;
586 
587  /*
588  * Basic ASCII string
589  */
590  ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xyzA", k, AST_UTF8_REPLACE_VALID));
591 
592  /*
593  * Mid string.
594  */
595  /* good single sequences */
596  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
597  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
598  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80xyz", "ABC\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
599  /* good multiple adjacent sequences */
600  ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0xyz", "ABC\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
601  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0xyz", "ABC\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
602  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0xyz", "ABC\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
603  /* Bad sequences */
604  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
605  ATV(test, tracs(__LINE__, "ABC\xC2\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
606  ATV(test, tracs(__LINE__, "ABC\xB0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
607  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
608  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
609  ATV(test, tracs(__LINE__, "ABC\xE0\xA0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
610 
611  /*
612  * Beginning of string.
613  */
614  /* good single sequences */
615  ATV(test, tracs(__LINE__, "\xC2\xB0xyz", "\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
616  ATV(test, tracs(__LINE__, "\xE0\xA0\x80xyz", "\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
617  ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80xyz", "\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
618  /* good multiple adjacent sequences */
619  ATV(test, tracs(__LINE__, "\xC2\xB0\xC2\xB0xyz", "\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
620  ATV(test, tracs(__LINE__, "\xE0\xA0\x80\xC2\xB0xyz", "\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
621  ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80\xC2\xB0xyz", "\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
622  /* Bad sequences */
623  ATV(test, tracs(__LINE__, "\xC2xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
624  ATV(test, tracs(__LINE__, "\xC2\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
625  ATV(test, tracs(__LINE__, "\xB0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
626  ATV(test, tracs(__LINE__, "\xE0\xA0\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
627  ATV(test, tracs(__LINE__, "\xE0\xA0\xF5xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
628  ATV(test, tracs(__LINE__, "\xE0\xA0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
629 
630  /*
631  * End of string.
632  */
633  /* good single sequences */
634  ATV(test, tracs(__LINE__, "ABC\xC2\xB0", "ABC\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
635  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80", "ABC\xE0\xA0\x80", k, AST_UTF8_REPLACE_VALID));
636  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80", "ABC\xF1\x80\x80\x80", k, AST_UTF8_REPLACE_VALID));
637  /* good multiple adjacent sequences */
638  ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0", "ABC\xC2\xB0\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
639  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0", "ABC\xE0\xA0\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
640  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0", "ABC\xF1\x80\x80\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
641  /* Bad sequences */
642  ATV(test, tracs(__LINE__, "ABC\xC2", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
643  ATV(test, tracs(__LINE__, "ABC\xC2\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
644  ATV(test, tracs(__LINE__, "ABC\xB0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
645  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
646  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
647  ATV(test, tracs(__LINE__, "ABC\xE0\xA0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
648 
649 
650  /* Force destination buffer to be only large enough to hold the expected result */
651  k = SIZE_EXPECTED;
652 
653  /*
654  * Mid string.
655  */
656  /* good single sequences */
657  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
658  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
659  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80xyz", "ABC\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
660  /* good multiple adjacent sequences */
661  ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0xyz", "ABC\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
662  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0xyz", "ABC\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
663  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0xyz", "ABC\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
664  /* Bad sequences */
665  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
666  ATV(test, tracs(__LINE__, "ABC\xC2\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
667  ATV(test, tracs(__LINE__, "ABC\xB0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
668  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
669  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
670  ATV(test, tracs(__LINE__, "ABC\xE0\xA0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
671 
672  /*
673  * Beginning of string.
674  */
675  /* good single sequences */
676  ATV(test, tracs(__LINE__, "\xC2\xB0xyz", "\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
677  ATV(test, tracs(__LINE__, "\xE0\xA0\x80xyz", "\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
678  ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80xyz", "\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
679  /* good multiple adjacent sequences */
680  ATV(test, tracs(__LINE__, "\xC2\xB0\xC2\xB0xyz", "\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
681  ATV(test, tracs(__LINE__, "\xE0\xA0\x80\xC2\xB0xyz", "\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
682  ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80\xC2\xB0xyz", "\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
683  /* Bad sequences */
684  ATV(test, tracs(__LINE__, "\xC2xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
685  ATV(test, tracs(__LINE__, "\xC2\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
686  ATV(test, tracs(__LINE__, "\xB0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
687  ATV(test, tracs(__LINE__, "\xE0\xA0\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
688  ATV(test, tracs(__LINE__, "\xE0\xA0\xF5xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
689  ATV(test, tracs(__LINE__, "\xE0\xA0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
690 
691  /*
692  * End of string.
693  */
694  /* good single sequences */
695  ATV(test, tracs(__LINE__, "ABC\xC2\xB0", "ABC\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
696  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80", "ABC\xE0\xA0\x80", k, AST_UTF8_REPLACE_VALID));
697  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80", "ABC\xF1\x80\x80\x80", k, AST_UTF8_REPLACE_VALID));
698  /* good multiple adjacent sequences */
699  ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0", "ABC\xC2\xB0\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
700  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0", "ABC\xE0\xA0\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
701  ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0", "ABC\xF1\x80\x80\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
702  /* Bad sequences */
703  ATV(test, tracs(__LINE__, "ABC\xC2", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
704  ATV(test, tracs(__LINE__, "ABC\xC2\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
705  ATV(test, tracs(__LINE__, "ABC\xB0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
706  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
707  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
708  ATV(test, tracs(__LINE__, "ABC\xE0\xA0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
709 
710 
711  /*
712  * Overrun Prevention
713  */
714 
715  /* No frills. */
716  k = 9;
717  ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xyzA", k--, AST_UTF8_REPLACE_VALID));
718  ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
719  ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
720 
721  /* good single sequences */
722  k = 9; /* \xC2\xB0 needs 2 bytes */
723  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xyz", k--, AST_UTF8_REPLACE_VALID));
724  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xy", k--, AST_UTF8_REPLACE_OVERRUN));
725  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0x", k--, AST_UTF8_REPLACE_OVERRUN));
726  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0", k--, AST_UTF8_REPLACE_OVERRUN));
727  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
728  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
729  ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "AB", k--, AST_UTF8_REPLACE_OVERRUN));
730 
731  k = 10; /* \xE0\xA0\x80 needs 3 bytes */
732  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xyz", k--, AST_UTF8_REPLACE_VALID));
733  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xy", k--, AST_UTF8_REPLACE_OVERRUN));
734  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80x", k--, AST_UTF8_REPLACE_OVERRUN));
735  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80", k--, AST_UTF8_REPLACE_OVERRUN));
736  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
737  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
738  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
739  ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "AB", k--, AST_UTF8_REPLACE_OVERRUN));
740 
741  k = 10; /* \xEF\xBF\xBD needs 3 bytes */
742  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxyz", k--, AST_UTF8_REPLACE_INVALID));
743  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxy", k--, AST_UTF8_REPLACE_OVERRUN));
744  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDx", k--, AST_UTF8_REPLACE_OVERRUN));
745  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
746  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
747  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
748  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
749  ATV(test, tracs(__LINE__, "ABC\xC2xyz", "AB", k--, AST_UTF8_REPLACE_OVERRUN));
750 
751  k = 14; /* Each \xEF\xBF\xBD needs 3 bytes */
752  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
753  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
754  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
755  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
756  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
757  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
758  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
759  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
760  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
761 
762  /*
763  * The following tests are classed as "Everything including the kitchen sink".
764  * Some tests may be redundant.
765  */
766  k = 11;
767  ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
768  ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
769  ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
770  ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
771  ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
772 
773  k = 11;
774  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz\xC2\xB0", k--, AST_UTF8_REPLACE_VALID));
775  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz\xC2\xB0", k--, AST_UTF8_REPLACE_VALID));
776  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
777  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
778  ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
779 
780  k = 11;
781  ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
782  ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
783  ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
784  ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
785  ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
786 
787  k = 12;
788  ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz\xEE\x80\x80", k--, AST_UTF8_REPLACE_VALID));
789  ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz\xEE\x80\x80", k--, AST_UTF8_REPLACE_VALID));
790  ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
791  ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
792  ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
793  ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
794 
795  k = 11;
796  ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
797  ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
798  ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
799  ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
800  ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
801 
802  k = 14;
803  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
804  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
805  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
806  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
807  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
808  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
809  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
810  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
811  ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
812 
813  k = 14;
814  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
815  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
816  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
817  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
818  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
819  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
820  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
821  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
822  ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
823 
824  k = 14;
825  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
826  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
827  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
828  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
829  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
830  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
831  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
832  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
833  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
834 
835  k = 14;
836  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
837  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
838  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
839  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
840  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
841  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
842  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
843  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
844  ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
845 
846  k = 13;
847  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz\xEF\xBF\xBD\xC2\xB0", k--, AST_UTF8_REPLACE_INVALID));
848  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
849  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
850  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
851  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
852  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
853  ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
854 
855  return AST_TEST_PASS;
856 }
857 
858 AST_TEST_DEFINE(test_utf8_validator)
859 {
860  struct ast_utf8_validator *validator;
861 
862  switch (cmd) {
863  case TEST_INIT:
864  info->name = "utf8_validator";
865  info->category = "/main/utf8/";
866  info->summary = "Test ast_utf8_validator";
867  info->description =
868  "Tests UTF-8 progressive validator code.";
869  return AST_TEST_NOT_RUN;
870  case TEST_EXECUTE:
871  break;
872  }
873 
874  if (ast_utf8_validator_new(&validator)) {
875  return AST_TEST_FAIL;
876  }
877 
878  ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID);
879  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2") == AST_UTF8_UNKNOWN);
880  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae") == AST_UTF8_VALID);
881  ast_test_validate(test, ast_utf8_validator_feed(validator, "Private") == AST_UTF8_VALID);
882  ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch") == AST_UTF8_VALID);
883  ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID);
884  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2") == AST_UTF8_UNKNOWN);
885  ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84") == AST_UTF8_UNKNOWN);
886  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb") == AST_UTF8_VALID);
887  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID);
888  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
889  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
890  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
891 
892  ast_utf8_validator_destroy(validator);
893 
894  return AST_TEST_PASS;
895 }
896 
897 static void test_utf8_shutdown(void)
898 {
899  AST_TEST_UNREGISTER(test_utf8_is_valid);
900  AST_TEST_UNREGISTER(test_utf8_copy_string);
901  AST_TEST_UNREGISTER(test_utf8_validator);
902  AST_TEST_UNREGISTER(test_utf8_replace_invalid_chars);
903 }
904 
905 int ast_utf8_init(void)
906 {
907  AST_TEST_REGISTER(test_utf8_is_valid);
908  AST_TEST_REGISTER(test_utf8_copy_string);
909  AST_TEST_REGISTER(test_utf8_validator);
910  AST_TEST_REGISTER(test_utf8_replace_invalid_chars);
911 
912  ast_register_cleanup(test_utf8_shutdown);
913 
914  return 0;
915 }
916 
917 #else /* !TEST_FRAMEWORK */
918 
919 int ast_utf8_init(void)
920 {
921  return 0;
922 }
923 
924 #endif
Not enough space to copy entire source.
Definition: utf8.h:93
The consumed sequence is invalid UTF-8.
Definition: utf8.h:138
Source contained at least 1 invalid UTF-8 sequence.
Definition: utf8.h:84
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:363
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:311
Asterisk main include file. File version handling, generic pbx functions.
int ast_utf8_is_validn(const char *src, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:358
struct ast_json * ast_json_pack(char const *format,...)
Helper for creating complex JSON values.
Definition: json.c:612
void ast_json_unref(struct ast_json *value)
Decrease refcount on value. If refcount reaches zero, value is freed.
Definition: json.c:73
int ast_utf8_is_valid(const char *src)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110
Test Framework API.
Source contained fully valid UTF-8.
Definition: utf8.h:76
UTF-8 information and validation functions.
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:919
Utility functions.
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:324
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:337
Asterisk JSON abstraction layer.
int ast_register_cleanup(void(*func)(void))
Register a function to be executed before Asterisk gracefully exits.
Definition: clicompat.c:19
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src, size_t src_len)
Copy a string safely replacing any invalid UTF-8 sequences.
Definition: utf8.c:173
#define REPL_SEQ
Definition: utf8.c:169
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133
The consumed sequence is valid UTF-8.
Definition: utf8.h:130
#define ast_malloc(len)
A wrapper for malloc()
Definition: astmm.h:191
#define ast_alloca(size)
call __builtin_alloca to ensure we get gcc builtin semantics
Definition: astmm.h:288
ast_utf8_validation_result
Definition: utf8.h:123
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:347
ast_utf8_replace_result
Definition: utf8.h:70
#define AST_TEST_DEFINE(hdr)
Definition: test.h:126
Abstract JSON element (object, array, string, int, ...).
The validator is in an intermediate state.
Definition: utf8.h:148