61 #define UTF8_REJECT 12
63 static const uint8_t utf8d[] = {
66 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
69 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
71 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
72 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
77 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
78 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
79 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
80 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
81 12,36,12,12,12,12,12,12,12,12,12,12,
86 static uint32_t
inline decode(uint32_t *
state, uint32_t *codep, uint32_t byte) {
87 uint32_t type = utf8d[byte];
89 *codep = (*state != UTF8_ACCEPT) ?
90 (byte & 0x3fu) | (*codep << 6) :
91 (0xff >> type) & (byte);
93 *state = utf8d[256 + *state + type];
98 static uint32_t
inline decode(uint32_t *state, uint32_t byte) {
99 uint32_t type = utf8d[byte];
100 *state = utf8d[256 + *state + type];
112 uint32_t state = UTF8_ACCEPT;
115 decode(&state, (uint8_t) *src++);
118 return state == UTF8_ACCEPT;
123 uint32_t state = UTF8_ACCEPT;
125 while (size && *src) {
126 decode(&state, (uint8_t) *src++);
130 return state == UTF8_ACCEPT;
135 uint32_t state = UTF8_ACCEPT;
136 char *last_good = dst;
138 ast_assert(size > 0);
140 while (size && *src) {
141 if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
150 if (size && state == UTF8_ACCEPT) {
169 #define REPL_SEQ "\xEF\xBF\xBD"
170 #define REPL_SEQ_LEN 3
179 uint32_t prev_state = UTF8_ACCEPT;
180 uint32_t curr_state = UTF8_ACCEPT;
188 memset(dst, 0, *dst_size);
193 if (!src || src_len == 0) {
197 for (prev_state = 0, curr_state = 0; src_pos < src_len; prev_state = curr_state, src_pos++) {
200 rc = decode(&curr_state, (uint8_t) src[src_pos]);
202 if (dst && dst_pos >= *dst_size - 1) {
203 if (prev_state > UTF8_REJECT) {
210 dst_pos -= (seq_len - (prev_state / 36));
217 if (rc == UTF8_ACCEPT) {
219 dst[dst_pos] = src[src_pos];
225 if (rc > UTF8_REJECT) {
231 if (prev_state == UTF8_ACCEPT) {
236 seq_len = curr_state / 36 + 1;
240 dst[dst_pos] = src[src_pos];
245 if (rc == UTF8_REJECT) {
249 if (prev_state != UTF8_ACCEPT) {
259 dst_pos -= (seq_len - (prev_state / 36));
269 if (dst_pos > *dst_size - 4) {
273 memcpy(&dst[dst_pos],
REPL_SEQ, REPL_SEQ_LEN);
275 dst_pos += REPL_SEQ_LEN;
277 curr_state = UTF8_ACCEPT;
281 if (curr_state != UTF8_ACCEPT) {
287 dst_pos -= (seq_len - (prev_state / 36));
289 if (dst_pos > *dst_size - 4) {
293 memcpy(&dst[dst_pos],
REPL_SEQ, REPL_SEQ_LEN);
295 dst_pos += REPL_SEQ_LEN;
301 *dst_size = dst_pos + 1;
319 tmp->state = UTF8_ACCEPT;
327 switch (validator->state) {
341 decode(&validator->state, (uint8_t) *data++);
350 while (size && *data) {
351 decode(&validator->state, (uint8_t) *data++);
360 validator->state = UTF8_ACCEPT;
368 #ifdef TEST_FRAMEWORK
376 info->name =
"is_valid";
377 info->category =
"/main/utf8/";
378 info->summary =
"Test ast_utf8_is_valid and ast_utf8_is_validn";
380 "Tests UTF-8 string validation code.";
381 return AST_TEST_NOT_RUN;
427 return AST_TEST_PASS;
430 static int test_copy_and_compare(
const char *src,
size_t dst_len,
const char *cmp)
434 return strcmp(dst, cmp) == 0;
441 info->name =
"copy_string";
442 info->category =
"/main/utf8/";
443 info->summary =
"Test ast_utf8_copy_string";
445 "Tests UTF-8 string copying code.";
446 return AST_TEST_NOT_RUN;
451 ast_test_validate(
test, test_copy_and_compare(
"Asterisk", 6,
"Aster"));
452 ast_test_validate(
test, test_copy_and_compare(
"Asterisk \xc2\xae", 11,
"Asterisk "));
453 ast_test_validate(
test, test_copy_and_compare(
"Asterisk \xc2\xae", 12,
"Asterisk \xc2\xae"));
454 ast_test_validate(
test, test_copy_and_compare(
"Asterisk \xc0\x8a", 12,
"Asterisk "));
455 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 1,
""));
456 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 2,
""));
457 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 3,
"\xce\xbb"));
458 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 4,
"\xce\xbb "));
459 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 5,
"\xce\xbb x"));
460 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 6,
"\xce\xbb xy"));
461 ast_test_validate(
test, test_copy_and_compare(
"\xce\xbb xyz", 7,
"\xce\xbb xyz"));
463 return AST_TEST_PASS;
470 #define SIZE_REQUIRED 0
477 #define SIZE_EXPECTED 0xDead
479 static int tracs(
int run,
const char *src,
const char *cmp,
486 if (dst_size == SIZE_REQUIRED) {
488 }
else if (dst_size == SIZE_EXPECTED) {
489 dst_size = strlen(cmp) + 1;
494 if (result != exp_result || strcmp(dst, cmp) != 0) {
495 ast_log(LOG_ERROR,
"Run: %2d Invalid result. Src: '%s', Dst: '%s', ExpDst: '%s' Result: %d ExpResult: %d\n",
496 run, src, dst, cmp, result, exp_result);
504 "variable",
"doesntmatter",
511 #define ATV(t, v) ast_test_validate(t, v)
522 info->name =
"replace_invalid";
523 info->category =
"/main/utf8/";
524 info->summary =
"Test ast_utf8_replace_invalid_chars";
526 "Tests UTF-8 string copying/replacing code.";
527 return AST_TEST_NOT_RUN;
563 src =
"ABC\xC2\xB0xyz";
855 return AST_TEST_PASS;
864 info->name =
"utf8_validator";
865 info->category =
"/main/utf8/";
866 info->summary =
"Test ast_utf8_validator";
868 "Tests UTF-8 progressive validator code.";
869 return AST_TEST_NOT_RUN;
875 return AST_TEST_FAIL;
894 return AST_TEST_PASS;
897 static void test_utf8_shutdown(
void)
899 AST_TEST_UNREGISTER(test_utf8_is_valid);
900 AST_TEST_UNREGISTER(test_utf8_copy_string);
901 AST_TEST_UNREGISTER(test_utf8_validator);
902 AST_TEST_UNREGISTER(test_utf8_replace_invalid_chars);
907 AST_TEST_REGISTER(test_utf8_is_valid);
908 AST_TEST_REGISTER(test_utf8_copy_string);
909 AST_TEST_REGISTER(test_utf8_validator);
910 AST_TEST_REGISTER(test_utf8_replace_invalid_chars);
Not enough space to copy entire source.
The consumed sequence is invalid UTF-8.
Source contained at least 1 invalid UTF-8 sequence.
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Asterisk main include file. File version handling, generic pbx functions.
int ast_utf8_is_validn(const char *src, size_t size)
Check if the first size bytes of a string are valid UTF-8.
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
struct ast_json * ast_json_pack(char const *format,...)
Helper for creating complex JSON values.
void ast_json_unref(struct ast_json *value)
Decrease refcount on value. If refcount reaches zero, value is freed.
int ast_utf8_is_valid(const char *src)
Check if a zero-terminated string is valid UTF-8.
Source contained fully valid UTF-8.
UTF-8 information and validation functions.
int ast_utf8_init(void)
Register UTF-8 tests.
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Asterisk JSON abstraction layer.
int ast_register_cleanup(void(*func)(void))
Register a function to be executed before Asterisk gracefully exits.
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src, size_t src_len)
Copy a string safely replacing any invalid UTF-8 sequences.
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
The consumed sequence is valid UTF-8.
#define ast_malloc(len)
A wrapper for malloc()
#define ast_alloca(size)
call __builtin_alloca to ensure we get gcc builtin semantics
ast_utf8_validation_result
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
#define AST_TEST_DEFINE(hdr)
Abstract JSON element (object, array, string, int, ...).
The validator is in an intermediate state.