Asterisk - The Open Source Telephony Project  21.4.1
utf8.h
Go to the documentation of this file.
1 /*
2  * Asterisk -- An open source telephony toolkit.
3  *
4  * Copyright (C) 2020, Sean Bright
5  *
6  * Sean Bright <sean.bright@gmail.com>
7  *
8  * See http://www.asterisk.org for more information about
9  * the Asterisk project. Please do not directly contact
10  * any of the maintainers of this project for assistance;
11  * the project provides a web site, mailing lists and IRC
12  * channels for your use.
13  *
14  * This program is free software, distributed under the terms of
15  * the GNU General Public License Version 2. See the LICENSE file
16  * at the top of the source tree.
17  */
18 
19 /*! \file
20  *
21  * \brief UTF-8 information and validation functions
22  */
23 
24 #ifndef ASTERISK_UTF8_H
25 #define ASTERISK_UTF8_H
26 
27 /*!
28  * \brief Check if a zero-terminated string is valid UTF-8
29  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
30  *
31  * \param str The zero-terminated string to check
32  *
33  * \retval 0 if the string is not valid UTF-8
34  * \retval Non-zero if the string is valid UTF-8
35  */
36 int ast_utf8_is_valid(const char *str);
37 
38 /*!
39  * \brief Check if the first \a size bytes of a string are valid UTF-8
40  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
41  *
42  * Similar to \a ast_utf8_is_valid() but checks the first \a size bytes or until
43  * a zero byte is reached, whichever comes first.
44  *
45  * \param str The string to check
46  * \param size The number of bytes to evaluate
47  *
48  * \retval 0 if the string is not valid UTF-8
49  * \retval Non-zero if the string is valid UTF-8
50  */
51 int ast_utf8_is_validn(const char *str, size_t size);
52 
53 /*!
54  * \brief Copy a string safely ensuring valid UTF-8
55  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
56  *
57  * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
58  * sequences from the source string into the destination buffer. If an invalid
59  * UTF-8 sequence is encountered, or the available space in the destination
60  * buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the
61  * destination buffer will be truncated to ensure that it only contains valid
62  * UTF-8.
63  *
64  * \param dst The destination buffer.
65  * \param src The source string
66  * \param size The size of the destination buffer
67  */
68 void ast_utf8_copy_string(char *dst, const char *src, size_t size);
69 
71  /*! \brief Source contained fully valid UTF-8
72  *
73  * The entire string was valid UTF-8 and no replacement
74  * was required.
75  */
77 
78  /*! \brief Source contained at least 1 invalid UTF-8 sequence
79  *
80  * Parts of the string contained invalid UTF-8 sequences
81  * but those were successfully replaced with the U+FFFD
82  * replacement sequence.
83  */
85 
86  /*! \brief Not enough space to copy entire source
87  *
88  * The destination buffer wasn't large enough to copy
89  * all of the source characters. As many of the source
90  * characters that could be copied/replaced were done so
91  * and a final NULL terminator added.
92  */
94 };
95 
96 /*!
97  * \brief Copy a string safely replacing any invalid UTF-8 sequences
98  *
99  * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
100  * sequences from the source string into the destination buffer.
101  * If an invalid sequence is encountered, it's replaced with the \uFFFD
102  * sequence which is the valid UTF-8 sequence that represents an unknown,
103  * unrecognized, or unrepresentable character. Since \uFFFD is actually a
104  * 3 byte sequence, the destination buffer will need to be larger than
105  * the corresponding source string if it contains invalid sequences.
106  * You can pass NULL as the destination buffer pointer to get the actual
107  * size required, then call the function again with the properly sized
108  * buffer.
109  *
110  * \param dst Pointer to the destination buffer. If NULL,
111  * dst_size will be set to the size of the
112  * buffer required to fully process the
113  * source string.
114  * \param dst_size A pointer to the size of the dst buffer
115  * \param src The source string
116  * \param src_len The number of bytes to copy
117  *
118  * \return \ref ast_utf8_replace_result
119  */
121  size_t *dst_size, const char *src, size_t src_len);
122 
124  /*! \brief The consumed sequence is valid UTF-8
125  *
126  * The bytes consumed thus far by the validator represent a valid sequence of
127  * UTF-8 bytes. If additional bytes are fed into the validator, it can
128  * transition into either \a AST_UTF8_INVALID or \a AST_UTF8_UNKNOWN
129  */
131 
132  /*! \brief The consumed sequence is invalid UTF-8
133  *
134  * The bytes consumed thus far by the validator represent an invalid sequence
135  * of UTF-8 bytes. Feeding additional bytes into the validator will not
136  * change its state.
137  */
139 
140  /*! \brief The validator is in an intermediate state
141  *
142  * The validator is in the process of validating a multibyte UTF-8 sequence
143  * and requires additional data to be fed into it to determine validity. If
144  * additional bytes are fed into the validator, it can transition into either
145  * \a AST_UTF8_VALID or \a AST_UTF8_INVALID. If you have no additional data
146  * to feed into the validator the UTF-8 sequence is invalid.
147  */
149 };
150 
151 /*!
152  * \brief Opaque type for UTF-8 validator state.
153  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
154  */
155 struct ast_utf8_validator;
156 
157 /*!
158  * \brief Create a new UTF-8 validator
159  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
160  *
161  * \param[out] validator The validator instance
162  *
163  * \retval 0 on success
164  * \retval -1 on failure
165  */
166 int ast_utf8_validator_new(struct ast_utf8_validator **validator);
167 
168 /*!
169  * \brief Feed a zero-terminated string into the UTF-8 validator
170  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
171  *
172  * \param validator The validator instance
173  * \param data The zero-terminated string to feed into the validator
174  *
175  * \return The \ref ast_utf8_validation_result indicating the current state of
176  * the validator.
177  */
179  struct ast_utf8_validator *validator, const char *data);
180 
181 /*!
182  * \brief Feed a string into the UTF-8 validator
183  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
184  *
185  * Similar to \a ast_utf8_validator_feed but will stop feeding in data if a zero
186  * byte is encountered or \a size bytes have been read.
187  *
188  * \param validator The validator instance
189  * \param data The string to feed into the validator
190  * \param size The number of bytes to feed into the validator
191  *
192  * \return The \ref ast_utf8_validation_result indicating the current state of
193  * the validator.
194  */
196  struct ast_utf8_validator *validator, const char *data, size_t size);
197 
198 /*!
199  * \brief Get the current UTF-8 validator state
200  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
201  *
202  * \param validator The validator instance
203  *
204  * \return The \ref ast_utf8_validation_result indicating the current state of
205  * the validator.
206  */
208  struct ast_utf8_validator *validator);
209 
210 /*!
211  * \brief Reset the state of a UTF-8 validator
212  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
213  *
214  * Resets the provided UTF-8 validator to its initial state so that it can be
215  * reused.
216  *
217  * \param validator The validator instance to reset
218  */
220  struct ast_utf8_validator *validator);
221 
222 /*!
223  * \brief Destroy a UTF-8 validator
224  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
225  *
226  * \param validator The validator instance to destroy
227  */
228 void ast_utf8_validator_destroy(struct ast_utf8_validator *validator);
229 
230 /*!
231  * \brief Register UTF-8 tests
232  * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
233  *
234  * Does nothing unless TEST_FRAMEWORK is defined.
235  *
236  * \retval 0 Always
237  */
238 int ast_utf8_init(void);
239 
240 #endif /* ASTERISK_UTF8_H */
Not enough space to copy entire source.
Definition: utf8.h:93
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:337
The consumed sequence is invalid UTF-8.
Definition: utf8.h:138
Source contained at least 1 invalid UTF-8 sequence.
Definition: utf8.h:84
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:363
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:324
Source contained fully valid UTF-8.
Definition: utf8.h:76
int ast_utf8_is_valid(const char *str)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:311
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:919
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:358
The consumed sequence is valid UTF-8.
Definition: utf8.h:130
ast_utf8_validation_result
Definition: utf8.h:123
int ast_utf8_is_validn(const char *str, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121
ast_utf8_replace_result
Definition: utf8.h:70
The validator is in an intermediate state.
Definition: utf8.h:148
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:347
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src, size_t src_len)
Copy a string safely replacing any invalid UTF-8 sequences.
Definition: utf8.c:173
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133