le_utf8.h

Go to the documentation of this file.
1 /**
2  * @page c_utf8 UTF-8 String Handling API
3  *
4  * @ref le_utf8.h "API Reference"
5  *
6  * <HR>
7  *
8  * This module implements safe and easy to use string handling functions for null-terminated strings
9  * with UTF-8 encoding.
10  *
11  * UTF-8 is a variable length character encoding that supports every character in the Unicode
12  * character set. UTF-8 has become the dominant character encoding because it is self synchronizing,
13  * compatible with ASCII, and avoids the endian issues that other encodings face.
14  *
15  * @section utf8_encoding UTF-8 Encoding
16  *
17  * UTF-8 uses between one and four bytes to encode a character as illustrated in the following
18  * table.
19  *
20  * <table>
21  * <tr> <th> Byte 1 </th> <th> Byte 2 </th> <th> Byte 3 </th> <th> Byte 4 </th> </tr>
22  * <tr> <td> 0xxxxxxx </td> <td> </td> <td> </td> <td> </td> </tr>
23  * <tr> <td> 110xxxxx </td> <td> 10xxxxxx </td> <td> </td> <td> </td> </tr>
24  * <tr> <td> 1110xxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td> </td> </tr>
25  * <tr> <td> 11110xxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> </tr>
26  * </table>
27  *
28  * Single byte codes are used only for the ASCII values 0 through 127. In this case, UTF-8 has the
29  * same binary value as ASCII, making ASCII text valid UTF-8 encoded Unicode. All ASCII strings are
30  * UTF-8 compatible.
31  *
32  * Character codes larger than 127 have a multi-byte encoding consisting of a leading byte and one
33  * or more continuation bytes.
34  *
35  * The leading byte has two or more high-order 1's followed by a 0 that can be used to determine the
36  * number bytes in the character without examining the continuation bytes.
37  *
38  * The continuation bytes have '10' in the high-order position.
39  *
40  * Single bytes, leading bytes and continuation bytes can't have the same values. This means that
41  * UTF-8 strings are self-synchronized, allowing the start of a character to be found by backing up
42  * at most three bytes.
43  *
44  * @c le_utf8_EncodeUnicodeCodePoint() provides a function that is able to encode any unicode code
45  * point into a sequence of bytes that represents the utf-8 encoding of the codepoint. The function
46  * @c le_utf8_DecodeUnicodeCodePoint() implements the inverse function. It converts a UTF-8 encoded
47  * character into the corresponding unicode code point.
48  *
49  * @section utf8_copy Copy and Append
50  *
51  * @c le_utf8_Copy() copies a string to a specified buffer location.
52  *
53  * @c le_utf8_Append() appends a string to the end of another string by copying the source string to
54  * the destination string's buffer starting at the null-terminator of the destination string.
55  *
56  * The @c le_uft8_CopyUpToSubStr() function is like le_utf8_Copy() except it copies only up to, but
57  * not including, a specified string.
58  *
59  * @section utf8_trunc Truncation
60  *
61  * Because UTF-8 is a variable length encoding, the number of characters in a string is not
62  * necessarily the same as the number bytes in the string. When using functions like le_utf8_Copy()
63  * and le_utf8_Append(), the size of the destination buffer, in bytes, must be provided to avoid
64  * buffer overruns.
65  *
66  * The copied string is truncated because of limited space in the destination buffer, and the
67  * destination buffer may not be completely filled. This can occur during the copy process if the
68  * last character to copy is more than one byte long and will not fit within the buffer.
69  *
70  * The character is not copied and a null-terminator is added. Even though we have not filled the
71  * destination buffer, we have truncated the copied string. Essentially, functions like
72  * le_utf8_Copy() and le_utf8_Append() only copy complete characters, not partial characters.
73  *
74  * For le_utf8_Copy(), the number of bytes actually copied is returned in the numBytesPtr parameter.
75  * This parameter can be set to NULL if the number of bytes copied is not needed. le_utf8_Append()
76  * and le_uft8_CopyUpToSubStr() work similarly.
77  *
78  * @code
79  * // In this code sample, we need the number of bytes actually copied:
80  * size_t numBytes;
81  *
82  * if (le_utf8_Copy(destStr, srcStr, sizeof(destStr), &numBytes) == LE_OVERFLOW)
83  * {
84  * LE_WARN("'%s' was truncated when copied. Only %d bytes were copied.", srcStr, numBytes);
85  * }
86  *
87  * // In this code sample, we don't care about the number of bytes copied:
88  * LE_ASSERT(le_utf8_Copy(destStr, srcStr, sizeof(destStr), NULL) != LE_OVERFLOW);
89  * @endcode
90  *
91  * @section utf8_length String Lengths
92  *
93  * String length may mean either the number of characters in the string or the number of bytes in
94  * the string. These two meanings are often used interchangeably because in ASCII-only encodings
95  * the number of characters in a string is equal to the number of bytes in a string. But this is not
96  * necessarily true with variable length encodings such as UTF-8. Legato provides both a
97  * le_utf8_NumChars() function and a le_utf8_NumBytes() function.
98  *
99  * @c le_utf8_NumBytes() must be used when determining the memory size of a string.
100  * @c le_utf8_NumChars() is useful for counting the number of characters in a string (ie. for
101  * display purposes).
102  *
103  * @section utf8_length Character Lengths
104  *
105  * The function le_utf8_NumBytesInChar() can be used to determine the number of bytes in a character
106  * by looking at its first byte. This is handy when reading a UTF-8 string from an input stream.
107  * When the first byte is read, it can be passed to le_utf8_NumBytesInChar() to determine how many
108  * more bytes need to be read to get the rest of the character.
109  *
110  * @section utf8_format Checking UTF-8 Format
111  *
112  * As can be seen in the @ref utf8_encoding section, UTF-8 strings have a specific byte sequence.
113  * The @c le_utf8_IsFormatCorrect() function can be used to check if a string conforms to UTF-8
114  * encoding. Not all valid UTF-8 characters are valid for a given character set;
115  * le_utf8_IsFormatCorrect() does not check for this.
116  *
117  * @section utf8_parsing String Parsing
118  *
119  * To assist with converting integer values from UTF-8 strings to binary numerical values,
120  * le_utf8_ParseInt() is provided.
121  *
122  * More parsing functions may be added as required in the future.
123  *
124  * <hr>
125  *
126  * Copyright (C) Sierra Wireless Inc.
127 */
128 
129 //--------------------------------------------------------------------------------------------------
130 /** @file le_utf8.h
131  *
132  * Legato @ref c_utf8 include file.
133  *
134  * Copyright (C) Sierra Wireless Inc.
135  *
136  */
137 
138 #ifndef LEGATO_UTF8_INCLUDE_GUARD
139 #define LEGATO_UTF8_INCLUDE_GUARD
140 
141 
142 //--------------------------------------------------------------------------------------------------
143 /**
144  * Returns the number of characters in string.
145  *
146  * UTF-8 encoded characters may be larger than 1 byte so the number of characters is not necessarily
147  * equal to the the number of bytes in the string.
148  *
149  * @return
150  * - Number of characters in string if successful.
151  * - LE_FORMAT_ERROR if the string is not UTF-8.
152  */
153 //--------------------------------------------------------------------------------------------------
154 ssize_t le_utf8_NumChars
155 (
156  const char* string ///< [IN] Pointer to the string.
157 );
158 
159 
160 //--------------------------------------------------------------------------------------------------
161 /**
162  * Returns the number of bytes in string (not including the null-terminator).
163  *
164  * @return
165  * Number of bytes in string (not including the null-terminator).
166  */
167 //--------------------------------------------------------------------------------------------------
168 size_t le_utf8_NumBytes
169 (
170  const char* string ///< [IN] The string.
171 );
172 
173 
174 //--------------------------------------------------------------------------------------------------
175 /**
176  * Returns the number of bytes in the character that starts with a given byte.
177  *
178  * @return
179  * Number of bytes in the character, or 0 if the byte provided is not a valid starting byte.
180  */
181 //--------------------------------------------------------------------------------------------------
183 (
184  const char firstByte ///< [IN] The first byte in the character.
185 );
186 
187 
188 //--------------------------------------------------------------------------------------------------
189 /**
190  * Determines whether a given byte is a continuation (not the first byte) of a multi-byte UTF-8
191  * character.
192  *
193  * @return True if a continuation byte or false otherwise.
194  */
195 //--------------------------------------------------------------------------------------------------
196 static inline bool le_utf8_IsContinuationByte
197 (
198  const char byte ///< [IN] The byte to check.
199 )
200 {
201  return ( (byte & 0xC0) == 0x80 );
202 }
203 
204 
205 //--------------------------------------------------------------------------------------------------
206 /**
207  * Copies the string in srcStr to the start of destStr and returns the number of bytes copied (not
208  * including the NULL-terminator) in numBytesPtr. Null can be passed into numBytesPtr if the number
209  * of bytes copied is not needed. The srcStr must be in UTF-8 format.
210  *
211  * If the size of srcStr is less than or equal to the destination buffer size then the entire srcStr
212  * will be copied including the null-character. The rest of the destination buffer is not modified.
213  *
214  * If the size of srcStr is larger than the destination buffer then the maximum number of characters
215  * (from srcStr) plus a null-character that will fit in the destination buffer is copied.
216  *
217  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
218  * not partial characters. Therefore, even if srcStr is larger than the destination buffer, the
219  * copied characters may not fill the entire destination buffer because the last character copied
220  * may not align exactly with the end of the destination buffer.
221  *
222  * The destination string will always be Null-terminated, unless destSize is zero.
223  *
224  * If destStr and srcStr overlap the behaviour of this function is undefined.
225  *
226  * @return
227  * - LE_OK if srcStr was completely copied to the destStr.
228  * - LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
229  */
230 //--------------------------------------------------------------------------------------------------
232 (
233  char* destStr, ///< [IN] Destination where the srcStr is to be copied.
234  const char* srcStr, ///< [IN] UTF-8 source string.
235  const size_t destSize, ///< [IN] Size of the destination buffer in bytes.
236  size_t* numBytesPtr ///< [OUT] Number of bytes copied not including the NULL-terminator.
237  /// Parameter can be set to NULL if the number of bytes copied is
238  /// not needed.
239 );
240 
241 
242 //--------------------------------------------------------------------------------------------------
243 /**
244  * Appends srcStr to destStr by copying characters from srcStr to the end of destStr. The srcStr
245  * must be in UTF-8 format. The number of bytes in the resultant destStr (not including the
246  * NULL-terminator) is returned in destStrLenPtr.
247  *
248  * A null-character is always added to the end of destStr after all srcStr characters have been
249  * copied.
250  *
251  * This function will copy as many characters as possible from srcStr to destStr while ensuring that
252  * the resultant string (including the null-character) will fit within the destination buffer.
253  *
254  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
255  * not partial characters.
256  *
257  * The destination string will always be Null-terminated, unless destSize is zero.
258  *
259  * If destStr and srcStr overlap the behaviour of this function is undefined.
260  *
261  * @return
262  * - LE_OK if srcStr was completely copied to the destStr.
263  * - LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
264  */
265 //--------------------------------------------------------------------------------------------------
267 (
268  char* destStr, ///< [IN] Destination string.
269  const char* srcStr, ///< [IN] UTF-8 source string.
270  const size_t destSize, ///< [IN] Size of the destination buffer in bytes.
271  size_t* destStrLenPtr ///< [OUT] Number of bytes in the resultant destination string (not
272  /// including the NULL-terminator). Parameter can be set to
273  /// NULL if the destination string size is not needed.
274 );
275 
276 
277 //--------------------------------------------------------------------------------------------------
278 /**
279  * Copies all characters from the srcStr to destStr up to the first occurrence of subStr. The
280  * subStr is not copied and instead a null-terminator is added to the destStr. The number of bytes
281  * copied (not including the null-terminator) is returned in numBytesPtr.
282  *
283  * The srcStr and subStr must be in null-terminated UTF-8 strings.
284  *
285  * The destination string will always be null-terminated.
286  *
287  * If subStr is not found in the srcStr then this function behaves just like le_utf8_Copy().
288  *
289  * @return
290  * - LE_OK if srcStr was completely copied to the destStr.
291  * - LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
292  */
293 //--------------------------------------------------------------------------------------------------
295 (
296  char* destStr, ///< [IN] Destination where the srcStr is to be copied.
297  const char* srcStr, ///< [IN] UTF-8 source string.
298  const char* subStr, ///< [IN] Sub-string to copy up to.
299  const size_t destSize, ///< [IN] Size of the destination buffer in bytes.
300  size_t* numBytesPtr ///< [OUT] Number of bytes copied not including the NULL-terminator.
301  /// Parameter can be set to NULL if the number of bytes
302  /// copied is not needed.
303 );
304 
305 
306 //--------------------------------------------------------------------------------------------------
307 /**
308  * Checks to see if the string is indeed a UTF-8 encoded, null-terminated string.
309  *
310  * @return
311  * true if the format is correct or false otherwise
312  */
313 //--------------------------------------------------------------------------------------------------
315 (
316  const char* string ///< [IN] The string.
317 );
318 
319 
320 //--------------------------------------------------------------------------------------------------
321 /**
322  * Parse an integer value from a string.
323  *
324  * @return
325  * - LE_OK = Success.
326  * - LE_FORMAT_ERROR = The argument string was not an integer value.
327  * - LE_OUT_OF_RANGE = Value is too large to be stored in an int variable.
328  **/
329 //--------------------------------------------------------------------------------------------------
331 (
332  int* valuePtr, ///< [OUT] Ptr to where the value will be stored if successful.
333  const char* arg ///< [IN] The string to parse.
334 );
335 
336 
337 //--------------------------------------------------------------------------------------------------
338 /**
339  * Encode a unicode code point as UTF-8 into a buffer.
340  *
341  * @return
342  * - LE_OK on success
343  * - LE_OUT_OF_RANGE if the code point supplied is outside the range of unicode code points
344  * - LE_OVERFLOW if the out buffer is not large enough to store the UTF-8 encoding of the code
345  * point
346  *
347  * @note
348  * Not all code point values are valid unicode. This function does not validate whether the
349  * code point is valid unicode.
350  */
351 //--------------------------------------------------------------------------------------------------
353 (
354  uint32_t codePoint, ///< [IN] Code point to encode as UTF-8
355  char* out, ///< [OUT] Buffer to store the UTF-8 encoded value in.
356  size_t* outSize ///< [IN/OUT] As an input, this value is interpreted as the size of the out
357  /// buffer. As an output, it is updated to hold the size of the UTF-8
358  /// encoded value (in the case of an LE_OK return value) or size that would
359  /// be required to encode the code point (in the case or an LE_OVERFLOW
360  /// return value).
361 );
362 
363 
364 //--------------------------------------------------------------------------------------------------
365 /**
366  * Decode the first unicode code point from the UTF-8 string src.
367  *
368  * @return
369  * - LE_OK on success
370  * - LE_BAD_PARAMETER if byteLength points to 0
371  * - LE_UNDERFLOW if src appears to be the beginning of a UTF-8 character which extends beyond
372  * the end of the string as specified by byteLength.
373  * - LE_FORMAT_ERROR if src is not valid UTF-8 encoded string data.
374  *
375  * @note
376  * Not all code point values are valid unicode. This function does not validate whether the
377  * code point is valid unicode.
378  */
379 //--------------------------------------------------------------------------------------------------
381 (
382  const char* src, ///< [IN] UTF-8 encoded data to extract a code point from.
383  size_t* byteLength, ///< [IN/OUT] As an input parameter, the value pointed to represents the
384  /// number of bytes in src. As an output parameter, the value pointed to
385  /// is the number of bytes from src that were consumed to decode the code
386  /// point (in the case of an LE_OK return value) or the number of bytes
387  /// that would have been consumed had src been long enough (in the case of
388  /// an LE_UNDERFLOW return value).
389  uint32_t* codePoint ///< [OUT] Code point that was decoded from src. This value is only valid
390  /// when the function returns LE_OK.
391 );
392 
393 #endif // LEGATO_UTF8_INCLUDE_GUARD
le_result_t le_utf8_EncodeUnicodeCodePoint(uint32_t codePoint, char *out, size_t *outSize)
size_t le_utf8_NumBytesInChar(const char firstByte)
ssize_t le_utf8_NumChars(const char *string)
le_result_t
Definition: le_basics.h:35
bool le_utf8_IsFormatCorrect(const char *string)
le_result_t le_utf8_Copy(char *destStr, const char *srcStr, const size_t destSize, size_t *numBytesPtr)
le_result_t le_utf8_Append(char *destStr, const char *srcStr, const size_t destSize, size_t *destStrLenPtr)
le_result_t le_utf8_ParseInt(int *valuePtr, const char *arg)
static bool le_utf8_IsContinuationByte(const char byte)
Definition: le_utf8.h:197
size_t le_utf8_NumBytes(const char *string)
le_result_t le_utf8_DecodeUnicodeCodePoint(const char *src, size_t *byteLength, uint32_t *codePoint)
le_result_t le_utf8_CopyUpToSubStr(char *destStr, const char *srcStr, const char *subStr, const size_t destSize, size_t *numBytesPtr)