le_utf8.h

Go to the documentation of this file.
1 /**
2  * @page c_utf8 UTF-8 String Handling API
3  *
4  * @ref le_utf8.h "API Reference"
5  *
6  * <HR>
7  *
8  * This module implements safe and easy to use string handling functions for null-terminated strings
9  * with UTF-8 encoding.
10  *
11  * UTF-8 is a variable length character encoding that supports every character in the Unicode
12  * character set. UTF-8 has become the dominant character encoding because it is self synchronizing,
13  * compatible with ASCII, and avoids the endian issues that other encodings face.
14  *
15  * @section utf8_encoding UTF-8 Encoding
16  *
17  * UTF-8 uses between one and four bytes to encode a character as illustrated in the following
18  * table.
19  *
20  * <table>
21  * <tr> <th> Byte 1 </th> <th> Byte 2 </th> <th> Byte 3 </th> <th> Byte 4 </th> </tr>
22  * <tr> <td> 0xxxxxxx </td> <td> </td> <td> </td> <td> </td> </tr>
23  * <tr> <td> 110xxxxx </td> <td> 10xxxxxx </td> <td> </td> <td> </td> </tr>
24  * <tr> <td> 1110xxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td> </td> </tr>
25  * <tr> <td> 11110xxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> </tr>
26  * </table>
27  *
28  * Single byte codes are used only for the ASCII values 0 through 127. In this case, UTF-8 has the
29  * same binary value as ASCII, making ASCII text valid UTF-8 encoded Unicode. All ASCII
30  * strings are UTF-8 compatible.
31  *
32  * Character codes larger than 127 have a multi-byte encoding consisting of a leading byte and one
33  * or more continuation bytes.
34  *
35  * The leading byte has two or more high-order 1's followed by a 0 that can be used to determine
36  * the number bytes in the character without examining the continuation bytes.
37  *
38  * The continuation bytes have '10' in the high-order position.
39  *
40  * Single bytes, leading bytes and continuation bytes can't have the
41  * same values. This means that UTF-8 strings are self-synchronized, allowing the start of a
42  * character to be found by backing up at most three bytes.
43  *
44  * @section utf8_copy Copy and Append
45  *
46  * @c le_utf8_Copy() copies a string to a specified buffer location.
47  *
48  * @c le_utf8_Append() appends a string to the end of another string by copying
49  * the source string to the destination string's buffer starting at the null-terminator of the
50  * destination string.
51  *
52  * The @c le_uft8_CopyUpToSubStr() function is like le_utf8_Copy() except it copies only up to,
53  * but not including, a specified string.
54  *
55  * @section utf8_trunc Truncation
56  *
57  * Because UTF-8 is a variable length encoding, the number of characters in a string is
58  * not necessarily the same as the number bytes in the string. When using functions like
59  * le_utf8_Copy() and le_utf8_Append(), the size of the destination buffer, in bytes, must be
60  * provided to avoid buffer overruns.
61  *
62  * The copied string is truncated because of limited space in the
63  * destination buffer, and the destination buffer may not be
64  * completely filled. This can occur during the copy processf the last character to copy is more
65  * than one byte long and will not fit within the buffer.
66  *
67  * The character is not copied and a null-terminator is added.
68  * Even though we have not filled the destination buffer,we have truncated the copied string. Essentially, functions like
69  * le_utf8_Copy() and le_utf8_Append() only copy complete characters, not partial characters.
70  *
71  * For le_utf8_Copy(), the number of bytes actually copied is returned in the numBytesPtr parameter.
72  * This parameter can be set to NULL if the number of bytes copied is not needed. le_utf8_Append()
73  * and le_uft8_CopyUpToSubStr() work similarly.
74  *
75  * @code
76  * // In this code sample, we need the number of bytes actually copied:
77  * size_t numBytes;
78  *
79  * if (le_utf8_Copy(destStr, srcStr, sizeof(destStr), &numBytes) == LE_OVERFLOW)
80  * {
81  * LE_WARN("'%s' was truncated when copied. Only %d bytes were copied.", srcStr, numBytes);
82  * }
83  *
84  * // In this code sample, we don't care about the number of bytes copied:
85  * LE_ASSERT(le_utf8_Copy(destStr, srcStr, sizeof(destStr), NULL) != LE_OVERFLOW);
86  * @endcode
87  *
88  * @section utf8_length String Lengths
89  *
90  * String length may mean either the number of characters in the string or the number of bytes in
91  * the string. These two meanings are often used interchangeably because in ASCII-only encodings
92  * the number of characters in a string is equal to the number of bytes in a string. But this
93  * is not necessarily true with variable length encodings such as UTF-8. Legato provides both
94  * a le_utf8_NumChars() function and a le_utf8_NumBytes() function.
95  *
96  * @c le_utf8_NumBytes() must be used when determining the memory size of a string.
97  * @c le_utf8_NumChars() is useful for counting the number of characters in a string (ie. for display
98  * purposes).
99  *
100  * @section utf8_length Character Lengths
101  *
102  * The function le_utf8_NumBytesInChar() can be used to determine the number of bytes in a character
103  * by looking at its first byte. This is handy when reading a UTF-8 string from an input stream.
104  * When the first byte is read, it can be passed to le_utf8_NumBytesInChar() to determine how many
105  * more bytes need to be read to get the rest of the character.
106  *
107  * @section utf8_format Checking UTF-8 Format
108  *
109  * As can be seen in the @ref utf8_encoding section, UTF-8 strings have a specific
110  * byte sequence. The @c le_utf8_IsFormatCorrect() function can be used to check if a string conforms
111  * to UTF-8 encoding. Not all valid UTF-8 characters are valid for a given character set;
112  * le_utf8_IsFormatCorrect() does not check for this.
113  *
114  * @section utf8_parsing String Parsing
115  *
116  * To assist with converting integer values from UTF-8 strings to binary numerical values,
117  * le_utf8_ParseInt() is provided.
118  *
119  * More parsing functions may be added as required in the future.
120  *
121  * <hr>
122  *
123  * Copyright (C) Sierra Wireless Inc. Use of this work is subject to license.
124 */
125 
126 //--------------------------------------------------------------------------------------------------
127 /** @file le_utf8.h
128  *
129  * Legato @ref c_utf8 include file.
130  *
131  * Copyright (C) Sierra Wireless Inc. Use of this work is subject to license.
132  *
133  */
134 
135 #ifndef LEGATO_UTF8_INCLUDE_GUARD
136 #define LEGATO_UTF8_INCLUDE_GUARD
137 
138 
139 //--------------------------------------------------------------------------------------------------
140 /**
141  * Returns the number of characters in string.
142  *
143  * UTF-8 encoded characters may be larger than 1 byte so the number of characters is not necessarily
144  * equal to the the number of bytes in the string.
145  *
146  * @return
147  * Number of characters in string if successful.
148  * LE_FORMAT_ERROR if the string is not UTF-8.
149  */
150 //--------------------------------------------------------------------------------------------------
151 ssize_t le_utf8_NumChars
152 (
153  const char* string ///< [IN] Pointer to the string.
154 );
155 
156 
157 //--------------------------------------------------------------------------------------------------
158 /**
159  * Returns the number of bytes in string (not including the null-terminator).
160  *
161  * @return
162  * Number of bytes in string (not including the null-terminator).
163  */
164 //--------------------------------------------------------------------------------------------------
165 size_t le_utf8_NumBytes
166 (
167  const char* string ///< [IN] The string.
168 );
169 
170 
171 //--------------------------------------------------------------------------------------------------
172 /**
173  * Returns the number of bytes in the character that starts with a given byte.
174  *
175  * @return
176  * Number of bytes in the character, or 0 if the byte provided is not a valid starting byte.
177  */
178 //--------------------------------------------------------------------------------------------------
180 (
181  const char firstByte ///< [IN] The first byte in the character.
182 );
183 
184 
185 //--------------------------------------------------------------------------------------------------
186 /**
187  * Determines whether a given byte is a continuation (not the first byte) of a multi-byte
188  * UTF-8 character.
189  *
190  * @return True if a continuation byte. False otherwise.
191  */
192 //--------------------------------------------------------------------------------------------------
193 static inline bool le_utf8_IsContinuationByte
194 (
195  const char byte ///< [IN] The byte to check.
196 )
197 {
198  return ( (byte & 0xC0) == 0x80 );
199 }
200 
201 
202 //--------------------------------------------------------------------------------------------------
203 /**
204  * Copies the string in srcStr to the start of destStr and returns the number of bytes
205  * copied (not including the NULL-terminator) in numBytesPtr. Null can be passed into numBytesPtr
206  * if the number of bytes copied is not needed. The srcStr must be in UTF-8 format.
207  *
208  * If the size of srcStr is less than or equal to the destination buffer size then the entire srcStr
209  * will be copied including the null-character. The rest of the destination buffer is not modified.
210  *
211  * If the size of srcStr is larger than the destination buffer then the maximum number of characters
212  * (from srcStr) plus a null-character that will fit in the destination buffer is copied.
213  *
214  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
215  * not partial characters. Even if srcStr is larger than the destination buffer ,the
216  * copied characters may not fill the entire destination buffer because the last character copied
217  * may not align exactly with the end of the destination buffer.
218  *
219  * The destination string will always be Null-terminated, unless destSize is zero.
220  *
221  * If destStr and srcStr overlap the behaviour of this function is undefined.
222  *
223  * @return
224  * LE_OK if srcStr was completely copied to the destStr.
225  * LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
226  */
227 //--------------------------------------------------------------------------------------------------
229 (
230  char* destStr, ///< [IN] Destination where the srcStr is to be copied.
231  const char* srcStr, ///< [IN] UTF-8 source string.
232  const size_t destSize, ///< [IN] Size of the destination buffer in bytes.
233  size_t* numBytesPtr ///< [OUT] Number of bytes copied not including the NULL-terminator.
234  /// [Parameter can be set to NULL if the number of bytes
235  /// copied is not needed.
236 );
237 
238 
239 //--------------------------------------------------------------------------------------------------
240 /**
241  * Appends srcStr to destStr by copying characters from srcStr to the end of destStr.
242  * The srcStr must be in UTF-8 format. The number of bytes in the resultant destStr (not including
243  * the NULL-terminator) is returned in destStrLenPtr.
244  *
245  * A null-character is always added to the end of destStr after all srcStr characters have been
246  * copied.
247  *
248  * This function will copy as many characters as possible from srcStr to destStr while ensuring that
249  * the resultant string (including the null-character) will fit within the destination buffer.
250  *
251  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
252  * not partial characters.
253  *
254  * The destination string will always be Null-terminated, unless destSize is zero.
255  *
256  * If destStr and srcStr overlap the behaviour of this function is undefined.
257  *
258  * @return
259  * LE_OK if srcStr was completely copied to the destStr.
260  * LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
261  */
262 //--------------------------------------------------------------------------------------------------
264 (
265  char* destStr, ///< [IN] Destination string.
266  const char* srcStr, ///< [IN] UTF-8 source string.
267  const size_t destSize, ///< [IN] Size of the destination buffer in bytes.
268  size_t* destStrLenPtr ///< [OUT] Number of bytes in the resultant destination string (not
269  /// including the NULL-terminator). Parameter can be set to
270  /// NULL if the destination string size is not needed.
271 );
272 
273 
274 //--------------------------------------------------------------------------------------------------
275 /**
276  * Copies all characters from the srcStr to destStr up to the first occurrence of
277  * subStr. The subStr is not copied and instead a null-terminator is added to the destStr.
278  * The number of bytes copied (not including the null-terminator) is returned in numBytesPtr.
279  *
280  * The srcStr and subStr must be in null-terminated UTF-8 strings.
281  *
282  * The destination string will always be null-terminated.
283  *
284  * If subStr is not found in the srcStr then this function behaves just like le_utf8_Copy().
285  *
286  * @return
287  * LE_OK if srcStr was completely copied to the destStr.
288  * LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
289  */
290 //--------------------------------------------------------------------------------------------------
292 (
293  char* destStr, ///< [IN] Destination where the srcStr is to be copied.
294  const char* srcStr, ///< [IN] UTF-8 source string.
295  const char* subStr, ///< [IN] Sub-string to copy up to.
296  const size_t destSize, ///< [IN] Size of the destination buffer in bytes.
297  size_t* numBytesPtr ///< [OUT] Number of bytes copied not including the NULL-terminator.
298  /// Parameter can be set to NULL if the number of bytes
299  /// copied is not needed.
300 );
301 
302 
303 //--------------------------------------------------------------------------------------------------
304 /**
305  * Checks to see if the string is indeed a UTF-8 encoded, null-terminated string.
306  *
307  * @return
308  * true if the format is correct.
309  * false if the format is incorrect.
310  */
311 //--------------------------------------------------------------------------------------------------
313 (
314  const char* string ///< [IN] The string.
315 );
316 
317 
318 //--------------------------------------------------------------------------------------------------
319 /**
320  * Parse an integer value from a string.
321  *
322  * @return
323  * - LE_OK = Success.
324  * - LE_FORMAT_ERROR = The argument string was not an integer value.
325  * - LE_OUT_OF_RANGE = Value is too large to be stored in an int variable.
326  **/
327 //--------------------------------------------------------------------------------------------------
329 (
330  int* valuePtr, ///< [OUT] Ptr to where the value will be stored if successful.
331  const char* arg ///< [IN] The string to parse.
332 );
333 
334 
335 #endif // LEGATO_UTF8_INCLUDE_GUARD
size_t le_utf8_NumBytesInChar(const char firstByte)
ssize_t le_utf8_NumChars(const char *string)
le_result_t
Definition: le_basics.h:35
bool le_utf8_IsFormatCorrect(const char *string)
le_result_t le_utf8_Copy(char *destStr, const char *srcStr, const size_t destSize, size_t *numBytesPtr)
le_result_t le_utf8_Append(char *destStr, const char *srcStr, const size_t destSize, size_t *destStrLenPtr)
le_result_t le_utf8_ParseInt(int *valuePtr, const char *arg)
static bool le_utf8_IsContinuationByte(const char byte)
Definition: le_utf8.h:194
size_t le_utf8_NumBytes(const char *string)
le_result_t le_utf8_CopyUpToSubStr(char *destStr, const char *srcStr, const char *subStr, const size_t destSize, size_t *numBytesPtr)