le_utf8.h - Legato Docs

Go to the documentation of this file.
    1 /**
    2  * @page c_utf8 UTF-8 String Handling API
    3  *
    4  * @ref le_utf8.h "API Reference"
    5  *
    6  * <HR>
    7  *
    8  * This module implements safe and easy to use string handling functions for null-terminated strings
    9  * with UTF-8 encoding.
   10  *
   11  * UTF-8 is a variable length character encoding that supports every character in the Unicode
   12  * character set. UTF-8 has become the dominant character encoding because it is self synchronizing,
   13  * compatible with ASCII, and avoids the endian issues that other encodings face.
   14  *
   15  *  @section utf8_encoding UTF-8 Encoding
   16  *
   17  * UTF-8 uses between one and four bytes to encode a character as illustrated in the following
   18  * table.
   19  *
   20  * <table>
   21  * <tr> <th> Byte 1   </th> <th> Byte 2   </th> <th> Byte 3   </th> <th> Byte 4   </th> </tr>
   22  * <tr> <td> 0xxxxxxx </td> <td>          </td> <td>          </td> <td>          </td> </tr>
   23  * <tr> <td> 110xxxxx </td> <td> 10xxxxxx </td> <td>          </td> <td>          </td> </tr>
   24  * <tr> <td> 1110xxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td>          </td> </tr>
   25  * <tr> <td> 11110xxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> </tr>
   26  * </table>
   27  *
   28  * Single byte codes are used only for the ASCII values 0 through 127.  In this case, UTF-8 has the
   29  * same binary value as ASCII, making ASCII text valid UTF-8 encoded Unicode.  All ASCII
   30  * strings are UTF-8 compatible.
   31  *
   32  * Character codes larger than 127 have a multi-byte encoding consisting of a leading byte and one
   33  * or more continuation bytes.
   34  *
   35  * The leading byte has two or more high-order 1's followed by a 0 that can be used to determine
   36  * the number bytes in the character without examining the continuation bytes.
   37  *
   38  * The continuation bytes have '10' in the high-order position.
   39  *
   40  * Single bytes, leading bytes and continuation bytes can't have the
   41  * same values. This means that UTF-8 strings are self-synchronized, allowing the start of a
   42  * character to be found by backing up at most three bytes.
   43  *
   44  *  @section utf8_copy Copy and Append
   45  *
   46  * @c le_utf8_Copy() copies a string to a specified buffer location.
   47  *
   48  * @c le_utf8_Append() appends a string to the end of another string by copying
   49  * the source string to the destination string's buffer starting at the null-terminator of the
   50  * destination string.
   51  *
   52  * The @c le_uft8_CopyUpToSubStr() function is like le_utf8_Copy() except it copies only up to,
   53  * but not including, a specified string.
   54  *
   55  *  @section utf8_trunc Truncation
   56  *
   57  * Because UTF-8 is a variable length encoding, the number of characters in a string is
   58  * not necessarily the same as the number bytes in the string.  When using functions like
   59  * le_utf8_Copy() and le_utf8_Append(), the size of the destination buffer, in bytes, must be
   60  * provided to avoid buffer overruns.
   61  *
   62  * The copied string is truncated because of limited space in the
   63  * destination buffer, and the destination buffer may not be
   64  * completely filled.  This can occur during the copy processf the last character to copy is more
   65  * than one byte long and will not fit within the buffer.
   66  *
   67  * The character is not copied and a null-terminator is added.
   68  * Even though we have not filled the destination buffer,we have truncated the copied string. Essentially,  functions like
   69  * le_utf8_Copy() and le_utf8_Append() only copy complete characters, not partial characters.
   70  *
   71  * For le_utf8_Copy(), the number of bytes actually copied is returned in the numBytesPtr parameter.
   72  * This parameter can be set to NULL if the number of bytes copied is not needed.  le_utf8_Append()
   73  * and le_uft8_CopyUpToSubStr() work similarly.
   74  *
   75  * @code
   76  * // In this code sample, we need the number of bytes actually copied:
   77  * size_t numBytes;
   78  *
   79  * if (le_utf8_Copy(destStr, srcStr, sizeof(destStr), &numBytes) == LE_OVERFLOW)
   80  * {
   81  *     LE_WARN("'%s' was truncated when copied.  Only %d bytes were copied.", srcStr, numBytes);
   82  * }
   83  *
   84  * // In this code sample, we don't care about the number of bytes copied:
   85  * LE_ASSERT(le_utf8_Copy(destStr, srcStr, sizeof(destStr), NULL) != LE_OVERFLOW);
   86  * @endcode
   87  *
   88  *  @section utf8_length String Lengths
   89  *
   90  * String length may mean either the number of characters in the string or the number of bytes in
   91  * the string.  These two meanings are often used interchangeably because in ASCII-only encodings
   92  * the number of characters in a string is equal to the number of bytes in a string. But this
   93  * is not necessarily true with variable length encodings such as UTF-8. Legato provides both
   94  * a le_utf8_NumChars() function and a le_utf8_NumBytes() function.
   95  *
   96  * @c le_utf8_NumBytes() must be used when determining the memory size of a string.
   97  * @c le_utf8_NumChars() is useful for counting the number of characters in a string (ie. for display
   98  * purposes).
   99  *
  100  *  @section utf8_length Character Lengths
  101  *
  102  * The function le_utf8_NumBytesInChar() can be used to determine the number of bytes in a character
  103  * by looking at its first byte.  This is handy when reading a UTF-8 string from an input stream.
  104  * When the first byte is read, it can be passed to le_utf8_NumBytesInChar() to determine how many
  105  * more bytes need to be read to get the rest of the character.
  106  *
  107  *  @section utf8_format Checking UTF-8 Format
  108  *
  109  * As can be seen in the @ref utf8_encoding section, UTF-8 strings have a specific
  110  * byte sequence. The @c le_utf8_IsFormatCorrect() function can be used to check if a string conforms
  111  * to UTF-8 encoding. Not all valid UTF-8 characters are valid for a given character set;
  112  *  le_utf8_IsFormatCorrect() does not check for this.
  113  *
  114  *  @section utf8_parsing String Parsing
  115  *
  116  * To assist with converting integer values from UTF-8 strings to binary numerical values,
  117  * le_utf8_ParseInt() is provided.
  118  *
  119  * More parsing functions may be added as required in the future.
  120  *
  121  * <hr>
  122  *
  123  * Copyright (C) Sierra Wireless Inc. Use of this work is subject to license.
  124 */
  125 
  126 //--------------------------------------------------------------------------------------------------
  127 /** @file le_utf8.h
  128  *
  129  * Legato @ref c_utf8 include file.
  130  *
  131  * Copyright (C) Sierra Wireless Inc. Use of this work is subject to license.
  132  *
  133  */
  134 
  135 #ifndef LEGATO_UTF8_INCLUDE_GUARD
  136 #define LEGATO_UTF8_INCLUDE_GUARD
  137 
  138 
  139 //--------------------------------------------------------------------------------------------------
  140 /**
  141  * Returns the number of characters in string.
  142  *
  143  * UTF-8 encoded characters may be larger than 1 byte so the number of characters is not necessarily
  144  * equal to the the number of bytes in the string.
  145  *
  146  * @return
  147  *      Number of characters in string if successful.
  148  *      LE_FORMAT_ERROR if the string is not UTF-8.
  149  */
  150 //--------------------------------------------------------------------------------------------------
  151 ssize_t le_utf8_NumChars
  152 (
  153     const char* string      ///< [IN] Pointer to the string.
  154 );
  155 
  156 
  157 //--------------------------------------------------------------------------------------------------
  158 /**
  159  * Returns the number of bytes in string (not including the null-terminator).
  160  *
  161  * @return
  162  *      Number of bytes in string (not including the null-terminator).
  163  */
  164 //--------------------------------------------------------------------------------------------------
  165 size_t le_utf8_NumBytes
  166 (
  167     const char* string      ///< [IN] The string.
  168 );
  169 
  170 
  171 //--------------------------------------------------------------------------------------------------
  172 /**
  173  * Returns the number of bytes in the character that starts with a given byte.
  174  *
  175  * @return
  176  *      Number of bytes in the character, or 0 if the byte provided is not a valid starting byte.
  177  */
  178 //--------------------------------------------------------------------------------------------------
  179 size_t le_utf8_NumBytesInChar
  180 (
  181     const char firstByte    ///< [IN] The first byte in the character.
  182 );
  183 
  184 
  185 //--------------------------------------------------------------------------------------------------
  186 /**
  187  * Determines whether a given byte is a continuation (not the first byte) of a multi-byte
  188  * UTF-8 character.
  189  *
  190  * @return  True if a continuation byte.  False otherwise.
  191  */
  192 //--------------------------------------------------------------------------------------------------
  193 static inline bool le_utf8_IsContinuationByte
  194 (
  195     const char byte     ///< [IN] The byte to check.
  196 )
  197 {
  198     return ( (byte & 0xC0) == 0x80 );
  199 }
  200 
  201 
  202 //--------------------------------------------------------------------------------------------------
  203 /**
  204  * Copies the string in srcStr to the start of destStr and returns the number of bytes
  205  * copied (not including the NULL-terminator) in numBytesPtr.  Null can be passed into numBytesPtr
  206  * if the number of bytes copied is not needed.  The srcStr must be in UTF-8 format.
  207  *
  208  * If the size of srcStr is less than or equal to the destination buffer size then the entire srcStr
  209  * will be copied including the null-character.  The rest of the destination buffer is not modified.
  210  *
  211  * If the size of srcStr is larger than the destination buffer then the maximum number of characters
  212  * (from srcStr) plus a null-character that will fit in the destination buffer is copied.
  213  *
  214  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
  215  * not partial characters. Even if srcStr is larger than the destination buffer ,the
  216  * copied characters may not fill the entire destination buffer because the last character copied
  217  * may not align exactly with the end of the destination buffer.
  218  *
  219  * The destination string will always be Null-terminated, unless destSize is zero.
  220  *
  221  * If destStr and srcStr overlap the behaviour of this function is undefined.
  222  *
  223  * @return
  224  *      LE_OK if srcStr was completely copied to the destStr.
  225  *      LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
  226  */
  227 //--------------------------------------------------------------------------------------------------
  228 le_result_t le_utf8_Copy
  229 (
  230     char* destStr,          ///< [IN] Destination where the srcStr is to be copied.
  231     const char* srcStr,     ///< [IN] UTF-8 source string.
  232     const size_t destSize,  ///< [IN] Size of the destination buffer in bytes.
  233     size_t* numBytesPtr     ///< [OUT] Number of bytes copied not including the NULL-terminator.
  234                             ///        [Parameter can be set to NULL if the number of bytes
  235                             ///        copied is not needed.
  236 );
  237 
  238 
  239 //--------------------------------------------------------------------------------------------------
  240 /**
  241  * Appends srcStr to destStr by copying characters from srcStr to the end of destStr.
  242  * The srcStr must be in UTF-8 format.  The number of bytes in the resultant destStr (not including
  243  * the NULL-terminator) is returned in destStrLenPtr.
  244  *
  245  * A null-character is always added to the end of destStr after all srcStr characters have been
  246  * copied.
  247  *
  248  * This function will copy as many characters as possible from srcStr to destStr while ensuring that
  249  * the resultant string (including the null-character) will fit within the destination buffer.
  250  *
  251  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
  252  * not partial characters.
  253  *
  254  * The destination string will always be Null-terminated, unless destSize is zero.
  255  *
  256  * If destStr and srcStr overlap the behaviour of this function is undefined.
  257  *
  258  * @return
  259  *      LE_OK if srcStr was completely copied to the destStr.
  260  *      LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
  261  */
  262 //--------------------------------------------------------------------------------------------------
  263 le_result_t le_utf8_Append
  264 (
  265     char* destStr,          ///< [IN] Destination string.
  266     const char* srcStr,     ///< [IN] UTF-8 source string.
  267     const size_t destSize,  ///< [IN] Size of the destination buffer in bytes.
  268     size_t* destStrLenPtr   ///< [OUT] Number of bytes in the resultant destination string (not
  269                             ///        including the NULL-terminator).  Parameter can be set to
  270                             ///        NULL if the destination string size is not needed.
  271 );
  272 
  273 
  274 //--------------------------------------------------------------------------------------------------
  275 /**
  276  * Copies all characters from the srcStr to destStr up to the first occurrence of
  277  * subStr.  The subStr is not copied and instead a null-terminator is added to the destStr.
  278  * The number of bytes copied (not including the null-terminator) is returned in numBytesPtr.
  279  *
  280  * The srcStr and subStr must be in null-terminated UTF-8 strings.
  281  *
  282  * The destination string will always be null-terminated.
  283  *
  284  * If subStr is not found in the srcStr then this function behaves just like le_utf8_Copy().
  285  *
  286  * @return
  287  *      LE_OK if srcStr was completely copied to the destStr.
  288  *      LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
  289  */
  290 //--------------------------------------------------------------------------------------------------
  291 le_result_t le_utf8_CopyUpToSubStr
  292 (
  293     char* destStr,          ///< [IN] Destination where the srcStr is to be copied.
  294     const char* srcStr,     ///< [IN] UTF-8 source string.
  295     const char* subStr,     ///< [IN] Sub-string to copy up to.
  296     const size_t destSize,  ///< [IN] Size of the destination buffer in bytes.
  297     size_t* numBytesPtr     ///< [OUT] Number of bytes copied not including the NULL-terminator.
  298                             ///        Parameter can be set to NULL if the number of bytes
  299                             ///        copied is not needed.
  300 );
  301 
  302 
  303 //--------------------------------------------------------------------------------------------------
  304 /**
  305  * Checks to see if the string is indeed a UTF-8 encoded, null-terminated string.
  306  *
  307  * @return
  308  *      true if the format is correct.
  309  *      false if the format is incorrect.
  310  */
  311 //--------------------------------------------------------------------------------------------------
  312 bool le_utf8_IsFormatCorrect
  313 (
  314     const char* string      ///< [IN] The string.
  315 );
  316 
  317 
  318 //--------------------------------------------------------------------------------------------------
  319 /**
  320  * Parse an integer value from a string.
  321  *
  322  * @return
  323  *  - LE_OK = Success.
  324  *  - LE_FORMAT_ERROR = The argument string was not an integer value.
  325  *  - LE_OUT_OF_RANGE = Value is too large to be stored in an int variable.
  326  **/
  327 //--------------------------------------------------------------------------------------------------
  328 le_result_t le_utf8_ParseInt
  329 (
  330     int* valuePtr,  ///< [OUT] Ptr to where the value will be stored if successful.
  331     const char* arg ///< [IN] The string to parse.
  332 );
  333 
  334 
  335 #endif  // LEGATO_UTF8_INCLUDE_GUARD
le_utf8_NumBytesInChar
size_t le_utf8_NumBytesInChar(const char firstByte)
le_utf8_NumChars
ssize_t le_utf8_NumChars(const char *string)
le_result_t
le_result_t
Definition: le_basics.h:35
le_utf8_IsFormatCorrect
bool le_utf8_IsFormatCorrect(const char *string)
le_utf8_Copy
le_result_t le_utf8_Copy(char *destStr, const char *srcStr, const size_t destSize, size_t *numBytesPtr)
le_utf8_Append
le_result_t le_utf8_Append(char *destStr, const char *srcStr, const size_t destSize, size_t *destStrLenPtr)
le_utf8_ParseInt
le_result_t le_utf8_ParseInt(int *valuePtr, const char *arg)
le_utf8_IsContinuationByte
static bool le_utf8_IsContinuationByte(const char byte)
Definition: le_utf8.h:194
le_utf8_NumBytes
size_t le_utf8_NumBytes(const char *string)
le_utf8_CopyUpToSubStr
le_result_t le_utf8_CopyUpToSubStr(char *destStr, const char *srcStr, const char *subStr, const size_t destSize, size_t *numBytesPtr)