le_utf8.h - Legato Docs

Go to the documentation of this file.
    1 /**
    2  * @page c_utf8 UTF-8 String Handling API
    3  *
    4  * @subpage le_utf8.h "API Reference"
    5  *
    6  * <HR>
    7  *
    8  * This module implements safe and easy to use string handling functions for null-terminated strings
    9  * with UTF-8 encoding.
   10  *
   11  * UTF-8 is a variable length character encoding that supports every character in the Unicode
   12  * character set. UTF-8 has become the dominant character encoding because it is self synchronizing,
   13  * compatible with ASCII, and avoids the endian issues that other encodings face.
   14  *
   15  *  @section utf8_encoding UTF-8 Encoding
   16  *
   17  * UTF-8 uses between one and four bytes to encode a character as illustrated in the following
   18  * table.
   19  *
   20  * <table>
   21  * <tr> <th> Byte 1   </th> <th> Byte 2   </th> <th> Byte 3   </th> <th> Byte 4   </th> </tr>
   22  * <tr> <td> 0xxxxxxx </td> <td>          </td> <td>          </td> <td>          </td> </tr>
   23  * <tr> <td> 110xxxxx </td> <td> 10xxxxxx </td> <td>          </td> <td>          </td> </tr>
   24  * <tr> <td> 1110xxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td>          </td> </tr>
   25  * <tr> <td> 11110xxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> <td> 10xxxxxx </td> </tr>
   26  * </table>
   27  *
   28  * Single byte codes are used only for the ASCII values 0 through 127.  In this case, UTF-8 has the
   29  * same binary value as ASCII, making ASCII text valid UTF-8 encoded Unicode.  All ASCII strings are
   30  * UTF-8 compatible.
   31  *
   32  * Character codes larger than 127 have a multi-byte encoding consisting of a leading byte and one
   33  * or more continuation bytes.
   34  *
   35  * The leading byte has two or more high-order 1's followed by a 0 that can be used to determine the
   36  * number bytes in the character without examining the continuation bytes.
   37  *
   38  * The continuation bytes have '10' in the high-order position.
   39  *
   40  * Single bytes, leading bytes and continuation bytes can't have the same values. This means that
   41  * UTF-8 strings are self-synchronized, allowing the start of a character to be found by backing up
   42  * at most three bytes.
   43  *
   44  * @c le_utf8_EncodeUnicodeCodePoint() provides a function that is able to encode any unicode code
   45  * point into a sequence of bytes that represents the utf-8 encoding of the codepoint.  The function
   46  * @c le_utf8_DecodeUnicodeCodePoint() implements the inverse function.  It converts a UTF-8 encoded
   47  * character into the corresponding unicode code point.
   48  *
   49  *  @section utf8_copy Copy and Append
   50  *
   51  * @c le_utf8_Copy() copies a string to a specified buffer location.
   52  *
   53  * @c le_utf8_Append() appends a string to the end of another string by copying the source string to
   54  * the destination string's buffer starting at the null-terminator of the destination string.
   55  *
   56  * The @c le_uft8_CopyUpToSubStr() function is like le_utf8_Copy() except it copies only up to, but
   57  * not including, a specified string.
   58  *
   59  *  @section utf8_trunc Truncation
   60  *
   61  * Because UTF-8 is a variable length encoding, the number of characters in a string is not
   62  * necessarily the same as the number bytes in the string.  When using functions like le_utf8_Copy()
   63  * and le_utf8_Append(), the size of the destination buffer, in bytes, must be provided to avoid
   64  * buffer overruns.
   65  *
   66  * The copied string is truncated because of limited space in the destination buffer, and the
   67  * destination buffer may not be completely filled.  This can occur during the copy process if the
   68  * last character to copy is more than one byte long and will not fit within the buffer.
   69  *
   70  * The character is not copied and a null-terminator is added.  Even though we have not filled the
   71  * destination buffer, we have truncated the copied string.  Essentially, functions like
   72  * le_utf8_Copy() and le_utf8_Append() only copy complete characters, not partial characters.
   73  *
   74  * For le_utf8_Copy(), the number of bytes actually copied is returned in the numBytesPtr parameter.
   75  * This parameter can be set to NULL if the number of bytes copied is not needed.  le_utf8_Append()
   76  * and le_uft8_CopyUpToSubStr() work similarly.
   77  *
   78  * @code
   79  * // In this code sample, we need the number of bytes actually copied:
   80  * size_t numBytes;
   81  *
   82  * if (le_utf8_Copy(destStr, srcStr, sizeof(destStr), &numBytes) == LE_OVERFLOW)
   83  * {
   84  *     LE_WARN("'%s' was truncated when copied.  Only %d bytes were copied.", srcStr, numBytes);
   85  * }
   86  *
   87  * // In this code sample, we don't care about the number of bytes copied:
   88  * LE_ASSERT(le_utf8_Copy(destStr, srcStr, sizeof(destStr), NULL) != LE_OVERFLOW);
   89  * @endcode
   90  *
   91  *  @section utf8_stringLength String Lengths
   92  *
   93  * String length may mean either the number of characters in the string or the number of bytes in
   94  * the string.  These two meanings are often used interchangeably because in ASCII-only encodings
   95  * the number of characters in a string is equal to the number of bytes in a string. But this is not
   96  * necessarily true with variable length encodings such as UTF-8. Legato provides both a
   97  * le_utf8_NumChars() function and a le_utf8_NumBytes() function.
   98  *
   99  * @c le_utf8_NumBytes() must be used when determining the memory size of a string.
  100  * @c le_utf8_NumChars() is useful for counting the number of characters in a string (ie. for
  101  * display purposes).
  102  *
  103  *  @section utf8_charLength Character Lengths
  104  *
  105  * The function le_utf8_NumBytesInChar() can be used to determine the number of bytes in a character
  106  * by looking at its first byte.  This is handy when reading a UTF-8 string from an input stream.
  107  * When the first byte is read, it can be passed to le_utf8_NumBytesInChar() to determine how many
  108  * more bytes need to be read to get the rest of the character.
  109  *
  110  *  @section utf8_format Checking UTF-8 Format
  111  *
  112  * As can be seen in the @ref utf8_encoding section, UTF-8 strings have a specific byte sequence.
  113  * The @c le_utf8_IsFormatCorrect() function can be used to check if a string conforms to UTF-8
  114  * encoding.  Not all valid UTF-8 characters are valid for a given character set;
  115  * le_utf8_IsFormatCorrect() does not check for this.
  116  *
  117  *  @section utf8_parsing String Parsing
  118  *
  119  * To assist with converting integer values from UTF-8 strings to binary numerical values,
  120  * le_utf8_ParseInt() is provided.
  121  *
  122  * More parsing functions may be added as required in the future.
  123  *
  124  * <hr>
  125  *
  126  * Copyright (C) Sierra Wireless Inc.
  127 */
  128 
  129 //--------------------------------------------------------------------------------------------------
  130 /** @file le_utf8.h
  131  *
  132  * Legato @ref c_utf8 include file.
  133  *
  134  * Copyright (C) Sierra Wireless Inc.
  135  *
  136  */
  137 
  138 #ifndef LEGATO_UTF8_INCLUDE_GUARD
  139 #define LEGATO_UTF8_INCLUDE_GUARD
  140 
  141 
  142 //--------------------------------------------------------------------------------------------------
  143 /**
  144  * Returns the number of characters in string.
  145  *
  146  * UTF-8 encoded characters may be larger than 1 byte so the number of characters is not necessarily
  147  * equal to the the number of bytes in the string.
  148  *
  149  * @return
  150  *      - Number of characters in string if successful.
  151  *      - LE_FORMAT_ERROR if the string is not UTF-8.
  152  */
  153 //--------------------------------------------------------------------------------------------------
  154 ssize_t le_utf8_NumChars
  155 (
  156     const char* string      ///< [IN] Pointer to the string.
  157 );
  158 
  159 
  160 //--------------------------------------------------------------------------------------------------
  161 /**
  162  * Returns the number of bytes in string (not including the null-terminator).
  163  *
  164  * @return
  165  *      Number of bytes in string (not including the null-terminator).
  166  */
  167 //--------------------------------------------------------------------------------------------------
  168 size_t le_utf8_NumBytes
  169 (
  170     const char* string      ///< [IN] The string.
  171 );
  172 
  173 
  174 //--------------------------------------------------------------------------------------------------
  175 /**
  176  * Returns the number of bytes in the character that starts with a given byte.
  177  *
  178  * @return
  179  *      Number of bytes in the character, or 0 if the byte provided is not a valid starting byte.
  180  */
  181 //--------------------------------------------------------------------------------------------------
  182 size_t le_utf8_NumBytesInChar
  183 (
  184     const char firstByte    ///< [IN] The first byte in the character.
  185 );
  186 
  187 
  188 //--------------------------------------------------------------------------------------------------
  189 /**
  190  * Determines whether a given byte is a continuation (not the first byte) of a multi-byte UTF-8
  191  * character.
  192  *
  193  * @return  True if a continuation byte or false otherwise.
  194  */
  195 //--------------------------------------------------------------------------------------------------
  196 static inline bool le_utf8_IsContinuationByte
  197 (
  198     const char byte     ///< [IN] The byte to check.
  199 )
  200 {
  201     return ( (byte & 0xC0) == 0x80 );
  202 }
  203 
  204 
  205 //--------------------------------------------------------------------------------------------------
  206 /**
  207  * Copies the string in srcStr to the start of destStr and returns the number of bytes copied (not
  208  * including the NULL-terminator) in numBytesPtr.  Null can be passed into numBytesPtr if the number
  209  * of bytes copied is not needed.  The srcStr must be in UTF-8 format.
  210  *
  211  * If the size of srcStr is less than or equal to the destination buffer size then the entire srcStr
  212  * will be copied including the null-character.  The rest of the destination buffer is not modified.
  213  *
  214  * If the size of srcStr is larger than the destination buffer then the maximum number of characters
  215  * (from srcStr) plus a null-character that will fit in the destination buffer is copied.
  216  *
  217  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
  218  * not partial characters. Therefore, even if srcStr is larger than the destination buffer, the
  219  * copied characters may not fill the entire destination buffer because the last character copied
  220  * may not align exactly with the end of the destination buffer.
  221  *
  222  * The destination string will always be Null-terminated, unless destSize is zero.
  223  *
  224  * If destStr and srcStr overlap the behaviour of this function is undefined.
  225  *
  226  * @return
  227  *      - LE_OK if srcStr was completely copied to the destStr.
  228  *      - LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
  229  */
  230 //--------------------------------------------------------------------------------------------------
  231 le_result_t le_utf8_Copy
  232 (
  233     char* destStr,          ///< [IN] Destination where the srcStr is to be copied.
  234     const char* srcStr,     ///< [IN] UTF-8 source string.
  235     const size_t destSize,  ///< [IN] Size of the destination buffer in bytes.
  236     size_t* numBytesPtr     ///< [OUT] Number of bytes copied not including the NULL-terminator.
  237                             ///        Parameter can be set to NULL if the number of bytes copied is
  238                             ///        not needed.
  239 );
  240 
  241 
  242 //--------------------------------------------------------------------------------------------------
  243 /**
  244  * Appends srcStr to destStr by copying characters from srcStr to the end of destStr.  The srcStr
  245  * must be in UTF-8 format.  The number of bytes in the resultant destStr (not including the
  246  * NULL-terminator) is returned in destStrLenPtr.
  247  *
  248  * A null-character is always added to the end of destStr after all srcStr characters have been
  249  * copied.
  250  *
  251  * This function will copy as many characters as possible from srcStr to destStr while ensuring that
  252  * the resultant string (including the null-character) will fit within the destination buffer.
  253  *
  254  * UTF-8 characters may be more than one byte long and this function will only copy whole characters
  255  * not partial characters.
  256  *
  257  * The destination string will always be Null-terminated, unless destSize is zero.
  258  *
  259  * If destStr and srcStr overlap the behaviour of this function is undefined.
  260  *
  261  * @return
  262  *      - LE_OK if srcStr was completely copied to the destStr.
  263  *      - LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
  264  */
  265 //--------------------------------------------------------------------------------------------------
  266 le_result_t le_utf8_Append
  267 (
  268     char* destStr,          ///< [IN] Destination string.
  269     const char* srcStr,     ///< [IN] UTF-8 source string.
  270     const size_t destSize,  ///< [IN] Size of the destination buffer in bytes.
  271     size_t* destStrLenPtr   ///< [OUT] Number of bytes in the resultant destination string (not
  272                             ///        including the NULL-terminator).  Parameter can be set to
  273                             ///        NULL if the destination string size is not needed.
  274 );
  275 
  276 
  277 //--------------------------------------------------------------------------------------------------
  278 /**
  279  * Copies all characters from the srcStr to destStr up to the first occurrence of subStr.  The
  280  * subStr is not copied and instead a null-terminator is added to the destStr.  The number of bytes
  281  * copied (not including the null-terminator) is returned in numBytesPtr.
  282  *
  283  * The srcStr and subStr must be in null-terminated UTF-8 strings.
  284  *
  285  * The destination string will always be null-terminated.
  286  *
  287  * If subStr is not found in the srcStr then this function behaves just like le_utf8_Copy().
  288  *
  289  * @return
  290  *      - LE_OK if srcStr was completely copied to the destStr.
  291  *      - LE_OVERFLOW if srcStr was truncated when it was copied to destStr.
  292  */
  293 //--------------------------------------------------------------------------------------------------
  294 le_result_t le_utf8_CopyUpToSubStr
  295 (
  296     char* destStr,          ///< [IN] Destination where the srcStr is to be copied.
  297     const char* srcStr,     ///< [IN] UTF-8 source string.
  298     const char* subStr,     ///< [IN] Sub-string to copy up to.
  299     const size_t destSize,  ///< [IN] Size of the destination buffer in bytes.
  300     size_t* numBytesPtr     ///< [OUT] Number of bytes copied not including the NULL-terminator.
  301                             ///        Parameter can be set to NULL if the number of bytes
  302                             ///        copied is not needed.
  303 );
  304 
  305 
  306 //--------------------------------------------------------------------------------------------------
  307 /**
  308  * Checks to see if the string is indeed a UTF-8 encoded, null-terminated string.
  309  *
  310  * @return
  311  *      true if the format is correct or false otherwise
  312  */
  313 //--------------------------------------------------------------------------------------------------
  314 bool le_utf8_IsFormatCorrect
  315 (
  316     const char* string      ///< [IN] The string.
  317 );
  318 
  319 
  320 //--------------------------------------------------------------------------------------------------
  321 /**
  322  * Parse an integer value from a string.
  323  *
  324  * @return
  325  *      - LE_OK = Success.
  326  *      - LE_FORMAT_ERROR = The argument string was not an integer value.
  327  *      - LE_OUT_OF_RANGE = Value is too large to be stored in an int variable.
  328  **/
  329 //--------------------------------------------------------------------------------------------------
  330 le_result_t le_utf8_ParseInt
  331 (
  332     int* valuePtr,   ///< [OUT] Ptr to where the value will be stored if successful.
  333     const char* arg  ///< [IN] The string to parse.
  334 );
  335 
  336 
  337 //--------------------------------------------------------------------------------------------------
  338 /**
  339  * Encode a unicode code point as UTF-8 into a buffer.
  340  *
  341  * @return
  342  *      - LE_OK on success
  343  *      - LE_OUT_OF_RANGE if the code point supplied is outside the range of unicode code points
  344  *      - LE_OVERFLOW if the out buffer is not large enough to store the UTF-8 encoding of the code
  345  *        point
  346  *
  347  * @note
  348  *      Not all code point values are valid unicode. This function does not validate whether the
  349  *      code point is valid unicode.
  350  */
  351 //--------------------------------------------------------------------------------------------------
  352 le_result_t le_utf8_EncodeUnicodeCodePoint
  353 (
  354     uint32_t codePoint, ///< [IN] Code point to encode as UTF-8
  355     char* out,          ///< [OUT] Buffer to store the UTF-8 encoded value in.
  356     size_t* outSize     ///< [IN/OUT] As an input, this value is interpreted as the size of the out
  357                         ///  buffer.  As an output, it is updated to hold the size of the UTF-8
  358                         ///  encoded value (in the case of an LE_OK return value) or size that would
  359                         ///  be required to encode the code point (in the case or an LE_OVERFLOW
  360                         ///  return value).
  361 );
  362 
  363 
  364 //--------------------------------------------------------------------------------------------------
  365 /**
  366  * Decode the first unicode code point from the UTF-8 string src.
  367  *
  368  * @return
  369  *      - LE_OK on success
  370  *      - LE_BAD_PARAMETER if byteLength points to 0
  371  *      - LE_UNDERFLOW if src appears to be the beginning of a UTF-8 character which extends beyond
  372  *        the end of the string as specified by byteLength.
  373  *      - LE_FORMAT_ERROR if src is not valid UTF-8 encoded string data.
  374  *
  375  * @note
  376  *      Not all code point values are valid unicode. This function does not validate whether the
  377  *      code point is valid unicode.
  378  */
  379 //--------------------------------------------------------------------------------------------------
  380 le_result_t le_utf8_DecodeUnicodeCodePoint
  381 (
  382     const char* src,     ///< [IN] UTF-8 encoded data to extract a code point from.
  383     size_t* byteLength,  ///< [IN/OUT] As an input parameter, the value pointed to represents the
  384                          ///  number of bytes in src. As an output parameter, the value pointed to
  385                          ///  is the number of bytes from src that were consumed to decode the code
  386                          ///  point (in the case of an LE_OK return value) or the number of bytes
  387                          ///  that would have been consumed had src been long enough (in the case of
  388                          ///  an LE_UNDERFLOW return value).
  389     uint32_t* codePoint  ///< [OUT] Code point that was decoded from src.  This value is only valid
  390                          ///  when the function returns LE_OK.
  391 );
  392 
  393 #endif  // LEGATO_UTF8_INCLUDE_GUARD
le_utf8_EncodeUnicodeCodePoint
le_result_t le_utf8_EncodeUnicodeCodePoint(uint32_t codePoint, char *out, size_t *outSize)
le_utf8_NumBytesInChar
size_t le_utf8_NumBytesInChar(const char firstByte)
le_utf8_NumChars
ssize_t le_utf8_NumChars(const char *string)
le_result_t
le_result_t
Definition: le_basics.h:45
le_utf8_IsFormatCorrect
bool le_utf8_IsFormatCorrect(const char *string)
le_utf8_Copy
le_result_t le_utf8_Copy(char *destStr, const char *srcStr, const size_t destSize, size_t *numBytesPtr)
le_utf8_Append
le_result_t le_utf8_Append(char *destStr, const char *srcStr, const size_t destSize, size_t *destStrLenPtr)
le_utf8_ParseInt
le_result_t le_utf8_ParseInt(int *valuePtr, const char *arg)
le_utf8_IsContinuationByte
static bool le_utf8_IsContinuationByte(const char byte)
Definition: le_utf8.h:197
le_utf8_NumBytes
size_t le_utf8_NumBytes(const char *string)
le_utf8_DecodeUnicodeCodePoint
le_result_t le_utf8_DecodeUnicodeCodePoint(const char *src, size_t *byteLength, uint32_t *codePoint)
le_utf8_CopyUpToSubStr
le_result_t le_utf8_CopyUpToSubStr(char *destStr, const char *srcStr, const char *subStr, const size_t destSize, size_t *numBytesPtr)