StdLib: Fix several problems where characters were not being correctly converted between wide and MBCS.

Add utility functions for determining character length of strings. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: daryl.mcdaniel@intel.com Reviewed-by: erik.c.bjorge@intel.com Reviewed-by: lee.g.rosenbaum@intel.com StdLib/LibC/ Locale/multibyte_Utf8.c Improve comments. Define implementation-specific MBCS utility functions, as declared in <stdlib.h>. Enhance functionality of EncodeUtf8() and improve error handling. Set correct conversion state in wcrtomb(). Bug fixes in wcsrtombs(). Make wctob() properly MBCS compliant. Main/Main.c Remove code obsoleted by new wcsrtombs() implementation. git-svn-id: https://edk2.svn.sourceforge.net/svnroot/edk2/trunk/edk2@13785 6f19259b-4bc3-4df7-8a09-765794883524
2025-12-07 13:51:04 +01:00 · 2012-10-05 22:00:43 +00:00 · 2012-10-05 22:00:43 +00:00 · c42c9cac8c
commit c42c9cac8c
parent e2a013fa40
2 changed files with 311 additions and 135 deletions
--- a/StdLib/LibC/Locale/multibyte_Utf8.c
+++ b/StdLib/LibC/Locale/multibyte_Utf8.c
@ -15,9 +15,9 @@
 #include  <wchar.h>
 #include  <sys/types.h>

-typedef      int  ch_UCS4;
+typedef      int      ch_UCS4;

-static  mbstate_t         LocalConvState = {0};
+static  mbstate_t     LocalConvState = {0};

 /** Map a UTF-8 encoded prefix byte to a sequence length.
    Zero means illegal prefix, but valid surrogate if < 0xC0.
@ -59,12 +59,12 @@ UINT8 utf8_code_length[256] = {

 /** Process one byte of a multibyte character.

-    @param  ch
-    @param  ps
+    @param[in]      ch    One byte of a multibyte character.
+    @param[in,out]  ps    Pointer to a conversion state object.

-    @retval   -2
-    @retval   -1
-    @retval   1:4
+    @retval   -2      ch is an incomplete but potentially valid character.
+    @retval   -1      ch is not valid in this context.
+    @retval   1:4     The length, in bytes, of the character ch just completed.
 **/
 static
 int
@ -174,10 +174,10 @@ ProcessOneByte(unsigned char ch, mbstate_t *ps)

 /** Convert one Multibyte sequence.

-    @param  Dest
-    @param  Src
-    @param  Len
-    @param  pS
+    @param[out]   Dest      Pointer to output location, or NULL
+    @param[in]    Src       Multibyte Source (UTF8)
+    @param[in]    Len       Max Number of bytes to convert
+    @param[in]    pS        Pointer to State struct., or NULL

    @retval   -2      Bytes processed comprise an incomplete, but potentially valid, character.
    @retval   -1      An encoding error was encountered.  ps->E indicates the number of bytes consumed.
@ -219,87 +219,212 @@ DecodeOneStateful(
  return NumConv;
 }

-/** Convert wide characters (UTF16) into multibyte characters (UTF8)
+/*  Determine the number of bytes needed to represent a Wide character
+    as a MBCS character.
+
+    A single wide character may convert into a one, two, three, or four byte
+    narrow (MBCS or UTF-8) character.  The number of MBCS bytes can be determined
+    as follows.
+
+    If WCS char      < 0x00000080      One Byte
+    Else if WCS char < 0x0000D800      Two Bytes
+    Else                               Three Bytes
+
+    Since UEFI only supports the Unicode Base Multilingual Plane (BMP),
+    Four-byte characters are not supported.
+
+    @param[in]    InCh      Wide character to test.
+
+    @retval     -1      Improperly formed character
+    @retval      0      InCh is 0x0000
+    @retval     >0      Number of bytes needed for the MBCS character
+*/
+int
+EFIAPI
+OneWcToMcLen(const wchar_t InCh)
+{
+  ssize_t   NumBytes;
+
+  if(InCh == 0) {             //    Is this a NUL, 0x0000 ?
+    NumBytes = 0;
+  }
+  else if(InCh < 0x0080) {    //    Is this a 1-byte character?
+    NumBytes = 1;
+  }
+  else if(InCh < 0x0800) {    //    Is this a 2-byte character?
+    NumBytes = 2;
+  }
+  else if((InCh >= 0xD800) && (InCh < 0xE000)) {    //    Is this a surrogate?
+    NumBytes = -1;
+  }
+  else {
+    NumBytes = 3;             //    Otherwise, it must be a 3-byte character.
+  }
+  return (int)NumBytes;      // Return extimate of required bytes.
+}
+
+/*  Determine the number of bytes needed to represent a Wide character string
+    as a MBCS string of given maximum length.  Will optionally return the number
+    of wide characters that would be consumed.
+
+    A single wide character may convert into a one, two, three, or four byte
+    narrow (MBCS or UTF-8) character.  The number of MBCS bytes can be determined
+    as follows.
+
+    If WCS char      < 0x00000080      One Byte
+    Else if WCS char < 0x00000800      Two Bytes
+    Else if WCS char < 0x00010000      Three Bytes
+    Else                               Four Bytes
+
+    Since UEFI only supports the Unicode Base Multilingual Plane (BMP),
+    Four-byte characters should not be encountered.
+
+    @param[in]    Src       Pointer to a wide character string.
+    @param[in]    Limit     Maximum number of bytes the converted string may occupy.
+    @param[out]   NumChar   Pointer to where to store the number of wide characters, or NULL.
+
+    @return     The number of bytes required to convert Src to MBCS,
+                not including the terminating NUL.  If NumChar is not NULL, the number
+                of characters represented by the return value will be written to
+                where it points.
+*/
+size_t
+EFIAPI
+EstimateWtoM(const wchar_t * Src, size_t Limit, size_t *NumChar)
+{
+  ssize_t    Estimate;
+  size_t    CharCount;
+  ssize_t   NumBytes;
+  wchar_t   EChar;
+
+  Estimate  = 0;
+  CharCount = 0;
+  EChar = *Src++;               // Get the initial character and point to next
+  while(((NumBytes = OneWcToMcLen(EChar)) > 0)  &&
+        ((size_t)(Estimate + NumBytes) < Limit))
+  {                             // Until one of the source characters is NUL
+    ++CharCount;                //    Count this character.
+    Estimate += NumBytes;       //    Count the Bytes for this character
+    EChar = *Src++;             //    Get the next source character and point to the next.
+  }
+  if(NumChar != NULL) {
+    *NumChar = CharCount;
+  }
+  return (size_t)Estimate;      // Return esimate of required bytes.
+}
+
+/*  Determine the number of characters in a MBCS string.
+    MBCS characters are one to four bytes long.  By examining the first byte
+    of a MBCS character, one can determine the number of bytes comprising the
+    character.
+
+    0x00 - 0x7F     One
+    0xC0 - 0xDF     Two
+    0xE0 - 0xEF     Three
+    0xF0 - 0xF7     Four
+
+    Since UEFI only supports the Unicode Base Multilingual Plane (BMP),
+    Four-byte characters should not be encountered.
+
+    @param[in]    Src     The string to examine
+
+    @return   The number of characters represented by the MBCS string.
+**/
+size_t
+EFIAPI
+CountMbcsChars(const char *Src)
+{
+  size_t      Count;
+  char        EChar;
+
+  Count = 0;
+  EChar = *Src++;
+  while(EChar != 0) {
+    if(EChar < 0x80) {
+      ++Count;
+    }
+    else if(EChar < 0xE0) {
+      Count += 2;
+      ++Src;
+    }
+    else if(EChar < 0xF0) {
+      Count += 3;
+      Src += 2;
+    }
+    else {
+      // Ill-formed character
+      break;
+    }
+  }
+  return Count;
+}
+
+/** Convert a wide character (UTF16) into a multibyte character (UTF8)
+
+    Converts a wide character into a corresponding multibyte character that
+    begins in the conversion state described by the object pointed to by ps.
+    If dst is not a null pointer, the converted character is then stored into
+    the array pointed to by dst.
+
+    It is the caller's responsibility to ensure that Dest is large enough to
+    hold the resulting MBCS sequence.

    @param  s       Pointer to the wide-character string to convert
-    @param  size    Number of wide characters in s.  size <= wcslen(s);
+    @param  Dest    Pointer to the buffer in which to place the converted sequence, or NULL.

-    @return A newly allocated buffer containing the converted string is returned,
-            or NULL if an error occurred.  Global variable errno contains more
-            information if NULL is returned.
+    @retval   -1    An error occurred.  The error reason is in errno.
+    @retval   >=0   The number of bytes stored into Dest.
 **/
 ssize_t
-EncodeUtf8(char *Dest, wchar_t *s, ssize_t size)
+EncodeUtf8(char *Dest, wchar_t ch)
 {
  char       *p;              /* next free byte in build buffer */
-  char       *v;              /* next free byte in destination */
-  ssize_t     nneeded;        /* number of result bytes needed */
-  int         i;              /* index into s of next input byte */
  int         NumInBuff;      // number of bytes in Buff
  char        Buff[4];        // Buffer into which each character is built

-  assert(s != NULL);
-  assert(size >= 0);
-
-  v = Dest;
-  nneeded = 0;
-  if((size * MB_LEN_MAX) / MB_LEN_MAX != size) {
-    // size is too large and resulted in overflow when multiplied by MB_LEN_MAX
-    errno = EINVAL;
-    return (ssize_t)-1;
-  }
-
- for (i = 0; i < size;) {
-    ch_UCS4 ch = s[i++];
    p = Buff;

-    if (ch < 0x80) {
-      /* Encode ASCII -- One Byte */
-      *p++ = (char) ch;
-    }
-    else if (ch < 0x0800) {
-      /* Encode Latin-1 -- Two Byte */
-      *p++ = (char)(0xc0 | (ch >> 6));
-      *p++ = (char)(0x80 | (ch & 0x3f));
-    }
-    else {
+  NumInBuff = 0;
+  if (ch < 0x80) {
+    /* Encode ASCII -- One Byte */
+    *p++ = (char) ch;
+    NumInBuff = 1;
+  }
+  else if (ch < 0x0800) {
+    /* Encode Latin-1 -- Two Byte */
+    *p++ = (char)(0xc0 | (ch >> 6));
+    *p++ = (char)(0x80 | (ch & 0x3f));
+    NumInBuff = 2;
+  }
+  else {
      /* Encode UCS2 Unicode ordinals -- Three Byte */
-      /* Special case: check for high surrogate -- Shouldn't happen in UEFI */
-      if (0xD800 <= ch && ch <= 0xDBFF && i < size) {
-        ch_UCS4 ch2 = s[i];
-        /* Check for low surrogate and combine the two to
-           form a UCS4 value */
-        if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
-          ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
-          i++;
-          /* Encode UCS4 Unicode ordinals -- Four Byte */
-          *p++ = (char)(0xf0 | (ch >> 18));
-          *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
-          *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-          *p++ = (char)(0x80 | (ch & 0x3f));
-          continue;
-        }
-        /* Fall through: handles isolated high surrogates */
+    /* Special case: check for surrogate -- Shouldn't happen in UEFI */
+    if (0xD800 <= ch && ch < 0xE000) {
+      errno = EILSEQ;
+      return -1;
      }
+    else {
      *p++ = (char)(0xe0 | (ch >> 12));
      *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
      *p++ = (char)(0x80 | (ch & 0x3f));
+      NumInBuff = 3;
    }
-    /*  At this point, Buff holds the converted character which is NumInBuff bytes long.
-        NumInBuff is the value 1, 2, 3, or 4
-    */
-    NumInBuff = (int)(p - Buff);     // Number of bytes in Buff
-    if(Dest != NULL) {        // Save character if Dest is not NULL
-      memcpy(v, Buff, NumInBuff);
-      v += NumInBuff;
+  }
+  /*  At this point, Buff holds the converted character which is NumInBuff bytes long.
+      NumInBuff is the value 1, 2, 3, or 4
+  */
+  if(Dest != NULL) {        // Save character if Dest is not NULL
+    memcpy(Dest, Buff, NumInBuff);
+
+    if(ch != 0) {
+      // Terminate the destination string.
+      Dest[NumInBuff] = '\0';
+    }
+    else {
+      NumInBuff = 0;
    }
-    nneeded += NumInBuff;     // Keep track of the number of bytes put into Dest
  }
-  if(Dest != NULL) {
-    // Terminate the destination string.
-    *v = '\0';
-  }
-  return nneeded;             // Tell the caller
+  return NumInBuff;             // Tell the caller
 }

 // ########################  Narrow to Wide Conversions #######################
@ -307,6 +432,8 @@ EncodeUtf8(char *Dest, wchar_t *s, ssize_t size)
 /** If ps is not a null pointer, the mbsinit function determines whether the
    pointed-to mbstate_t object describes an initial conversion state.

+    @param[in]  ps    Pointer to the conversion state object to test.
+
    @return     The mbsinit function returns nonzero if ps is a null pointer
                or if the pointed-to object describes an initial conversion
                state; otherwise, it returns zero.
@ -329,8 +456,14 @@ mbsinit(const mbstate_t *ps)
    where internal is the mbstate_t object for the mbrlen function, except that
    the expression designated by ps is evaluated only once.

-    @return   The mbrlen function returns a value between zero and n,
-              inclusive, (size_t)(-2), or (size_t)(-1).
+    @param[in]  s     Pointer to a multibyte character sequence.
+    @param[in]  n     Maximum number of bytes to examine.
+    @param[in]  pS    Pointer to the conversion state object.
+
+    @retval   0       The next n or fewer characters complete a NUL.
+    @retval   1..n    The number of bytes that complete the multibyte character.
+    @retval   -2      The next n bytes contribute to an incomplete (but potentially valid) multibyte character.
+    @retval   -1      An encoding error occurred.

    Declared in: wchar.h
 **/
@ -338,10 +471,10 @@ size_t
 mbrlen(
  const char *s,
  size_t n,
-  mbstate_t *ps
+  mbstate_t *pS
  )
 {
-  return mbrtowc(NULL, s, n, ps);
+  return mbrtowc(NULL, s, n, pS);
 }

 /** Determine the number of bytes comprising a multibyte character.
@ -392,6 +525,11 @@ corresponding wide character and then, if pwc is not a null pointer, stores that
 the object pointed to by pwc. If the corresponding wide character is the null wide
 character, the resulting state described is the initial conversion state.

+    @param[out]   pwc   Pointer to where the resulting wide character is to be stored.
+    @param[in]     s    Pointer to a multibyte character "string".
+    @param[in]     n    The maximum number of bytes to inspect.
+    @param[in]     ps   Pointer to a conversion state object.
+
    @retval   0             if the next n or fewer bytes complete the multibyte
                            character that corresponds to the null wide
                            character (which is the value stored).
@ -480,6 +618,11 @@ just past the last multibyte character converted (if any). If conversion stopped
 reaching a terminating null character and if dst is not a null pointer, the resulting state
 described is the initial conversion state.

+    @param[out]   dst   Pointer to where the resulting wide character sequence is stored.
+    @param[in]    src   Pointer to a pointer to the multibyte character sequence to convert.
+    @param[in]    len   Maximum number of wide characters to be stored into dst.
+    @param[in]    ps    Pointer to a conversion state object.
+
    @return   If the input conversion encounters a sequence of bytes that do
              not form a valid multibyte character, an encoding error occurs:
              the mbsrtowcs function stores the value of the macro EILSEQ in
@ -564,21 +707,23 @@ mbsrtowcs(
 **/
 size_t
 mbstowcs(
-  wchar_t *pwcs,
-  const char *s,
-  size_t n
+  wchar_t *Dest,
+  const char *Src,
+  size_t Limit
  )
 {

-  /* pwcs may be NULL */
-  /* s may be NULL */
+  /* Dest may be NULL */
+  /* Src may be NULL */

-  return mbsrtowcs(pwcs, &s, n, NULL);
+  return mbsrtowcs(Dest, &Src, Limit, NULL);
 }

 /** The btowc function determines whether C constitutes a valid single-byte
    character in the initial shift state.

+    @param[in]    C   A narrow character to test or convert to wide.
+
    @return   The btowc function returns WEOF if c has the value EOF or if
              (unsigned char)C does not constitute a valid single-byte
              character in the initial shift state. Otherwise, it returns the
@ -621,6 +766,12 @@ array whose first element is pointed to by S. At most MB_CUR_MAX bytes are store
 wc is a null wide character, a null byte is stored, preceded by any shift sequence needed
 to restore the initial shift state; the resulting state described is the initial conversion state.

+    @param[out]     Dest    Pointer to the location in which to store the resulting
+                            multibyte character.  Otherwise, NULL to reset the
+                            conversion state.
+    @param[in]      wchar   The wide character to convert.
+    @param[in,out]  pS      Pointer to a conversion state object, or NULL.
+
    @return   The wcrtomb function returns the number of bytes stored in the
              array object (including any shift sequences). When wc is not a
              valid wide character, an encoding error occurs: the function
@ -631,26 +782,31 @@ to restore the initial shift state; the resulting state described is the initial
 **/
 size_t
 wcrtomb(
-  char *s,
+  char *Dest,
  wchar_t wchar,
-  mbstate_t *ps
+  mbstate_t *pS
  )
 {
  size_t    RetVal;

-  /* s may be NULL */
-  if (s == NULL) {
+  /* Dest may be NULL */
+  if (Dest == NULL) {
    RetVal = 1;
  }
  else {
    if (wchar == L'\0') {
-      *s = '\0';
+      *Dest = '\0';
      RetVal = 1;
    }
    else {
-      RetVal = EncodeUtf8(s, &wchar, 1);
+      RetVal = EncodeUtf8(Dest, wchar);
    }
  }
+  if(pS == NULL) {
+    pS = &LocalConvState;
+  }
+  pS->A = 0;      // Set ps to the initial conversion state
+
  return RetVal;
 }

@ -698,27 +854,31 @@ wctomb(
 }

 /** The wcsrtombs function converts a sequence of wide characters from the array
-    indirectly pointed to by S into a sequence of corresponding multibyte
+    indirectly pointed to by Dest into a sequence of corresponding multibyte
    characters that begins in the conversion state described by the object
    pointed to by ps.

-    If S is not a null pointer, the converted characters
-    are then stored into the array pointed to by S.  Conversion continues
-    up to and including a terminating null wide character, which is also
-    stored. Conversion stops earlier in two cases: when a wide character is
-    reached that does not correspond to a valid multibyte character, or
-    (if S is not a null pointer) when the next multibyte character would
-    exceed the limit of N total bytes to be stored into the array pointed
-    to by S. Each conversion takes place as if by a call to the wcrtomb
-    function.)
+    If Dest is not a null pointer, the converted characters are stored into the
+    array pointed to by Dest.  Conversion continues up to and including a
+    terminating null wide character, which is also stored. Conversion stops
+    earlier in two cases: when a wide character is reached that does not
+    correspond to a valid multibyte character, or (if Dest is not a null
+    pointer) when the next multibyte character would exceed the limit of Limit
+    total bytes to be stored into the array pointed to by Dest. Each conversion
+    takes place as if by a call to the wcrtomb function.)

-    If S is not a null pointer, the pointer object pointed to by pwcs is
+    If Dest is not a null pointer, the pointer object pointed to by Src is
    assigned either a null pointer (if conversion stopped due to reaching
    a terminating null wide character) or the address just past the last wide
    character converted (if any). If conversion stopped due to reaching a
    terminating null wide character, the resulting state described is the
    initial conversion state.

+    @param[in]      Dest
+    @param[in,out]  Src
+    @param[in]      Limit   Max number of bytes to store in Dest.
+    @param[in,out]  ps
+
    @return     If conversion stops because a wide character is reached that
                does not correspond to a valid multibyte character, an
                encoding error occurs: the wcsrtombs function stores the
@ -731,38 +891,50 @@ wctomb(
 **/
 size_t
 wcsrtombs(
-  char *s,
-  const wchar_t **pwcs,
-  size_t n,
-  mbstate_t *ps
+  char           *Dest,
+  const wchar_t **Src,
+  size_t          Limit,
+  mbstate_t      *ps
 )
 {
-  int count = 0;
+  size_t  NumStored;
+  ssize_t MaxBytes;
+  int     count;
+  wchar_t InCh;

-  /* s may be NULL */
-  /* pwcs may be NULL */
+  NumStored = 0;
+  MaxBytes  = (ssize_t)Limit;
+
+  /* Dest may be NULL */
+  /* Src may be NULL */
  /* ps appears to be unused */

-  if (pwcs == NULL || *pwcs == NULL)
+  if (Src == NULL || *Src == NULL)
    return (0);

-  if (s == NULL) {
-    while (*(*pwcs)++ != 0)
-      count++;
-    return(count);
+  if (Dest == NULL) {
+    NumStored = EstimateWtoM(*Src, MaxBytes, NULL);
  }
-
-  if (n != 0) {
-    do {
-      if ((*s++ = (char) *(*pwcs)++) == 0) {
-        *pwcs = NULL;
+  else {
+    while (OneWcToMcLen(InCh = *(*Src)++) <= MaxBytes) {
+      if(InCh == 0) {
+        *Src = NULL;
        break;
      }
-      count++;
-    } while (--n != 0);
+      count = (int)wcrtomb(Dest, InCh, NULL);
+      if(count >= 0) {
+        Dest += count;
+        MaxBytes -= count;
+        NumStored += count;
+      }
+      else {
+        NumStored = (size_t)(-1);
+      }
+    }
  }

-  return count;
+
+  return NumStored;
 }

 /** Convert a wide-character string into a multibyte character string.
@ -794,19 +966,23 @@ wcsrtombs(
 **/
 size_t
 wcstombs(
-  char *s,
-  const wchar_t *pwcs,
-  size_t n
+  char           *Dest,
+  const wchar_t  *Src,
+  size_t          Limit
 )
 {
-  /* s may be NULL */
-  return wcsrtombs(s, &pwcs, n, NULL);
+  /* Dest may be NULL */
+  return wcsrtombs(Dest, &Src, Limit, NULL);
 }

 /** The wctob function determines whether C corresponds to a member of the extended
    character set whose multibyte character representation is a single byte when in the initial
    shift state.

+    wctob needs to be consistent with wcrtomb.
+    If wcrtomb says that a character is representable in 1 byte,
+    then wctob needs to also represent the character as 1 byte.
+
    @return     The wctob function returns EOF if C does not correspond to a multibyte
                character with length one in the initial shift state. Otherwise, it
                returns the single-byte representation of that character as an
@ -817,13 +993,14 @@ wcstombs(
 int
 wctob(wint_t c)
 {
-  /*  wctob needs to be consistent with wcrtomb.
-      if wcrtomb says that a character is representable in 1 byte,
-      which this implementation always says, then wctob needs to
-      also represent the character as 1 byte.
-  */
-  if (c == WEOF) {
-    return EOF;
+  int   RetVal;
+
+  RetVal = EOF;
+  if(c == 0) {
+    RetVal = 0;
  }
-  return (int)(c & 0xFF);
+  else if (OneWcToMcLen((const wchar_t)c) == 1) {
+    RetVal = (int)(c & 0xFF);
+  }
+  return RetVal;
 }
--- a/StdLib/LibC/Main/Main.c
+++ b/StdLib/LibC/Main/Main.c
@ -113,10 +113,9 @@ DEBUG_CODE_END();
  for(count = 0; count < Argc; ++count) {
    nArgv[count] = string;
    AVsz = wcstombs(string, Argv[count], nArgvSize);
-    string[AVsz] = 0;   /* NULL terminate the argument */
    DEBUG((DEBUG_INFO, "Cvt[%d] %d \"%s\" --> \"%a\"\n", (INT32)count, (INT32)AVsz, Argv[count], nArgv[count]));
-    string += AVsz + 1;
-    nArgvSize -= AVsz + 1;
+    string += AVsz;
+    nArgvSize -= AVsz;
    if(nArgvSize < 0) {
      Print(L"ABORTING: Internal Argv[%d] conversion error.\n", count);
      exit(EXIT_FAILURE);