[0.4.13][KERNEL32] Fix MultiByteToWideChar(CP_UTF8, ...) (#2007)

CP_UTF8 validation was not strict. 5- or 6-byte length sequences are invalid in RFC 3629 UTF-8. CORE-16468 - Fix MultiByteToWideChar(CP_UTF8, ...) by making check strict. This will definitely fix regression CORE-16678 which was introduced or unhidden by 0.4.13-dev-927-g e85664a3d8 and was also reported to bring ros a bit closer towards displaying japanese in ros Notepad with MS Gothic, but I am not sure, whether it is enough for that already when backported on its own. cherry picked from commit 0.4.14-dev-193-g a63678c9a9
2025-02-22 16:36:33 +00:00 · 2019-11-02 14:16:26 +09:00 · 2019-11-02 14:16:26 +09:00 · e22ac0c2b9
commit e22ac0c2b9
parent 1e5d33ba25
1 changed files with 24 additions and 8 deletions
--- a/dll/win32/kernel32/winnls/string/nls.c
+++ b/dll/win32/kernel32/winnls/string/nls.c
@ -28,10 +28,10 @@ static const char UTF8Length[128] =
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */
-   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 - 0xCF */
+   0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 - 0xCF */
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 - 0xDF */
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xE0 - 0xEF */
-   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0  /* 0xF0 - 0xFF */
+   3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0  /* 0xF0 - 0xFF */
 };

 /* First byte mask depending on UTF-8 sequence length. */
@ -389,21 +389,27 @@ IntMultiByteToWideCharUTF8(DWORD Flags,
        for (; MultiByteString < MbsEnd; WideCharCount++)
        {
            Char = *MultiByteString++;
-            if (Char < 0xC0)
+            if (Char < 0x80)
            {
                TrailLength = 0;
                continue;
            }
-            if (Char >= 0xF8 || (Char & 0xC0) == 0x80)
+            if ((Char & 0xC0) == 0x80)
            {
                TrailLength = 0;
                StringIsValid = FALSE;
                continue;
            }

+            TrailLength = UTF8Length[Char - 0x80];
+            if (TrailLength == 0)
+            {
+                StringIsValid = FALSE;
+                continue;
+            }
+
            CharIsValid = TRUE;
            MbsPtrSave = MultiByteString;
-            TrailLength = UTF8Length[Char - 0x80];
            WideChar = Char & UTF8Mask[TrailLength];

            while (TrailLength && MultiByteString < MbsEnd)
@ -427,9 +433,10 @@ IntMultiByteToWideCharUTF8(DWORD Flags,
        if (TrailLength)
        {
            WideCharCount++;
+            StringIsValid = FALSE;
        }

-        if (Flags == MB_ERR_INVALID_CHARS && (!StringIsValid || TrailLength))
+        if (Flags == MB_ERR_INVALID_CHARS && !StringIsValid)
        {
            SetLastError(ERROR_NO_UNICODE_TRANSLATION);
            return 0;
@ -449,16 +456,24 @@ IntMultiByteToWideCharUTF8(DWORD Flags,
            TrailLength = 0;
            continue;
        }
-        if (Char >= 0xF8 || Char == 0x80 || (Char & 0xC0) == 0x80)
+        if ((Char & 0xC0) == 0x80)
        {
            *WideCharString++ = InvalidChar;
            TrailLength = 0;
+            StringIsValid = FALSE;
+            continue;
+        }
+
+        TrailLength = UTF8Length[Char - 0x80];
+        if (TrailLength == 0)
+        {
+            *WideCharString++ = InvalidChar;
+            StringIsValid = FALSE;
            continue;
        }

        CharIsValid = TRUE;
        MbsPtrSave = MultiByteString;
-        TrailLength = UTF8Length[Char - 0x80];
        WideChar = Char & UTF8Mask[TrailLength];

        while (TrailLength && MultiByteString < MbsEnd)
@ -481,6 +496,7 @@ IntMultiByteToWideCharUTF8(DWORD Flags,
        {
            *WideCharString++ = InvalidChar;
            MultiByteString = MbsPtrSave;
+            StringIsValid = FALSE;
        }
    }