[SHELLFIND] Split encoding detection code (#3147)

Split encoding detection code to improve code quality. This is follow-up of #3134 (7376cde). CORE-17250
2025-02-22 16:36:33 +00:00 · 2020-09-10 19:03:20 +09:00 · 2020-09-10 19:03:20 +09:00 · c8e3ef7894
commit c8e3ef7894
parent 92f4746845
1 changed files with 154 additions and 20 deletions
--- a/dll/win32/browseui/shellfind/CFindFolder.cpp
+++ b/dll/win32/browseui/shellfind/CFindFolder.cpp
@ -185,6 +185,137 @@ StrFindNIW(const WCHAR *lpFirst, const CStringW &lpSrch, UINT cchMax)
    return StrStrN<WCHAR, CStringW, _wcsnicmp>(lpFirst, lpSrch, cchMax) != NULL;
 }
 /*
 * The following code is borrowed from base/applications/cmdutils/more/more.c .
 */
 typedef enum
 {
    ENCODING_ANSI    =  0,
    ENCODING_UTF16LE =  1,
    ENCODING_UTF16BE =  2,
    ENCODING_UTF8    =  3
 } ENCODING;
 static BOOL
 IsDataUnicode(
    IN PVOID Buffer,
    IN DWORD BufferSize,
    OUT ENCODING* Encoding OPTIONAL,
    OUT PDWORD SkipBytes OPTIONAL)
 {
    PBYTE pBytes = (PBYTE)Buffer;
    ENCODING encFile = ENCODING_ANSI;
    DWORD dwPos = 0;
    /*
     * See http://archives.miloush.net/michkap/archive/2007/04/22/2239345.html
     * for more details about the algorithm and the pitfalls behind it.
     * Of course it would be actually great to make a nice function that
     * would work, once and for all, and put it into a library.
     */
    /* Look for Byte Order Marks */
    if ((BufferSize >= 2) && (pBytes[0] == 0xFF) && (pBytes[1] == 0xFE))
    {
        encFile = ENCODING_UTF16LE;
        dwPos = 2;
    }
    else if ((BufferSize >= 2) && (pBytes[0] == 0xFE) && (pBytes[1] == 0xFF))
    {
        encFile = ENCODING_UTF16BE;
        dwPos = 2;
    }
    else if ((BufferSize >= 3) && (pBytes[0] == 0xEF) && (pBytes[1] == 0xBB) && (pBytes[2] == 0xBF))
    {
        encFile = ENCODING_UTF8;
        dwPos = 3;
    }
    else
    {
        /*
         * Try using statistical analysis. Do not rely on the return value of
         * IsTextUnicode as we can get FALSE even if the text is in UTF-16 BE
         * (i.e. we have some of the IS_TEXT_UNICODE_REVERSE_MASK bits set).
         * Instead, set all the tests we want to perform, then just check
         * the passed tests and try to deduce the string properties.
         */
 /*
 * This mask contains the 3 highest bits from IS_TEXT_UNICODE_NOT_ASCII_MASK
 * and the 1st highest bit from IS_TEXT_UNICODE_NOT_UNICODE_MASK.
 */
 #define IS_TEXT_UNKNOWN_FLAGS_MASK  ((7 << 13) | (1 << 11))
        /* Flag out the unknown flags here, the passed tests will not have them either */
        INT Tests = (IS_TEXT_UNICODE_NOT_ASCII_MASK   |
                     IS_TEXT_UNICODE_NOT_UNICODE_MASK |
                     IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_UNICODE_MASK)
                        & ~IS_TEXT_UNKNOWN_FLAGS_MASK;
        INT Results;
        IsTextUnicode(Buffer, BufferSize, &Tests);
        Results = Tests;
        /*
         * As the IS_TEXT_UNICODE_NULL_BYTES or IS_TEXT_UNICODE_ILLEGAL_CHARS
         * flags are expected to be potentially present in the result without
         * modifying our expectations, filter them out now.
         */
        Results &= ~(IS_TEXT_UNICODE_NULL_BYTES | IS_TEXT_UNICODE_ILLEGAL_CHARS);
        /*
         * NOTE: The flags IS_TEXT_UNICODE_ASCII16 and
         * IS_TEXT_UNICODE_REVERSE_ASCII16 are not reliable.
         *
         * NOTE2: Check for potential "bush hid the facts" effect by also
         * checking the original results (in 'Tests') for the absence of
         * the IS_TEXT_UNICODE_NULL_BYTES flag, as we may presumably expect
         * that in UTF-16 text there will be at some point some NULL bytes.
         * If not, fall back to ANSI. This shows the limitations of using the
         * IsTextUnicode API to perform such tests, and the usage of a more
         * improved encoding detection algorithm would be really welcome.
         */
        if (!(Results & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
            !(Results & IS_TEXT_UNICODE_REVERSE_MASK)     &&
             (Results & IS_TEXT_UNICODE_UNICODE_MASK)     &&
             (Tests   & IS_TEXT_UNICODE_NULL_BYTES))
        {
            encFile = ENCODING_UTF16LE;
            dwPos = (Results & IS_TEXT_UNICODE_SIGNATURE) ? 2 : 0;
        }
        else
        if (!(Results & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
            !(Results & IS_TEXT_UNICODE_UNICODE_MASK)     &&
             (Results & IS_TEXT_UNICODE_REVERSE_MASK)     &&
             (Tests   & IS_TEXT_UNICODE_NULL_BYTES))
        {
            encFile = ENCODING_UTF16BE;
            dwPos = (Results & IS_TEXT_UNICODE_REVERSE_SIGNATURE) ? 2 : 0;
        }
        else
        {
            /*
             * Either 'Results' has neither of those masks set, as it can be
             * the case for UTF-8 text (or ANSI), or it has both as can be the
             * case when analysing pure binary data chunk. This is therefore
             * invalid and we fall back to ANSI encoding.
             * FIXME: In case of failure, assume ANSI (as long as we do not have
             * correct tests for UTF8, otherwise we should do them, and at the
             * very end, assume ANSI).
             */
            encFile = ENCODING_ANSI; // ENCODING_UTF8;
            dwPos = 0;
        }
    }
    if (Encoding)
        *Encoding = encFile;
    if (SkipBytes)
        *SkipBytes = dwPos;
    return (encFile != ENCODING_ANSI);
 }
 static BOOL SearchFile(LPCWSTR lpFilePath, _SearchData *pSearchData)
 {
    HANDLE hFile = CreateFileW(lpFilePath, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
@ -209,28 +340,31 @@ static BOOL SearchFile(LPCWSTR lpFilePath, _SearchData *pSearchData)
    if (!pbContents)
        return FALSE;
    ENCODING encoding;
    IsDataUnicode(pbContents, size, &encoding, NULL);
    BOOL bFound;
-    if (size >= 2 &&
+    switch (encoding)
        (memcmp(pbContents, "\xFF\xFE", 2) == 0 || (pbContents[0] && !pbContents[1])))
    {
        case ENCODING_UTF16LE:
            // UTF-16
            bFound = StrFindNIW((LPCWSTR)pbContents, pSearchData->szQueryW, size / sizeof(WCHAR));
-    }
+            break;
-    else if (size >= 2 &&
+        case ENCODING_UTF16BE:
             (memcmp(pbContents, "\xFE\xFF", 2) == 0 || (!pbContents[0] && pbContents[1])))
    {
            // UTF-16 BE
            bFound = StrFindNIW((LPCWSTR)pbContents, pSearchData->szQueryU16BE, size / sizeof(WCHAR));
-    }
+            break;
-    else if (size >= 3 && memcmp(pbContents, "\xEF\xBB\xBF", 3) == 0)
+        case ENCODING_UTF8:
    {
            // UTF-8
            bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryU8, size / sizeof(CHAR));
-    }
+            break;
-    else
+        case ENCODING_ANSI:
-    {
+        default:
-        // ANSI
+            // ANSI or UTF-8 without BOM
            bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryA, size / sizeof(CHAR));
            if (!bFound && pSearchData->szQueryA != pSearchData->szQueryU8)
                bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryU8, size / sizeof(CHAR));
            break;
    }
    UnmapViewOfFile(pbContents);