mirror of
https://github.com/reactos/reactos.git
synced 2025-02-22 16:36:33 +00:00
[SHELLFIND] Split encoding detection code (#3147)
Split encoding detection code to improve code quality. This is follow-up of #3134 (7376cde
). CORE-17250
This commit is contained in:
parent
92f4746845
commit
c8e3ef7894
1 changed files with 154 additions and 20 deletions
|
@ -185,6 +185,137 @@ StrFindNIW(const WCHAR *lpFirst, const CStringW &lpSrch, UINT cchMax)
|
||||||
return StrStrN<WCHAR, CStringW, _wcsnicmp>(lpFirst, lpSrch, cchMax) != NULL;
|
return StrStrN<WCHAR, CStringW, _wcsnicmp>(lpFirst, lpSrch, cchMax) != NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The following code is borrowed from base/applications/cmdutils/more/more.c .
|
||||||
|
*/
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
ENCODING_ANSI = 0,
|
||||||
|
ENCODING_UTF16LE = 1,
|
||||||
|
ENCODING_UTF16BE = 2,
|
||||||
|
ENCODING_UTF8 = 3
|
||||||
|
} ENCODING;
|
||||||
|
|
||||||
|
static BOOL
|
||||||
|
IsDataUnicode(
|
||||||
|
IN PVOID Buffer,
|
||||||
|
IN DWORD BufferSize,
|
||||||
|
OUT ENCODING* Encoding OPTIONAL,
|
||||||
|
OUT PDWORD SkipBytes OPTIONAL)
|
||||||
|
{
|
||||||
|
PBYTE pBytes = (PBYTE)Buffer;
|
||||||
|
ENCODING encFile = ENCODING_ANSI;
|
||||||
|
DWORD dwPos = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* See http://archives.miloush.net/michkap/archive/2007/04/22/2239345.html
|
||||||
|
* for more details about the algorithm and the pitfalls behind it.
|
||||||
|
* Of course it would be actually great to make a nice function that
|
||||||
|
* would work, once and for all, and put it into a library.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Look for Byte Order Marks */
|
||||||
|
if ((BufferSize >= 2) && (pBytes[0] == 0xFF) && (pBytes[1] == 0xFE))
|
||||||
|
{
|
||||||
|
encFile = ENCODING_UTF16LE;
|
||||||
|
dwPos = 2;
|
||||||
|
}
|
||||||
|
else if ((BufferSize >= 2) && (pBytes[0] == 0xFE) && (pBytes[1] == 0xFF))
|
||||||
|
{
|
||||||
|
encFile = ENCODING_UTF16BE;
|
||||||
|
dwPos = 2;
|
||||||
|
}
|
||||||
|
else if ((BufferSize >= 3) && (pBytes[0] == 0xEF) && (pBytes[1] == 0xBB) && (pBytes[2] == 0xBF))
|
||||||
|
{
|
||||||
|
encFile = ENCODING_UTF8;
|
||||||
|
dwPos = 3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Try using statistical analysis. Do not rely on the return value of
|
||||||
|
* IsTextUnicode as we can get FALSE even if the text is in UTF-16 BE
|
||||||
|
* (i.e. we have some of the IS_TEXT_UNICODE_REVERSE_MASK bits set).
|
||||||
|
* Instead, set all the tests we want to perform, then just check
|
||||||
|
* the passed tests and try to deduce the string properties.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This mask contains the 3 highest bits from IS_TEXT_UNICODE_NOT_ASCII_MASK
|
||||||
|
* and the 1st highest bit from IS_TEXT_UNICODE_NOT_UNICODE_MASK.
|
||||||
|
*/
|
||||||
|
#define IS_TEXT_UNKNOWN_FLAGS_MASK ((7 << 13) | (1 << 11))
|
||||||
|
|
||||||
|
/* Flag out the unknown flags here, the passed tests will not have them either */
|
||||||
|
INT Tests = (IS_TEXT_UNICODE_NOT_ASCII_MASK |
|
||||||
|
IS_TEXT_UNICODE_NOT_UNICODE_MASK |
|
||||||
|
IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_UNICODE_MASK)
|
||||||
|
& ~IS_TEXT_UNKNOWN_FLAGS_MASK;
|
||||||
|
INT Results;
|
||||||
|
|
||||||
|
IsTextUnicode(Buffer, BufferSize, &Tests);
|
||||||
|
Results = Tests;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As the IS_TEXT_UNICODE_NULL_BYTES or IS_TEXT_UNICODE_ILLEGAL_CHARS
|
||||||
|
* flags are expected to be potentially present in the result without
|
||||||
|
* modifying our expectations, filter them out now.
|
||||||
|
*/
|
||||||
|
Results &= ~(IS_TEXT_UNICODE_NULL_BYTES | IS_TEXT_UNICODE_ILLEGAL_CHARS);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* NOTE: The flags IS_TEXT_UNICODE_ASCII16 and
|
||||||
|
* IS_TEXT_UNICODE_REVERSE_ASCII16 are not reliable.
|
||||||
|
*
|
||||||
|
* NOTE2: Check for potential "bush hid the facts" effect by also
|
||||||
|
* checking the original results (in 'Tests') for the absence of
|
||||||
|
* the IS_TEXT_UNICODE_NULL_BYTES flag, as we may presumably expect
|
||||||
|
* that in UTF-16 text there will be at some point some NULL bytes.
|
||||||
|
* If not, fall back to ANSI. This shows the limitations of using the
|
||||||
|
* IsTextUnicode API to perform such tests, and the usage of a more
|
||||||
|
* improved encoding detection algorithm would be really welcome.
|
||||||
|
*/
|
||||||
|
if (!(Results & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
|
||||||
|
!(Results & IS_TEXT_UNICODE_REVERSE_MASK) &&
|
||||||
|
(Results & IS_TEXT_UNICODE_UNICODE_MASK) &&
|
||||||
|
(Tests & IS_TEXT_UNICODE_NULL_BYTES))
|
||||||
|
{
|
||||||
|
encFile = ENCODING_UTF16LE;
|
||||||
|
dwPos = (Results & IS_TEXT_UNICODE_SIGNATURE) ? 2 : 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if (!(Results & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
|
||||||
|
!(Results & IS_TEXT_UNICODE_UNICODE_MASK) &&
|
||||||
|
(Results & IS_TEXT_UNICODE_REVERSE_MASK) &&
|
||||||
|
(Tests & IS_TEXT_UNICODE_NULL_BYTES))
|
||||||
|
{
|
||||||
|
encFile = ENCODING_UTF16BE;
|
||||||
|
dwPos = (Results & IS_TEXT_UNICODE_REVERSE_SIGNATURE) ? 2 : 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Either 'Results' has neither of those masks set, as it can be
|
||||||
|
* the case for UTF-8 text (or ANSI), or it has both as can be the
|
||||||
|
* case when analysing pure binary data chunk. This is therefore
|
||||||
|
* invalid and we fall back to ANSI encoding.
|
||||||
|
* FIXME: In case of failure, assume ANSI (as long as we do not have
|
||||||
|
* correct tests for UTF8, otherwise we should do them, and at the
|
||||||
|
* very end, assume ANSI).
|
||||||
|
*/
|
||||||
|
encFile = ENCODING_ANSI; // ENCODING_UTF8;
|
||||||
|
dwPos = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Encoding)
|
||||||
|
*Encoding = encFile;
|
||||||
|
if (SkipBytes)
|
||||||
|
*SkipBytes = dwPos;
|
||||||
|
|
||||||
|
return (encFile != ENCODING_ANSI);
|
||||||
|
}
|
||||||
|
|
||||||
static BOOL SearchFile(LPCWSTR lpFilePath, _SearchData *pSearchData)
|
static BOOL SearchFile(LPCWSTR lpFilePath, _SearchData *pSearchData)
|
||||||
{
|
{
|
||||||
HANDLE hFile = CreateFileW(lpFilePath, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
|
HANDLE hFile = CreateFileW(lpFilePath, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
|
||||||
|
@ -209,28 +340,31 @@ static BOOL SearchFile(LPCWSTR lpFilePath, _SearchData *pSearchData)
|
||||||
if (!pbContents)
|
if (!pbContents)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
|
ENCODING encoding;
|
||||||
|
IsDataUnicode(pbContents, size, &encoding, NULL);
|
||||||
|
|
||||||
BOOL bFound;
|
BOOL bFound;
|
||||||
if (size >= 2 &&
|
switch (encoding)
|
||||||
(memcmp(pbContents, "\xFF\xFE", 2) == 0 || (pbContents[0] && !pbContents[1])))
|
|
||||||
{
|
{
|
||||||
|
case ENCODING_UTF16LE:
|
||||||
// UTF-16
|
// UTF-16
|
||||||
bFound = StrFindNIW((LPCWSTR)pbContents, pSearchData->szQueryW, size / sizeof(WCHAR));
|
bFound = StrFindNIW((LPCWSTR)pbContents, pSearchData->szQueryW, size / sizeof(WCHAR));
|
||||||
}
|
break;
|
||||||
else if (size >= 2 &&
|
case ENCODING_UTF16BE:
|
||||||
(memcmp(pbContents, "\xFE\xFF", 2) == 0 || (!pbContents[0] && pbContents[1])))
|
|
||||||
{
|
|
||||||
// UTF-16 BE
|
// UTF-16 BE
|
||||||
bFound = StrFindNIW((LPCWSTR)pbContents, pSearchData->szQueryU16BE, size / sizeof(WCHAR));
|
bFound = StrFindNIW((LPCWSTR)pbContents, pSearchData->szQueryU16BE, size / sizeof(WCHAR));
|
||||||
}
|
break;
|
||||||
else if (size >= 3 && memcmp(pbContents, "\xEF\xBB\xBF", 3) == 0)
|
case ENCODING_UTF8:
|
||||||
{
|
|
||||||
// UTF-8
|
// UTF-8
|
||||||
bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryU8, size / sizeof(CHAR));
|
bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryU8, size / sizeof(CHAR));
|
||||||
}
|
break;
|
||||||
else
|
case ENCODING_ANSI:
|
||||||
{
|
default:
|
||||||
// ANSI
|
// ANSI or UTF-8 without BOM
|
||||||
bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryA, size / sizeof(CHAR));
|
bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryA, size / sizeof(CHAR));
|
||||||
|
if (!bFound && pSearchData->szQueryA != pSearchData->szQueryU8)
|
||||||
|
bFound = StrFindNIA((LPCSTR)pbContents, pSearchData->szQueryU8, size / sizeof(CHAR));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
UnmapViewOfFile(pbContents);
|
UnmapViewOfFile(pbContents);
|
||||||
|
|
Loading…
Reference in a new issue