reactos/lib/3rdparty/fullfat/ff_unicode.c

295 lines
11 KiB
C
Raw Normal View History

/*****************************************************************************
* FullFAT - High Performance, Thread-Safe Embedded FAT File-System *
* Copyright (C) 2009 James Walmsley (james@worm.me.uk) *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
* IMPORTANT NOTICE: *
* ================= *
* Alternative Licensing is available directly from the Copyright holder, *
* (James Walmsley). For more information consult LICENSING.TXT to obtain *
* a Commercial license. *
* *
* See RESTRICTIONS.TXT for extra restrictions on the use of FullFAT. *
* *
* Removing the above notice is illegal and will invalidate this license. *
*****************************************************************************
* See http://worm.me.uk/fullfat for more information. *
* Or http://fullfat.googlecode.com/ for latest releases and the wiki. *
*****************************************************************************/
/**
* @file ff_unicode.c
* @author James Walmsley
* @ingroup UNICODE
*
* @defgroup UNICODE FullFAT UNICODE Library
* @brief Portable UNICODE Transformation Library for FullFAT
*
**/
#include "ff_unicode.h"
#include "string.h"
// UTF-8 Routines
/*
UCS-4 range (hex.) UTF-8 octet sequence (binary)
0000 0000-0000 007F 0xxxxxxx
0000 0080-0000 07FF 110xxxxx 10xxxxxx
0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE).
0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE).
*/
FF_T_UINT FF_GetUtf16SequenceLen(FF_T_UINT16 usLeadChar) {
if((usLeadChar & 0xFC00) == 0xD800) {
return 2;
}
return 1;
}
/*
Returns the number of UTF-8 units read.
Will not exceed ulSize UTF-16 units. (ulSize * 2 bytes).
*/
/*
UCS-4 range (hex.) UTF-8 octet sequence (binary)
0000 0000-0000 007F 0xxxxxxx
0000 0080-0000 07FF 110xxxxx 10xxxxxx
0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE).
0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx -- We don't encode these because we won't receive them. (Invalid UNICODE).
*/
FF_T_SINT32 FF_Utf8ctoUtf16c(FF_T_UINT16 *utf16Dest, const FF_T_UINT8 *utf8Source, FF_T_UINT32 ulSize) {
FF_T_UINT32 ulUtf32char;
FF_T_UINT16 utf16Source = 0;
register FF_T_INT uiSequenceNumber = 0;
while((*utf8Source & (0x80 >> (uiSequenceNumber)))) { // Count number of set bits before a zero.
uiSequenceNumber++;
}
if(!uiSequenceNumber) {
uiSequenceNumber++;
}
if(!ulSize) {
return FF_ERR_UNICODE_DEST_TOO_SMALL;
}
switch(uiSequenceNumber) {
case 1:
utf16Source = (FF_T_UINT16) *utf8Source;
memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
//bobtntfullfat *utf16Dest = (FF_T_UINT16) *utf8Source;
break;
case 2:
utf16Source =(FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F));
memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
//bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F));
break;
case 3:
utf16Source =(FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F));
memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
//bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F));
break;
case 4:
// Convert to UTF-32 and then into UTF-16
if(ulSize < 2) {
return FF_ERR_UNICODE_DEST_TOO_SMALL;
}
ulUtf32char = (FF_T_UINT16) ((*utf8Source & 0x0F) << 18) | ((*(utf8Source + 1) & 0x3F) << 12) | ((*(utf8Source + 2) & 0x3F) << 6) | ((*(utf8Source + 3) & 0x3F));
utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
memcpy(utf16Dest+1,&utf16Source,sizeof(FF_T_UINT16));
//bobtntfullfat *(utf16Dest + 0) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
//bobtntfullfat *(utf16Dest + 1) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
break;
default:
break;
}
return uiSequenceNumber;
}
/*
Returns the number of UTF-8 units required to encode the UTF-16 sequence.
Will not exceed ulSize UTF-8 units. (ulSize * 1 bytes).
*/
FF_T_SINT32 FF_Utf16ctoUtf8c(FF_T_UINT8 *utf8Dest, const FF_T_UINT16 *utf16Source, FF_T_UINT32 ulSize) {
FF_T_UINT32 ulUtf32char;
FF_T_UINT16 ulUtf16char;
if(!ulSize) {
return FF_ERR_UNICODE_DEST_TOO_SMALL;
}
memcpy(&ulUtf16char, utf16Source, sizeof(FF_T_UINT16));
if((/*bobtntfullfat *utf16Source*/ulUtf16char & 0xF800) == 0xD800) { // A surrogate sequence was encountered. Must transform to UTF32 first.
ulUtf32char = ((FF_T_UINT32) (ulUtf16char & 0x003FF) << 10) + 0x10000;
//bobtntfullfat ulUtf32char = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000;
memcpy(&ulUtf16char, utf16Source + 1, sizeof(FF_T_UINT16));
if((/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0xFC00) != 0xDC00) {
return FF_ERR_UNICODE_INVALID_SEQUENCE; // Invalid UTF-16 sequence.
}
ulUtf32char |= ((FF_T_UINT32) (/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0x003FF));
} else {
ulUtf32char = (FF_T_UINT32) /*bobtntfullfat *utf16Source*/ulUtf16char;
}
// Now convert to the UTF-8 sequence.
if(ulUtf32char < 0x00000080) { // Single byte UTF-8 sequence.
*(utf8Dest + 0) = (FF_T_UINT8) ulUtf32char;
return 1;
}
if(ulUtf32char < 0x00000800) { // Double byte UTF-8 sequence.
if(ulSize < 2) {
return FF_ERR_UNICODE_DEST_TOO_SMALL;
}
*(utf8Dest + 0) = (FF_T_UINT8) (0xC0 | ((ulUtf32char >> 6) & 0x1F));
*(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0) & 0x3F));
return 2;
}
if(ulUtf32char < 0x00010000) { // Triple byte UTF-8 sequence.
if(ulSize < 3) {
return FF_ERR_UNICODE_DEST_TOO_SMALL;
}
*(utf8Dest + 0) = (FF_T_UINT8) (0xE0 | ((ulUtf32char >> 12) & 0x0F));
*(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F));
*(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F));
return 3;
}
if(ulUtf32char < 0x00200000) { // Quadruple byte UTF-8 sequence.
if(ulSize < 4) {
return FF_ERR_UNICODE_DEST_TOO_SMALL;
}
*(utf8Dest + 0) = (FF_T_UINT8) (0xF0 | ((ulUtf32char >> 18) & 0x07));
*(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 12) & 0x3F));
*(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F));
*(utf8Dest + 3) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F));
return 4;
}
return FF_ERR_UNICODE_INVALID_CODE; // Invalid Charachter
}
// UTF-16 Support Functions
// Converts a UTF-32 Charachter into its equivalent UTF-16 sequence.
FF_T_SINT32 FF_Utf32ctoUtf16c(FF_T_UINT16 *utf16Dest, FF_T_UINT32 utf32char, FF_T_UINT32 ulSize) {
// Check that its a valid UTF-32 wide-char!
if(utf32char >= 0xD800 && utf32char <= 0xDFFF) { // This range is not a valid Unicode code point.
return FF_ERR_UNICODE_INVALID_CODE; // Invalid charachter.
}
if(utf32char < 0x10000) {
*utf16Dest = (FF_T_UINT16) utf32char; // Simple conversion! Char comes within UTF-16 space (without surrogates).
return 1;
}
if(ulSize < 2) {
return FF_ERR_UNICODE_DEST_TOO_SMALL; // Not enough UTF-16 units to record this charachter.
}
if(utf32char < 0x00200000) {
// Conversion to a UTF-16 Surrogate pair!
//valueImage = utf32char - 0x10000;
*(utf16Dest + 0) = (FF_T_UINT16) (((utf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
*(utf16Dest + 1) = (FF_T_UINT16) (((utf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
return 2; // Surrogate pair encoded value.
}
return FF_ERR_UNICODE_INVALID_CODE; // Invalid Charachter
}
// Converts a UTF-16 sequence into its equivalent UTF-32 code point.
FF_T_SINT32 FF_Utf16ctoUtf32c(FF_T_UINT32 *utf32Dest, const FF_T_UINT16 *utf16Source) {
if((*utf16Source & 0xFC00) != 0xD800) { // Not a surrogate sequence.
*utf32Dest = (FF_T_UINT32) *utf16Source;
return 1; // A single UTF-16 item was used to represent the charachter.
}
*utf32Dest = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000;
if((*(utf16Source + 1) & 0xFC00) != 0xDC00) {
return FF_ERR_UNICODE_INVALID_SEQUENCE; // Invalid UTF-16 sequence.
}
*utf32Dest |= ((FF_T_UINT32) (*(utf16Source + 1) & 0x003FF));
return 2; // 2 utf-16 units make up the Unicode code-point.
}
/*
Returns the total number of UTF-16 items required to represent
the provided UTF-32 string in UTF-16 form.
*/
/*
FF_T_UINT FF_Utf32GetUtf16Len(const FF_T_UINT32 *utf32String) {
FF_T_UINT utf16len = 0;
while(*utf32String) {
if(*utf32String++ <= 0xFFFF) {
utf16len++;
} else {
utf16len += 2;
}
}
return utf16len;
}*/
// String conversions
FF_T_SINT32 FF_Utf32stoUtf8s(FF_T_UINT8 *Utf8String, FF_T_UINT32 *Utf32String) {
int i = 0,y = 0;
FF_T_UINT16 utf16buffer[2];
while(Utf32String[i]) {
// Convert to a UTF16 char.
FF_Utf32ctoUtf16c(utf16buffer, Utf32String[i], 2);
// Now convert the UTF16 to UTF8 sequence.
y += FF_Utf16ctoUtf8c(&Utf8String[y], utf16buffer, 4);
i++;
}
Utf8String[y] = '\0';
return 0;
}