reactos/lib/3rdparty/fullfat/ff_unicode.c

/*****************************************************************************
 *  FullFAT - High Performance, Thread-Safe Embedded FAT File-System         *
 *  Copyright (C) 2009  James Walmsley (james@worm.me.uk)                    *
 *                                                                           *
 *  This program is free software: you can redistribute it and/or modify     *
 *  it under the terms of the GNU General Public License as published by     *
 *  the Free Software Foundation, either version 3 of the License, or        *
 *  (at your option) any later version.                                      *
 *                                                                           *
 *  This program is distributed in the hope that it will be useful,          *
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
 *  GNU General Public License for more details.                             *
 *                                                                           *
 *  You should have received a copy of the GNU General Public License        *
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.    *
 *                                                                           *
 *  IMPORTANT NOTICE:                                                        *
 *  =================                                                        *
 *  Alternative Licensing is available directly from the Copyright holder,   *
 *  (James Walmsley). For more information consult LICENSING.TXT to obtain   *
 *  a Commercial license.                                                    *
 *                                                                           *
 *  See RESTRICTIONS.TXT for extra restrictions on the use of FullFAT.       *
 *                                                                           *
 *  Removing the above notice is illegal and will invalidate this license.   *
 *****************************************************************************
 *  See http://worm.me.uk/fullfat for more information.                      *
 *  Or  http://fullfat.googlecode.com/ for latest releases and the wiki.     *
 *****************************************************************************/

/**
 *	@file		ff_unicode.c
 *	@author		James Walmsley
 *	@ingroup	UNICODE
 *
 *	@defgroup	UNICODE	FullFAT UNICODE Library
 *	@brief		Portable UNICODE Transformation Library for FullFAT
 *
 **/

#include "ff_unicode.h"
#include "string.h"

// UTF-8 Routines

/*
   UCS-4 range (hex.)           UTF-8 octet sequence (binary)
   0000 0000-0000 007F   0xxxxxxx
   0000 0080-0000 07FF   110xxxxx 10xxxxxx
   0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx

   0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	-- We don't encode these because we won't receive them. (Invalid UNICODE).
   0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx					-- We don't encode these because we won't receive them. (Invalid UNICODE).
*/

FF_T_UINT FF_GetUtf16SequenceLen(FF_T_UINT16 usLeadChar) {
	if((usLeadChar & 0xFC00) == 0xD800) {
		return 2;
	}
	return 1;
}

/*
	Returns the number of UTF-8 units read.
	Will not exceed ulSize UTF-16 units. (ulSize * 2 bytes).
*/
/*
   UCS-4 range (hex.)           UTF-8 octet sequence (binary)
   0000 0000-0000 007F   0xxxxxxx
   0000 0080-0000 07FF   110xxxxx 10xxxxxx
   0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx

   0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	-- We don't encode these because we won't receive them. (Invalid UNICODE).
   0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx					-- We don't encode these because we won't receive them. (Invalid UNICODE).
*/
FF_T_SINT32 FF_Utf8ctoUtf16c(FF_T_UINT16 *utf16Dest, const FF_T_UINT8 *utf8Source, FF_T_UINT32 ulSize) {
	FF_T_UINT32			ulUtf32char;
	FF_T_UINT16			utf16Source = 0;
	register FF_T_INT	uiSequenceNumber = 0;

	while((*utf8Source & (0x80 >> (uiSequenceNumber)))) {	// Count number of set bits before a zero.
		uiSequenceNumber++;
	}

	if(!uiSequenceNumber) {
		uiSequenceNumber++;
	}

	if(!ulSize) {
		return FF_ERR_UNICODE_DEST_TOO_SMALL;
	}

	switch(uiSequenceNumber) {
		case 1:
                        utf16Source = (FF_T_UINT16) *utf8Source;
                        memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
			//bobtntfullfat *utf16Dest = (FF_T_UINT16) *utf8Source;
			break;

		case 2:
                        utf16Source =(FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F));
                        memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
			//bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x1F) << 6) | ((*(utf8Source + 1) & 0x3F));
			break;

		case 3:
                        utf16Source =(FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F));
                        memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
			//bobtntfullfat *utf16Dest = (FF_T_UINT16) ((*utf8Source & 0x0F) << 12) | ((*(utf8Source + 1) & 0x3F) << 6) | ((*(utf8Source + 2) & 0x3F));
			break;

		case 4:
			// Convert to UTF-32 and then into UTF-16
			if(ulSize < 2) {
				return FF_ERR_UNICODE_DEST_TOO_SMALL;
			}
			ulUtf32char = (FF_T_UINT16) ((*utf8Source & 0x0F) << 18) | ((*(utf8Source + 1) & 0x3F) << 12) | ((*(utf8Source + 2) & 0x3F) << 6) | ((*(utf8Source + 3) & 0x3F));

                        utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
                        memcpy(utf16Dest,&utf16Source,sizeof(FF_T_UINT16));
                        utf16Source = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
                        memcpy(utf16Dest+1,&utf16Source,sizeof(FF_T_UINT16));
			//bobtntfullfat *(utf16Dest + 0) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
			//bobtntfullfat *(utf16Dest + 1) = (FF_T_UINT16) (((ulUtf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;
			break;

		default:
			break;
	}

	return uiSequenceNumber;
}


/*
	Returns the number of UTF-8 units required to encode the UTF-16 sequence.
	Will not exceed ulSize UTF-8 units. (ulSize  * 1 bytes).
*/
FF_T_SINT32 FF_Utf16ctoUtf8c(FF_T_UINT8 *utf8Dest, const FF_T_UINT16 *utf16Source, FF_T_UINT32 ulSize) {
	FF_T_UINT32	ulUtf32char;
	FF_T_UINT16	ulUtf16char;

	if(!ulSize) {
		return FF_ERR_UNICODE_DEST_TOO_SMALL;
	}

        memcpy(&ulUtf16char, utf16Source, sizeof(FF_T_UINT16));
	if((/*bobtntfullfat *utf16Source*/ulUtf16char & 0xF800) == 0xD800) {	// A surrogate sequence was encountered. Must transform to UTF32 first.
                ulUtf32char  = ((FF_T_UINT32) (ulUtf16char & 0x003FF) << 10) + 0x10000;
		//bobtntfullfat ulUtf32char  = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000;

                memcpy(&ulUtf16char, utf16Source + 1, sizeof(FF_T_UINT16));
		if((/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0xFC00) != 0xDC00) {
			return FF_ERR_UNICODE_INVALID_SEQUENCE;	// Invalid UTF-16 sequence.
		}
		ulUtf32char |= ((FF_T_UINT32) (/*bobtntfullfat *(utf16Source + 1)*/ulUtf16char & 0x003FF));

	} else {
		ulUtf32char = (FF_T_UINT32) /*bobtntfullfat *utf16Source*/ulUtf16char;
	}

	// Now convert to the UTF-8 sequence.
	if(ulUtf32char < 0x00000080) {	// Single byte UTF-8 sequence.
		*(utf8Dest + 0) = (FF_T_UINT8) ulUtf32char;
		return 1;
	}

	if(ulUtf32char < 0x00000800) {	// Double byte UTF-8 sequence.
		if(ulSize < 2) {
			return FF_ERR_UNICODE_DEST_TOO_SMALL;
		}
		*(utf8Dest + 0) = (FF_T_UINT8) (0xC0 | ((ulUtf32char >> 6) & 0x1F));
		*(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0) & 0x3F));
		return 2;
	}

	if(ulUtf32char < 0x00010000) {	// Triple byte UTF-8 sequence.
		if(ulSize < 3) {
			return FF_ERR_UNICODE_DEST_TOO_SMALL;
		}
		*(utf8Dest + 0) = (FF_T_UINT8) (0xE0 | ((ulUtf32char >> 12) & 0x0F));
		*(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F));
		*(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F));
		return 3;
	}

	if(ulUtf32char < 0x00200000) {	// Quadruple byte UTF-8 sequence.
		if(ulSize < 4) {
			return FF_ERR_UNICODE_DEST_TOO_SMALL;
		}
		*(utf8Dest + 0) = (FF_T_UINT8) (0xF0 | ((ulUtf32char >> 18) & 0x07));
		*(utf8Dest + 1) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 12) & 0x3F));
		*(utf8Dest + 2) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 6 ) & 0x3F));
		*(utf8Dest + 3) = (FF_T_UINT8) (0x80 | ((ulUtf32char >> 0 ) & 0x3F));
		return 4;
	}

	return FF_ERR_UNICODE_INVALID_CODE;	// Invalid Charachter
}


// UTF-16 Support Functions

// Converts a UTF-32 Charachter into its equivalent UTF-16 sequence.
FF_T_SINT32 FF_Utf32ctoUtf16c(FF_T_UINT16 *utf16Dest, FF_T_UINT32 utf32char, FF_T_UINT32 ulSize) {

	// Check that its a valid UTF-32 wide-char!

	if(utf32char >= 0xD800 && utf32char <= 0xDFFF) {	// This range is not a valid Unicode code point.
		return FF_ERR_UNICODE_INVALID_CODE; // Invalid charachter.
	}

	if(utf32char < 0x10000) {
		*utf16Dest = (FF_T_UINT16) utf32char; // Simple conversion! Char comes within UTF-16 space (without surrogates).
		return 1;
	}

	if(ulSize < 2) {
		return FF_ERR_UNICODE_DEST_TOO_SMALL;	// Not enough UTF-16 units to record this charachter.
	}

	if(utf32char < 0x00200000) {
		// Conversion to a UTF-16 Surrogate pair!
		//valueImage = utf32char - 0x10000;

		*(utf16Dest + 0) = (FF_T_UINT16) (((utf32char - 0x10000) & 0xFFC00) >> 10) | 0xD800;
		*(utf16Dest + 1) = (FF_T_UINT16) (((utf32char - 0x10000) & 0x003FF) >> 00) | 0xDC00;

		return 2;	// Surrogate pair encoded value.
	}

	return FF_ERR_UNICODE_INVALID_CODE;	// Invalid Charachter
}

// Converts a UTF-16 sequence into its equivalent UTF-32 code point.
FF_T_SINT32 FF_Utf16ctoUtf32c(FF_T_UINT32 *utf32Dest, const FF_T_UINT16 *utf16Source) {

	if((*utf16Source & 0xFC00) != 0xD800) {	// Not a surrogate sequence.
		*utf32Dest = (FF_T_UINT32) *utf16Source;
		return 1;	// A single UTF-16 item was used to represent the charachter.
	}

	*utf32Dest  = ((FF_T_UINT32) (*(utf16Source + 0) & 0x003FF) << 10) + 0x10000;

	if((*(utf16Source + 1) & 0xFC00) != 0xDC00) {
		return FF_ERR_UNICODE_INVALID_SEQUENCE;	// Invalid UTF-16 sequence.
	}
	*utf32Dest |= ((FF_T_UINT32) (*(utf16Source + 1) & 0x003FF));
	return 2;	// 2 utf-16 units make up the Unicode code-point.
}


/*
	Returns the total number of UTF-16 items required to represent
	the provided UTF-32 string in UTF-16 form.
*/
/*
FF_T_UINT FF_Utf32GetUtf16Len(const FF_T_UINT32 *utf32String) {
	FF_T_UINT utf16len = 0;

	while(*utf32String) {
		if(*utf32String++ <= 0xFFFF) {
			utf16len++;
		} else {
			utf16len += 2;
		}
	}

	return utf16len;
}*/


// String conversions

FF_T_SINT32 FF_Utf32stoUtf8s(FF_T_UINT8 *Utf8String, FF_T_UINT32 *Utf32String) {
	int i = 0,y = 0;

	FF_T_UINT16 utf16buffer[2];

	while(Utf32String[i]) {
		// Convert to a UTF16 char.
		FF_Utf32ctoUtf16c(utf16buffer, Utf32String[i], 2);
		// Now convert the UTF16 to UTF8 sequence.
		y += FF_Utf16ctoUtf8c(&Utf8String[y], utf16buffer, 4);
		i++;
	}

	Utf8String[y] = '\0';

	return 0;
}