/* * Usage: utf16le inputfile outputfile * * This is a tool and is compiled using the host compiler, * i.e. on Linux gcc and not mingw-gcc (cross-compiler). * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE) * to utf-16 LE and especially made for automatic conversions of * INF-files from utf-8 to utf-16LE (so we can furthermore * store the INF files in utf-8 for subversion. * * Author: Matthias Kupfer (mkupfer@reactos.org) */ #include #include #include //#define DISPLAY_DETECTED_UNICODE using namespace std; #ifdef _MSC_VER #define strcasecmp _stricmp #endif class utf_converter { public: // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only // due to ambiguous BOM enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be }; enum err_types { none, iopen, oopen, eof, read, write, decode }; enum bom_types { bom, nobom }; protected: err_types error; enc_types encoding; bom_types bom_type; unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling std::streamsize fill; fstream inputfile,outputfile; static const unsigned char utf8table[64]; public: utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0) { enc_types tmp_enc; inputfile.open(ifname.c_str(), ios::in | ios::binary); if (!inputfile) { error = iopen; return; } outputfile.open(ofname.c_str(), ios::out | ios::binary); if (!outputfile) { error = oopen; return; } tmp_enc = getBOM(); if (enc != detect) { if (enc != tmp_enc) cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl; } else encoding = tmp_enc; } err_types getError() { return error; } enc_types getBOM() { index = 0; /* first byte can also detect with: if ((buffer[0] & 0x11) || !buffer[0])) valid values are 0xef, 0xff, 0xfe, 0x00 */ inputfile.read(reinterpret_cast(&buffer),4); fill = inputfile.gcount(); // stupid utf8 bom if ((fill > 2) && (buffer[0] == 0xef) && (buffer[1] == 0xbb) && (buffer[2] == 0xbf)) { index += 3; fill -=3; #ifdef DISPLAY_DETECTED_UNICODE cerr << "UTF-8 BOM found" << endl; #endif return utf8; } if ((fill > 1) && (buffer[0] == 0xfe) && (buffer[1] == 0xff)) { index += 2; fill -= 2; #ifdef DISPLAY_DETECTED_UNICODE cerr << "UTF-16BE BOM found" << endl; #endif return utf16be; } if ((fill > 1) && (buffer[0] == 0xff) && (buffer[1] == 0xfe)) { if ((fill == 4) && (buffer[2] == 0x00) && (buffer[3] == 0x00)) { cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl; fill = 0; index = 0; return utf32le; } fill -= 2; index += 2; #ifdef DISPLAY_DETECTED_UNICODE cerr << "UTF-16LE BOM found" << endl; #endif return utf16le; } if ((fill == 4) && (buffer[0] == 0x00) && (buffer[1] == 0x00) && (buffer[2] == 0xfe) && (buffer[3] == 0xff)) { fill = 0; index = 0; #ifdef DISPLAY_DETECTED_UNICODE cerr << "UTF-32BE BOM found" << endl; #endif return utf32be; } return utf8; // no valid bom so use utf8 as default } std::streamsize getByte(unsigned char &c) { if (fill) { index %= 4; --fill; c = buffer[index++]; return 1; } else { inputfile.read(reinterpret_cast(&c),1); return inputfile.gcount(); } } std::streamsize getWord(unsigned short &w) { unsigned char c[2]; if (!getByte(c[0])) return 0; if (!getByte(c[1])) return 1; if (encoding == utf16le) w = c[0] | (c[1] << 8); else w = c[1] | (c[0] << 8); return 2; } std::streamsize getDWord(wchar_t &d) { unsigned char c[4]; for (int i=0;i<4;i++) if (!getByte(c[i])) return i; if (encoding == utf32le) d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24); else d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24); return 4; } wchar_t get_wchar_t() { wchar_t ret = (wchar_t)-1; switch (encoding) { case detect: // if still unknwon encoding = utf8; // assume utf8 as default case utf8: unsigned char c, tmp; if (!getByte(tmp)) return ret; // table for 64 bytes (all 11xxxxxx resp. >=192) // resulting byte is determined: // lower 3 bits: number of following bytes (max.8) 0=error // upper 5 bits: data filled with 0 if (tmp & 0x80) { if ((tmp & 0xc0) != 0xc0) { cerr << "UTF-8 Error: invalid data byte" << endl; return ret; } unsigned char i = utf8table[tmp & 0x3f]; ret = i >> 3; i &= 7; while (i--) { ret <<= 6; if (!getByte(c)) return wchar_t(-1); ret |= c & 0x3f; } return ret; } else return wchar_t(tmp); case utf16le: case utf16be: unsigned short w,w2; if (getWord(w) != 2) return ret; if ((w & 0xfc00) == 0xd800) // high surrogate first { if (getWord(w2) != 2) return ret; if ((w2 & 0xfc00) != 0xdc00) { cerr << "UTF-16 Error: invalid low surrogate" << endl; return ret; } return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff); } return w; case utf32le: case utf32be: if (getDWord(ret) != 4) return wchar_t (-1); return ret; } return ret; } void convert2utf16le() { unsigned char buffer[2] = { 0xff, 0xfe }; if (bom_type == bom) { outputfile.write(reinterpret_cast(&buffer), 2); // write BOM } wchar_t c = get_wchar_t(); while (!inputfile.eof()) { buffer[0] = c & 0xff; buffer[1] = (c >> 8) & 0xff; // create utf16-le char outputfile.write(reinterpret_cast(&buffer),2); // write char c = get_wchar_t(); } } ~utf_converter() { if (inputfile) inputfile.close(); if (outputfile) outputfile.close(); } }; const unsigned char utf_converter::utf8table[64] = { 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7 }; int main(int argc, char* argv[]) { utf_converter::err_types err; if (argc < 3) { cout << "usage: " << argv[0] << " inputfile outputfile" << endl; return -1; } utf_converter::bom_types bom_type = utf_converter::bom; if (argc == 4 && strcasecmp(argv[3], "nobom") == 0) { bom_type = utf_converter::nobom; } utf_converter conv(argv[1], argv[2], bom_type); if ((err = conv.getError())!=utf_converter::none) { switch (err) { case utf_converter::iopen: cerr << "Couldn't open input file." << endl; break; case utf_converter::oopen: cerr << "Couldn't open output file." << endl; break; default: cerr << "Unknown error." << endl; } return -1; } else { conv.convert2utf16le(); } return 0; }