/** * \file unicode.C * This file is part of LyX, the document processor. * Licence details can be found in the file COPYING. * * \author Lars Gullik Bjønnes * * Full author contact details are available in file CREDITS. * * A collection of unicode conversion functions, using iconv. */ #include #include "unicode.h" #include "debug.h" #include #include #include #include using std::endl; using std::string; namespace { std::vector iconv_convert(std::string const & tocode, std::string const & fromcode, std::vector const & buf) { if (buf.empty()) return std::vector(); iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str()); if (cd == (iconv_t)(-1)) { lyxerr << "Error returned from iconv_open" << endl; switch (errno) { case EINVAL: lyxerr << "EINVAL The conversion from " << fromcode << " to " << tocode << " is not supported by the implementation." << endl; break; default: lyxerr << "\tSome other error: " << errno << endl; break; } } char ICONV_CONST * inbuf = const_cast(&buf[0]); size_t inbytesleft = buf.size(); static char out[1000]; char * outbuf = out; size_t outbytesleft = 1000; size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (res == (size_t)(-1)) { lyxerr << "Error returned from iconv" << endl; switch (errno) { case E2BIG: lyxerr << "E2BIG There is not sufficient room at *outbuf." << endl; break; case EILSEQ: lyxerr << "EILSEQ An invalid multibyte sequence" << " has been encountered in the input.\n" << "When converting from " << fromcode << " to " << tocode << ".\n"; lyxerr << "Input: " << std::hex; for (size_t i = 0; i < buf.size(); ++i) { unsigned char const b = buf[i]; lyxerr << "0x" << int(b) << " "; } lyxerr << endl; break; case EINVAL: lyxerr << "EINVAL An incomplete multibyte sequence" << " has been encountered in the input.\n" << "When converting from " << fromcode << " to " << tocode << ".\n"; lyxerr << "Input: " << std::hex; for (size_t i = 0; i < buf.size(); ++i) { unsigned char const b = buf[i]; lyxerr << "0x" << int(b) << " "; } lyxerr << endl; break; default: lyxerr << "\tSome other error: " << errno << endl; break; } } if (iconv_close(cd) == -1) { lyxerr << "Error returned from iconv_close(" << errno << ")" << endl; } //lyxerr << std::dec; //lyxerr << "Inbytesleft: " << inbytesleft << endl; //lyxerr << "Outbytesleft: " << outbytesleft << endl; int bytes = 1000 - outbytesleft; std::vector outvec(out, out + bytes); return outvec; } std::vector bytes_to_ucs4(std::vector const & bytes) { //lyxerr << "Outbuf =" << std::hex; std::vector ucs4; for (size_t i = 0; i < bytes.size(); i += 4) { unsigned char const b1 = bytes[i ]; unsigned char const b2 = bytes[i + 1]; unsigned char const b3 = bytes[i + 2]; unsigned char const b4 = bytes[i + 3]; boost::uint32_t c; char * cc = reinterpret_cast(&c); cc[3] = b1; cc[2] = b2; cc[1] = b3; cc[0] = b4; if (c > 0xffff) { lyxerr << "Strange ucs4 value encountered\n"; lyxerr << "0x" << std::setw(2) << std::setfill('0') << int(b1) << std::setw(2) << std::setfill('0') << int(b2) << std::setw(2) << std::setfill('0') << int(b3) << std::setw(2) << std::setfill('0') << int(b4) << ' ' << "(0x" << c << ") "; } ucs4.push_back(c); } //lyxerr << endl; return ucs4; } std::vector bytes_to_ucs2(std::vector const & bytes) { //lyxerr << "Outbuf =" << std::hex; std::vector ucs2; for (size_t i = 0; i < bytes.size(); i += 2) { unsigned char const b1 = bytes[i ]; unsigned char const b2 = bytes[i + 1]; unsigned short c; char * cc = reinterpret_cast(&c); cc[0] = b1; cc[1] = b2; //lyxerr << "0x" // << std::setw(2) << std::setfill('0') << int(b2) // << std::setw(2) << std::setfill('0') << int(b1) // << ' ' // << "(0x" // << c // << ") "; ucs2.push_back(c); } //lyxerr << endl; return ucs2; } } // anon namespace std::vector utf8_to_ucs4(std::vector const & utf8str) { //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end()) // << " (" << utf8str.size() << ")" << endl; //lyxerr << "Res = " << string(res.begin(), res.end()) // << " (" << res.size() << ")" << endl; std::vector res = iconv_convert("UCS-4", "UTF-8", utf8str); return bytes_to_ucs4(res); } std::vector ucs2_to_ucs4(std::vector const & ucs2str) { // TODO: Simplify and speed up. std::vector in; std::vector::const_iterator cit = ucs2str.begin(); std::vector::const_iterator end = ucs2str.end(); //lyxerr << std::hex; for (; cit != end; ++cit) { unsigned short s = *cit; in.push_back(static_cast(s & 0x00ff)); in.push_back(static_cast((s & 0xff00) >> 8)); lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl; lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl; } std::vector res = iconv_convert("UCS-4", "UCS-2", in); return bytes_to_ucs4(res); } std::vector ucs4_to_ucs2(std::vector const & ucs4str) { std::vector in; std::vector::const_iterator cit = ucs4str.begin(); std::vector::const_iterator end = ucs4str.end(); for (; cit != end; ++cit) { boost::uint32_t s = *cit; in.push_back(static_cast((s & 0xff000000) >> 24)); in.push_back(static_cast((s & 0x00ff0000) >> 16)); in.push_back(static_cast((s & 0x0000ff00) >> 8)); in.push_back(static_cast(s & 0x000000ff)); } std::vector res = iconv_convert("UCS-2", "UCS-4", in); return bytes_to_ucs2(res); } std::vector ucs4_to_ucs2(boost::uint32_t const * s, size_t ls) { std::vector in; for (size_t i = 0; i < ls; ++i) { in.push_back(static_cast((s[i] & 0xff000000) >> 24)); in.push_back(static_cast((s[i] & 0x00ff0000) >> 16)); in.push_back(static_cast((s[i] & 0x0000ff00) >> 8)); in.push_back(static_cast(s[i] & 0x000000ff)); } std::vector res = iconv_convert("UCS-2", "UCS-4", in); return bytes_to_ucs2(res); } unsigned short ucs4_to_ucs2(boost::uint32_t c) { std::vector in; in.push_back(static_cast((c & 0xff000000) >> 24)); in.push_back(static_cast((c & 0x00ff0000) >> 16)); in.push_back(static_cast((c & 0x0000ff00) >> 8)); in.push_back(static_cast(c & 0x000000ff)); std::vector res = iconv_convert("UCS-2", "UCS-4", in); std::vector us = bytes_to_ucs2(res); if (!us.empty()) return us[0]; else return 0xfffd; // unknown character } std::vector ucs4_to_utf8(std::vector const & ucs4str) { std::vector in; std::vector::const_iterator cit = ucs4str.begin(); std::vector::const_iterator end = ucs4str.end(); for (; cit != end; ++cit) { boost::uint32_t s = *cit; in.push_back(static_cast((s & 0xff000000) >> 24)); in.push_back(static_cast((s & 0x00ff0000) >> 16)); in.push_back(static_cast((s & 0x0000ff00) >> 8)); in.push_back(static_cast(s & 0x000000ff)); } std::vector res = iconv_convert("UTF-8", "UCS-4", in); return res; } std::vector ucs4_to_utf8(boost::uint32_t c) { std::vector in; in.push_back(static_cast((c & 0xff000000) >> 24)); in.push_back(static_cast((c & 0x00ff0000) >> 16)); in.push_back(static_cast((c & 0x0000ff00) >> 8)); in.push_back(static_cast(c & 0x000000ff)); std::vector res = iconv_convert("UTF-8", "UCS-4", in); return res; }