]> git.lyx.org Git - lyx.git/blobdiff - src/support/unicode.C
* support/qstring_helpers.h: erase ucs4_to_qstring() method.
[lyx.git] / src / support / unicode.C
index b4b6385f3ae85c08de86e12fd84dbfe1251df8f6..a9d5a6a634a21a811201dce53b1c8d0fdb7abf80 100644 (file)
 
 #include <cerrno>
 #include <iomanip>
-#include <string>
+#include <map>
 
 using std::endl;
-using std::string;
 
 namespace {
 
-std::vector<char>
-iconv_convert(std::string const & tocode, std::string const & fromcode,
-             std::vector<char> const & buf)
+#ifdef WORDS_BIGENDIAN
+       char const * utf16_codeset = "UTF16-BE";
+#else
+       char const * utf16_codeset = "UTF16-LE";
+#endif
+
+}
+
+
+namespace lyx {
+
+#ifdef WORDS_BIGENDIAN
+       char const * ucs4_codeset = "UCS-4BE";
+#else
+       char const * ucs4_codeset = "UCS-4LE";
+#endif
+
+static const iconv_t invalid_cd = (iconv_t)(-1);
+
+
+struct IconvProcessor::Private {
+       Private(): cd(invalid_cd) {}
+       ~Private()
+       {
+               if (cd != invalid_cd) {
+                       if (iconv_close(cd) == -1) {
+                               lyxerr << "Error returned from iconv_close("
+                                      << errno << ")" << endl;
+                       }
+               }
+       }
+       iconv_t cd;
+};
+
+
+IconvProcessor::IconvProcessor(char const * tocode,
+               char const * fromcode): tocode_(tocode), fromcode_(fromcode),
+               pimpl_(new IconvProcessor::Private)
 {
-       if (buf.empty())
-               return std::vector<char>();
+}
+
 
-       iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
-       if (cd == (iconv_t)(-1)) {
-               lyxerr << "Error returned from iconv_open" << endl;
-               switch (errno) {
+IconvProcessor::IconvProcessor(IconvProcessor const & other)
+       : tocode_(other.tocode_), fromcode_(other.fromcode_),
+         pimpl_(new IconvProcessor::Private)
+{
+}
+
+
+IconvProcessor & IconvProcessor::operator=(IconvProcessor const & other)
+{
+       if (&other == this)
+               return *this;
+       tocode_ = other.tocode_;
+       fromcode_ = other.fromcode_;
+       pimpl_.reset(new Private);
+       return *this;
+}
+
+
+IconvProcessor::~IconvProcessor() {}
+
+
+bool IconvProcessor::init()
+{
+       if (pimpl_->cd != invalid_cd)
+               return true;
+
+       pimpl_->cd = iconv_open(tocode_.c_str(), fromcode_.c_str());
+       if (pimpl_->cd != invalid_cd)
+               return true;
+
+       lyxerr << "Error returned from iconv_open" << endl;
+       switch (errno) {
                case EINVAL:
-                       lyxerr << "EINVAL The conversion from " << fromcode
-                              << " to " << tocode
-                              << " is not supported by the implementation."
-                              << endl;
+                       lyxerr << "EINVAL The conversion from " << fromcode_
+                               << " to " << tocode_
+                               << " is not supported by the implementation."
+                               << endl;
                        break;
                default:
                        lyxerr << "\tSome other error: " << errno << endl;
                        break;
-               }
        }
+       return false;
+}
 
-       char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
-       size_t inbytesleft = buf.size();
-       static char out[1000];
-       char * outbuf = out;
-       size_t outbytesleft = 1000;
 
-       size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+int IconvProcessor::convert(char const * buf, size_t buflen,
+               char * outbuf, size_t maxoutsize)
+{
+       if (buflen == 0)
+               return 0;
+
+       if (pimpl_->cd == invalid_cd) {
+               if (!init())
+                       return -1;
+       }
+
+       char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
+       size_t inbytesleft = buflen;
+       size_t outbytesleft = maxoutsize;
+
+       int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+
+       //lyxerr << std::dec;
+       //lyxerr << "Inbytesleft: " << inbytesleft << endl;
+       //lyxerr << "Outbytesleft: " << outbytesleft << endl;
 
-       if (res == (size_t)(-1)) {
-               lyxerr << "Error returned from iconv" << endl;
-               switch (errno) {
+       if (res != -1)
+               // Everything went well.
+               return maxoutsize - outbytesleft;
+
+       // There are some errors in the conversion
+       lyxerr << "Error returned from iconv" << endl;
+       switch (errno) {
                case E2BIG:
                        lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
                        break;
                case EILSEQ:
                        lyxerr << "EILSEQ An invalid multibyte sequence"
-                              << " has been encountered in the input.\n"
-                              << "When converting from " << fromcode
-                              << " to " << tocode << ".\n";
-                       lyxerr << "Input: " << std::hex;
-                       for (size_t i = 0; i < buf.size(); ++i) {
-                               unsigned char const b = buf[i];
-                               lyxerr << "0x" << int(b) << " ";
+                               << " has been encountered in the input.\n"
+                               << "When converting from " << fromcode_
+                               << " to " << tocode_ << ".\n";
+                       lyxerr << "Input:" << std::hex;
+                       for (size_t i = 0; i < buflen; ++i) {
+                               // char may be signed, avoid output of
+                               // something like 0xffffffc2
+                               boost::uint32_t const b =
+                                       *reinterpret_cast<unsigned char const *>(buf + i);
+                               lyxerr << " 0x" << b;
                        }
                        lyxerr << endl;
                        break;
                case EINVAL:
                        lyxerr << "EINVAL An incomplete multibyte sequence"
-                              << " has been encountered in the input.\n"
-                              << "When converting from " << fromcode
-                              << " to " << tocode << ".\n";
-                       lyxerr << "Input: " << std::hex;
-                       for (size_t i = 0; i < buf.size(); ++i) {
-                               unsigned char const b = buf[i];
-                               lyxerr << "0x" << int(b) << " ";
+                               << " has been encountered in the input.\n"
+                               << "When converting from " << fromcode_
+                               << " to " << tocode_ << ".\n";
+                       lyxerr << "Input:" << std::hex;
+                       for (size_t i = 0; i < buflen; ++i) {
+                               // char may be signed, avoid output of
+                               // something like 0xffffffc2
+                               boost::uint32_t const b =
+                                       *reinterpret_cast<unsigned char const *>(buf + i);
+                               lyxerr << " 0x" << b;
                        }
                        lyxerr << endl;
                        break;
                default:
                        lyxerr << "\tSome other error: " << errno << endl;
                        break;
-               }
        }
-
-       if (iconv_close(cd) == -1) {
+       // We got an error so we close down the conversion engine
+       if (iconv_close(pimpl_->cd) == -1) {
                lyxerr << "Error returned from iconv_close("
-                      << errno << ")" << endl;
+                       << errno << ")" << endl;
        }
+       pimpl_->cd = invalid_cd;
+       return -1;
+}
 
-       //lyxerr << std::dec;
-       //lyxerr << "Inbytesleft: " << inbytesleft << endl;
-       //lyxerr << "Outbytesleft: " << outbytesleft << endl;
-       int bytes = 1000 - outbytesleft;
 
-       std::vector<char> outvec(out, out + bytes);
-       return outvec;
-}
+namespace {
 
 
-std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
+template<typename RetType, typename InType>
+std::vector<RetType>
+iconv_convert(IconvProcessor & processor,
+             InType const * buf,
+             size_t buflen)
 {
-       //lyxerr << "Outbuf =" << std::hex;
+       if (buflen == 0)
+               return std::vector<RetType>();
 
-       std::vector<boost::uint32_t> ucs4;
-       for (size_t i = 0; i < bytes.size(); i += 4) {
-               unsigned char const b1 = bytes[i    ];
-               unsigned char const b2 = bytes[i + 1];
-               unsigned char const b3 = bytes[i + 2];
-               unsigned char const b4 = bytes[i + 3];
+       char const * inbuf = reinterpret_cast<char const *>(buf);
+       size_t inbytesleft = buflen * sizeof(InType);
 
-               boost::uint32_t c;
-               char * cc = reinterpret_cast<char *>(&c);
-#ifdef WORDS_BIGENDIAN
-               cc[0] = b1;
-               cc[1] = b2;
-               cc[2] = b3;
-               cc[3] = b4;
-#else
-               cc[3] = b1;
-               cc[2] = b2;
-               cc[1] = b3;
-               cc[0] = b4;
-#endif
+       size_t const outsize = 32768;
+       static char out[outsize];
+       char * outbuf = out;
 
-               if (c > 0xffff) {
-                       lyxerr << "Strange ucs4 value encountered\n";
-                       lyxerr << "0x"
-                              << std::setw(2) << std::setfill('0') << int(b1)
-                              << std::setw(2) << std::setfill('0') << int(b2)
-                              << std::setw(2) << std::setfill('0') << int(b3)
-                              << std::setw(2) << std::setfill('0') << int(b4)
-                              << ' '
-                              << "(0x"
-                              << c
-                              << ") ";
-               }
+       int bytes = processor.convert(inbuf, inbytesleft, outbuf, outsize);
+       if (bytes <= 0)
+               // Conversion failed
+               // FIXME Maybe throw an exception and handle that in the caller?
+               return std::vector<RetType>();
 
-               ucs4.push_back(c);
-       }
-       //lyxerr << endl;
-       return ucs4;
+       RetType const * tmp = reinterpret_cast<RetType const *>(out);
+       return std::vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
 }
 
+} // anon namespace
 
-std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
-{
-       //lyxerr << "Outbuf =" << std::hex;
 
-       std::vector<unsigned short> ucs2;
-       for (size_t i = 0; i < bytes.size(); i += 2) {
-               unsigned char const b1 = bytes[i    ];
-               unsigned char const b2 = bytes[i + 1];
+std::vector<lyx::char_type> utf8_to_ucs4(std::vector<char> const & utf8str)
+{
+       if (utf8str.empty())
+               return std::vector<lyx::char_type>();
 
-               unsigned short c;
-               char * cc = reinterpret_cast<char *>(&c);
-#ifdef WORDS_BIGENDIAN
-               cc[0] = b1;
-               cc[1] = b2;
-#else
-               cc[1] = b1;
-               cc[0] = b2;
-#endif
+       return utf8_to_ucs4(&utf8str[0], utf8str.size());
+}
 
-               //lyxerr << "0x"
-               //       << std::setw(2) << std::setfill('0') << int(b2)
-               //       << std::setw(2) << std::setfill('0') << int(b1)
-               //       << ' '
-               //       << "(0x"
-               //       << c
-               //       << ") ";
 
-               ucs2.push_back(c);
-       }
-       //lyxerr << endl;
-       return ucs2;
+std::vector<lyx::char_type>
+utf8_to_ucs4(char const * utf8str, size_t ls)
+{
+       static IconvProcessor processor(ucs4_codeset, "UTF-8");
+       return iconv_convert<lyx::char_type>(processor, utf8str, ls);
 }
 
-} // anon namespace
-
 
-std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
+std::vector<char_type>
+utf16_to_ucs4(unsigned short const * s, size_t ls)
 {
-       //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
-       //       << " (" << utf8str.size() << ")" << endl;
-       //lyxerr << "Res = " << string(res.begin(), res.end())
-       //       << " (" << res.size() << ")" << endl;
-
-       std::vector<char> res = iconv_convert("UCS-4BE", "UTF-8", utf8str);
-       return bytes_to_ucs4(res);
+       static IconvProcessor processor(ucs4_codeset, utf16_codeset);
+       return iconv_convert<char_type>(processor, s, ls);
 }
 
 
-std::vector<boost::uint32_t>
-ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
+std::vector<unsigned short>
+ucs4_to_utf16(char_type const * s, size_t ls)
 {
-       // TODO: Simplify and speed up.
-       std::vector<char> in;
-       std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
-       std::vector<unsigned short>::const_iterator end = ucs2str.end();
-       //lyxerr << std::hex;
-       for (; cit != end; ++cit) {
-               unsigned short s = *cit;
-               in.push_back(static_cast<char>((s & 0xff00) >> 8));
-               in.push_back(static_cast<char>(s & 0x00ff));
-               lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
-               lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
-       }
-
-       std::vector<char> res = iconv_convert("UCS-4BE", "UCS-2BE", in);
-       return bytes_to_ucs4(res);
+       static IconvProcessor processor(utf16_codeset, ucs4_codeset);
+       return iconv_convert<unsigned short>(processor, s, ls);
 }
 
 
-std::vector<unsigned short>
-ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
+std::vector<char>
+ucs4_to_utf8(lyx::char_type c)
 {
-       std::vector<char> in;
-       std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
-       std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
-       for (; cit != end; ++cit) {
-               boost::uint32_t s = *cit;
-               in.push_back(static_cast<char>((s & 0xff000000) >> 24));
-               in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
-               in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
-               in.push_back(static_cast<char>(s & 0x000000ff));
-       }
-       std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
-       return bytes_to_ucs2(res);
+       static IconvProcessor processor("UTF-8", ucs4_codeset);
+       return iconv_convert<char>(processor, &c, 1);
 }
 
 
-std::vector<unsigned short>
-ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
+std::vector<char>
+ucs4_to_utf8(std::vector<lyx::char_type> const & ucs4str)
 {
-       std::vector<char> in;
-       for (size_t i = 0; i < ls; ++i) {
-               in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
-               in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
-               in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
-               in.push_back(static_cast<char>(s[i] & 0x000000ff));
-       }
-       std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
-       return bytes_to_ucs2(res);
+       if (ucs4str.empty())
+               return std::vector<char>();
+
+       return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
 }
 
 
-unsigned short
-ucs4_to_ucs2(boost::uint32_t c)
+std::vector<char>
+ucs4_to_utf8(lyx::char_type const * ucs4str, size_t ls)
 {
-       std::vector<char> in;
-       in.push_back(static_cast<char>((c & 0xff000000) >> 24));
-       in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
-       in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
-       in.push_back(static_cast<char>(c & 0x000000ff));
-       std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
-       std::vector<unsigned short> us = bytes_to_ucs2(res);
-       if (!us.empty())
-               return us[0];
-       else
-               return 0xfffd; // unknown character
+       static IconvProcessor processor("UTF-8", ucs4_codeset);
+       return iconv_convert<char>(processor, ucs4str, ls);
 }
 
 
-std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
+std::vector<lyx::char_type>
+eightbit_to_ucs4(char const * s, size_t ls, std::string const & encoding)
 {
-       std::vector<char> in;
-       std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
-       std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
-       for (; cit != end; ++cit) {
-               boost::uint32_t s = *cit;
-               in.push_back(static_cast<char>((s & 0xff000000) >> 24));
-               in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
-               in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
-               in.push_back(static_cast<char>(s & 0x000000ff));
+       static std::map<std::string, IconvProcessor> processors;
+       if (processors.find(encoding) == processors.end()) {
+               IconvProcessor processor(ucs4_codeset, encoding.c_str());
+               processors.insert(std::make_pair(encoding, processor));
        }
-       std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
-       return res;
+       return iconv_convert<char_type>(processors[encoding], s, ls);
 }
 
 
-std::vector<char> ucs4_to_utf8(boost::uint32_t c)
+std::vector<char>
+ucs4_to_eightbit(lyx::char_type const * ucs4str, size_t ls, std::string const & encoding)
 {
-       std::vector<char> in;
-       in.push_back(static_cast<char>((c & 0xff000000) >> 24));
-       in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
-       in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
-       in.push_back(static_cast<char>(c & 0x000000ff));
-       std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
-       return res;
+       static std::map<std::string, IconvProcessor> processors;
+       if (processors.find(encoding) == processors.end()) {
+               IconvProcessor processor(encoding.c_str(), ucs4_codeset);
+               processors.insert(std::make_pair(encoding, processor));
+       }
+       return iconv_convert<char>(processors[encoding], ucs4str, ls);
 }
+
+} // namespace lyx