src/support/unicode.cpp

   1 /**
   2  * \file unicode.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Lars Gullik Bjønnes
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  *
  10  * A collection of unicode conversion functions, using iconv.
  11  */
  12
  13 #include <config.h>
  14
  15 #include "support/unicode.h"
  16 #include "support/debug.h"
  17 #include "support/mutex.h"
  18
  19 #include <iconv.h>
  20
  21 #include <boost/cstdint.hpp>
  22
  23 #include <cerrno>
  24 #include <map>
  25 #include <ostream>
  26 #include <string>
  27
  28
  29 using namespace std;
  30
  31 namespace {
  32
  33 #ifdef WORDS_BIGENDIAN
  34         char const * utf16_codeset = "UTF16-BE";
  35 #else
  36         char const * utf16_codeset = "UTF16-LE";
  37 #endif
  38
  39 }
  40
  41
  42 namespace lyx {
  43
  44 #ifdef WORDS_BIGENDIAN
  45         char const * ucs4_codeset = "UCS-4BE";
  46 #else
  47         char const * ucs4_codeset = "UCS-4LE";
  48 #endif
  49
  50 static const iconv_t invalid_cd = (iconv_t)(-1);
  51
  52
  53 struct IconvProcessor::Impl
  54 {
  55         Impl(string const & to, string const & from)
  56                 : cd(invalid_cd), tocode_(to), fromcode_(from)
  57         {}
  58
  59         ~Impl()
  60         {
  61                 if (cd != invalid_cd && iconv_close(cd) == -1)
  62                                 LYXERR0("Error returned from iconv_close(" << errno << ")");
  63         }
  64
  65         iconv_t cd;
  66         string tocode_;
  67         string fromcode_;
  68
  69         Mutex mutex_; // iconv() is not thread save, see #7240
  70 };
  71
  72
  73 IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode)
  74         : pimpl_(new IconvProcessor::Impl(tocode, fromcode))
  75 {
  76 }
  77
  78
  79 IconvProcessor::IconvProcessor(IconvProcessor const & other)
  80         : pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_))
  81 {
  82 }
  83
  84
  85 IconvProcessor::~IconvProcessor()
  86 {
  87         delete pimpl_;
  88 }
  89
  90
  91 void IconvProcessor::operator=(IconvProcessor const & other)
  92 {
  93         if (&other != this)
  94                 pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_);
  95 }
  96
  97
  98 bool IconvProcessor::init()
  99 {
 100         if (pimpl_->cd != invalid_cd)
 101                 return true;
 102
 103         pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str());
 104         if (pimpl_->cd != invalid_cd)
 105                 return true;
 106
 107         lyxerr << "Error returned from iconv_open" << endl;
 108         switch (errno) {
 109                 case EINVAL:
 110                         lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_
 111                                 << " to " << pimpl_->tocode_
 112                                 << " is not supported by the implementation."
 113                                 << endl;
 114                         break;
 115                 default:
 116                         lyxerr << "\tSome other error: " << errno << endl;
 117                         break;
 118         }
 119         return false;
 120 }
 121
 122
 123 int IconvProcessor::convert(char const * buf, size_t buflen,
 124                 char * outbuf, size_t maxoutsize)
 125 {
 126         Mutex::Locker lock(&pimpl_->mutex_);
 127
 128         if (buflen == 0)
 129                 return 0;
 130
 131         if (pimpl_->cd == invalid_cd) {
 132                 if (!init())
 133                         return -1;
 134         }
 135
 136         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
 137         size_t inbytesleft = buflen;
 138         size_t outbytesleft = maxoutsize;
 139
 140         int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 141
 142         // flush out remaining data. This is needed because iconv sometimes
 143         // holds back chars in the stream, waiting for a combination character
 144         // (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124)
 145         iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft);
 146
 147         //lyxerr << dec;
 148         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
 149         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
 150
 151         if (res != -1)
 152                 // Everything went well.
 153                 return maxoutsize - outbytesleft;
 154
 155         // There are some errors in the conversion
 156         lyxerr << "Error returned from iconv" << endl;
 157         switch (errno) {
 158                 case E2BIG:
 159                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
 160                         break;
 161                 case EILSEQ:
 162                         lyxerr << "EILSEQ An invalid multibyte sequence"
 163                                 << " has been encountered in the input.\n"
 164                                 << "When converting from " << pimpl_->fromcode_
 165                                 << " to " << pimpl_->tocode_ << ".\n";
 166                         lyxerr << "Input:" << hex;
 167                         for (size_t i = 0; i < buflen; ++i) {
 168                                 // char may be signed, avoid output of
 169                                 // something like 0xffffffc2
 170                                 boost::uint32_t const b =
 171                                         *reinterpret_cast<unsigned char const *>(buf + i);
 172                                 lyxerr << " 0x" << (unsigned int)b;
 173                         }
 174                         lyxerr << dec << endl;
 175                         break;
 176                 case EINVAL:
 177                         lyxerr << "EINVAL An incomplete multibyte sequence"
 178                                 << " has been encountered in the input.\n"
 179                                 << "When converting from " << pimpl_->fromcode_
 180                                 << " to " << pimpl_->tocode_ << ".\n";
 181                         lyxerr << "Input:" << hex;
 182                         for (size_t i = 0; i < buflen; ++i) {
 183                                 // char may be signed, avoid output of
 184                                 // something like 0xffffffc2
 185                                 boost::uint32_t const b =
 186                                         *reinterpret_cast<unsigned char const *>(buf + i);
 187                                 lyxerr << " 0x" << (unsigned int)b;
 188                         }
 189                         lyxerr << dec << endl;
 190                         break;
 191                 default:
 192                         lyxerr << "\tSome other error: " << errno << endl;
 193                         break;
 194         }
 195         // We got an error so we close down the conversion engine
 196         if (iconv_close(pimpl_->cd) == -1) {
 197                 lyxerr << "Error returned from iconv_close("
 198                         << errno << ")" << endl;
 199         }
 200         pimpl_->cd = invalid_cd;
 201         return -1;
 202 }
 203
 204
 205 std::string IconvProcessor::from() const
 206 {
 207         return pimpl_->fromcode_;
 208 }
 209
 210
 211 std::string IconvProcessor::to() const
 212 {
 213         return pimpl_->tocode_;
 214 }
 215
 216
 217 namespace {
 218
 219
 220 template<typename RetType, typename InType>
 221 vector<RetType>
 222 iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen)
 223 {
 224         if (buflen == 0)
 225                 return vector<RetType>();
 226
 227         char const * inbuf = reinterpret_cast<char const *>(buf);
 228         size_t inbytesleft = buflen * sizeof(InType);
 229
 230         static std::vector<char> outbuf(32768);
 231         // The number of UCS4 code points in buf is at most inbytesleft.
 232         // The output encoding will use at most
 233         // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point.
 234         size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft;
 235         if (outbuf.size() < maxoutbufsize)
 236                 outbuf.resize(maxoutbufsize);
 237
 238         int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size());
 239         if (bytes <= 0)
 240                 // Conversion failed
 241                 // FIXME Maybe throw an exception and handle that in the caller?
 242                 return vector<RetType>();
 243
 244         RetType const * tmp = reinterpret_cast<RetType const *>(&outbuf[0]);
 245         return vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
 246 }
 247
 248 } // anon namespace
 249
 250
 251 vector<char_type> utf8_to_ucs4(vector<char> const & utf8str)
 252 {
 253         if (utf8str.empty())
 254                 return vector<char_type>();
 255
 256         return utf8_to_ucs4(&utf8str[0], utf8str.size());
 257 }
 258
 259
 260 vector<char_type>
 261 utf8_to_ucs4(char const * utf8str, size_t ls)
 262 {
 263         static IconvProcessor processor(ucs4_codeset, "UTF-8");
 264         return iconv_convert<char_type>(processor, utf8str, ls);
 265 }
 266
 267
 268 vector<char_type>
 269 utf16_to_ucs4(unsigned short const * s, size_t ls)
 270 {
 271         static IconvProcessor processor(ucs4_codeset, utf16_codeset);
 272         return iconv_convert<char_type>(processor, s, ls);
 273 }
 274
 275
 276 vector<unsigned short>
 277 ucs4_to_utf16(char_type const * s, size_t ls)
 278 {
 279         static IconvProcessor processor(utf16_codeset, ucs4_codeset);
 280         return iconv_convert<unsigned short>(processor, s, ls);
 281 }
 282
 283
 284 vector<char>
 285 ucs4_to_utf8(char_type c)
 286 {
 287         static IconvProcessor processor("UTF-8", ucs4_codeset);
 288         return iconv_convert<char>(processor, &c, 1);
 289 }
 290
 291
 292 vector<char>
 293 ucs4_to_utf8(vector<char_type> const & ucs4str)
 294 {
 295         if (ucs4str.empty())
 296                 return vector<char>();
 297
 298         return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
 299 }
 300
 301
 302 vector<char>
 303 ucs4_to_utf8(char_type const * ucs4str, size_t ls)
 304 {
 305         static IconvProcessor processor("UTF-8", ucs4_codeset);
 306         return iconv_convert<char>(processor, ucs4str, ls);
 307 }
 308
 309
 310 vector<char_type>
 311 eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
 312 {
 313         static map<string, IconvProcessor> processors;
 314         if (processors.find(encoding) == processors.end()) {
 315                 IconvProcessor processor(ucs4_codeset, encoding.c_str());
 316                 processors.insert(make_pair(encoding, processor));
 317         }
 318         return iconv_convert<char_type>(processors[encoding], s, ls);
 319 }
 320
 321
 322 vector<char>
 323 ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
 324 {
 325         static map<string, IconvProcessor> processors;
 326         if (processors.find(encoding) == processors.end()) {
 327                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 328                 processors.insert(make_pair(encoding, processor));
 329         }
 330         return iconv_convert<char>(processors[encoding], ucs4str, ls);
 331 }
 332
 333
 334 char ucs4_to_eightbit(char_type ucs4, string const & encoding)
 335 {
 336         static map<string, IconvProcessor> processors;
 337         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 338         if (it == processors.end()) {
 339                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 340                 it = processors.insert(make_pair(encoding, processor)).first;
 341         }
 342
 343         char out;
 344         int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1);
 345         if (bytes > 0)
 346                 return out;
 347         return 0;
 348 }
 349
 350
 351 void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
 352         string const & encoding)
 353 {
 354         static map<string, IconvProcessor> processors;
 355         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 356         if (it == processors.end()) {
 357                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 358                 it = processors.insert(make_pair(encoding, processor)).first;
 359         }
 360
 361         out.resize(4);
 362         int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4);
 363         if (bytes > 0)
 364                 out.resize(bytes);
 365         else
 366                 out.clear();
 367 }
 368
 369 int max_encoded_bytes(std::string const & encoding)
 370 {
 371         // FIXME: this information should be transferred to lib/encodings
 372         // UTF8 uses at most 4 bytes to represent one UCS4 code point
 373         // (see RFC 3629). RFC 2279 specifies 6 bytes, but that
 374         // information is outdated, and RFC 2279 has been superseded by
 375         // RFC 3629.
 376         // The CJK encodings use (different) multibyte representation as well.
 377         // All other encodings encode one UCS4 code point in one byte
 378         // (and can therefore only encode a subset of UCS4)
 379         // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings).
 380         // Furthermore, all encodings that use shifting (like SJIS) do not work with
 381         // iconv_codecvt_facet.
 382         if (encoding == "UTF-8" ||
 383             encoding == "GB" ||
 384             encoding == "EUC-TW")
 385                 return 4;
 386         else if (encoding == "EUC-JP")
 387                 return 3;
 388         else if (encoding == "ISO-2022-JP")
 389                 return 8;
 390         else if (encoding == "BIG5" ||
 391                  encoding == "EUC-KR" ||
 392                  encoding == "EUC-CN" ||
 393                  encoding == "SJIS" ||
 394                  encoding == "GBK")
 395                 return 2;
 396         else
 397                 return 1;
 398 }
 399
 400 } // namespace lyx