src/support/unicode.cpp

   1 /**
   2  * \file unicode.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Lars Gullik Bjønnes
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  *
  10  * A collection of unicode conversion functions, using iconv.
  11  */
  12
  13 #include <config.h>
  14
  15 #include "support/unicode.h"
  16 #include "support/debug.h"
  17 #include "support/mutex.h"
  18
  19 #include <iconv.h>
  20
  21 #include <boost/cstdint.hpp>
  22
  23 #include <cerrno>
  24 #include <map>
  25 #include <ostream>
  26
  27
  28 using namespace std;
  29
  30 namespace {
  31
  32 #ifdef WORDS_BIGENDIAN
  33         char const * utf16_codeset = "UTF16-BE";
  34 #else
  35         char const * utf16_codeset = "UTF16-LE";
  36 #endif
  37
  38 }
  39
  40
  41 namespace lyx {
  42
  43 #ifdef WORDS_BIGENDIAN
  44         char const * ucs4_codeset = "UCS-4BE";
  45 #else
  46         char const * ucs4_codeset = "UCS-4LE";
  47 #endif
  48
  49 static const iconv_t invalid_cd = (iconv_t)(-1);
  50
  51
  52 struct IconvProcessor::Impl
  53 {
  54         Impl(string const & to, string const & from)
  55                 : cd(invalid_cd), tocode_(to), fromcode_(from)
  56         {}
  57
  58         ~Impl()
  59         {
  60                 if (cd != invalid_cd && iconv_close(cd) == -1)
  61                                 LYXERR0("Error returned from iconv_close(" << errno << ")");
  62         }
  63
  64         iconv_t cd;
  65         string tocode_;
  66         string fromcode_;
  67
  68         Mutex mutex_; // iconv() is not thread save, see #7240
  69 };
  70
  71
  72 IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode)
  73         : pimpl_(new IconvProcessor::Impl(tocode, fromcode))
  74 {
  75 }
  76
  77
  78 IconvProcessor::IconvProcessor(IconvProcessor const & other)
  79         : pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_))
  80 {
  81 }
  82
  83
  84 IconvProcessor::~IconvProcessor()
  85 {
  86         delete pimpl_;
  87 }
  88
  89
  90 void IconvProcessor::operator=(IconvProcessor const & other)
  91 {
  92         if (&other != this)
  93                 pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_);
  94 }
  95
  96
  97 bool IconvProcessor::init()
  98 {
  99         if (pimpl_->cd != invalid_cd)
 100                 return true;
 101
 102         pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str());
 103         if (pimpl_->cd != invalid_cd)
 104                 return true;
 105
 106         lyxerr << "Error returned from iconv_open" << endl;
 107         switch (errno) {
 108                 case EINVAL:
 109                         lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_
 110                                 << " to " << pimpl_->tocode_
 111                                 << " is not supported by the implementation."
 112                                 << endl;
 113                         break;
 114                 default:
 115                         lyxerr << "\tSome other error: " << errno << endl;
 116                         break;
 117         }
 118         return false;
 119 }
 120
 121
 122 int IconvProcessor::convert(char const * buf, size_t buflen,
 123                 char * outbuf, size_t maxoutsize)
 124 {
 125         Mutex::Locker lock(&pimpl_->mutex_);
 126
 127         if (buflen == 0)
 128                 return 0;
 129
 130         if (pimpl_->cd == invalid_cd) {
 131                 if (!init())
 132                         return -1;
 133         }
 134
 135         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
 136         size_t inbytesleft = buflen;
 137         size_t outbytesleft = maxoutsize;
 138
 139         int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 140
 141         // flush out remaining data. This is needed because iconv sometimes
 142         // holds back chars in the stream, waiting for a combination character
 143         // (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124)
 144         iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft);
 145
 146         //lyxerr << dec;
 147         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
 148         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
 149
 150         if (res != -1)
 151                 // Everything went well.
 152                 return maxoutsize - outbytesleft;
 153
 154         // There are some errors in the conversion
 155         lyxerr << "Error returned from iconv" << endl;
 156         switch (errno) {
 157                 case E2BIG:
 158                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
 159                         break;
 160                 case EILSEQ:
 161                         lyxerr << "EILSEQ An invalid multibyte sequence"
 162                                 << " has been encountered in the input.\n"
 163                                 << "When converting from " << pimpl_->fromcode_
 164                                 << " to " << pimpl_->tocode_ << ".\n";
 165                         lyxerr << "Input:" << hex;
 166                         for (size_t i = 0; i < buflen; ++i) {
 167                                 // char may be signed, avoid output of
 168                                 // something like 0xffffffc2
 169                                 boost::uint32_t const b =
 170                                         *reinterpret_cast<unsigned char const *>(buf + i);
 171                                 lyxerr << " 0x" << (unsigned int)b;
 172                         }
 173                         lyxerr << dec << endl;
 174                         break;
 175                 case EINVAL:
 176                         lyxerr << "EINVAL An incomplete multibyte sequence"
 177                                 << " has been encountered in the input.\n"
 178                                 << "When converting from " << pimpl_->fromcode_
 179                                 << " to " << pimpl_->tocode_ << ".\n";
 180                         lyxerr << "Input:" << hex;
 181                         for (size_t i = 0; i < buflen; ++i) {
 182                                 // char may be signed, avoid output of
 183                                 // something like 0xffffffc2
 184                                 boost::uint32_t const b =
 185                                         *reinterpret_cast<unsigned char const *>(buf + i);
 186                                 lyxerr << " 0x" << (unsigned int)b;
 187                         }
 188                         lyxerr << dec << endl;
 189                         break;
 190                 default:
 191                         lyxerr << "\tSome other error: " << errno << endl;
 192                         break;
 193         }
 194         // We got an error so we close down the conversion engine
 195         if (iconv_close(pimpl_->cd) == -1) {
 196                 lyxerr << "Error returned from iconv_close("
 197                         << errno << ")" << endl;
 198         }
 199         pimpl_->cd = invalid_cd;
 200         return -1;
 201 }
 202
 203
 204 std::string IconvProcessor::from() const
 205 {
 206         return pimpl_->fromcode_;
 207 }
 208
 209
 210 std::string IconvProcessor::to() const
 211 {
 212         return pimpl_->tocode_;
 213 }
 214
 215
 216 namespace {
 217
 218
 219 template<typename RetType, typename InType>
 220 vector<RetType>
 221 iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen)
 222 {
 223         if (buflen == 0)
 224                 return vector<RetType>();
 225
 226         char const * inbuf = reinterpret_cast<char const *>(buf);
 227         size_t inbytesleft = buflen * sizeof(InType);
 228
 229         static std::vector<char> outbuf(32768);
 230         // The number of UCS4 code points in buf is at most inbytesleft.
 231         // The output encoding will use at most
 232         // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point.
 233         size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft;
 234         if (outbuf.size() < maxoutbufsize)
 235                 outbuf.resize(maxoutbufsize);
 236
 237         int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size());
 238         if (bytes <= 0)
 239                 // Conversion failed
 240                 // FIXME Maybe throw an exception and handle that in the caller?
 241                 return vector<RetType>();
 242
 243         RetType const * tmp = reinterpret_cast<RetType const *>(&outbuf[0]);
 244         return vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
 245 }
 246
 247 } // anon namespace
 248
 249
 250 vector<char_type> utf8_to_ucs4(vector<char> const & utf8str)
 251 {
 252         if (utf8str.empty())
 253                 return vector<char_type>();
 254
 255         return utf8_to_ucs4(&utf8str[0], utf8str.size());
 256 }
 257
 258
 259 vector<char_type>
 260 utf8_to_ucs4(char const * utf8str, size_t ls)
 261 {
 262         static IconvProcessor processor(ucs4_codeset, "UTF-8");
 263         return iconv_convert<char_type>(processor, utf8str, ls);
 264 }
 265
 266
 267 vector<char_type>
 268 utf16_to_ucs4(unsigned short const * s, size_t ls)
 269 {
 270         static IconvProcessor processor(ucs4_codeset, utf16_codeset);
 271         return iconv_convert<char_type>(processor, s, ls);
 272 }
 273
 274
 275 vector<unsigned short>
 276 ucs4_to_utf16(char_type const * s, size_t ls)
 277 {
 278         static IconvProcessor processor(utf16_codeset, ucs4_codeset);
 279         return iconv_convert<unsigned short>(processor, s, ls);
 280 }
 281
 282
 283 vector<char>
 284 ucs4_to_utf8(char_type c)
 285 {
 286         static IconvProcessor processor("UTF-8", ucs4_codeset);
 287         return iconv_convert<char>(processor, &c, 1);
 288 }
 289
 290
 291 vector<char>
 292 ucs4_to_utf8(vector<char_type> const & ucs4str)
 293 {
 294         if (ucs4str.empty())
 295                 return vector<char>();
 296
 297         return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
 298 }
 299
 300
 301 vector<char>
 302 ucs4_to_utf8(char_type const * ucs4str, size_t ls)
 303 {
 304         static IconvProcessor processor("UTF-8", ucs4_codeset);
 305         return iconv_convert<char>(processor, ucs4str, ls);
 306 }
 307
 308
 309 vector<char_type>
 310 eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
 311 {
 312         static map<string, IconvProcessor> processors;
 313         if (processors.find(encoding) == processors.end()) {
 314                 IconvProcessor processor(ucs4_codeset, encoding.c_str());
 315                 processors.insert(make_pair(encoding, processor));
 316         }
 317         return iconv_convert<char_type>(processors[encoding], s, ls);
 318 }
 319
 320
 321 vector<char>
 322 ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
 323 {
 324         static map<string, IconvProcessor> processors;
 325         if (processors.find(encoding) == processors.end()) {
 326                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 327                 processors.insert(make_pair(encoding, processor));
 328         }
 329         return iconv_convert<char>(processors[encoding], ucs4str, ls);
 330 }
 331
 332
 333 char ucs4_to_eightbit(char_type ucs4, string const & encoding)
 334 {
 335         static map<string, IconvProcessor> processors;
 336         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 337         if (it == processors.end()) {
 338                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 339                 it = processors.insert(make_pair(encoding, processor)).first;
 340         }
 341
 342         char out;
 343         int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1);
 344         if (bytes > 0)
 345                 return out;
 346         return 0;
 347 }
 348
 349
 350 void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
 351         string const & encoding)
 352 {
 353         static map<string, IconvProcessor> processors;
 354         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 355         if (it == processors.end()) {
 356                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 357                 it = processors.insert(make_pair(encoding, processor)).first;
 358         }
 359
 360         out.resize(4);
 361         int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4);
 362         if (bytes > 0)
 363                 out.resize(bytes);
 364         else
 365                 out.clear();
 366 }
 367
 368 int max_encoded_bytes(std::string const & encoding)
 369 {
 370         // FIXME: this information should be transferred to lib/encodings
 371         // UTF8 uses at most 4 bytes to represent one UCS4 code point
 372         // (see RFC 3629). RFC 2279 specifies 6 bytes, but that
 373         // information is outdated, and RFC 2279 has been superseded by
 374         // RFC 3629.
 375         // The CJK encodings use (different) multibyte representation as well.
 376         // All other encodings encode one UCS4 code point in one byte
 377         // (and can therefore only encode a subset of UCS4)
 378         // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings).
 379         // Furthermore, all encodings that use shifting (like SJIS) do not work with
 380         // iconv_codecvt_facet.
 381         if (encoding == "UTF-8" ||
 382             encoding == "GB" ||
 383             encoding == "EUC-TW")
 384                 return 4;
 385         else if (encoding == "EUC-JP")
 386                 return 3;
 387         else if (encoding == "ISO-2022-JP")
 388                 return 8;
 389         else if (encoding == "BIG5" ||
 390                  encoding == "EUC-KR" ||
 391                  encoding == "EUC-CN" ||
 392                  encoding == "SJIS" ||
 393                  encoding == "GBK")
 394                 return 2;
 395         else
 396                 return 1;
 397 }
 398
 399 } // namespace lyx