src/support/unicode.cpp

   1 /**
   2  * \file unicode.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Lars Gullik Bjønnes
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  *
  10  * A collection of unicode conversion functions, using iconv.
  11  */
  12
  13 #include <config.h>
  14
  15 #include "support/unicode.h"
  16 #include "support/debug.h"
  17 #include "support/mutex.h"
  18
  19 #include <iconv.h>
  20
  21 #include <boost/cstdint.hpp>
  22
  23 #include <cerrno>
  24 #include <iomanip>
  25 #include <map>
  26 #include <ostream>
  27 #include <string>
  28
  29
  30 using namespace std;
  31
  32 namespace {
  33
  34 #ifdef WORDS_BIGENDIAN
  35         char const * utf16_codeset = "UTF16-BE";
  36 #else
  37         char const * utf16_codeset = "UTF16-LE";
  38 #endif
  39
  40 }
  41
  42
  43 namespace lyx {
  44
  45 #ifdef WORDS_BIGENDIAN
  46         char const * ucs4_codeset = "UCS-4BE";
  47 #else
  48         char const * ucs4_codeset = "UCS-4LE";
  49 #endif
  50
  51 static const iconv_t invalid_cd = (iconv_t)(-1);
  52
  53
  54 struct IconvProcessor::Impl
  55 {
  56         Impl(string const & to, string const & from)
  57                 : cd(invalid_cd), tocode_(to), fromcode_(from)
  58         {}
  59
  60         ~Impl()
  61         {
  62                 if (cd != invalid_cd && iconv_close(cd) == -1)
  63                                 LYXERR0("Error returned from iconv_close(" << errno << ")");
  64         }
  65
  66         iconv_t cd;
  67         string tocode_;
  68         string fromcode_;
  69
  70         Mutex mutex_; // iconv() is not thread save, see #7240
  71 };
  72
  73
  74 IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode)
  75         : pimpl_(new IconvProcessor::Impl(tocode, fromcode))
  76 {
  77 }
  78
  79
  80 IconvProcessor::IconvProcessor(IconvProcessor const & other)
  81         : pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_))
  82 {
  83 }
  84
  85
  86 IconvProcessor::~IconvProcessor()
  87 {
  88         delete pimpl_;
  89 }
  90
  91
  92 void IconvProcessor::operator=(IconvProcessor const & other)
  93 {
  94         if (&other != this)
  95                 pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_);
  96 }
  97
  98
  99 bool IconvProcessor::init()
 100 {
 101         if (pimpl_->cd != invalid_cd)
 102                 return true;
 103
 104         pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str());
 105         if (pimpl_->cd != invalid_cd)
 106                 return true;
 107
 108         lyxerr << "Error returned from iconv_open" << endl;
 109         switch (errno) {
 110                 case EINVAL:
 111                         lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_
 112                                 << " to " << pimpl_->tocode_
 113                                 << " is not supported by the implementation."
 114                                 << endl;
 115                         break;
 116                 default:
 117                         lyxerr << "\tSome other error: " << errno << endl;
 118                         break;
 119         }
 120         return false;
 121 }
 122
 123
 124 int IconvProcessor::convert(char const * buf, size_t buflen,
 125                 char * outbuf, size_t maxoutsize)
 126 {
 127         Mutex::Locker lock(&pimpl_->mutex_);
 128
 129         if (buflen == 0)
 130                 return 0;
 131
 132         if (pimpl_->cd == invalid_cd) {
 133                 if (!init())
 134                         return -1;
 135         }
 136
 137         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
 138         size_t inbytesleft = buflen;
 139         size_t outbytesleft = maxoutsize;
 140
 141         int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 142
 143         // flush out remaining data. This is needed because iconv sometimes
 144         // holds back chars in the stream, waiting for a combination character
 145         // (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124)
 146         iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft);
 147
 148         //lyxerr << dec;
 149         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
 150         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
 151
 152         if (res != -1)
 153                 // Everything went well.
 154                 return maxoutsize - outbytesleft;
 155
 156         // There are some errors in the conversion
 157         lyxerr << "Error returned from iconv" << endl;
 158         switch (errno) {
 159                 case E2BIG:
 160                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
 161                         break;
 162                 case EILSEQ:
 163                         lyxerr << "EILSEQ An invalid multibyte sequence"
 164                                 << " has been encountered in the input.\n"
 165                                 << "When converting from " << pimpl_->fromcode_
 166                                 << " to " << pimpl_->tocode_ << ".\n";
 167                         lyxerr << "Input:" << hex;
 168                         for (size_t i = 0; i < buflen; ++i) {
 169                                 // char may be signed, avoid output of
 170                                 // something like 0xffffffc2
 171                                 boost::uint32_t const b =
 172                                         *reinterpret_cast<unsigned char const *>(buf + i);
 173                                 lyxerr << " 0x" << (unsigned int)b;
 174                         }
 175                         lyxerr << dec << endl;
 176                         break;
 177                 case EINVAL:
 178                         lyxerr << "EINVAL An incomplete multibyte sequence"
 179                                 << " has been encountered in the input.\n"
 180                                 << "When converting from " << pimpl_->fromcode_
 181                                 << " to " << pimpl_->tocode_ << ".\n";
 182                         lyxerr << "Input:" << hex;
 183                         for (size_t i = 0; i < buflen; ++i) {
 184                                 // char may be signed, avoid output of
 185                                 // something like 0xffffffc2
 186                                 boost::uint32_t const b =
 187                                         *reinterpret_cast<unsigned char const *>(buf + i);
 188                                 lyxerr << " 0x" << (unsigned int)b;
 189                         }
 190                         lyxerr << dec << endl;
 191                         break;
 192                 default:
 193                         lyxerr << "\tSome other error: " << errno << endl;
 194                         break;
 195         }
 196         // We got an error so we close down the conversion engine
 197         if (iconv_close(pimpl_->cd) == -1) {
 198                 lyxerr << "Error returned from iconv_close("
 199                         << errno << ")" << endl;
 200         }
 201         pimpl_->cd = invalid_cd;
 202         return -1;
 203 }
 204
 205
 206 std::string IconvProcessor::from() const
 207 {
 208         return pimpl_->fromcode_;
 209 }
 210
 211
 212 std::string IconvProcessor::to() const
 213 {
 214         return pimpl_->tocode_;
 215 }
 216
 217
 218 namespace {
 219
 220
 221 template<typename RetType, typename InType>
 222 vector<RetType>
 223 iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen)
 224 {
 225         if (buflen == 0)
 226                 return vector<RetType>();
 227
 228         char const * inbuf = reinterpret_cast<char const *>(buf);
 229         size_t inbytesleft = buflen * sizeof(InType);
 230
 231         static std::vector<char> outbuf(32768);
 232         // The number of UCS4 code points in buf is at most inbytesleft.
 233         // The output encoding will use at most
 234         // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point.
 235         size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft;
 236         if (outbuf.size() < maxoutbufsize)
 237                 outbuf.resize(maxoutbufsize);
 238
 239         int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size());
 240         if (bytes <= 0)
 241                 // Conversion failed
 242                 // FIXME Maybe throw an exception and handle that in the caller?
 243                 return vector<RetType>();
 244
 245         RetType const * tmp = reinterpret_cast<RetType const *>(&outbuf[0]);
 246         return vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
 247 }
 248
 249 } // anon namespace
 250
 251
 252 vector<char_type> utf8_to_ucs4(vector<char> const & utf8str)
 253 {
 254         if (utf8str.empty())
 255                 return vector<char_type>();
 256
 257         return utf8_to_ucs4(&utf8str[0], utf8str.size());
 258 }
 259
 260
 261 vector<char_type>
 262 utf8_to_ucs4(char const * utf8str, size_t ls)
 263 {
 264         static IconvProcessor processor(ucs4_codeset, "UTF-8");
 265         return iconv_convert<char_type>(processor, utf8str, ls);
 266 }
 267
 268
 269 vector<char_type>
 270 utf16_to_ucs4(unsigned short const * s, size_t ls)
 271 {
 272         static IconvProcessor processor(ucs4_codeset, utf16_codeset);
 273         return iconv_convert<char_type>(processor, s, ls);
 274 }
 275
 276
 277 vector<unsigned short>
 278 ucs4_to_utf16(char_type const * s, size_t ls)
 279 {
 280         static IconvProcessor processor(utf16_codeset, ucs4_codeset);
 281         return iconv_convert<unsigned short>(processor, s, ls);
 282 }
 283
 284
 285 vector<char>
 286 ucs4_to_utf8(char_type c)
 287 {
 288         static IconvProcessor processor("UTF-8", ucs4_codeset);
 289         return iconv_convert<char>(processor, &c, 1);
 290 }
 291
 292
 293 vector<char>
 294 ucs4_to_utf8(vector<char_type> const & ucs4str)
 295 {
 296         if (ucs4str.empty())
 297                 return vector<char>();
 298
 299         return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
 300 }
 301
 302
 303 vector<char>
 304 ucs4_to_utf8(char_type const * ucs4str, size_t ls)
 305 {
 306         static IconvProcessor processor("UTF-8", ucs4_codeset);
 307         return iconv_convert<char>(processor, ucs4str, ls);
 308 }
 309
 310
 311 vector<char_type>
 312 eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
 313 {
 314         static map<string, IconvProcessor> processors;
 315         if (processors.find(encoding) == processors.end()) {
 316                 IconvProcessor processor(ucs4_codeset, encoding.c_str());
 317                 processors.insert(make_pair(encoding, processor));
 318         }
 319         return iconv_convert<char_type>(processors[encoding], s, ls);
 320 }
 321
 322
 323 vector<char>
 324 ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
 325 {
 326         static map<string, IconvProcessor> processors;
 327         if (processors.find(encoding) == processors.end()) {
 328                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 329                 processors.insert(make_pair(encoding, processor));
 330         }
 331         return iconv_convert<char>(processors[encoding], ucs4str, ls);
 332 }
 333
 334
 335 char ucs4_to_eightbit(char_type ucs4, string const & encoding)
 336 {
 337         static map<string, IconvProcessor> processors;
 338         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 339         if (it == processors.end()) {
 340                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 341                 it = processors.insert(make_pair(encoding, processor)).first;
 342         }
 343
 344         char out;
 345         int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1);
 346         if (bytes > 0)
 347                 return out;
 348         return 0;
 349 }
 350
 351
 352 void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
 353         string const & encoding)
 354 {
 355         static map<string, IconvProcessor> processors;
 356         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 357         if (it == processors.end()) {
 358                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 359                 it = processors.insert(make_pair(encoding, processor)).first;
 360         }
 361
 362         out.resize(4);
 363         int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4);
 364         if (bytes > 0)
 365                 out.resize(bytes);
 366         else
 367                 out.clear();
 368 }
 369
 370 int max_encoded_bytes(std::string const & encoding)
 371 {
 372         // FIXME: this information should be transferred to lib/encodings
 373         // UTF8 uses at most 4 bytes to represent one UCS4 code point
 374         // (see RFC 3629). RFC 2279 specifies 6 bytes, but that
 375         // information is outdated, and RFC 2279 has been superseded by
 376         // RFC 3629.
 377         // The CJK encodings use (different) multibyte representation as well.
 378         // All other encodings encode one UCS4 code point in one byte
 379         // (and can therefore only encode a subset of UCS4)
 380         // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings).
 381         // Furthermore, all encodings that use shifting (like SJIS) do not work with
 382         // iconv_codecvt_facet.
 383         if (encoding == "UTF-8" ||
 384             encoding == "GB" ||
 385             encoding == "EUC-TW")
 386                 return 4;
 387         else if (encoding == "EUC-JP")
 388                 return 3;
 389         else if (encoding == "ISO-2022-JP")
 390                 return 8;
 391         else if (encoding == "BIG5" ||
 392                  encoding == "EUC-KR" ||
 393                  encoding == "EUC-CN" ||
 394                  encoding == "SJIS" ||
 395                  encoding == "GBK")
 396                 return 2;
 397         else
 398                 return 1;
 399 }
 400
 401 } // namespace lyx