src/support/unicode.cpp

   1 /**
   2  * \file unicode.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Lars Gullik Bjønnes
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  *
  10  * A collection of unicode conversion functions, using iconv.
  11  */
  12
  13 #include <config.h>
  14
  15 #include "support/unicode.h"
  16 #include "support/debug.h"
  17
  18 #include <iconv.h>
  19
  20 #include <boost/cstdint.hpp>
  21
  22 #include <cerrno>
  23 #include <iomanip>
  24 #include <map>
  25 #include <ostream>
  26 #include <string>
  27
  28 using namespace std;
  29
  30 namespace {
  31
  32 #ifdef WORDS_BIGENDIAN
  33         char const * utf16_codeset = "UTF16-BE";
  34 #else
  35         char const * utf16_codeset = "UTF16-LE";
  36 #endif
  37
  38 }
  39
  40
  41 namespace lyx {
  42
  43 #ifdef WORDS_BIGENDIAN
  44         char const * ucs4_codeset = "UCS-4BE";
  45 #else
  46         char const * ucs4_codeset = "UCS-4LE";
  47 #endif
  48
  49 static const iconv_t invalid_cd = (iconv_t)(-1);
  50
  51
  52 struct IconvProcessor::Impl
  53 {
  54         Impl(string const & to, string const & from)
  55                 : cd(invalid_cd), tocode_(to), fromcode_(from)
  56         {}
  57
  58         ~Impl()
  59         {
  60                 if (cd != invalid_cd && iconv_close(cd) == -1)
  61                                 LYXERR0("Error returned from iconv_close(" << errno << ")");
  62         }
  63
  64         iconv_t cd;
  65         string tocode_;
  66         string fromcode_;
  67 };
  68
  69
  70 IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode)
  71         : pimpl_(new IconvProcessor::Impl(tocode, fromcode))
  72 {
  73 }
  74
  75
  76 IconvProcessor::IconvProcessor(IconvProcessor const & other)
  77         : pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_))
  78 {
  79 }
  80
  81
  82 IconvProcessor::~IconvProcessor()
  83 {
  84         delete pimpl_;
  85 }
  86
  87
  88 void IconvProcessor::operator=(IconvProcessor const & other)
  89 {
  90         if (&other != this)
  91                 pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_);
  92 }
  93
  94
  95 bool IconvProcessor::init()
  96 {
  97         if (pimpl_->cd != invalid_cd)
  98                 return true;
  99
 100         pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str());
 101         if (pimpl_->cd != invalid_cd)
 102                 return true;
 103
 104         lyxerr << "Error returned from iconv_open" << endl;
 105         switch (errno) {
 106                 case EINVAL:
 107                         lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_
 108                                 << " to " << pimpl_->tocode_
 109                                 << " is not supported by the implementation."
 110                                 << endl;
 111                         break;
 112                 default:
 113                         lyxerr << "\tSome other error: " << errno << endl;
 114                         break;
 115         }
 116         return false;
 117 }
 118
 119
 120 int IconvProcessor::convert(char const * buf, size_t buflen,
 121                 char * outbuf, size_t maxoutsize)
 122 {
 123         if (buflen == 0)
 124                 return 0;
 125
 126         if (pimpl_->cd == invalid_cd) {
 127                 if (!init())
 128                         return -1;
 129         }
 130
 131         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
 132         size_t inbytesleft = buflen;
 133         size_t outbytesleft = maxoutsize;
 134
 135         int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 136
 137         // flush out remaining data. This is needed because iconv sometimes
 138         // holds back chars in the stream, waiting for a combination character
 139         // (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124)
 140         iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft);
 141
 142         //lyxerr << dec;
 143         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
 144         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
 145
 146         if (res != -1)
 147                 // Everything went well.
 148                 return maxoutsize - outbytesleft;
 149
 150         // There are some errors in the conversion
 151         lyxerr << "Error returned from iconv" << endl;
 152         switch (errno) {
 153                 case E2BIG:
 154                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
 155                         break;
 156                 case EILSEQ:
 157                         lyxerr << "EILSEQ An invalid multibyte sequence"
 158                                 << " has been encountered in the input.\n"
 159                                 << "When converting from " << pimpl_->fromcode_
 160                                 << " to " << pimpl_->tocode_ << ".\n";
 161                         lyxerr << "Input:" << hex;
 162                         for (size_t i = 0; i < buflen; ++i) {
 163                                 // char may be signed, avoid output of
 164                                 // something like 0xffffffc2
 165                                 boost::uint32_t const b =
 166                                         *reinterpret_cast<unsigned char const *>(buf + i);
 167                                 lyxerr << " 0x" << (unsigned int)b;
 168                         }
 169                         lyxerr << dec << endl;
 170                         break;
 171                 case EINVAL:
 172                         lyxerr << "EINVAL An incomplete multibyte sequence"
 173                                 << " has been encountered in the input.\n"
 174                                 << "When converting from " << pimpl_->fromcode_
 175                                 << " to " << pimpl_->tocode_ << ".\n";
 176                         lyxerr << "Input:" << hex;
 177                         for (size_t i = 0; i < buflen; ++i) {
 178                                 // char may be signed, avoid output of
 179                                 // something like 0xffffffc2
 180                                 boost::uint32_t const b =
 181                                         *reinterpret_cast<unsigned char const *>(buf + i);
 182                                 lyxerr << " 0x" << (unsigned int)b;
 183                         }
 184                         lyxerr << dec << endl;
 185                         break;
 186                 default:
 187                         lyxerr << "\tSome other error: " << errno << endl;
 188                         break;
 189         }
 190         // We got an error so we close down the conversion engine
 191         if (iconv_close(pimpl_->cd) == -1) {
 192                 lyxerr << "Error returned from iconv_close("
 193                         << errno << ")" << endl;
 194         }
 195         pimpl_->cd = invalid_cd;
 196         return -1;
 197 }
 198
 199
 200 std::string IconvProcessor::from() const
 201 {
 202         return pimpl_->fromcode_;
 203 }
 204
 205
 206 std::string IconvProcessor::to() const
 207 {
 208         return pimpl_->tocode_;
 209 }
 210
 211
 212 namespace {
 213
 214
 215 template<typename RetType, typename InType>
 216 vector<RetType>
 217 iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen)
 218 {
 219         if (buflen == 0)
 220                 return vector<RetType>();
 221
 222         char const * inbuf = reinterpret_cast<char const *>(buf);
 223         size_t inbytesleft = buflen * sizeof(InType);
 224
 225         static std::vector<char> outbuf(32768);
 226         // The number of UCS4 code points in buf is at most inbytesleft.
 227         // The output encoding will use at most
 228         // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point.
 229         size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft;
 230         if (outbuf.size() < maxoutbufsize)
 231                 outbuf.resize(maxoutbufsize);
 232
 233         int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size());
 234         if (bytes <= 0)
 235                 // Conversion failed
 236                 // FIXME Maybe throw an exception and handle that in the caller?
 237                 return vector<RetType>();
 238
 239         RetType const * tmp = reinterpret_cast<RetType const *>(&outbuf[0]);
 240         return vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
 241 }
 242
 243 } // anon namespace
 244
 245
 246 vector<char_type> utf8_to_ucs4(vector<char> const & utf8str)
 247 {
 248         if (utf8str.empty())
 249                 return vector<char_type>();
 250
 251         return utf8_to_ucs4(&utf8str[0], utf8str.size());
 252 }
 253
 254
 255 vector<char_type>
 256 utf8_to_ucs4(char const * utf8str, size_t ls)
 257 {
 258         static IconvProcessor processor(ucs4_codeset, "UTF-8");
 259         return iconv_convert<char_type>(processor, utf8str, ls);
 260 }
 261
 262
 263 vector<char_type>
 264 utf16_to_ucs4(unsigned short const * s, size_t ls)
 265 {
 266         static IconvProcessor processor(ucs4_codeset, utf16_codeset);
 267         return iconv_convert<char_type>(processor, s, ls);
 268 }
 269
 270
 271 vector<unsigned short>
 272 ucs4_to_utf16(char_type const * s, size_t ls)
 273 {
 274         static IconvProcessor processor(utf16_codeset, ucs4_codeset);
 275         return iconv_convert<unsigned short>(processor, s, ls);
 276 }
 277
 278
 279 vector<char>
 280 ucs4_to_utf8(char_type c)
 281 {
 282         static IconvProcessor processor("UTF-8", ucs4_codeset);
 283         return iconv_convert<char>(processor, &c, 1);
 284 }
 285
 286
 287 vector<char>
 288 ucs4_to_utf8(vector<char_type> const & ucs4str)
 289 {
 290         if (ucs4str.empty())
 291                 return vector<char>();
 292
 293         return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
 294 }
 295
 296
 297 vector<char>
 298 ucs4_to_utf8(char_type const * ucs4str, size_t ls)
 299 {
 300         static IconvProcessor processor("UTF-8", ucs4_codeset);
 301         return iconv_convert<char>(processor, ucs4str, ls);
 302 }
 303
 304
 305 vector<char_type>
 306 eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
 307 {
 308         static map<string, IconvProcessor> processors;
 309         if (processors.find(encoding) == processors.end()) {
 310                 IconvProcessor processor(ucs4_codeset, encoding.c_str());
 311                 processors.insert(make_pair(encoding, processor));
 312         }
 313         return iconv_convert<char_type>(processors[encoding], s, ls);
 314 }
 315
 316
 317 vector<char>
 318 ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
 319 {
 320         static map<string, IconvProcessor> processors;
 321         if (processors.find(encoding) == processors.end()) {
 322                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 323                 processors.insert(make_pair(encoding, processor));
 324         }
 325         return iconv_convert<char>(processors[encoding], ucs4str, ls);
 326 }
 327
 328
 329 char ucs4_to_eightbit(char_type ucs4, string const & encoding)
 330 {
 331         static map<string, IconvProcessor> processors;
 332         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 333         if (it == processors.end()) {
 334                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 335                 it = processors.insert(make_pair(encoding, processor)).first;
 336         }
 337
 338         char out;
 339         int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1);
 340         if (bytes > 0)
 341                 return out;
 342         return 0;
 343 }
 344
 345
 346 void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
 347         string const & encoding)
 348 {
 349         static map<string, IconvProcessor> processors;
 350         map<string, IconvProcessor>::iterator it = processors.find(encoding);
 351         if (it == processors.end()) {
 352                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
 353                 it = processors.insert(make_pair(encoding, processor)).first;
 354         }
 355
 356         out.resize(4);
 357         int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4);
 358         if (bytes > 0)
 359                 out.resize(bytes);
 360         else
 361                 out.clear();
 362 }
 363
 364 int max_encoded_bytes(std::string const & encoding)
 365 {
 366         // FIXME: this information should be transferred to lib/encodings
 367         // UTF8 uses at most 4 bytes to represent one UCS4 code point
 368         // (see RFC 3629). RFC 2279 specifies 6 bytes, but that
 369         // information is outdated, and RFC 2279 has been superseded by
 370         // RFC 3629.
 371         // The CJK encodings use (different) multibyte representation as well.
 372         // All other encodings encode one UCS4 code point in one byte
 373         // (and can therefore only encode a subset of UCS4)
 374         // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings).
 375         // Furthermore, all encodings that use shifting (like SJIS) do not work with
 376         // iconv_codecvt_facet.
 377         if (encoding == "UTF-8" ||
 378             encoding == "GB" ||
 379             encoding == "EUC-TW")
 380                 return 4;
 381         else if (encoding == "EUC-JP")
 382                 return 3;
 383         else if (encoding == "ISO-2022-JP")
 384                 return 8;
 385         else if (encoding == "BIG5" ||
 386                  encoding == "EUC-KR" ||
 387                  encoding == "EUC-CN" ||
 388                  encoding == "SJIS" ||
 389                  encoding == "GBK")
 390                 return 2;
 391         else
 392                 return 1;
 393 }
 394
 395 } // namespace lyx