src/support/unicode.C

   1 /**
   2  * \file unicode.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Lars Gullik Bjønnes
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  *
  10  * A collection of unicode conversion functions, using iconv.
  11  */
  12
  13 #include <config.h>
  14
  15 #include "unicode.h"
  16
  17 #include "debug.h"
  18
  19 #include <iconv.h>
  20
  21 #include <cerrno>
  22 #include <iomanip>
  23 #include <string>
  24
  25 using std::endl;
  26 using std::string;
  27
  28 namespace {
  29
  30 #ifdef WORDS_BIGENDIAN
  31         char const * ucs4_codeset = "UCS-4BE";
  32         char const * ucs2_codeset = "UCS-2BE";
  33 #else
  34         char const * ucs4_codeset = "UCS-4LE";
  35         char const * ucs2_codeset = "UCS-2LE";
  36 #endif
  37
  38 std::vector<char>
  39 iconv_convert(std::string const & tocode, std::string const & fromcode,
  40               std::vector<char> const & buf)
  41 {
  42         if (buf.empty())
  43                 return std::vector<char>();
  44
  45         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
  46         if (cd == (iconv_t)(-1)) {
  47                 lyxerr << "Error returned from iconv_open" << endl;
  48                 switch (errno) {
  49                 case EINVAL:
  50                         lyxerr << "EINVAL The conversion from " << fromcode
  51                                << " to " << tocode
  52                                << " is not supported by the implementation."
  53                                << endl;
  54                         break;
  55                 default:
  56                         lyxerr << "\tSome other error: " << errno << endl;
  57                         break;
  58                 }
  59         }
  60
  61         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
  62         size_t inbytesleft = buf.size();
  63         static char out[1000];
  64         char * outbuf = out;
  65         size_t outbytesleft = 1000;
  66
  67         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
  68
  69         if (res == (size_t)(-1)) {
  70                 lyxerr << "Error returned from iconv" << endl;
  71                 switch (errno) {
  72                 case E2BIG:
  73                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
  74                         break;
  75                 case EILSEQ:
  76                         lyxerr << "EILSEQ An invalid multibyte sequence"
  77                                << " has been encountered in the input.\n"
  78                                << "When converting from " << fromcode
  79                                << " to " << tocode << ".\n";
  80                         lyxerr << "Input: " << std::hex;
  81                         for (size_t i = 0; i < buf.size(); ++i) {
  82                                 unsigned char const b = buf[i];
  83                                 lyxerr << "0x" << int(b) << " ";
  84                         }
  85                         lyxerr << endl;
  86                         break;
  87                 case EINVAL:
  88                         lyxerr << "EINVAL An incomplete multibyte sequence"
  89                                << " has been encountered in the input.\n"
  90                                << "When converting from " << fromcode
  91                                << " to " << tocode << ".\n";
  92                         lyxerr << "Input: " << std::hex;
  93                         for (size_t i = 0; i < buf.size(); ++i) {
  94                                 unsigned char const b = buf[i];
  95                                 lyxerr << "0x" << int(b) << " ";
  96                         }
  97                         lyxerr << endl;
  98                         break;
  99                 default:
 100                         lyxerr << "\tSome other error: " << errno << endl;
 101                         break;
 102                 }
 103         }
 104
 105         if (iconv_close(cd) == -1) {
 106                 lyxerr << "Error returned from iconv_close("
 107                        << errno << ")" << endl;
 108         }
 109
 110         //lyxerr << std::dec;
 111         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
 112         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
 113         int bytes = 1000 - outbytesleft;
 114
 115         std::vector<char> outvec(out, out + bytes);
 116         return outvec;
 117 }
 118
 119
 120 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
 121 {
 122         boost::uint32_t const * tmp = reinterpret_cast<uint32_t const *>(&bytes[0]);
 123         return std::vector<boost::uint32_t>(tmp, tmp + bytes.size() / 4);
 124 }
 125
 126
 127 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
 128 {
 129         unsigned short const * tmp = reinterpret_cast<unsigned short const *>(&bytes[0]);
 130         return std::vector<unsigned short>(tmp, tmp + bytes.size() / 2);
 131 }
 132
 133 } // anon namespace
 134
 135
 136 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
 137 {
 138         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
 139         //       << " (" << utf8str.size() << ")" << endl;
 140         //lyxerr << "Res = " << string(res.begin(), res.end())
 141         //       << " (" << res.size() << ")" << endl;
 142
 143         std::vector<char> res = iconv_convert(ucs4_codeset, "UTF-8", utf8str);
 144         return bytes_to_ucs4(res);
 145 }
 146
 147
 148 std::vector<boost::uint32_t>
 149 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
 150 {
 151         char const * tin = reinterpret_cast<char const *>(&ucs2str[0]);
 152         std::vector<char> in(tin, tin + ucs2str.size() * 2);
 153         std::vector<char> res = iconv_convert(ucs4_codeset, ucs2_codeset, in);
 154         return bytes_to_ucs4(res);
 155 }
 156
 157
 158 std::vector<unsigned short>
 159 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
 160 {
 161         char const * tin = reinterpret_cast<char const *>(&ucs4str[0]);
 162         std::vector<char> in(tin, tin + ucs4str.size() * 4);
 163         std::vector<char> res = iconv_convert(ucs2_codeset, ucs4_codeset, in);
 164         return bytes_to_ucs2(res);
 165 }
 166
 167
 168 std::vector<unsigned short>
 169 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
 170 {
 171         char const * tin = reinterpret_cast<char const *>(s);
 172         std::vector<char> in(tin, tin + ls * 4);
 173         std::vector<char> res = iconv_convert(ucs2_codeset, ucs4_codeset, in);
 174         return bytes_to_ucs2(res);
 175 }
 176
 177
 178 unsigned short
 179 ucs4_to_ucs2(boost::uint32_t c)
 180 {
 181         char const * tin = reinterpret_cast<char const *>(&c);
 182         std::vector<char> in(tin, tin + 4);
 183         std::vector<char> res = iconv_convert(ucs2_codeset, ucs4_codeset, in);
 184         return bytes_to_ucs2(res)[0];
 185 }
 186
 187
 188 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
 189 {
 190         char const * tin = reinterpret_cast<char const *>(&ucs4str[0]);
 191         std::vector<char> in(tin, tin + ucs4str.size() * 4);
 192         std::vector<char> res = iconv_convert("UTF-8", ucs4_codeset, in);
 193         return res;
 194 }
 195
 196
 197 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
 198 {
 199         char const * tin = reinterpret_cast<char const *>(&c);
 200         std::vector<char> in(tin, tin + 4);
 201         std::vector<char> res = iconv_convert("UTF-8", ucs4_codeset, in);
 202         return res;
 203 }