3 * This file is part of LyX, the document processor.
4 * Licence details can be found in the file COPYING.
6 * \author Lars Gullik Bjønnes
8 * Full author contact details are available in file CREDITS.
10 * A collection of unicode conversion functions, using iconv.
31 iconv_convert(std::string const & tocode, std::string const & fromcode,
32 std::vector<char> const & buf)
35 return std::vector<char>();
37 iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
38 if (cd == (iconv_t)(-1)) {
39 lyxerr << "Error returned from iconv_open" << endl;
42 lyxerr << "EINVAL The conversion from " << fromcode
44 << " is not supported by the implementation."
48 lyxerr << "\tSome other error: " << errno << endl;
53 char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
54 size_t inbytesleft = buf.size();
55 static char out[1000];
57 size_t outbytesleft = 1000;
59 size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
61 if (res == (size_t)(-1)) {
62 lyxerr << "Error returned from iconv" << endl;
65 lyxerr << "E2BIG There is not sufficient room at *outbuf." << endl;
68 lyxerr << "EILSEQ An invalid multibyte sequence"
69 << " has been encountered in the input.\n"
70 << "When converting from " << fromcode
71 << " to " << tocode << ".\n";
72 lyxerr << "Input: " << std::hex;
73 for (size_t i = 0; i < buf.size(); ++i) {
74 unsigned char const b = buf[i];
75 lyxerr << "0x" << int(b) << " ";
80 lyxerr << "EINVAL An incomplete multibyte sequence"
81 << " has been encountered in the input.\n"
82 << "When converting from " << fromcode
83 << " to " << tocode << ".\n";
84 lyxerr << "Input: " << std::hex;
85 for (size_t i = 0; i < buf.size(); ++i) {
86 unsigned char const b = buf[i];
87 lyxerr << "0x" << int(b) << " ";
92 lyxerr << "\tSome other error: " << errno << endl;
97 if (iconv_close(cd) == -1) {
98 lyxerr << "Error returned from iconv_close("
99 << errno << ")" << endl;
102 //lyxerr << std::dec;
103 //lyxerr << "Inbytesleft: " << inbytesleft << endl;
104 //lyxerr << "Outbytesleft: " << outbytesleft << endl;
105 int bytes = 1000 - outbytesleft;
107 std::vector<char> outvec(out, out + bytes);
112 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
114 //lyxerr << "Outbuf =" << std::hex;
116 std::vector<boost::uint32_t> ucs4;
117 for (size_t i = 0; i < bytes.size(); i += 4) {
118 unsigned char const b1 = bytes[i ];
119 unsigned char const b2 = bytes[i + 1];
120 unsigned char const b3 = bytes[i + 2];
121 unsigned char const b4 = bytes[i + 3];
124 char * cc = reinterpret_cast<char *>(&c);
125 #ifdef WORDS_BIGENDIAN
138 lyxerr << "Strange ucs4 value encountered\n";
140 << std::setw(2) << std::setfill('0') << int(b1)
141 << std::setw(2) << std::setfill('0') << int(b2)
142 << std::setw(2) << std::setfill('0') << int(b3)
143 << std::setw(2) << std::setfill('0') << int(b4)
157 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
159 //lyxerr << "Outbuf =" << std::hex;
161 std::vector<unsigned short> ucs2;
162 for (size_t i = 0; i < bytes.size(); i += 2) {
163 unsigned char const b1 = bytes[i ];
164 unsigned char const b2 = bytes[i + 1];
167 char * cc = reinterpret_cast<char *>(&c);
168 #ifdef WORDS_BIGENDIAN
177 // << std::setw(2) << std::setfill('0') << int(b2)
178 // << std::setw(2) << std::setfill('0') << int(b1)
193 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
195 //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
196 // << " (" << utf8str.size() << ")" << endl;
197 //lyxerr << "Res = " << string(res.begin(), res.end())
198 // << " (" << res.size() << ")" << endl;
200 std::vector<char> res = iconv_convert("UCS-4BE", "UTF-8", utf8str);
201 return bytes_to_ucs4(res);
205 std::vector<boost::uint32_t>
206 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
208 // TODO: Simplify and speed up.
209 std::vector<char> in;
210 std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
211 std::vector<unsigned short>::const_iterator end = ucs2str.end();
212 //lyxerr << std::hex;
213 for (; cit != end; ++cit) {
214 unsigned short s = *cit;
215 in.push_back(static_cast<char>((s & 0xff00) >> 8));
216 in.push_back(static_cast<char>(s & 0x00ff));
217 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
218 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
221 std::vector<char> res = iconv_convert("UCS-4BE", "UCS-2BE", in);
222 return bytes_to_ucs4(res);
226 std::vector<unsigned short>
227 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
229 std::vector<char> in;
230 std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
231 std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
232 for (; cit != end; ++cit) {
233 boost::uint32_t s = *cit;
234 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
235 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
236 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
237 in.push_back(static_cast<char>(s & 0x000000ff));
239 std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
240 return bytes_to_ucs2(res);
244 std::vector<unsigned short>
245 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
247 std::vector<char> in;
248 for (size_t i = 0; i < ls; ++i) {
249 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
250 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
251 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
252 in.push_back(static_cast<char>(s[i] & 0x000000ff));
254 std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
255 return bytes_to_ucs2(res);
260 ucs4_to_ucs2(boost::uint32_t c)
262 std::vector<char> in;
263 in.push_back(static_cast<char>((c & 0xff000000) >> 24));
264 in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
265 in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
266 in.push_back(static_cast<char>(c & 0x000000ff));
267 std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
268 std::vector<unsigned short> us = bytes_to_ucs2(res);
272 return 0xfffd; // unknown character
276 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
278 std::vector<char> in;
279 std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
280 std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
281 for (; cit != end; ++cit) {
282 boost::uint32_t s = *cit;
283 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
284 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
285 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
286 in.push_back(static_cast<char>(s & 0x000000ff));
288 std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
293 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
295 std::vector<char> in;
296 in.push_back(static_cast<char>((c & 0xff000000) >> 24));
297 in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
298 in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
299 in.push_back(static_cast<char>(c & 0x000000ff));
300 std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);