3 * This file is part of LyX, the document processor.
4 * Licence details can be found in the file COPYING.
6 * \author Lars Gullik Bjønnes
8 * Full author contact details are available in file CREDITS.
10 * A collection of unicode conversion functions, using iconv.
29 iconv_convert(std::string const & tocode, std::string const & fromcode,
30 std::vector<char> const & buf)
32 iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
33 if (cd == (iconv_t)(-1)) {
34 lyxerr << "Error returned from iconv_open" << endl;
37 lyxerr << "EINVAL The conversion from " << fromcode
39 << " is not supported by the implementation."
43 lyxerr << "\tSome other error: " << errno << endl;
48 char * inbuf = const_cast<char *>(&buf[0]);
49 size_t inbytesleft = buf.size();
50 char out[1000] = { 0 };
52 size_t outbytesleft = 1000;
54 size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
56 if (res == (size_t)(-1)) {
57 lyxerr << "Error returned from iconv" << endl;
60 lyxerr << "E2BIG There is not sufficient room at *outbuf." << endl;
63 lyxerr << "EILSEQ An invalid multibyte sequence"
64 << " has been encountered in the input.\n"
65 << "When converting from " << fromcode
66 << " to " << tocode << ".\n";
67 lyxerr << "Input: " << std::hex;
68 for (size_t i = 0; i < buf.size(); ++i) {
69 unsigned char const b = buf[i];
70 lyxerr << "0x" << int(b) << " ";
75 lyxerr << "EINVAL An incomplete multibyte sequence"
76 << " has been encountered in the input.\n"
77 << "When converting from " << fromcode
78 << " to " << tocode << ".\n";
79 lyxerr << "Input: " << std::hex;
80 for (size_t i = 0; i < buf.size(); ++i) {
81 unsigned char const b = buf[i];
82 lyxerr << "0x" << int(b) << " ";
87 lyxerr << "\tSome other error: " << errno << endl;
92 if (iconv_close(cd) == -1) {
93 lyxerr << "Error returned from iconv_close("
94 << errno << ")" << endl;
98 //lyxerr << "Inbytesleft: " << inbytesleft << endl;
99 //lyxerr << "Outbytesleft: " << outbytesleft << endl;
100 int bytes = 1000 - outbytesleft;
102 std::vector<char> outvec(out, out + bytes);
107 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
109 //lyxerr << "Outbuf =" << std::hex;
111 std::vector<uint32_t> ucs4;
112 for (size_t i = 0; i < bytes.size(); i += 4) {
113 unsigned char const b1 = bytes[i ];
114 unsigned char const b2 = bytes[i + 1];
115 unsigned char const b3 = bytes[i + 2];
116 unsigned char const b4 = bytes[i + 3];
119 char * cc = reinterpret_cast<char *>(&c);
126 lyxerr << "Strange ucs4 value encountered\n";
128 << std::setw(2) << std::setfill('0') << int(b1)
129 << std::setw(2) << std::setfill('0') << int(b2)
130 << std::setw(2) << std::setfill('0') << int(b3)
131 << std::setw(2) << std::setfill('0') << int(b4)
145 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
147 //lyxerr << "Outbuf =" << std::hex;
149 std::vector<unsigned short> ucs2;
150 for (size_t i = 0; i < bytes.size(); i += 2) {
151 unsigned char const b1 = bytes[i ];
152 unsigned char const b2 = bytes[i + 1];
155 char * cc = reinterpret_cast<char *>(&c);
160 // << std::setw(2) << std::setfill('0') << int(b2)
161 // << std::setw(2) << std::setfill('0') << int(b1)
176 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
178 //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
179 // << " (" << utf8str.size() << ")" << endl;
180 //lyxerr << "Res = " << string(res.begin(), res.end())
181 // << " (" << res.size() << ")" << endl;
183 std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str);
184 return bytes_to_ucs4(res);
188 std::vector<boost::uint32_t>
189 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
191 // TODO: Simplify and speed up.
192 std::vector<char> in;
193 std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
194 std::vector<unsigned short>::const_iterator end = ucs2str.end();
195 //lyxerr << std::hex;
196 for (; cit != end; ++cit) {
197 unsigned short s = *cit;
198 in.push_back(static_cast<char>(s & 0x00ff));
199 in.push_back(static_cast<char>((s & 0xff00) >> 8));
200 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
201 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
204 std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in);
205 return bytes_to_ucs4(res);
209 std::vector<unsigned short>
210 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
212 std::vector<char> in;
213 std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
214 std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
215 for (; cit != end; ++cit) {
216 boost::uint32_t s = *cit;
217 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
218 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
219 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
220 in.push_back(static_cast<char>(s & 0x000000ff));
222 std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
223 return bytes_to_ucs2(res);
227 std::vector<unsigned short>
228 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
230 std::vector<char> in;
231 for (size_t i = 0; i < ls; ++i) {
232 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
233 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
234 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
235 in.push_back(static_cast<char>(s[i] & 0x000000ff));
237 std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
238 return bytes_to_ucs2(res);
243 ucs4_to_ucs2(boost::uint32_t c)
245 std::vector<char> in;
246 in.push_back(static_cast<char>((c & 0xff000000) >> 24));
247 in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
248 in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
249 in.push_back(static_cast<char>(c & 0x000000ff));
250 std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
251 std::vector<unsigned short> us = bytes_to_ucs2(res);
255 return 0xfffd; // unknown character
259 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
261 std::vector<char> in;
262 std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
263 std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
264 for (; cit != end; ++cit) {
265 boost::uint32_t s = *cit;
266 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
267 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
268 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
269 in.push_back(static_cast<char>(s & 0x000000ff));
271 std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
276 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
278 std::vector<char> in;
279 in.push_back(static_cast<char>((c & 0xff000000) >> 24));
280 in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
281 in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
282 in.push_back(static_cast<char>(c & 0x000000ff));
283 std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);