]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
Fix byte order problems
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <iconv.h>
20
21 #include <cerrno>
22 #include <iomanip>
23 #include <string>
24
25 using std::endl;
26 using std::string;
27
28 namespace {
29
30 std::vector<char>
31 iconv_convert(std::string const & tocode, std::string const & fromcode,
32               std::vector<char> const & buf)
33 {
34         if (buf.empty())
35                 return std::vector<char>();
36
37         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
38         if (cd == (iconv_t)(-1)) {
39                 lyxerr << "Error returned from iconv_open" << endl;
40                 switch (errno) {
41                 case EINVAL:
42                         lyxerr << "EINVAL The conversion from " << fromcode
43                                << " to " << tocode
44                                << " is not supported by the implementation."
45                                << endl;
46                         break;
47                 default:
48                         lyxerr << "\tSome other error: " << errno << endl;
49                         break;
50                 }
51         }
52
53         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
54         size_t inbytesleft = buf.size();
55         static char out[1000];
56         char * outbuf = out;
57         size_t outbytesleft = 1000;
58
59         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
60
61         if (res == (size_t)(-1)) {
62                 lyxerr << "Error returned from iconv" << endl;
63                 switch (errno) {
64                 case E2BIG:
65                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
66                         break;
67                 case EILSEQ:
68                         lyxerr << "EILSEQ An invalid multibyte sequence"
69                                << " has been encountered in the input.\n"
70                                << "When converting from " << fromcode
71                                << " to " << tocode << ".\n";
72                         lyxerr << "Input: " << std::hex;
73                         for (size_t i = 0; i < buf.size(); ++i) {
74                                 unsigned char const b = buf[i];
75                                 lyxerr << "0x" << int(b) << " ";
76                         }
77                         lyxerr << endl;
78                         break;
79                 case EINVAL:
80                         lyxerr << "EINVAL An incomplete multibyte sequence"
81                                << " has been encountered in the input.\n"
82                                << "When converting from " << fromcode
83                                << " to " << tocode << ".\n";
84                         lyxerr << "Input: " << std::hex;
85                         for (size_t i = 0; i < buf.size(); ++i) {
86                                 unsigned char const b = buf[i];
87                                 lyxerr << "0x" << int(b) << " ";
88                         }
89                         lyxerr << endl;
90                         break;
91                 default:
92                         lyxerr << "\tSome other error: " << errno << endl;
93                         break;
94                 }
95         }
96
97         if (iconv_close(cd) == -1) {
98                 lyxerr << "Error returned from iconv_close("
99                        << errno << ")" << endl;
100         }
101
102         //lyxerr << std::dec;
103         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
104         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
105         int bytes = 1000 - outbytesleft;
106
107         std::vector<char> outvec(out, out + bytes);
108         return outvec;
109 }
110
111
112 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
113 {
114         //lyxerr << "Outbuf =" << std::hex;
115
116         std::vector<boost::uint32_t> ucs4;
117         for (size_t i = 0; i < bytes.size(); i += 4) {
118                 unsigned char const b1 = bytes[i    ];
119                 unsigned char const b2 = bytes[i + 1];
120                 unsigned char const b3 = bytes[i + 2];
121                 unsigned char const b4 = bytes[i + 3];
122
123                 boost::uint32_t c;
124                 char * cc = reinterpret_cast<char *>(&c);
125 #ifdef WORDS_BIGENDIAN
126                 cc[0] = b1;
127                 cc[1] = b2;
128                 cc[2] = b3;
129                 cc[3] = b4;
130 #else
131                 cc[3] = b1;
132                 cc[2] = b2;
133                 cc[1] = b3;
134                 cc[0] = b4;
135 #endif
136
137                 if (c > 0xffff) {
138                         lyxerr << "Strange ucs4 value encountered\n";
139                         lyxerr << "0x"
140                                << std::setw(2) << std::setfill('0') << int(b1)
141                                << std::setw(2) << std::setfill('0') << int(b2)
142                                << std::setw(2) << std::setfill('0') << int(b3)
143                                << std::setw(2) << std::setfill('0') << int(b4)
144                                << ' '
145                                << "(0x"
146                                << c
147                                << ") ";
148                 }
149
150                 ucs4.push_back(c);
151         }
152         //lyxerr << endl;
153         return ucs4;
154 }
155
156
157 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
158 {
159         //lyxerr << "Outbuf =" << std::hex;
160
161         std::vector<unsigned short> ucs2;
162         for (size_t i = 0; i < bytes.size(); i += 2) {
163                 unsigned char const b1 = bytes[i    ];
164                 unsigned char const b2 = bytes[i + 1];
165
166                 unsigned short c;
167                 char * cc = reinterpret_cast<char *>(&c);
168 #ifdef WORDS_BIGENDIAN
169                 cc[0] = b1;
170                 cc[1] = b2;
171 #else
172                 cc[1] = b1;
173                 cc[0] = b2;
174 #endif
175
176                 //lyxerr << "0x"
177                 //       << std::setw(2) << std::setfill('0') << int(b2)
178                 //       << std::setw(2) << std::setfill('0') << int(b1)
179                 //       << ' '
180                 //       << "(0x"
181                 //       << c
182                 //       << ") ";
183
184                 ucs2.push_back(c);
185         }
186         //lyxerr << endl;
187         return ucs2;
188 }
189
190 } // anon namespace
191
192
193 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
194 {
195         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
196         //       << " (" << utf8str.size() << ")" << endl;
197         //lyxerr << "Res = " << string(res.begin(), res.end())
198         //       << " (" << res.size() << ")" << endl;
199
200         std::vector<char> res = iconv_convert("UCS-4BE", "UTF-8", utf8str);
201         return bytes_to_ucs4(res);
202 }
203
204
205 std::vector<boost::uint32_t>
206 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
207 {
208         // TODO: Simplify and speed up.
209         std::vector<char> in;
210         std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
211         std::vector<unsigned short>::const_iterator end = ucs2str.end();
212         //lyxerr << std::hex;
213         for (; cit != end; ++cit) {
214                 unsigned short s = *cit;
215                 in.push_back(static_cast<char>((s & 0xff00) >> 8));
216                 in.push_back(static_cast<char>(s & 0x00ff));
217                 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
218                 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
219         }
220
221         std::vector<char> res = iconv_convert("UCS-4BE", "UCS-2BE", in);
222         return bytes_to_ucs4(res);
223 }
224
225
226 std::vector<unsigned short>
227 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
228 {
229         std::vector<char> in;
230         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
231         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
232         for (; cit != end; ++cit) {
233                 boost::uint32_t s = *cit;
234                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
235                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
236                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
237                 in.push_back(static_cast<char>(s & 0x000000ff));
238         }
239         std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
240         return bytes_to_ucs2(res);
241 }
242
243
244 std::vector<unsigned short>
245 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
246 {
247         std::vector<char> in;
248         for (size_t i = 0; i < ls; ++i) {
249                 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
250                 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
251                 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
252                 in.push_back(static_cast<char>(s[i] & 0x000000ff));
253         }
254         std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
255         return bytes_to_ucs2(res);
256 }
257
258
259 unsigned short
260 ucs4_to_ucs2(boost::uint32_t c)
261 {
262         std::vector<char> in;
263         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
264         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
265         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
266         in.push_back(static_cast<char>(c & 0x000000ff));
267         std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
268         std::vector<unsigned short> us = bytes_to_ucs2(res);
269         if (!us.empty())
270                 return us[0];
271         else
272                 return 0xfffd; // unknown character
273 }
274
275
276 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
277 {
278         std::vector<char> in;
279         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
280         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
281         for (; cit != end; ++cit) {
282                 boost::uint32_t s = *cit;
283                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
284                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
285                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
286                 in.push_back(static_cast<char>(s & 0x000000ff));
287         }
288         std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
289         return res;
290 }
291
292
293 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
294 {
295         std::vector<char> in;
296         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
297         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
298         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
299         in.push_back(static_cast<char>(c & 0x000000ff));
300         std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
301         return res;
302 }