]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
Windows compilation fixes.
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <boost/cstdint.hpp>
20
21 #include <iconv.h>
22
23 #include <cerrno>
24 #include <iomanip>
25 #include <string>
26
27 using std::endl;
28 using std::string;
29
30 namespace {
31
32 std::vector<char>
33 iconv_convert(std::string const & tocode, std::string const & fromcode,
34               std::vector<char> const & buf)
35 {
36         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
37         if (cd == (iconv_t)(-1)) {
38                 lyxerr << "Error returned from iconv_open" << endl;
39                 switch (errno) {
40                 case EINVAL:
41                         lyxerr << "EINVAL The conversion from " << fromcode
42                                << " to " << tocode
43                                << " is not supported by the implementation."
44                                << endl;
45                         break;
46                 default:
47                         lyxerr << "\tSome other error: " << errno << endl;
48                         break;
49                 }
50         }
51
52         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
53         size_t inbytesleft = buf.size();
54         char out[1000] = { 0 };
55         char * outbuf = out;
56         size_t outbytesleft = 1000;
57
58         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
59
60         if (res == (size_t)(-1)) {
61                 lyxerr << "Error returned from iconv" << endl;
62                 switch (errno) {
63                 case E2BIG:
64                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
65                         break;
66                 case EILSEQ:
67                         lyxerr << "EILSEQ An invalid multibyte sequence"
68                                << " has been encountered in the input.\n"
69                                << "When converting from " << fromcode
70                                << " to " << tocode << ".\n";
71                         lyxerr << "Input: " << std::hex;
72                         for (size_t i = 0; i < buf.size(); ++i) {
73                                 unsigned char const b = buf[i];
74                                 lyxerr << "0x" << int(b) << " ";
75                         }
76                         lyxerr << endl;
77                         break;
78                 case EINVAL:
79                         lyxerr << "EINVAL An incomplete multibyte sequence"
80                                << " has been encountered in the input.\n"
81                                << "When converting from " << fromcode
82                                << " to " << tocode << ".\n";
83                         lyxerr << "Input: " << std::hex;
84                         for (size_t i = 0; i < buf.size(); ++i) {
85                                 unsigned char const b = buf[i];
86                                 lyxerr << "0x" << int(b) << " ";
87                         }
88                         lyxerr << endl;
89                         break;
90                 default:
91                         lyxerr << "\tSome other error: " << errno << endl;
92                         break;
93                 }
94         }
95
96         if (iconv_close(cd) == -1) {
97                 lyxerr << "Error returned from iconv_close("
98                        << errno << ")" << endl;
99         }
100
101         //lyxerr << std::dec;
102         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
103         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
104         int bytes = 1000 - outbytesleft;
105
106         std::vector<char> outvec(out, out + bytes);
107         return outvec;
108 }
109
110
111 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
112 {
113         //lyxerr << "Outbuf =" << std::hex;
114
115         std::vector<boost::uint32_t> ucs4;
116         for (size_t i = 0; i < bytes.size(); i += 4) {
117                 unsigned char const b1 = bytes[i    ];
118                 unsigned char const b2 = bytes[i + 1];
119                 unsigned char const b3 = bytes[i + 2];
120                 unsigned char const b4 = bytes[i + 3];
121
122                 boost::uint32_t c;
123                 char * cc = reinterpret_cast<char *>(&c);
124                 cc[3] = b1;
125                 cc[2] = b2;
126                 cc[1] = b3;
127                 cc[0] = b4;
128
129                 if (c > 0xffff) {
130                         lyxerr << "Strange ucs4 value encountered\n";
131                         lyxerr << "0x"
132                                << std::setw(2) << std::setfill('0') << int(b1)
133                                << std::setw(2) << std::setfill('0') << int(b2)
134                                << std::setw(2) << std::setfill('0') << int(b3)
135                                << std::setw(2) << std::setfill('0') << int(b4)
136                                << ' '
137                                << "(0x"
138                                << c
139                                << ") ";
140                 }
141
142                 ucs4.push_back(c);
143         }
144         //lyxerr << endl;
145         return ucs4;
146 }
147
148
149 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
150 {
151         //lyxerr << "Outbuf =" << std::hex;
152
153         std::vector<unsigned short> ucs2;
154         for (size_t i = 0; i < bytes.size(); i += 2) {
155                 unsigned char const b1 = bytes[i    ];
156                 unsigned char const b2 = bytes[i + 1];
157
158                 unsigned short c;
159                 char * cc = reinterpret_cast<char *>(&c);
160                 cc[0] = b1;
161                 cc[1] = b2;
162
163                 //lyxerr << "0x"
164                 //       << std::setw(2) << std::setfill('0') << int(b2)
165                 //       << std::setw(2) << std::setfill('0') << int(b1)
166                 //       << ' '
167                 //       << "(0x"
168                 //       << c
169                 //       << ") ";
170
171                 ucs2.push_back(c);
172         }
173         //lyxerr << endl;
174         return ucs2;
175 }
176
177 } // anon namespace
178
179
180 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
181 {
182         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
183         //       << " (" << utf8str.size() << ")" << endl;
184         //lyxerr << "Res = " << string(res.begin(), res.end())
185         //       << " (" << res.size() << ")" << endl;
186
187         std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str);
188         return bytes_to_ucs4(res);
189 }
190
191
192 std::vector<boost::uint32_t>
193 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
194 {
195         // TODO: Simplify and speed up.
196         std::vector<char> in;
197         std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
198         std::vector<unsigned short>::const_iterator end = ucs2str.end();
199         //lyxerr << std::hex;
200         for (; cit != end; ++cit) {
201                 unsigned short s = *cit;
202                 in.push_back(static_cast<char>(s & 0x00ff));
203                 in.push_back(static_cast<char>((s & 0xff00) >> 8));
204                 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
205                 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
206         }
207
208         std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in);
209         return bytes_to_ucs4(res);
210 }
211
212
213 std::vector<unsigned short>
214 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
215 {
216         std::vector<char> in;
217         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
218         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
219         for (; cit != end; ++cit) {
220                 boost::uint32_t s = *cit;
221                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
222                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
223                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
224                 in.push_back(static_cast<char>(s & 0x000000ff));
225         }
226         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
227         return bytes_to_ucs2(res);
228 }
229
230
231 std::vector<unsigned short>
232 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
233 {
234         std::vector<char> in;
235         for (size_t i = 0; i < ls; ++i) {
236                 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
237                 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
238                 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
239                 in.push_back(static_cast<char>(s[i] & 0x000000ff));
240         }
241         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
242         return bytes_to_ucs2(res);
243 }
244
245
246 unsigned short
247 ucs4_to_ucs2(boost::uint32_t c)
248 {
249         std::vector<char> in;
250         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
251         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
252         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
253         in.push_back(static_cast<char>(c & 0x000000ff));
254         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
255         std::vector<unsigned short> us = bytes_to_ucs2(res);
256         if (!us.empty())
257                 return us[0];
258         else
259                 return 0xfffd; // unknown character
260 }
261
262
263 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
264 {
265         std::vector<char> in;
266         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
267         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
268         for (; cit != end; ++cit) {
269                 boost::uint32_t s = *cit;
270                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
271                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
272                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
273                 in.push_back(static_cast<char>(s & 0x000000ff));
274         }
275         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
276         return res;
277 }
278
279
280 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
281 {
282         std::vector<char> in;
283         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
284         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
285         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
286         in.push_back(static_cast<char>(c & 0x000000ff));
287         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
288         return res;
289 }