]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
deleted "#include <boost/cstdint.hpp>" as it is already in unicode.h.
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <iconv.h>
20
21 #include <cerrno>
22 #include <iomanip>
23 #include <string>
24
25 using std::endl;
26 using std::string;
27
28 namespace {
29
30 std::vector<char>
31 iconv_convert(std::string const & tocode, std::string const & fromcode,
32               std::vector<char> const & buf)
33 {
34         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
35         if (cd == (iconv_t)(-1)) {
36                 lyxerr << "Error returned from iconv_open" << endl;
37                 switch (errno) {
38                 case EINVAL:
39                         lyxerr << "EINVAL The conversion from " << fromcode
40                                << " to " << tocode
41                                << " is not supported by the implementation."
42                                << endl;
43                         break;
44                 default:
45                         lyxerr << "\tSome other error: " << errno << endl;
46                         break;
47                 }
48         }
49
50         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
51         size_t inbytesleft = buf.size();
52         char out[1000] = { 0 };
53         char * outbuf = out;
54         size_t outbytesleft = 1000;
55
56         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
57
58         if (res == (size_t)(-1)) {
59                 lyxerr << "Error returned from iconv" << endl;
60                 switch (errno) {
61                 case E2BIG:
62                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
63                         break;
64                 case EILSEQ:
65                         lyxerr << "EILSEQ An invalid multibyte sequence"
66                                << " has been encountered in the input.\n"
67                                << "When converting from " << fromcode
68                                << " to " << tocode << ".\n";
69                         lyxerr << "Input: " << std::hex;
70                         for (size_t i = 0; i < buf.size(); ++i) {
71                                 unsigned char const b = buf[i];
72                                 lyxerr << "0x" << int(b) << " ";
73                         }
74                         lyxerr << endl;
75                         break;
76                 case EINVAL:
77                         lyxerr << "EINVAL An incomplete multibyte sequence"
78                                << " has been encountered in the input.\n"
79                                << "When converting from " << fromcode
80                                << " to " << tocode << ".\n";
81                         lyxerr << "Input: " << std::hex;
82                         for (size_t i = 0; i < buf.size(); ++i) {
83                                 unsigned char const b = buf[i];
84                                 lyxerr << "0x" << int(b) << " ";
85                         }
86                         lyxerr << endl;
87                         break;
88                 default:
89                         lyxerr << "\tSome other error: " << errno << endl;
90                         break;
91                 }
92         }
93
94         if (iconv_close(cd) == -1) {
95                 lyxerr << "Error returned from iconv_close("
96                        << errno << ")" << endl;
97         }
98
99         //lyxerr << std::dec;
100         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
101         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
102         int bytes = 1000 - outbytesleft;
103
104         std::vector<char> outvec(out, out + bytes);
105         return outvec;
106 }
107
108
109 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
110 {
111         //lyxerr << "Outbuf =" << std::hex;
112
113         std::vector<boost::uint32_t> ucs4;
114         for (size_t i = 0; i < bytes.size(); i += 4) {
115                 unsigned char const b1 = bytes[i    ];
116                 unsigned char const b2 = bytes[i + 1];
117                 unsigned char const b3 = bytes[i + 2];
118                 unsigned char const b4 = bytes[i + 3];
119
120                 boost::uint32_t c;
121                 char * cc = reinterpret_cast<char *>(&c);
122                 cc[3] = b1;
123                 cc[2] = b2;
124                 cc[1] = b3;
125                 cc[0] = b4;
126
127                 if (c > 0xffff) {
128                         lyxerr << "Strange ucs4 value encountered\n";
129                         lyxerr << "0x"
130                                << std::setw(2) << std::setfill('0') << int(b1)
131                                << std::setw(2) << std::setfill('0') << int(b2)
132                                << std::setw(2) << std::setfill('0') << int(b3)
133                                << std::setw(2) << std::setfill('0') << int(b4)
134                                << ' '
135                                << "(0x"
136                                << c
137                                << ") ";
138                 }
139
140                 ucs4.push_back(c);
141         }
142         //lyxerr << endl;
143         return ucs4;
144 }
145
146
147 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
148 {
149         //lyxerr << "Outbuf =" << std::hex;
150
151         std::vector<unsigned short> ucs2;
152         for (size_t i = 0; i < bytes.size(); i += 2) {
153                 unsigned char const b1 = bytes[i    ];
154                 unsigned char const b2 = bytes[i + 1];
155
156                 unsigned short c;
157                 char * cc = reinterpret_cast<char *>(&c);
158                 cc[0] = b1;
159                 cc[1] = b2;
160
161                 //lyxerr << "0x"
162                 //       << std::setw(2) << std::setfill('0') << int(b2)
163                 //       << std::setw(2) << std::setfill('0') << int(b1)
164                 //       << ' '
165                 //       << "(0x"
166                 //       << c
167                 //       << ") ";
168
169                 ucs2.push_back(c);
170         }
171         //lyxerr << endl;
172         return ucs2;
173 }
174
175 } // anon namespace
176
177
178 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
179 {
180         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
181         //       << " (" << utf8str.size() << ")" << endl;
182         //lyxerr << "Res = " << string(res.begin(), res.end())
183         //       << " (" << res.size() << ")" << endl;
184
185         std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str);
186         return bytes_to_ucs4(res);
187 }
188
189
190 std::vector<boost::uint32_t>
191 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
192 {
193         // TODO: Simplify and speed up.
194         std::vector<char> in;
195         std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
196         std::vector<unsigned short>::const_iterator end = ucs2str.end();
197         //lyxerr << std::hex;
198         for (; cit != end; ++cit) {
199                 unsigned short s = *cit;
200                 in.push_back(static_cast<char>(s & 0x00ff));
201                 in.push_back(static_cast<char>((s & 0xff00) >> 8));
202                 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
203                 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
204         }
205
206         std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in);
207         return bytes_to_ucs4(res);
208 }
209
210
211 std::vector<unsigned short>
212 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
213 {
214         std::vector<char> in;
215         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
216         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
217         for (; cit != end; ++cit) {
218                 boost::uint32_t s = *cit;
219                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
220                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
221                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
222                 in.push_back(static_cast<char>(s & 0x000000ff));
223         }
224         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
225         return bytes_to_ucs2(res);
226 }
227
228
229 std::vector<unsigned short>
230 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
231 {
232         std::vector<char> in;
233         for (size_t i = 0; i < ls; ++i) {
234                 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
235                 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
236                 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
237                 in.push_back(static_cast<char>(s[i] & 0x000000ff));
238         }
239         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
240         return bytes_to_ucs2(res);
241 }
242
243
244 unsigned short
245 ucs4_to_ucs2(boost::uint32_t c)
246 {
247         std::vector<char> in;
248         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
249         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
250         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
251         in.push_back(static_cast<char>(c & 0x000000ff));
252         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
253         std::vector<unsigned short> us = bytes_to_ucs2(res);
254         if (!us.empty())
255                 return us[0];
256         else
257                 return 0xfffd; // unknown character
258 }
259
260
261 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
262 {
263         std::vector<char> in;
264         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
265         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
266         for (; cit != end; ++cit) {
267                 boost::uint32_t s = *cit;
268                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
269                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
270                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
271                 in.push_back(static_cast<char>(s & 0x000000ff));
272         }
273         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
274         return res;
275 }
276
277
278 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
279 {
280         std::vector<char> in;
281         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
282         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
283         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
284         in.push_back(static_cast<char>(c & 0x000000ff));
285         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
286         return res;
287 }