]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
iconv_convert(): return empty vector in case of empty input
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <iconv.h>
20
21 #include <cerrno>
22 #include <iomanip>
23 #include <string>
24
25 using std::endl;
26 using std::string;
27
28 namespace {
29
30 std::vector<char>
31 iconv_convert(std::string const & tocode, std::string const & fromcode,
32               std::vector<char> const & buf)
33 {
34         if (buf.empty())
35                 return std::vector<char>();
36
37         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
38         if (cd == (iconv_t)(-1)) {
39                 lyxerr << "Error returned from iconv_open" << endl;
40                 switch (errno) {
41                 case EINVAL:
42                         lyxerr << "EINVAL The conversion from " << fromcode
43                                << " to " << tocode
44                                << " is not supported by the implementation."
45                                << endl;
46                         break;
47                 default:
48                         lyxerr << "\tSome other error: " << errno << endl;
49                         break;
50                 }
51         }
52
53         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
54         size_t inbytesleft = buf.size();
55         static char out[1000];
56         char * outbuf = out;
57         size_t outbytesleft = 1000;
58
59         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
60
61         if (res == (size_t)(-1)) {
62                 lyxerr << "Error returned from iconv" << endl;
63                 switch (errno) {
64                 case E2BIG:
65                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
66                         break;
67                 case EILSEQ:
68                         lyxerr << "EILSEQ An invalid multibyte sequence"
69                                << " has been encountered in the input.\n"
70                                << "When converting from " << fromcode
71                                << " to " << tocode << ".\n";
72                         lyxerr << "Input: " << std::hex;
73                         for (size_t i = 0; i < buf.size(); ++i) {
74                                 unsigned char const b = buf[i];
75                                 lyxerr << "0x" << int(b) << " ";
76                         }
77                         lyxerr << endl;
78                         break;
79                 case EINVAL:
80                         lyxerr << "EINVAL An incomplete multibyte sequence"
81                                << " has been encountered in the input.\n"
82                                << "When converting from " << fromcode
83                                << " to " << tocode << ".\n";
84                         lyxerr << "Input: " << std::hex;
85                         for (size_t i = 0; i < buf.size(); ++i) {
86                                 unsigned char const b = buf[i];
87                                 lyxerr << "0x" << int(b) << " ";
88                         }
89                         lyxerr << endl;
90                         break;
91                 default:
92                         lyxerr << "\tSome other error: " << errno << endl;
93                         break;
94                 }
95         }
96
97         if (iconv_close(cd) == -1) {
98                 lyxerr << "Error returned from iconv_close("
99                        << errno << ")" << endl;
100         }
101
102         //lyxerr << std::dec;
103         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
104         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
105         int bytes = 1000 - outbytesleft;
106
107         std::vector<char> outvec(out, out + bytes);
108         return outvec;
109 }
110
111
112 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
113 {
114         //lyxerr << "Outbuf =" << std::hex;
115
116         std::vector<boost::uint32_t> ucs4;
117         for (size_t i = 0; i < bytes.size(); i += 4) {
118                 unsigned char const b1 = bytes[i    ];
119                 unsigned char const b2 = bytes[i + 1];
120                 unsigned char const b3 = bytes[i + 2];
121                 unsigned char const b4 = bytes[i + 3];
122
123                 boost::uint32_t c;
124                 char * cc = reinterpret_cast<char *>(&c);
125                 cc[3] = b1;
126                 cc[2] = b2;
127                 cc[1] = b3;
128                 cc[0] = b4;
129
130                 if (c > 0xffff) {
131                         lyxerr << "Strange ucs4 value encountered\n";
132                         lyxerr << "0x"
133                                << std::setw(2) << std::setfill('0') << int(b1)
134                                << std::setw(2) << std::setfill('0') << int(b2)
135                                << std::setw(2) << std::setfill('0') << int(b3)
136                                << std::setw(2) << std::setfill('0') << int(b4)
137                                << ' '
138                                << "(0x"
139                                << c
140                                << ") ";
141                 }
142
143                 ucs4.push_back(c);
144         }
145         //lyxerr << endl;
146         return ucs4;
147 }
148
149
150 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
151 {
152         //lyxerr << "Outbuf =" << std::hex;
153
154         std::vector<unsigned short> ucs2;
155         for (size_t i = 0; i < bytes.size(); i += 2) {
156                 unsigned char const b1 = bytes[i    ];
157                 unsigned char const b2 = bytes[i + 1];
158
159                 unsigned short c;
160                 char * cc = reinterpret_cast<char *>(&c);
161                 cc[0] = b1;
162                 cc[1] = b2;
163
164                 //lyxerr << "0x"
165                 //       << std::setw(2) << std::setfill('0') << int(b2)
166                 //       << std::setw(2) << std::setfill('0') << int(b1)
167                 //       << ' '
168                 //       << "(0x"
169                 //       << c
170                 //       << ") ";
171
172                 ucs2.push_back(c);
173         }
174         //lyxerr << endl;
175         return ucs2;
176 }
177
178 } // anon namespace
179
180
181 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
182 {
183         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
184         //       << " (" << utf8str.size() << ")" << endl;
185         //lyxerr << "Res = " << string(res.begin(), res.end())
186         //       << " (" << res.size() << ")" << endl;
187
188         std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str);
189         return bytes_to_ucs4(res);
190 }
191
192
193 std::vector<boost::uint32_t>
194 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
195 {
196         // TODO: Simplify and speed up.
197         std::vector<char> in;
198         std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
199         std::vector<unsigned short>::const_iterator end = ucs2str.end();
200         //lyxerr << std::hex;
201         for (; cit != end; ++cit) {
202                 unsigned short s = *cit;
203                 in.push_back(static_cast<char>(s & 0x00ff));
204                 in.push_back(static_cast<char>((s & 0xff00) >> 8));
205                 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
206                 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
207         }
208
209         std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in);
210         return bytes_to_ucs4(res);
211 }
212
213
214 std::vector<unsigned short>
215 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
216 {
217         std::vector<char> in;
218         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
219         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
220         for (; cit != end; ++cit) {
221                 boost::uint32_t s = *cit;
222                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
223                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
224                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
225                 in.push_back(static_cast<char>(s & 0x000000ff));
226         }
227         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
228         return bytes_to_ucs2(res);
229 }
230
231
232 std::vector<unsigned short>
233 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
234 {
235         std::vector<char> in;
236         for (size_t i = 0; i < ls; ++i) {
237                 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
238                 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
239                 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
240                 in.push_back(static_cast<char>(s[i] & 0x000000ff));
241         }
242         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
243         return bytes_to_ucs2(res);
244 }
245
246
247 unsigned short
248 ucs4_to_ucs2(boost::uint32_t c)
249 {
250         std::vector<char> in;
251         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
252         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
253         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
254         in.push_back(static_cast<char>(c & 0x000000ff));
255         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
256         std::vector<unsigned short> us = bytes_to_ucs2(res);
257         if (!us.empty())
258                 return us[0];
259         else
260                 return 0xfffd; // unknown character
261 }
262
263
264 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
265 {
266         std::vector<char> in;
267         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
268         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
269         for (; cit != end; ++cit) {
270                 boost::uint32_t s = *cit;
271                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
272                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
273                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
274                 in.push_back(static_cast<char>(s & 0x000000ff));
275         }
276         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
277         return res;
278 }
279
280
281 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
282 {
283         std::vector<char> in;
284         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
285         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
286         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
287         in.push_back(static_cast<char>(c & 0x000000ff));
288         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
289         return res;
290 }