]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
final compilation: wheel was already invented :(
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <iconv.h>
20
21 #include <cerrno>
22 #include <iomanip>
23 #include <map>
24
25 using std::endl;
26
27 namespace {
28
29 #ifdef WORDS_BIGENDIAN
30         char const * utf16_codeset = "UTF16-BE";
31 #else
32         char const * utf16_codeset = "UTF16-LE";
33 #endif
34
35 }
36
37
38 namespace lyx {
39
40 #ifdef WORDS_BIGENDIAN
41         char const * ucs4_codeset = "UCS-4BE";
42 #else
43         char const * ucs4_codeset = "UCS-4LE";
44 #endif
45
46 static const iconv_t invalid_cd = (iconv_t)(-1);
47
48
49 struct IconvProcessor::Private {
50         Private(): cd(invalid_cd) {}
51         ~Private()
52         {
53                 if (cd != invalid_cd) {
54                         if (iconv_close(cd) == -1) {
55                                 lyxerr << "Error returned from iconv_close("
56                                        << errno << ")" << endl;
57                         }
58                 }
59         }
60         iconv_t cd;
61 };
62
63
64 IconvProcessor::IconvProcessor(char const * tocode,
65                 char const * fromcode): tocode_(tocode), fromcode_(fromcode),
66                 pimpl_(new IconvProcessor::Private)
67 {
68 }
69
70
71 IconvProcessor::IconvProcessor(IconvProcessor const & other)
72         : tocode_(other.tocode_), fromcode_(other.fromcode_),
73           pimpl_(new IconvProcessor::Private)
74 {
75 }
76
77
78 IconvProcessor & IconvProcessor::operator=(IconvProcessor const & other)
79 {
80         if (&other == this)
81                 return *this;
82         tocode_ = other.tocode_;
83         fromcode_ = other.fromcode_;
84         pimpl_.reset(new Private);
85         return *this;
86 }
87
88
89 IconvProcessor::~IconvProcessor() {}
90
91
92 bool IconvProcessor::init()
93 {
94         if (pimpl_->cd != invalid_cd)
95                 return true;
96
97         pimpl_->cd = iconv_open(tocode_.c_str(), fromcode_.c_str());
98         if (pimpl_->cd != invalid_cd)
99                 return true;
100
101         lyxerr << "Error returned from iconv_open" << endl;
102         switch (errno) {
103                 case EINVAL:
104                         lyxerr << "EINVAL The conversion from " << fromcode_
105                                 << " to " << tocode_
106                                 << " is not supported by the implementation."
107                                 << endl;
108                         break;
109                 default:
110                         lyxerr << "\tSome other error: " << errno << endl;
111                         break;
112         }
113         return false;
114 }
115
116
117 int IconvProcessor::convert(char const * buf, size_t buflen,
118                 char * outbuf, size_t maxoutsize)
119 {
120         if (buflen == 0)
121                 return 0;
122
123         if (pimpl_->cd == invalid_cd) {
124                 if (!init())
125                         return -1;
126         }
127
128         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
129         size_t inbytesleft = buflen;
130         size_t outbytesleft = maxoutsize;
131
132         int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
133
134         //lyxerr << std::dec;
135         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
136         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
137
138         if (res != -1)
139                 // Everything went well.
140                 return maxoutsize - outbytesleft;
141
142         // There are some errors in the conversion
143         lyxerr << "Error returned from iconv" << endl;
144         switch (errno) {
145                 case E2BIG:
146                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
147                         break;
148                 case EILSEQ:
149                         lyxerr << "EILSEQ An invalid multibyte sequence"
150                                 << " has been encountered in the input.\n"
151                                 << "When converting from " << fromcode_
152                                 << " to " << tocode_ << ".\n";
153                         lyxerr << "Input:" << std::hex;
154                         for (size_t i = 0; i < buflen; ++i) {
155                                 // char may be signed, avoid output of
156                                 // something like 0xffffffc2
157                                 boost::uint32_t const b =
158                                         *reinterpret_cast<unsigned char const *>(buf + i);
159                                 lyxerr << " 0x" << b;
160                         }
161                         lyxerr << endl;
162                         break;
163                 case EINVAL:
164                         lyxerr << "EINVAL An incomplete multibyte sequence"
165                                 << " has been encountered in the input.\n"
166                                 << "When converting from " << fromcode_
167                                 << " to " << tocode_ << ".\n";
168                         lyxerr << "Input:" << std::hex;
169                         for (size_t i = 0; i < buflen; ++i) {
170                                 // char may be signed, avoid output of
171                                 // something like 0xffffffc2
172                                 boost::uint32_t const b =
173                                         *reinterpret_cast<unsigned char const *>(buf + i);
174                                 lyxerr << " 0x" << b;
175                         }
176                         lyxerr << endl;
177                         break;
178                 default:
179                         lyxerr << "\tSome other error: " << errno << endl;
180                         break;
181         }
182         // We got an error so we close down the conversion engine
183         if (iconv_close(pimpl_->cd) == -1) {
184                 lyxerr << "Error returned from iconv_close("
185                         << errno << ")" << endl;
186         }
187         pimpl_->cd = invalid_cd;
188         return -1;
189 }
190
191
192 namespace {
193
194
195 template<typename RetType, typename InType>
196 std::vector<RetType>
197 iconv_convert(IconvProcessor & processor,
198               InType const * buf,
199               size_t buflen)
200 {
201         if (buflen == 0)
202                 return std::vector<RetType>();
203
204         char const * inbuf = reinterpret_cast<char const *>(buf);
205         size_t inbytesleft = buflen * sizeof(InType);
206
207         size_t const outsize = 32768;
208         static char out[outsize];
209         char * outbuf = out;
210
211         int bytes = processor.convert(inbuf, inbytesleft, outbuf, outsize);
212         if (bytes <= 0)
213                 // Conversion failed
214                 // FIXME Maybe throw an exception and handle that in the caller?
215                 return std::vector<RetType>();
216
217         RetType const * tmp = reinterpret_cast<RetType const *>(out);
218         return std::vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
219 }
220
221 } // anon namespace
222
223
224 std::vector<lyx::char_type> utf8_to_ucs4(std::vector<char> const & utf8str)
225 {
226         if (utf8str.empty())
227                 return std::vector<lyx::char_type>();
228
229         return utf8_to_ucs4(&utf8str[0], utf8str.size());
230 }
231
232
233 std::vector<lyx::char_type>
234 utf8_to_ucs4(char const * utf8str, size_t ls)
235 {
236         static IconvProcessor processor(ucs4_codeset, "UTF-8");
237         return iconv_convert<lyx::char_type>(processor, utf8str, ls);
238 }
239
240
241 std::vector<char_type>
242 utf16_to_ucs4(unsigned short const * s, size_t ls)
243 {
244         static IconvProcessor processor(ucs4_codeset, utf16_codeset);
245         return iconv_convert<char_type>(processor, s, ls);
246 }
247
248
249 std::vector<unsigned short>
250 ucs4_to_utf16(char_type const * s, size_t ls)
251 {
252         static IconvProcessor processor(utf16_codeset, ucs4_codeset);
253         return iconv_convert<unsigned short>(processor, s, ls);
254 }
255
256
257 std::vector<char>
258 ucs4_to_utf8(lyx::char_type c)
259 {
260         static IconvProcessor processor("UTF-8", ucs4_codeset);
261         return iconv_convert<char>(processor, &c, 1);
262 }
263
264
265 std::vector<char>
266 ucs4_to_utf8(std::vector<lyx::char_type> const & ucs4str)
267 {
268         if (ucs4str.empty())
269                 return std::vector<char>();
270
271         return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
272 }
273
274
275 std::vector<char>
276 ucs4_to_utf8(lyx::char_type const * ucs4str, size_t ls)
277 {
278         static IconvProcessor processor("UTF-8", ucs4_codeset);
279         return iconv_convert<char>(processor, ucs4str, ls);
280 }
281
282
283 std::vector<lyx::char_type>
284 eightbit_to_ucs4(char const * s, size_t ls, std::string const & encoding)
285 {
286         static std::map<std::string, IconvProcessor> processors;
287         if (processors.find(encoding) == processors.end()) {
288                 IconvProcessor processor(ucs4_codeset, encoding.c_str());
289                 processors.insert(std::make_pair(encoding, processor));
290         }
291         return iconv_convert<char_type>(processors[encoding], s, ls);
292 }
293
294
295 std::vector<char>
296 ucs4_to_eightbit(lyx::char_type const * ucs4str, size_t ls, std::string const & encoding)
297 {
298         static std::map<std::string, IconvProcessor> processors;
299         if (processors.find(encoding) == processors.end()) {
300                 IconvProcessor processor(encoding.c_str(), ucs4_codeset);
301                 processors.insert(std::make_pair(encoding, processor));
302         }
303         return iconv_convert<char>(processors[encoding], ucs4str, ls);
304 }
305
306 } // namespace lyx