]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
* LyXLex::Pimpl::buff is now a string.
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <iconv.h>
20
21 #include <cerrno>
22 #include <iomanip>
23 #include <map>
24
25
26 namespace lyx {
27
28 using std::endl;
29
30 #ifdef WORDS_BIGENDIAN
31         char const * ucs4_codeset = "UCS-4BE";
32         char const * ucs2_codeset = "UCS-2BE";
33 #else
34         char const * ucs4_codeset = "UCS-4LE";
35         char const * ucs2_codeset = "UCS-2LE";
36 #endif
37
38 int iconv_convert(int & cd,
39               char const * tocode,
40               char const * fromcode,
41               char const * buf,
42               size_t buflen,
43                   char * outbuf,
44                   size_t maxoutsize)
45 {
46         if (buflen == 0)
47                 return 0;
48
49         if (cd == -1) {
50                 cd = (int)(iconv_open(tocode, fromcode));
51                 if (cd == -1) {
52                         lyxerr << "Error returned from iconv_open" << endl;
53                         switch (errno) {
54                         case EINVAL:
55                                 lyxerr << "EINVAL The conversion from " << fromcode
56                                        << " to " << tocode
57                                        << " is not supported by the implementation."
58                                        << endl;
59                                 break;
60                         default:
61                                 lyxerr << "\tSome other error: " << errno << endl;
62                                 break;
63                         }
64                 }
65         }
66
67         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
68         size_t inbytesleft = buflen;
69         size_t outbytesleft = maxoutsize;
70
71         int res = iconv((iconv_t)(cd), &inbuf, &inbytesleft, &outbuf, &outbytesleft);
72
73         if (res == -1) {
74                 lyxerr << "Error returned from iconv" << endl;
75                 switch (errno) {
76                 case E2BIG:
77                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
78                         break;
79                 case EILSEQ:
80                         lyxerr << "EILSEQ An invalid multibyte sequence"
81                                << " has been encountered in the input.\n"
82                                << "When converting from " << fromcode
83                                << " to " << tocode << ".\n";
84                         lyxerr << "Input: " << std::hex;
85                         for (size_t i = 0; i < buflen; ++i) {
86                                 boost::uint32_t const b = buf[i];
87                                 lyxerr << "0x" << b << " ";
88                         }
89                         lyxerr << endl;
90                         break;
91                 case EINVAL:
92                         lyxerr << "EINVAL An incomplete multibyte sequence"
93                                << " has been encountered in the input.\n"
94                                << "When converting from " << fromcode
95                                << " to " << tocode << ".\n";
96                         lyxerr << "Input: " << std::hex;
97                         for (size_t i = 0; i < buflen; ++i) {
98                                 boost::uint32_t const b = buf[i];
99                                 lyxerr << "0x" << b << " ";
100                         }
101                         lyxerr << endl;
102                         break;
103                 default:
104                         lyxerr << "\tSome other error: " << errno << endl;
105                         break;
106                 }
107                 // We got an error so we close down the conversion engine
108                 if (iconv_close((iconv_t)(cd)) == -1) {
109                         lyxerr << "Error returned from iconv_close("
110                                << errno << ")" << endl;
111                 }
112                 cd = -1;
113         }
114
115         //lyxerr << std::dec;
116         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
117         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
118
119         return maxoutsize - outbytesleft;
120 }
121
122
123 namespace {
124
125
126 template<typename RetType, typename InType>
127 std::vector<RetType>
128 iconv_convert(int & cd,
129               char const * tocode,
130               char const * fromcode,
131               InType const * buf,
132               size_t buflen)
133 {
134         if (buflen == 0)
135                 return std::vector<RetType>();
136
137         char const * inbuf = reinterpret_cast<char const *>(buf);
138         size_t inbytesleft = buflen * sizeof(InType);
139
140         size_t const outsize = 32768;
141         static char out[outsize];
142         char * outbuf = out;
143
144         int bytes = lyx::iconv_convert(cd, tocode, fromcode, inbuf, inbytesleft, outbuf, outsize);
145
146         RetType const * tmp = reinterpret_cast<RetType const *>(out);
147         return std::vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
148 }
149
150 } // anon namespace
151
152
153 std::vector<lyx::char_type> utf8_to_ucs4(std::vector<char> const & utf8str)
154 {
155         if (utf8str.empty())
156                 return std::vector<lyx::char_type>();
157
158         return utf8_to_ucs4(&utf8str[0], utf8str.size());
159 }
160
161
162 std::vector<lyx::char_type>
163 utf8_to_ucs4(char const * utf8str, size_t ls)
164 {
165         static int cd = -1;
166         return iconv_convert<lyx::char_type>(cd, ucs4_codeset, "UTF-8",
167                                               utf8str, ls);
168 }
169
170
171 lyx::char_type
172 ucs2_to_ucs4(unsigned short c)
173 {
174         return ucs2_to_ucs4(&c, 1)[0];
175 }
176
177
178 std::vector<lyx::char_type>
179 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
180 {
181         if (ucs2str.empty())
182                 return std::vector<lyx::char_type>();
183
184         return ucs2_to_ucs4(&ucs2str[0], ucs2str.size());
185 }
186
187
188 std::vector<lyx::char_type>
189 ucs2_to_ucs4(unsigned short const * ucs2str, size_t ls)
190 {
191         static int cd = -1;
192         return iconv_convert<lyx::char_type>(cd, ucs4_codeset, ucs2_codeset,
193                                               ucs2str, ls);
194 }
195
196
197 unsigned short
198 ucs4_to_ucs2(lyx::char_type c)
199 {
200         return ucs4_to_ucs2(&c, 1)[0];
201 }
202
203
204 std::vector<unsigned short>
205 ucs4_to_ucs2(std::vector<lyx::char_type> const & ucs4str)
206 {
207         if (ucs4str.empty())
208                 return std::vector<unsigned short>();
209
210         return ucs4_to_ucs2(&ucs4str[0], ucs4str.size());
211 }
212
213
214 std::vector<unsigned short>
215 ucs4_to_ucs2(lyx::char_type const * s, size_t ls)
216 {
217         static int cd = -1;
218         return iconv_convert<unsigned short>(cd, ucs2_codeset, ucs4_codeset,
219                                              s, ls);
220 }
221
222
223 std::vector<char>
224 ucs4_to_utf8(lyx::char_type c)
225 {
226         static int cd = -1;
227         return iconv_convert<char>(cd, "UTF-8", ucs4_codeset, &c, 1);
228 }
229
230
231 std::vector<char>
232 ucs4_to_utf8(std::vector<lyx::char_type> const & ucs4str)
233 {
234         if (ucs4str.empty())
235                 return std::vector<char>();
236
237         return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
238 }
239
240
241 std::vector<char>
242 ucs4_to_utf8(lyx::char_type const * ucs4str, size_t ls)
243 {
244         static int cd = -1;
245         return iconv_convert<char>(cd, "UTF-8", ucs4_codeset,
246                                    ucs4str, ls);
247 }
248
249
250 std::vector<lyx::char_type>
251 eightbit_to_ucs4(char const * s, size_t ls, std::string const & encoding)
252 {
253         static std::map<std::string, int> cd;
254         if (cd.find(encoding) == cd.end())
255                 cd[encoding] = -1;
256         return iconv_convert<char_type>(cd[encoding], ucs4_codeset,
257                                         encoding.c_str(), s, ls);
258 }
259
260
261 std::vector<char>
262 ucs4_to_eightbit(lyx::char_type const * ucs4str, size_t ls, std::string const & encoding)
263 {
264         static std::map<std::string, int> cd;
265         if (cd.find(encoding) == cd.end())
266                 cd[encoding] = -1;
267         return iconv_convert<char>(cd[encoding], encoding.c_str(),
268                                    ucs4_codeset, ucs4str, ls);
269 }
270
271 } // namespace lyx