]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
add docstring.h
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <cerrno>
20 #include <iomanip>
21 #include <string>
22
23 using std::endl;
24 using std::string;
25
26 namespace {
27
28 std::vector<char>
29 iconv_convert(std::string const & tocode, std::string const & fromcode,
30               std::vector<char> const & buf)
31 {
32         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
33         if (cd == (iconv_t)(-1)) {
34                 lyxerr << "Error returned from iconv_open" << endl;
35                 switch (errno) {
36                 case EINVAL:
37                         lyxerr << "EINVAL The conversion from " << fromcode
38                                << " to " << tocode
39                                << " is not supported by the implementation."
40                                << endl;
41                         break;
42                 default:
43                         lyxerr << "\tSome other error: " << errno << endl;
44                         break;
45                 }
46         }
47
48         char * inbuf = const_cast<char *>(&buf[0]);
49         size_t inbytesleft = buf.size();
50         char out[1000] = { 0 };
51         char * outbuf = out;
52         size_t outbytesleft = 1000;
53
54         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
55
56         if (res == (size_t)(-1)) {
57                 lyxerr << "Error returned from iconv" << endl;
58                 switch (errno) {
59                 case E2BIG:
60                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
61                         break;
62                 case EILSEQ:
63                         lyxerr << "EILSEQ An invalid multibyte sequence"
64                                << " has been encountered in the input.\n"
65                                << "When converting from " << fromcode
66                                << " to " << tocode << ".\n";
67                         lyxerr << "Input: " << std::hex;
68                         for (size_t i = 0; i < buf.size(); ++i) {
69                                 unsigned char const b = buf[i];
70                                 lyxerr << "0x" << int(b) << " ";
71                         }
72                         lyxerr << endl;
73                         break;
74                 case EINVAL:
75                         lyxerr << "EINVAL An incomplete multibyte sequence"
76                                << " has been encountered in the input.\n"
77                                << "When converting from " << fromcode
78                                << " to " << tocode << ".\n";
79                         lyxerr << "Input: " << std::hex;
80                         for (size_t i = 0; i < buf.size(); ++i) {
81                                 unsigned char const b = buf[i];
82                                 lyxerr << "0x" << int(b) << " ";
83                         }
84                         lyxerr << endl;
85                         break;
86                 default:
87                         lyxerr << "\tSome other error: " << errno << endl;
88                         break;
89                 }
90         }
91
92         if (iconv_close(cd) == -1) {
93                 lyxerr << "Error returned from iconv_close("
94                        << errno << ")" << endl;
95         }
96
97         //lyxerr << std::dec;
98         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
99         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
100         int bytes = 1000 - outbytesleft;
101
102         std::vector<char> outvec(out, out + bytes);
103         return outvec;
104 }
105
106
107 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
108 {
109         //lyxerr << "Outbuf =" << std::hex;
110
111         std::vector<uint32_t> ucs4;
112         for (size_t i = 0; i < bytes.size(); i += 4) {
113                 unsigned char const b1 = bytes[i    ];
114                 unsigned char const b2 = bytes[i + 1];
115                 unsigned char const b3 = bytes[i + 2];
116                 unsigned char const b4 = bytes[i + 3];
117
118                 boost::uint32_t c;
119                 char * cc = reinterpret_cast<char *>(&c);
120                 cc[3] = b1;
121                 cc[2] = b2;
122                 cc[1] = b3;
123                 cc[0] = b4;
124
125                 if (c > 0xffff) {
126                         lyxerr << "Strange ucs4 value encountered\n";
127                         lyxerr << "0x"
128                                << std::setw(2) << std::setfill('0') << int(b1)
129                                << std::setw(2) << std::setfill('0') << int(b2)
130                                << std::setw(2) << std::setfill('0') << int(b3)
131                                << std::setw(2) << std::setfill('0') << int(b4)
132                                << ' '
133                                << "(0x"
134                                << c
135                                << ") ";
136                 }
137
138                 ucs4.push_back(c);
139         }
140         //lyxerr << endl;
141         return ucs4;
142 }
143
144
145 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
146 {
147         //lyxerr << "Outbuf =" << std::hex;
148
149         std::vector<unsigned short> ucs2;
150         for (size_t i = 0; i < bytes.size(); i += 2) {
151                 unsigned char const b1 = bytes[i    ];
152                 unsigned char const b2 = bytes[i + 1];
153
154                 unsigned short c;
155                 char * cc = reinterpret_cast<char *>(&c);
156                 cc[0] = b1;
157                 cc[1] = b2;
158
159                 //lyxerr << "0x"
160                 //       << std::setw(2) << std::setfill('0') << int(b2)
161                 //       << std::setw(2) << std::setfill('0') << int(b1)
162                 //       << ' '
163                 //       << "(0x"
164                 //       << c
165                 //       << ") ";
166
167                 ucs2.push_back(c);
168         }
169         //lyxerr << endl;
170         return ucs2;
171 }
172
173 } // anon namespace
174
175
176 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
177 {
178         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
179         //       << " (" << utf8str.size() << ")" << endl;
180         //lyxerr << "Res = " << string(res.begin(), res.end())
181         //       << " (" << res.size() << ")" << endl;
182
183         std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str);
184         return bytes_to_ucs4(res);
185 }
186
187
188 std::vector<boost::uint32_t>
189 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
190 {
191         // TODO: Simplify and speed up.
192         std::vector<char> in;
193         std::vector<unsigned short>::const_iterator cit = ucs2str.begin();
194         std::vector<unsigned short>::const_iterator end = ucs2str.end();
195         //lyxerr << std::hex;
196         for (; cit != end; ++cit) {
197                 unsigned short s = *cit;
198                 in.push_back(static_cast<char>(s & 0x00ff));
199                 in.push_back(static_cast<char>((s & 0xff00) >> 8));
200                 lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
201                 lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
202         }
203
204         std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in);
205         return bytes_to_ucs4(res);
206 }
207
208
209 std::vector<unsigned short>
210 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
211 {
212         std::vector<char> in;
213         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
214         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
215         for (; cit != end; ++cit) {
216                 boost::uint32_t s = *cit;
217                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
218                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
219                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
220                 in.push_back(static_cast<char>(s & 0x000000ff));
221         }
222         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
223         return bytes_to_ucs2(res);
224 }
225
226
227 std::vector<unsigned short>
228 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
229 {
230         std::vector<char> in;
231         for (size_t i = 0; i < ls; ++i) {
232                 in.push_back(static_cast<char>((s[i] & 0xff000000) >> 24));
233                 in.push_back(static_cast<char>((s[i] & 0x00ff0000) >> 16));
234                 in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
235                 in.push_back(static_cast<char>(s[i] & 0x000000ff));
236         }
237         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
238         return bytes_to_ucs2(res);
239 }
240
241
242 unsigned short
243 ucs4_to_ucs2(boost::uint32_t c)
244 {
245         std::vector<char> in;
246         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
247         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
248         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
249         in.push_back(static_cast<char>(c & 0x000000ff));
250         std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
251         std::vector<unsigned short> us = bytes_to_ucs2(res);
252         if (!us.empty())
253                 return us[0];
254         else
255                 return 0xfffd; // unknown character
256 }
257
258
259 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
260 {
261         std::vector<char> in;
262         std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
263         std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
264         for (; cit != end; ++cit) {
265                 boost::uint32_t s = *cit;
266                 in.push_back(static_cast<char>((s & 0xff000000) >> 24));
267                 in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
268                 in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
269                 in.push_back(static_cast<char>(s & 0x000000ff));
270         }
271         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
272         return res;
273 }
274
275
276 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
277 {
278         std::vector<char> in;
279         in.push_back(static_cast<char>((c & 0xff000000) >> 24));
280         in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
281         in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
282         in.push_back(static_cast<char>(c & 0x000000ff));
283         std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
284         return res;
285 }