]> git.lyx.org Git - lyx.git/blob - src/support/unicode.C
Cleanup and speedup some of the conversion functions a bit.
[lyx.git] / src / support / unicode.C
1 /**
2  * \file unicode.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  *
8  * Full author contact details are available in file CREDITS.
9  *
10  * A collection of unicode conversion functions, using iconv.
11  */
12
13 #include <config.h>
14
15 #include "unicode.h"
16
17 #include "debug.h"
18
19 #include <iconv.h>
20
21 #include <cerrno>
22 #include <iomanip>
23 #include <string>
24
25 using std::endl;
26 using std::string;
27
28 namespace {
29
30 #ifdef WORDS_BIGENDIAN
31         char const * ucs4_codeset = "UCS-4BE";
32         char const * ucs2_codeset = "UCS-2BE";
33 #else
34         char const * ucs4_codeset = "UCS-4LE";
35         char const * ucs2_codeset = "UCS-2LE";
36 #endif
37
38 std::vector<char>
39 iconv_convert(std::string const & tocode, std::string const & fromcode,
40               std::vector<char> const & buf)
41 {
42         if (buf.empty())
43                 return std::vector<char>();
44
45         iconv_t cd = iconv_open(tocode.c_str(), fromcode.c_str());
46         if (cd == (iconv_t)(-1)) {
47                 lyxerr << "Error returned from iconv_open" << endl;
48                 switch (errno) {
49                 case EINVAL:
50                         lyxerr << "EINVAL The conversion from " << fromcode
51                                << " to " << tocode
52                                << " is not supported by the implementation."
53                                << endl;
54                         break;
55                 default:
56                         lyxerr << "\tSome other error: " << errno << endl;
57                         break;
58                 }
59         }
60
61         char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(&buf[0]);
62         size_t inbytesleft = buf.size();
63         static char out[1000];
64         char * outbuf = out;
65         size_t outbytesleft = 1000;
66
67         size_t res = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
68
69         if (res == (size_t)(-1)) {
70                 lyxerr << "Error returned from iconv" << endl;
71                 switch (errno) {
72                 case E2BIG:
73                         lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
74                         break;
75                 case EILSEQ:
76                         lyxerr << "EILSEQ An invalid multibyte sequence"
77                                << " has been encountered in the input.\n"
78                                << "When converting from " << fromcode
79                                << " to " << tocode << ".\n";
80                         lyxerr << "Input: " << std::hex;
81                         for (size_t i = 0; i < buf.size(); ++i) {
82                                 unsigned char const b = buf[i];
83                                 lyxerr << "0x" << int(b) << " ";
84                         }
85                         lyxerr << endl;
86                         break;
87                 case EINVAL:
88                         lyxerr << "EINVAL An incomplete multibyte sequence"
89                                << " has been encountered in the input.\n"
90                                << "When converting from " << fromcode
91                                << " to " << tocode << ".\n";
92                         lyxerr << "Input: " << std::hex;
93                         for (size_t i = 0; i < buf.size(); ++i) {
94                                 unsigned char const b = buf[i];
95                                 lyxerr << "0x" << int(b) << " ";
96                         }
97                         lyxerr << endl;
98                         break;
99                 default:
100                         lyxerr << "\tSome other error: " << errno << endl;
101                         break;
102                 }
103         }
104
105         if (iconv_close(cd) == -1) {
106                 lyxerr << "Error returned from iconv_close("
107                        << errno << ")" << endl;
108         }
109
110         //lyxerr << std::dec;
111         //lyxerr << "Inbytesleft: " << inbytesleft << endl;
112         //lyxerr << "Outbytesleft: " << outbytesleft << endl;
113         int bytes = 1000 - outbytesleft;
114
115         std::vector<char> outvec(out, out + bytes);
116         return outvec;
117 }
118
119
120 std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
121 {
122         boost::uint32_t const * tmp = reinterpret_cast<uint32_t const *>(&bytes[0]);
123         return std::vector<boost::uint32_t>(tmp, tmp + bytes.size() / 4);
124 }
125
126
127 std::vector<unsigned short> bytes_to_ucs2(std::vector<char> const & bytes)
128 {
129         unsigned short const * tmp = reinterpret_cast<unsigned short const *>(&bytes[0]);
130         return std::vector<unsigned short>(tmp, tmp + bytes.size() / 2);
131 }
132
133 } // anon namespace
134
135
136 std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
137 {
138         //lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
139         //       << " (" << utf8str.size() << ")" << endl;
140         //lyxerr << "Res = " << string(res.begin(), res.end())
141         //       << " (" << res.size() << ")" << endl;
142
143         std::vector<char> res = iconv_convert(ucs4_codeset, "UTF-8", utf8str);
144         return bytes_to_ucs4(res);
145 }
146
147
148 std::vector<boost::uint32_t>
149 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
150 {
151         char const * tin = reinterpret_cast<char const *>(&ucs2str[0]);
152         std::vector<char> in(tin, tin + ucs2str.size() * 2);
153         std::vector<char> res = iconv_convert(ucs4_codeset, ucs2_codeset, in);
154         return bytes_to_ucs4(res);
155 }
156
157
158 std::vector<unsigned short>
159 ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
160 {
161         char const * tin = reinterpret_cast<char const *>(&ucs4str[0]);
162         std::vector<char> in(tin, tin + ucs4str.size() * 4);
163         std::vector<char> res = iconv_convert(ucs2_codeset, ucs4_codeset, in);
164         return bytes_to_ucs2(res);
165 }
166
167
168 std::vector<unsigned short>
169 ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
170 {
171         char const * tin = reinterpret_cast<char const *>(s);
172         std::vector<char> in(tin, tin + ls * 4);
173         std::vector<char> res = iconv_convert(ucs2_codeset, ucs4_codeset, in);
174         return bytes_to_ucs2(res);
175 }
176
177
178 unsigned short
179 ucs4_to_ucs2(boost::uint32_t c)
180 {
181         char const * tin = reinterpret_cast<char const *>(&c);
182         std::vector<char> in(tin, tin + 4);
183         std::vector<char> res = iconv_convert(ucs2_codeset, ucs4_codeset, in);
184         return bytes_to_ucs2(res)[0];
185 }
186
187
188 std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
189 {
190         char const * tin = reinterpret_cast<char const *>(&ucs4str[0]);
191         std::vector<char> in(tin, tin + ucs4str.size() * 4);
192         std::vector<char> res = iconv_convert("UTF-8", ucs4_codeset, in);
193         return res;
194 }
195
196
197 std::vector<char> ucs4_to_utf8(boost::uint32_t c)
198 {
199         char const * tin = reinterpret_cast<char const *>(&c);
200         std::vector<char> in(tin, tin + 4);
201         std::vector<char> res = iconv_convert("UTF-8", ucs4_codeset, in);
202         return res;
203 }