]> git.lyx.org Git - lyx.git/blob - src/encoding.C
use more unicode in math
[lyx.git] / src / encoding.C
1 /**
2  * \file encoding.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  * \author Jean-Marc Lasgouttes
8  * \author Dekel Tsur
9  *
10  * Full author contact details are available in file CREDITS.
11  */
12
13 #include <config.h>
14
15 #include "encoding.h"
16
17 #include "debug.h"
18 #include "lyxlex.h"
19 #include "lyxrc.h"
20
21
22 namespace lyx {
23
24 #ifndef CXX_GLOBAL_CSTD
25 using std::strtol;
26 #endif
27
28 using std::endl;
29 using std::string;
30
31
32 Encodings encodings;
33
34 namespace {
35
36 char_type tab_iso8859_1[256] = {
37    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
38    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
39    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
40    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
41    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
42    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
43    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
44    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
45    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
46    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
47    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
48    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
49    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
50    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
51    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
52    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
53 };
54
55 #ifdef USE_UNICODE_FOR_SYMBOLS
56 char_type tab_symbol[256] = {
57    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
58    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
59
60    0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220b,
61    0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
62
63    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
64
65 //   0x2245, 0x0391, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
66 //   0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
67
68 //   0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
69 //   0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0xffff,
70    0x2245, 0x0041, 0x0042, 0x0058, 0x0394, 0x0045, 0x03c2, 0x03a9,
71    0x0048, 0x0049, 0x03d1, 0x004b, 0x039b, 0x004d, 0x004e, 0x004f,
72
73    0x03a0, 0x0398, 0x0050, 0x03a3, 0x0054, 0x0059, 0x03c2, 0x03a9,
74    0x039e, 0x03a8, 0x005a, 0x005b, 0x2234, 0x005d, 0x22a5, 0xffff,
75
76    0xffff, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03d5, 0x03b3,
77    0x03b7, 0x03b9, 0x03c6, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
78
79    0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
80    0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x007e, 0xffff,
81
82    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
83    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
84
85    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
86    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
87
88    0xffff, 0x03d2, 0x2032, 0x2264, 0x2215, 0x221e, 0xffff, 0x2663,
89    0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
90
91    0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
92    0x00f7, 0x2260, 0x2261, 0x2248, 0x22ef, 0xffff, 0xffff, 0x21b5,
93
94    0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
95    0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
96
97    0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
98    0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
99
100    0x2662, 0x2329, 0x00ae, 0x00a9, 0x2122, 0x2211, 0xffff, 0xffff,
101    0xffff, 0x2308, 0xffff, 0x230a, 0xffff, 0xffff, 0xffff, 0xffff,
102
103    0xffff, 0x232a, 0x222b, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
104    0xffff, 0x2309, 0xffff, 0x230b, 0xffff, 0xffff, 0xffff, 0xffff
105 };
106 #endif
107
108
109 char_type arabic_table2[63][4] = {
110         {0x41, 0x41, 0x41, 0x41}, // 0xc1 = hamza
111         {0x42, 0xa1, 0x42, 0xa1}, // 0xc2 = ligature madda on alef
112         {0x43, 0xa2, 0x43, 0xa2}, // 0xc3 = ligature hamza on alef
113         {0x44, 0xa3, 0x44, 0xa3}, // 0xc4 = ligature hamza on waw
114         {0x45, 0xa4, 0x45, 0xa4}, // 0xc5 = ligature hamza under alef
115         {0x46, 0xf9, 0xf8, 0xa0}, // 0xc6 = ligature hamza on ya
116         {0x47, 0xa5, 0x47, 0xa5}, // 0xc7 = alef
117         {0x48, 0xae, 0xac, 0xad}, // 0xc8 = baa
118         {0x49, 0xb1, 0x49, 0xb1}, // 0xc9 = taa marbuta
119         {0x4a, 0xb4, 0xb2, 0xb3}, // 0xca = taa
120         {0x4b, 0xb7, 0xb5, 0xb6}, // 0xcb = thaa
121         {0x4c, 0xba, 0xb8, 0xb9}, // 0xcc = jeem
122         {0x4d, 0xbd, 0xbb, 0xbc}, // 0xcd = haa
123         {0x4e, 0xc0, 0xbe, 0xbf}, // 0xce = khaa
124         {0x4f, 0xa6, 0x4f, 0xa6}, // 0xcf = dal
125
126         {0x50, 0xa7, 0x50, 0xa7}, // 0xd0 = thal
127         {0x51, 0xa8, 0x51, 0xa8}, // 0xd1 = ra
128         {0x52, 0xa9, 0x52, 0xa9}, // 0xd2 = zain
129         {0x53, 0xc3, 0xc1, 0xc2}, // 0xd3 = seen
130         {0x54, 0xc6, 0xc4, 0xc5}, // 0xd4 = sheen
131         {0x55, 0xc9, 0xc7, 0xc8}, // 0xd5 = sad
132         {0x56, 0xcc, 0xca, 0xcb}, // 0xd6 = dad
133         {0x57, 0xcf, 0xcd, 0xce}, // 0xd7 = tah
134         {0x58, 0xd2, 0xd0, 0xd1}, // 0xd8 = zah
135         {0x59, 0xd5, 0xd3, 0xd4}, // 0xd9 = ain
136         {0x5a, 0xd8, 0xd6, 0xd7}, // 0xda = ghain
137         {0,0,0,0}, // 0xdb
138         {0,0,0,0}, // 0xdc
139         {0,0,0,0}, // 0xdd
140         {0,0,0,0}, // 0xde
141         {0,0,0,0}, // 0xdf
142
143         {0,0,0,0}, // 0xe0
144         {0x61, 0xdb, 0xd9, 0xda}, // 0xe1 = fa
145         {0x62, 0xde, 0xdc, 0xdd}, // 0xe2 = qaf
146         {0x63, 0xe1, 0xdf, 0xe0}, // 0xe3 = kaf
147         {0x64, 0xe4, 0xe2, 0xe3}, // 0xe4 = lam
148         {0x65, 0xe7, 0xe5, 0xe6}, // 0xe5 = meem
149         {0x66, 0xea, 0xe8, 0xe9}, // 0xe6 = noon
150         {0x67, 0xed, 0xeb, 0xec}, // 0xe7 = ha
151         {0x68, 0xaa, 0x68, 0xaa}, // 0xe8 = waw
152         {0x69, 0xab, 0x69, 0xab}, // 0xe9 = alef maksura
153         {0x6a, 0xf0, 0xee, 0xef}, // 0xea = ya
154         {0x6b, 0x6b, 0x6b, 0x6b}, // 0xeb = fathatan
155         {0x6c, 0x6c, 0x6c, 0x6c}, // 0xec = dammatan
156         {0x6d, 0x6d, 0x6d, 0x6d}, // 0xed = kasratan
157         {0x6e, 0x6e, 0x6e, 0x6e}, // 0xee = fatha
158         {0x6f, 0x6f, 0x6f, 0x6f}, // 0xef = damma
159
160         {0x70, 0x70, 0x70, 0x70}, // 0xf0 = kasra
161         {0x71, 0x71, 0x71, 0x71}, // 0xf1 = shadda
162         {0x72, 0x72, 0x72, 0x72}, // 0xf2 = sukun
163         {0,0,0,0}, // 0xf3
164         {0,0,0,0}, // 0xf4
165         {0,0,0,0}, // 0xf5
166         {0,0,0,0}, // 0xf6
167         {0,0,0,0}, // 0xf7
168         {0,0,0,0}, // 0xf8
169         {0,0,0,0}, // 0xf9
170         {0,0,0,0}, // 0xfa
171         {0,0,0,0}, // 0xfb
172         {0,0,0,0}, // 0xfc
173         {0,0,0,0}, // 0xfd
174         {0,0,0,0}, // 0xfe
175         {0,0,0,0}, // 0xff
176 };
177
178
179 char_type arabic_table[63][2] = {
180         {0xc1, 0xc1}, // 0xc1 = hamza
181         {0xc2, 0xc2}, // 0xc2 = ligature madda on alef
182         {0xc3, 0xc3}, // 0xc3 = ligature hamza on alef
183         {0xc4, 0xc4}, // 0xc4 = ligature hamza on waw
184         {0xc5, 0xc5}, // 0xc5 = ligature hamza under alef
185         {0xc6, 0xc0}, // 0xc6 = ligature hamza on ya
186         {0xc7, 0xc7}, // 0xc7 = alef
187         {0xc8, 0xeb}, // 0xc8 = baa
188         {0xc9, 0xc9}, // 0xc9 = taa marbuta
189         {0xca, 0xec}, // 0xca = taa
190         {0xcb, 0xed}, // 0xcb = thaa
191         {0xcc, 0xee}, // 0xcc = jeem
192         {0xcd, 0xef}, // 0xcd = haa
193         {0xce, 0xf0}, // 0xce = khaa
194         {0xcf, 0xcf}, // 0xcf = dal
195
196         {0xd0, 0xd0}, // 0xd0 = thal
197         {0xd1, 0xd1}, // 0xd1 = ra
198         {0xd2, 0xd2}, // 0xd2 = zain
199         {0xd3, 0xf1}, // 0xd3 = seen
200         {0xd4, 0xf2}, // 0xd4 = sheen
201         {0xd5, 0xf3}, // 0xd5 = sad
202         {0xd6, 0xf4}, // 0xd6 = dad
203         {0xd7, 0xd7}, // 0xd7 = tah
204         {0xd8, 0xd8}, // 0xd8 = zah
205         {0xd9, 0xf5}, // 0xd9 = ain
206         {0xda, 0xf6}, // 0xda = ghain
207         {0,0}, // 0xdb
208         {0,0}, // 0xdc
209         {0,0}, // 0xdd
210         {0,0}, // 0xde
211         {0,0}, // 0xdf
212
213         {0,0},  // 0xe0
214         {0xe1, 0xf7},   // 0xe1 = fa
215         {0xe2, 0xf8},   // 0xe2 = qaf
216         {0xe3, 0xf9},   // 0xe3 = kaf
217         {0xe4, 0xfa},   // 0xe4 = lam
218         {0xe5, 0xfb},   // 0xe5 = meem
219         {0xe6, 0xfc},   // 0xe6 = noon
220         {0xe7, 0xfd},   // 0xe7 = ha
221         {0xe8, 0xe8},   // 0xe8 = waw
222         {0xe9, 0xe9},   // 0xe9 = alef maksura
223         {0xea, 0xfe},   // 0xea = ya
224         {0xa8, 0xa8},   // 0xeb = fathatan
225         {0xa9, 0xa9},   // 0xec = dammatan
226         {0xaa, 0xaa},   // 0xed = kasratan
227         {0xab, 0xab},   // 0xee = fatha
228         {0xac, 0xac},   // 0xef = damma
229
230         {0xad, 0xad},   // 0xf0 = kasra
231         {0xae, 0xae},   // 0xf1 = shadda
232         {0xaf, 0xaf},   // 0xf2 = sukun
233         {0,0}, // 0xf3
234         {0,0}, // 0xf4
235         {0,0}, // 0xf5
236         {0,0}, // 0xf6
237         {0,0}, // 0xf7
238         {0,0}, // 0xf8
239         {0,0}, // 0xf9
240         {0,0}, // 0xfa
241         {0,0}, // 0xfb
242         {0,0}, // 0xfc
243         {0,0}, // 0xfd
244         {0,0}, // 0xfe
245         {0,0} // 0xff
246 };
247
248
249 char_type const arabic_start = 0xc1;
250
251 } // namespace anon
252
253
254
255 char_type Encoding::ucs(char_type c) const
256 {
257         BOOST_ASSERT(c < 256);
258         return encoding_table[c];
259 }
260
261
262 bool Encodings::isComposeChar_hebrew(char_type c)
263 {
264         return c <= 0xd2 && c >= 0xc0 &&
265                 c != 0xce && c != 0xd0;
266 }
267
268
269 // Special Arabic letters are ones that do not get connected from left
270 // they are hamza, alef_madda, alef_hamza, waw_hamza, alef_hamza_under,
271 // alef, tah_marbota, dal, thal, rah, zai, wow, alef_maksoura
272
273 bool Encodings::is_arabic_special(char_type c)
274 {
275         return  (c >= 0xc1 && c <= 0xc5) ||
276                  c == 0xc7 || c  == 0xc9  ||
277                  c == 0xcf || c  == 0xe8  ||
278                 (c >= 0xd0 && c <= 0xd2) ||
279                  c == 0xe9;
280 }
281
282 bool Encodings::isComposeChar_arabic(char_type c)
283 {
284         return c >= 0xeb && c <= 0xf2;
285 }
286
287
288 bool Encodings::is_arabic(char_type c)
289 {
290         return c >= arabic_start && arabic_table[c-arabic_start][0];
291 }
292
293
294 char_type Encodings::transformChar(char_type c,
295                                       Encodings::Letter_Form form)
296 {
297         if (!is_arabic(c))
298                 return c;
299
300         if (lyxrc.font_norm_type == LyXRC::ISO_10646_1)
301                 return arabic_table2[c-arabic_start][form];
302         else
303                 return arabic_table[c-arabic_start][form >> 1];
304 }
305
306
307 Encoding const * Encodings::getEncoding(string const & encoding) const
308 {
309         EncodingList::const_iterator it = encodinglist.find(encoding);
310         if (it != encodinglist.end())
311                 return &it->second;
312         else
313                 return 0;
314 }
315
316 Encodings::Encodings()
317 {
318         encodinglist["iso8859-1"] = Encoding("iso8859-1", "latin1", tab_iso8859_1);
319         symbol_encoding_ =
320 #ifdef USE_UNICODE_FOR_SYMBOLS
321                 Encoding("symbol", "", tab_symbol);
322 #else
323                 Encoding("symbol", "", tab_iso8859_1);
324 #endif
325 }
326
327 void Encodings::read(string const & filename)
328 {
329         enum Encodingtags {
330                 et_encoding = 1,
331                 et_end,
332                 et_last
333         };
334
335         struct keyword_item encodingtags[et_last - 1] = {
336                 { "encoding", et_encoding },
337                 { "end", et_end }
338         };
339
340         LyXLex lex(encodingtags, et_last - 1);
341         lex.setFile(filename);
342         while (lex.isOK()) {
343                 switch (lex.lex()) {
344                 case et_encoding:
345                 {
346                         lex.next();
347                         string const name = lex.getString();
348                         lex.next();
349                         string const latexname = lex.getString();
350                         lyxerr[Debug::INFO] << "Reading encoding " << name << endl;
351                         char_type table[256];
352                         for (unsigned int i = 0; i < 256; ++i) {
353                                 lex.next();
354                                 string const tmp = lex.getString();
355                                 table[i] = ::strtol(tmp.c_str(), 0 , 16);
356                         }
357                         encodinglist[name] = Encoding(name, latexname, table);
358                         if (lex.lex() != et_end)
359                                 lex.printError("Encodings::read: "
360                                                "missing end");
361                         break;
362                 }
363                 case et_end:
364                         lex.printError("Encodings::read: Misplaced end");
365                         break;
366                 case LyXLex::LEX_FEOF:
367                         break;
368                 default:
369                         lex.printError("Encodings::read: "
370                                        "Unknown tag: `$$Token'");
371                         break;
372                 }
373         }
374 }
375
376
377 } // namespace lyx