]> git.lyx.org Git - lyx.git/blob - src/encoding.C
9ff1b3cfeef0ad7c5de503fcb3079dc989ff7efa
[lyx.git] / src / encoding.C
1 /**
2  * \file encoding.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  * \author Jean-Marc Lasgouttes
8  * \author Dekel Tsur
9  *
10  * Full author contact details are available in file CREDITS.
11  */
12
13 #include <config.h>
14
15 #include "encoding.h"
16
17 #include "debug.h"
18 #include "lyxlex.h"
19 #include "lyxrc.h"
20
21
22 namespace lyx {
23
24 #ifndef CXX_GLOBAL_CSTD
25 using std::strtol;
26 #endif
27
28 using std::endl;
29 using std::string;
30
31
32 Encodings encodings;
33
34 namespace {
35
36 Uchar tab_iso8859_1[256] = {
37    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
38    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
39    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
40    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
41    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
42    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
43    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
44    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
45    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
46    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
47    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
48    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
49    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
50    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
51    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
52    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
53 };
54
55 #ifdef USE_UNICODE_FOR_SYMBOLS
56 Uchar tab_symbol[256] = {
57    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
58    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
59
60    0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220b,
61    0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
62
63    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
64
65 //   0x2245, 0x0391, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
66 //   0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
67
68 //   0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
69 //   0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0xffff,
70    0x2245, 0x0041, 0x0042, 0x0058, 0x0394, 0x0045, 0x03c2, 0x03a9,
71    0x0048, 0x0049, 0x03d1, 0x004b, 0x039b, 0x004d, 0x004e, 0x004f,
72
73    0x03a0, 0x0398, 0x0050, 0x03a3, 0x0054, 0x0059, 0x03c2, 0x03a9,
74    0x039e, 0x03a8, 0x005a, 0x005b, 0x2234, 0x005d, 0x22a5, 0xffff,
75
76    0xffff, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03d5, 0x03b3,
77    0x03b7, 0x03b9, 0x03c6, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
78
79    0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
80    0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x007e, 0xffff,
81
82    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
83    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
84
85    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
86    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
87
88    0xffff, 0x03d2, 0x2032, 0x2264, 0x2215, 0x221e, 0xffff, 0x2663,
89    0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
90
91    0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
92    0x00f7, 0x2260, 0x2261, 0x2248, 0x22ef, 0xffff, 0xffff, 0x21b5,
93
94    0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
95    0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
96
97    0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
98    0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
99
100    0x2662, 0x2329, 0x00ae, 0x00a9, 0x2122, 0x2211, 0xffff, 0xffff,
101    0xffff, 0x2308, 0xffff, 0x230a, 0xffff, 0xffff, 0xffff, 0xffff,
102
103    0xffff, 0x232a, 0x222b, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
104    0xffff, 0x2309, 0xffff, 0x230b, 0xffff, 0xffff, 0xffff, 0xffff
105 };
106 #endif
107
108
109 unsigned char arabic_table2[63][4] = {
110         {0x41, 0x41, 0x41, 0x41}, // 0xc1 = hamza
111         {0x42, 0xa1, 0x42, 0xa1}, // 0xc2 = ligature madda on alef
112         {0x43, 0xa2, 0x43, 0xa2}, // 0xc3 = ligature hamza on alef
113         {0x44, 0xa3, 0x44, 0xa3}, // 0xc4 = ligature hamza on waw
114         {0x45, 0xa4, 0x45, 0xa4}, // 0xc5 = ligature hamza under alef
115         {0x46, 0xf9, 0xf8, 0xa0}, // 0xc6 = ligature hamza on ya
116         {0x47, 0xa5, 0x47, 0xa5}, // 0xc7 = alef
117         {0x48, 0xae, 0xac, 0xad}, // 0xc8 = baa
118         {0x49, 0xb1, 0x49, 0xb1}, // 0xc9 = taa marbuta
119         {0x4a, 0xb4, 0xb2, 0xb3}, // 0xca = taa
120         {0x4b, 0xb7, 0xb5, 0xb6}, // 0xcb = thaa
121         {0x4c, 0xba, 0xb8, 0xb9}, // 0xcc = jeem
122         {0x4d, 0xbd, 0xbb, 0xbc}, // 0xcd = haa
123         {0x4e, 0xc0, 0xbe, 0xbf}, // 0xce = khaa
124         {0x4f, 0xa6, 0x4f, 0xa6}, // 0xcf = dal
125
126         {0x50, 0xa7, 0x50, 0xa7}, // 0xd0 = thal
127         {0x51, 0xa8, 0x51, 0xa8}, // 0xd1 = ra
128         {0x52, 0xa9, 0x52, 0xa9}, // 0xd2 = zain
129         {0x53, 0xc3, 0xc1, 0xc2}, // 0xd3 = seen
130         {0x54, 0xc6, 0xc4, 0xc5}, // 0xd4 = sheen
131         {0x55, 0xc9, 0xc7, 0xc8}, // 0xd5 = sad
132         {0x56, 0xcc, 0xca, 0xcb}, // 0xd6 = dad
133         {0x57, 0xcf, 0xcd, 0xce}, // 0xd7 = tah
134         {0x58, 0xd2, 0xd0, 0xd1}, // 0xd8 = zah
135         {0x59, 0xd5, 0xd3, 0xd4}, // 0xd9 = ain
136         {0x5a, 0xd8, 0xd6, 0xd7}, // 0xda = ghain
137         {0,0,0,0}, // 0xdb
138         {0,0,0,0}, // 0xdc
139         {0,0,0,0}, // 0xdd
140         {0,0,0,0}, // 0xde
141         {0,0,0,0}, // 0xdf
142
143         {0,0,0,0}, // 0xe0
144         {0x61, 0xdb, 0xd9, 0xda}, // 0xe1 = fa
145         {0x62, 0xde, 0xdc, 0xdd}, // 0xe2 = qaf
146         {0x63, 0xe1, 0xdf, 0xe0}, // 0xe3 = kaf
147         {0x64, 0xe4, 0xe2, 0xe3}, // 0xe4 = lam
148         {0x65, 0xe7, 0xe5, 0xe6}, // 0xe5 = meem
149         {0x66, 0xea, 0xe8, 0xe9}, // 0xe6 = noon
150         {0x67, 0xed, 0xeb, 0xec}, // 0xe7 = ha
151         {0x68, 0xaa, 0x68, 0xaa}, // 0xe8 = waw
152         {0x69, 0xab, 0x69, 0xab}, // 0xe9 = alef maksura
153         {0x6a, 0xf0, 0xee, 0xef}, // 0xea = ya
154         {0x6b, 0x6b, 0x6b, 0x6b}, // 0xeb = fathatan
155         {0x6c, 0x6c, 0x6c, 0x6c}, // 0xec = dammatan
156         {0x6d, 0x6d, 0x6d, 0x6d}, // 0xed = kasratan
157         {0x6e, 0x6e, 0x6e, 0x6e}, // 0xee = fatha
158         {0x6f, 0x6f, 0x6f, 0x6f}, // 0xef = damma
159
160         {0x70, 0x70, 0x70, 0x70}, // 0xf0 = kasra
161         {0x71, 0x71, 0x71, 0x71}, // 0xf1 = shadda
162         {0x72, 0x72, 0x72, 0x72}, // 0xf2 = sukun
163         {0,0,0,0}, // 0xf3
164         {0,0,0,0}, // 0xf4
165         {0,0,0,0}, // 0xf5
166         {0,0,0,0}, // 0xf6
167         {0,0,0,0}, // 0xf7
168         {0,0,0,0}, // 0xf8
169         {0,0,0,0}, // 0xf9
170         {0,0,0,0}, // 0xfa
171         {0,0,0,0}, // 0xfb
172         {0,0,0,0}, // 0xfc
173         {0,0,0,0}, // 0xfd
174         {0,0,0,0}, // 0xfe
175         {0,0,0,0}, // 0xff
176 };
177
178
179 unsigned char arabic_table[63][2] = {
180         {0xc1, 0xc1}, // 0xc1 = hamza
181         {0xc2, 0xc2}, // 0xc2 = ligature madda on alef
182         {0xc3, 0xc3}, // 0xc3 = ligature hamza on alef
183         {0xc4, 0xc4}, // 0xc4 = ligature hamza on waw
184         {0xc5, 0xc5}, // 0xc5 = ligature hamza under alef
185         {0xc6, 0xc0}, // 0xc6 = ligature hamza on ya
186         {0xc7, 0xc7}, // 0xc7 = alef
187         {0xc8, 0xeb}, // 0xc8 = baa
188         {0xc9, 0xc9}, // 0xc9 = taa marbuta
189         {0xca, 0xec}, // 0xca = taa
190         {0xcb, 0xed}, // 0xcb = thaa
191         {0xcc, 0xee}, // 0xcc = jeem
192         {0xcd, 0xef}, // 0xcd = haa
193         {0xce, 0xf0}, // 0xce = khaa
194         {0xcf, 0xcf}, // 0xcf = dal
195
196         {0xd0, 0xd0}, // 0xd0 = thal
197         {0xd1, 0xd1}, // 0xd1 = ra
198         {0xd2, 0xd2}, // 0xd2 = zain
199         {0xd3, 0xf1}, // 0xd3 = seen
200         {0xd4, 0xf2}, // 0xd4 = sheen
201         {0xd5, 0xf3}, // 0xd5 = sad
202         {0xd6, 0xf4}, // 0xd6 = dad
203         {0xd7, 0xd7}, // 0xd7 = tah
204         {0xd8, 0xd8}, // 0xd8 = zah
205         {0xd9, 0xf5}, // 0xd9 = ain
206         {0xda, 0xf6}, // 0xda = ghain
207         {0,0}, // 0xdb
208         {0,0}, // 0xdc
209         {0,0}, // 0xdd
210         {0,0}, // 0xde
211         {0,0}, // 0xdf
212
213         {0,0},  // 0xe0
214         {0xe1, 0xf7},   // 0xe1 = fa
215         {0xe2, 0xf8},   // 0xe2 = qaf
216         {0xe3, 0xf9},   // 0xe3 = kaf
217         {0xe4, 0xfa},   // 0xe4 = lam
218         {0xe5, 0xfb},   // 0xe5 = meem
219         {0xe6, 0xfc},   // 0xe6 = noon
220         {0xe7, 0xfd},   // 0xe7 = ha
221         {0xe8, 0xe8},   // 0xe8 = waw
222         {0xe9, 0xe9},   // 0xe9 = alef maksura
223         {0xea, 0xfe},   // 0xea = ya
224         {0xa8, 0xa8},   // 0xeb = fathatan
225         {0xa9, 0xa9},   // 0xec = dammatan
226         {0xaa, 0xaa},   // 0xed = kasratan
227         {0xab, 0xab},   // 0xee = fatha
228         {0xac, 0xac},   // 0xef = damma
229
230         {0xad, 0xad},   // 0xf0 = kasra
231         {0xae, 0xae},   // 0xf1 = shadda
232         {0xaf, 0xaf},   // 0xf2 = sukun
233         {0,0}, // 0xf3
234         {0,0}, // 0xf4
235         {0,0}, // 0xf5
236         {0,0}, // 0xf6
237         {0,0}, // 0xf7
238         {0,0}, // 0xf8
239         {0,0}, // 0xf9
240         {0,0}, // 0xfa
241         {0,0}, // 0xfb
242         {0,0}, // 0xfc
243         {0,0}, // 0xfd
244         {0,0}, // 0xfe
245         {0,0} // 0xff
246 };
247
248
249 unsigned char const arabic_start = 0xc1;
250
251 } // namespace anon
252
253
254 bool Encodings::isComposeChar_hebrew(unsigned char c)
255 {
256         return c <= 0xd2 && c >= 0xc0 &&
257                 c != 0xce && c != 0xd0;
258 }
259
260
261 // Special Arabic letters are ones that do not get connected from left
262 // they are hamza, alef_madda, alef_hamza, waw_hamza, alef_hamza_under,
263 // alef, tah_marbota, dal, thal, rah, zai, wow, alef_maksoura
264
265 bool Encodings::is_arabic_special(unsigned char c)
266 {
267         return  (c >= 0xc1 && c <= 0xc5) ||
268                  c == 0xc7 || c  == 0xc9  ||
269                  c == 0xcf || c  == 0xe8  ||
270                 (c >= 0xd0 && c <= 0xd2) ||
271                  c == 0xe9;
272 }
273
274 bool Encodings::isComposeChar_arabic(unsigned char c)
275 {
276         return c >= 0xeb && c <= 0xf2;
277 }
278
279
280 bool Encodings::is_arabic(unsigned char c)
281 {
282         return c >= arabic_start && arabic_table[c-arabic_start][0];
283 }
284
285
286 unsigned char Encodings::transformChar(unsigned char c,
287                                       Encodings::Letter_Form form)
288 {
289         if (!is_arabic(c))
290                 return c;
291
292         if (lyxrc.font_norm_type == LyXRC::ISO_10646_1)
293                 return arabic_table2[c-arabic_start][form];
294         else
295                 return arabic_table[c-arabic_start][form >> 1];
296 }
297
298
299 Encoding const * Encodings::getEncoding(string const & encoding) const
300 {
301         EncodingList::const_iterator it = encodinglist.find(encoding);
302         if (it != encodinglist.end())
303                 return &it->second;
304         else
305                 return 0;
306 }
307
308 Encodings::Encodings()
309 {
310         encodinglist["iso8859-1"] = Encoding("iso8859-1", "latin1", tab_iso8859_1);
311         symbol_encoding_ =
312 #ifdef USE_UNICODE_FOR_SYMBOLS
313                 Encoding("symbol", "", tab_symbol);
314 #else
315                 Encoding("symbol", "", tab_iso8859_1);
316 #endif
317 }
318
319 void Encodings::read(string const & filename)
320 {
321         enum Encodingtags {
322                 et_encoding = 1,
323                 et_end,
324                 et_last
325         };
326
327         struct keyword_item encodingtags[et_last - 1] = {
328                 { "encoding", et_encoding },
329                 { "end", et_end }
330         };
331
332         LyXLex lex(encodingtags, et_last - 1);
333         lex.setFile(filename);
334         while (lex.isOK()) {
335                 switch (lex.lex()) {
336                 case et_encoding:
337                 {
338                         lex.next();
339                         string const name = lex.getString();
340                         lex.next();
341                         string const latexname = lex.getString();
342                         lyxerr[Debug::INFO] << "Reading encoding " << name << endl;
343                         Uchar table[256];
344                         for (unsigned int i = 0; i < 256; ++i) {
345                                 lex.next();
346                                 string const tmp = lex.getString();
347                                 table[i] = ::strtol(tmp.c_str(), 0 , 16);
348                         }
349                         encodinglist[name] = Encoding(name, latexname, table);
350                         if (lex.lex() != et_end)
351                                 lex.printError("Encodings::read: "
352                                                "missing end");
353                         break;
354                 }
355                 case et_end:
356                         lex.printError("Encodings::read: Misplaced end");
357                         break;
358                 case LyXLex::LEX_FEOF:
359                         break;
360                 default:
361                         lex.printError("Encodings::read: "
362                                        "Unknown tag: `$$Token'");
363                         break;
364                 }
365         }
366 }
367
368
369 } // namespace lyx