]> git.lyx.org Git - lyx.git/blob - src/Encoding.cpp
Add ObsoletedBy tag to InsetLayout
[lyx.git] / src / Encoding.cpp
1 /**
2  * \file Encoding.cpp
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Lars Gullik Bjønnes
7  * \author Jean-Marc Lasgouttes
8  * \author Dekel Tsur
9  *
10  * Full author contact details are available in file CREDITS.
11  */
12
13 #include <config.h>
14
15 #include "Encoding.h"
16
17 #include "Lexer.h"
18
19 #include "support/debug.h"
20 #include "support/gettext.h"
21 #include "support/lstrings.h"
22 #include "support/textutils.h"
23 #include "support/unicode.h"
24
25 #include <boost/cstdint.hpp>
26
27 #include <sstream>
28 #include <algorithm>
29
30 using namespace std;
31 using namespace lyx::support;
32
33 namespace lyx {
34
35 int const Encoding::any = -1;
36
37 Encodings encodings;
38
39 Encodings::MathCommandSet Encodings::mathcmd;
40 Encodings::TextCommandSet Encodings::textcmd;
41 Encodings::MathSymbolSet  Encodings::mathsym;
42
43 namespace {
44
45 typedef map<char_type, CharInfo> CharInfoMap;
46 CharInfoMap unicodesymbols;
47
48 typedef set<char_type> CharSet;
49 typedef map<string, CharSet> CharSetMap;
50 CharSet forced;
51 CharSetMap forcedselected;
52
53 typedef set<char_type> MathAlphaSet;
54 MathAlphaSet mathalpha;
55
56
57 /// The highest code point in UCS4 encoding (1<<20 + 1<<16)
58 char_type const max_ucs4 = 0x110000;
59
60 } // namespace anon
61
62
63 EncodingException::EncodingException(char_type c)
64         : failed_char(c), par_id(0), pos(0)
65 {
66 }
67
68
69 const char * EncodingException::what() const throw()
70 {
71         return "Could not find LaTeX command for a character";
72 }
73
74
75 CharInfo::CharInfo(
76         docstring const & textcommand, docstring const & mathcommand,
77         std::string const & textpreamble, std::string const & mathpreamble,
78         std::string const & tipashortcut, unsigned int flags)
79         : textcommand_(textcommand), mathcommand_(mathcommand),
80           textpreamble_(textpreamble), mathpreamble_(mathpreamble),
81           tipashortcut_(tipashortcut), flags_(flags)
82 {
83 }
84
85
86 Encoding::Encoding(string const & n, string const & l, string const & g,
87                    string const & i, bool f, bool u, Encoding::Package p)
88         : name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f),
89           unsafe_(u), forced_(&forcedselected[n]), package_(p)
90 {
91         if (n == "ascii") {
92                 // ASCII can encode 128 code points and nothing else
93                 start_encodable_ = 128;
94                 complete_ = true;
95         } else if (i == "UTF-8") {
96                 // UTF8 can encode all UCS4 code points
97                 start_encodable_ = max_ucs4;
98                 complete_ = true;
99         } else {
100                 start_encodable_ = 0;
101                 complete_ = false;
102         }
103 }
104
105
106 void Encoding::init() const
107 {
108         if (complete_)
109                 return;
110
111         start_encodable_ = 0;
112         // temporarily switch off lyxerr, since we will generate iconv errors
113         lyxerr.disable();
114         if (fixedwidth_) {
115                 // We do not need to check all UCS4 code points, it is enough
116                 // if we check all 256 code points of this encoding.
117                 for (unsigned short j = 0; j < 256; ++j) {
118                         char const c = char(j);
119                         vector<char_type> const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_);
120                         if (ucs4.size() != 1)
121                                 continue;
122                         char_type const uc = ucs4[0];
123                         CharInfoMap::const_iterator const it = unicodesymbols.find(uc);
124                         if (it == unicodesymbols.end())
125                                 encodable_.insert(uc);
126                         else if (!it->second.force()) {
127                                 if (forced_->empty() || forced_->find(uc) == forced_->end())
128                                         encodable_.insert(uc);
129                         }
130                 }
131         } else {
132                 // We do not know how many code points this encoding has, and
133                 // they do not have a direct representation as a single byte,
134                 // therefore we need to check all UCS4 code points.
135                 // This is expensive!
136                 for (char_type c = 0; c < max_ucs4; ++c) {
137                         vector<char> const eightbit = ucs4_to_eightbit(&c, 1, iconvName_);
138                         if (!eightbit.empty()) {
139                                 CharInfoMap::const_iterator const it = unicodesymbols.find(c);
140                                 if (it == unicodesymbols.end())
141                                         encodable_.insert(c);
142                                 else if (!it->second.force()) {
143                                         if (forced_->empty() || forced_->find(c) == forced_->end())
144                                                 encodable_.insert(c);
145                                 }
146                         }
147                 }
148         }
149         lyxerr.enable();
150         CharSet::iterator it = encodable_.find(start_encodable_);
151         while (it != encodable_.end()) {
152                 encodable_.erase(it);
153                 ++start_encodable_;
154                 it = encodable_.find(start_encodable_);
155         }
156         complete_ = true;
157 }
158
159
160 bool Encoding::isForced(char_type c) const
161 {
162         if (!forced.empty() && forced.find(c) != forced.end())
163                 return true;
164         return !forced_->empty() && forced_->find(c) != forced_->end();
165 }
166
167
168 bool Encoding::encodable(char_type c) const
169 {
170         // assure the used encoding is properly initialized
171         init();
172
173         if (iconvName_ == "UTF-8" && package_ == none)
174                 return true;
175         if (c < start_encodable_ && !isForced(c))
176                 return true;
177         if (encodable_.find(c) != encodable_.end())
178                 return true;
179         return false;
180 }
181
182
183 pair<docstring, bool> Encoding::latexChar(char_type c) const
184 {
185         if (encodable(c))
186                 return make_pair(docstring(1, c), false);
187
188         // c cannot (or should not) be encoded in this encoding
189         CharInfoMap::const_iterator const it = unicodesymbols.find(c);
190         if (it == unicodesymbols.end())
191                 throw EncodingException(c);
192         // at least one of mathcommand and textcommand is nonempty
193         if (it->second.textcommand().empty())
194                 return make_pair(
195                         "\\ensuremath{" + it->second.mathcommand() + '}', false);
196         return make_pair(it->second.textcommand(), !it->second.textnotermination());
197 }
198
199
200 pair<docstring, docstring> Encoding::latexString(docstring const & input, bool dryrun) const
201 {
202         docstring result;
203         docstring uncodable;
204         bool terminate = false;
205         for (size_t n = 0; n < input.size(); ++n) {
206                 try {
207                         char_type const c = input[n];
208                         pair<docstring, bool> latex_char = latexChar(c);
209                         docstring const latex = latex_char.first;
210                         if (terminate && !prefixIs(latex, '\\')
211                             && !prefixIs(latex, '{')
212                             && !prefixIs(latex, '}')) {
213                                         // Prevent eating of a following
214                                         // space or command corruption by
215                                         // following characters
216                                         if (latex == " ")
217                                                 result += "{}";
218                                         else
219                                                 result += " ";
220                                 }
221                         result += latex;
222                         terminate = latex_char.second;
223                 } catch (EncodingException & /* e */) {
224                         LYXERR0("Uncodable character in latexString!");
225                         if (dryrun) {
226                                 result += "<" + _("LyX Warning: ")
227                                            + _("uncodable character") + " '";
228                                 result += docstring(1, input[n]);
229                                 result += "'>";
230                         } else
231                                 uncodable += input[n];
232                 }
233         }
234         return make_pair(result, uncodable);
235 }
236
237
238 vector<char_type> Encoding::symbolsList() const
239 {
240         // assure the used encoding is properly initialized
241         init();
242
243         // first all encodable characters
244         vector<char_type> symbols(encodable_.begin(), encodable_.end());
245         // add those below start_encodable_
246         for (char_type c = 0; c < start_encodable_; ++c)
247                 symbols.push_back(c);
248         // now the ones from the unicodesymbols file
249         CharInfoMap::const_iterator const end = unicodesymbols.end();
250         CharInfoMap::const_iterator it = unicodesymbols.begin();
251         for (; it != end; ++it)
252                 symbols.push_back(it->first);
253         return symbols;
254 }
255
256
257 bool Encodings::latexMathChar(char_type c, bool mathmode,
258                         Encoding const * encoding, docstring & command,
259                         bool & needsTermination)
260 {
261         command = empty_docstring();
262         if (encoding)
263                 if (encoding->encodable(c))
264                         command = docstring(1, c);
265         needsTermination = false;
266
267         CharInfoMap::const_iterator const it = unicodesymbols.find(c);
268         if (it == unicodesymbols.end()) {
269                 if (!encoding || command.empty())
270                         throw EncodingException(c);
271                 if (mathmode)
272                         addMathSym(c);
273                 return false;
274         }
275         // at least one of mathcommand and textcommand is nonempty
276         bool use_math = (mathmode && !it->second.mathcommand().empty()) ||
277                         (!mathmode && it->second.textcommand().empty());
278         if (use_math) {
279                 command = it->second.mathcommand();
280                 needsTermination = !it->second.mathnotermination();
281                 addMathCmd(c);
282         } else {
283                 if (!encoding || command.empty()) {
284                         command = it->second.textcommand();
285                         needsTermination = !it->second.textnotermination();
286                         addTextCmd(c);
287                 }
288                 if (mathmode)
289                         addMathSym(c);
290         }
291         return use_math;
292 }
293
294
295 char_type Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype,
296                 bool & combining, bool & needsTermination, set<string> * req)
297 {
298         CharInfoMap::const_iterator const end = unicodesymbols.end();
299         CharInfoMap::const_iterator it = unicodesymbols.begin();
300         for (combining = false; it != end; ++it) {
301                 docstring const math = it->second.mathcommand();
302                 docstring const text = it->second.textcommand();
303                 if ((cmdtype & MATH_CMD) && math == cmd) {
304                         combining = it->second.combining();
305                         needsTermination = !it->second.mathnotermination();
306                         if (req && it->second.mathfeature() &&
307                             !it->second.mathpreamble().empty())
308                                 req->insert(it->second.mathpreamble());
309                         return it->first;
310                 }
311                 if ((cmdtype & TEXT_CMD) && text == cmd) {
312                         combining = it->second.combining();
313                         needsTermination = !it->second.textnotermination();
314                         if (req && it->second.textfeature() &&
315                             !it->second.textpreamble().empty())
316                                 req->insert(it->second.textpreamble());
317                         return it->first;
318                 }
319         }
320         needsTermination = false;
321         return 0;
322 }
323
324
325 docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype,
326                 bool & needsTermination, docstring & rem, set<string> * req)
327 {
328         needsTermination = false;
329         rem = empty_docstring();
330         bool const mathmode = cmdtype & MATH_CMD;
331         bool const textmode = cmdtype & TEXT_CMD;
332         docstring symbols;
333         size_t const cmdend = cmd.size();
334         size_t prefix = 0;
335         CharInfoMap::const_iterator const uniend = unicodesymbols.end();
336         for (size_t i = 0, j = 0; j < cmdend; ++j) {
337                 // Also get the char after a backslash
338                 if (j + 1 < cmdend && cmd[j] == '\\') {
339                         ++j;
340                         prefix = 1;
341                         // Detect things like \=*{e} as well
342                         if (j + 3 < cmdend && cmd[j+1] == '*' &&
343                             cmd[j+2] == '{') {
344                                 ++j;
345                                 prefix = 2;
346                         }
347                 }
348                 // position of the last character before a possible macro
349                 // argument
350                 size_t m = j;
351                 // If a macro argument follows, get it, too
352                 // Do it here only for single character commands. Other
353                 // combining commands need this too, but they are handled in
354                 // the loop below for performance reasons.
355                 if (j + 1 < cmdend && cmd[j + 1] == '{') {
356                         size_t k = j + 1;
357                         int count = 1;
358                         while (k < cmdend && count) {
359                                 k = cmd.find_first_of(from_ascii("{}"), k + 1);
360                                 // braces may not be balanced
361                                 if (k == docstring::npos)
362                                         break;
363                                 if (cmd[k] == '{')
364                                         ++count;
365                                 else
366                                         --count;
367                         }
368                         if (k != docstring::npos)
369                                 j = k;
370                 } else if (m + 1 < cmdend && isAlphaASCII(cmd[m])) {
371                         while (m + 2 < cmdend && isAlphaASCII(cmd[m+1]))
372                                 m++;
373                 }
374                 // Start with this substring and try augmenting it when it is
375                 // the prefix of some command in the unicodesymbols file
376                 docstring subcmd = cmd.substr(i, j - i + 1);
377
378                 CharInfoMap::const_iterator it = unicodesymbols.begin();
379                 // First part of subcmd which might be a combining character
380                 docstring combcmd = (m == j) ? docstring() : cmd.substr(i, m - i + 1);
381                 // The combining character of combcmd if it exists
382                 CharInfoMap::const_iterator combining = uniend;
383                 size_t unicmd_size = 0;
384                 char_type c = 0;
385                 for (; it != uniend; ++it) {
386                         docstring const math = mathmode ? it->second.mathcommand()
387                                                         : docstring();
388                         docstring const text = textmode ? it->second.textcommand()
389                                                         : docstring();
390                         if (!combcmd.empty() && it->second.combining() &&
391                             (math == combcmd || text == combcmd))
392                                 combining = it;
393                         size_t cur_size = max(math.size(), text.size());
394                         // The current math or text unicode command cannot
395                         // match, or we already matched a longer one
396                         if (cur_size < subcmd.size() || cur_size <= unicmd_size)
397                                 continue;
398
399                         docstring tmp = subcmd;
400                         size_t k = j;
401                         while (prefixIs(math, tmp) || prefixIs(text, tmp)) {
402                                 ++k;
403                                 if (k >= cmdend || cur_size <= tmp.size())
404                                         break;
405                                 tmp += cmd[k];
406                         }
407                         // No match
408                         if (k == j)
409                                 continue;
410
411                         // The last added char caused a mismatch, because
412                         // we didn't exhaust the chars in cmd and didn't
413                         // exceed the maximum size of the current unicmd
414                         if (k < cmdend && cur_size > tmp.size())
415                                 tmp.resize(tmp.size() - 1);
416
417                         // If this is an exact match, we found a (longer)
418                         // matching entry in the unicodesymbols file.
419                         if (math != tmp && text != tmp)
420                                 continue;
421                         // If we found a combining command, we need to append
422                         // the macro argument if this has not been done above.
423                         if (tmp == combcmd && combining != uniend &&
424                             k < cmdend && cmd[k] == '{') {
425                                 size_t l = k;
426                                 int count = 1;
427                                 while (l < cmdend && count) {
428                                         l = cmd.find_first_of(from_ascii("{}"), l + 1);
429                                         // braces may not be balanced
430                                         if (l == docstring::npos)
431                                                 break;
432                                         if (cmd[l] == '{')
433                                                 ++count;
434                                         else
435                                                 --count;
436                                 }
437                                 if (l != docstring::npos) {
438                                         j = l;
439                                         subcmd = cmd.substr(i, j - i + 1);
440                                 }
441                         }
442                         // If the entry doesn't start with '\', we take note
443                         // of the match and continue (this is not a ultimate
444                         // acceptance, as some other entry may match a longer
445                         // portion of the cmd string). However, if the entry
446                         // does start with '\', we accept the match only if
447                         // this is a valid macro, i.e., either it is a single
448                         // (nonletter) char macro, or nothing else follows,
449                         // or what follows is a nonletter char, or the last
450                         // character is a }.
451                         else if (tmp[0] != '\\'
452                                    || (tmp.size() == prefix + 1 &&
453                                        !isAlphaASCII(tmp[1]) &&
454                                        (prefix == 1 || !isAlphaASCII(tmp[2])))
455                                    || k == cmdend
456                                    || !isAlphaASCII(cmd[k])
457                                    || tmp[tmp.size() - 1] == '}'
458                                  ) {
459                                 c = it->first;
460                                 j = k - 1;
461                                 i = j + 1;
462                                 unicmd_size = cur_size;
463                                 if (math == tmp)
464                                         needsTermination = !it->second.mathnotermination();
465                                 else
466                                         needsTermination = !it->second.textnotermination();
467                                 if (req) {
468                                         if (math == tmp && it->second.mathfeature() &&
469                                             !it->second.mathpreamble().empty())
470                                                 req->insert(it->second.mathpreamble());
471                                         if (text == tmp && it->second.textfeature() &&
472                                             !it->second.textpreamble().empty())
473                                                 req->insert(it->second.textpreamble());
474                                 }
475                         }
476                 }
477                 if (unicmd_size)
478                         symbols += c;
479                 else if (combining != uniend &&
480                          prefixIs(subcmd, combcmd + '{')) {
481                         // We know that subcmd starts with combcmd and
482                         // contains an argument in braces.
483                         docstring const arg = subcmd.substr(
484                                 combcmd.length() + 1,
485                                 subcmd.length() - combcmd.length() - 2);
486                         // If arg is a single character we can construct a
487                         // combining sequence.
488                         char_type a;
489                         bool argcomb = false;
490                         if (arg.size() == 1 && isAlnumASCII(arg[0]))
491                                 a = arg[0];
492                         else {
493                                 // Use the version of fromLaTeXCommand() that
494                                 // parses only one command, since we cannot
495                                 // use more than one character.
496                                 bool dummy = false;
497                                 set<string> r;
498                                 a = fromLaTeXCommand(arg, cmdtype, argcomb,
499                                                      dummy, &r);
500                                 if (a && req && !argcomb)
501                                         req->insert(r.begin(), r.end());
502                         }
503                         if (a && !argcomb) {
504                                 // In unicode the combining character comes
505                                 // after its base
506                                 symbols += a;
507                                 symbols += combining->first;
508                                 i = j + 1;
509                                 unicmd_size = 2;
510                         }
511                 }
512                 if (j + 1 == cmdend && !unicmd_size) {
513                         // No luck. Return what remains
514                         rem = cmd.substr(i);
515                         if (needsTermination && !rem.empty()) {
516                                 if (rem.substr(0, 2) == "{}") {
517                                         rem = rem.substr(2);
518                                         needsTermination = false;
519                                 } else if (rem[0] == ' ') {
520                                         needsTermination = false;
521                                         // LaTeX would swallow all spaces
522                                         rem = ltrim(rem);
523                                 }
524                         }
525                 }
526         }
527         return symbols;
528 }
529
530
531 CharInfo const & Encodings::unicodeCharInfo(char_type c)
532 {
533         static CharInfo empty;
534         CharInfoMap::const_iterator const it = unicodesymbols.find(c);
535         return it != unicodesymbols.end() ? it->second : empty;
536 }
537
538
539 bool Encodings::isCombiningChar(char_type c)
540 {
541         CharInfoMap::const_iterator const it = unicodesymbols.find(c);
542         if (it != unicodesymbols.end())
543                 return it->second.combining();
544         return false;
545 }
546
547
548 string const Encodings::TIPAShortcut(char_type c)
549 {
550         CharInfoMap::const_iterator const it = unicodesymbols.find(c);
551         if (it != unicodesymbols.end())
552                 return it->second.tipashortcut();
553         return string();
554 }
555
556
557 bool Encodings::isKnownScriptChar(char_type const c, string & preamble)
558 {
559         CharInfoMap::const_iterator const it = unicodesymbols.find(c);
560
561         if (it == unicodesymbols.end())
562                 return false;
563
564         if (it->second.textpreamble() != "textgreek" && it->second.textpreamble() != "textcyr")
565                 return false;
566
567         if (preamble.empty()) {
568                 preamble = it->second.textpreamble();
569                 return true;
570         }
571         return it->second.textpreamble() == preamble;
572 }
573
574
575 bool Encodings::isMathAlpha(char_type c)
576 {
577         return mathalpha.count(c);
578 }
579
580
581 Encoding const *
582 Encodings::fromLyXName(string const & name, bool allowUnsafe) const
583 {
584         EncodingList::const_iterator const it = encodinglist.find(name);
585         if (!allowUnsafe && it->second.unsafe())
586                 return 0;
587         return it != encodinglist.end() ? &it->second : 0;
588 }
589
590
591 Encoding const *
592 Encodings::fromLaTeXName(string const & n, int const & p, bool allowUnsafe) const
593 {
594         string name = n;
595         // FIXME: if we have to test for too many of these synonyms,
596         // we should instead extend the format of lib/encodings
597         if (n == "ansinew")
598                 name = "cp1252";
599
600         // We don't use find_if because it makes copies of the pairs in
601         // the map.
602         // This linear search is OK since we don't have many encodings.
603         // Users could even optimize it by putting the encodings they use
604         // most at the top of lib/encodings.
605         EncodingList::const_iterator const end = encodinglist.end();
606         for (EncodingList::const_iterator it = encodinglist.begin(); it != end; ++it)
607                 if ((it->second.latexName() == name) && (it->second.package() & p)
608                                 && (!it->second.unsafe() || allowUnsafe))
609                         return &it->second;
610         return 0;
611 }
612
613
614 Encoding const *
615 Encodings::fromIconvName(string const & n, int const & p, bool allowUnsafe) const
616 {
617         EncodingList::const_iterator const end = encodinglist.end();
618         for (EncodingList::const_iterator it = encodinglist.begin(); it != end; ++it)
619                 if ((it->second.iconvName() == n) && (it->second.package() & p)
620                                 && (!it->second.unsafe() || allowUnsafe))
621                         return &it->second;
622         return 0;
623 }
624
625
626 Encodings::Encodings()
627 {}
628
629
630 void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
631 {
632         // We must read the symbolsfile first, because the Encoding
633         // constructor depends on it.
634         CharSetMap forcednotselected;
635         Lexer symbolslex;
636         symbolslex.setFile(symbolsfile);
637         bool getNextToken = true;
638         while (symbolslex.isOK()) {
639                 char_type symbol;
640
641                 if (getNextToken) {
642                         if (!symbolslex.next(true))
643                                 break;
644                 } else
645                         getNextToken = true;
646
647                 istringstream is(symbolslex.getString());
648                 // reading symbol directly does not work if
649                 // char_type == wchar_t.
650                 boost::uint32_t tmp;
651                 if(!(is >> hex >> tmp))
652                         break;
653                 symbol = tmp;
654
655                 if (!symbolslex.next(true))
656                         break;
657                 docstring textcommand = symbolslex.getDocString();
658                 if (!symbolslex.next(true))
659                         break;
660                 string textpreamble = symbolslex.getString();
661                 if (!symbolslex.next(true))
662                         break;
663                 string sflags = symbolslex.getString();
664
665                 string tipashortcut;
666                 int flags = 0;
667
668                 if (suffixIs(textcommand, '}'))
669                         flags |= CharInfoTextNoTermination;
670                 while (!sflags.empty()) {
671                         string flag;
672                         sflags = split(sflags, flag, ',');
673                         if (flag == "combining") {
674                                 flags |= CharInfoCombining;
675                         } else if (flag == "force") {
676                                 flags |= CharInfoForce;
677                                 forced.insert(symbol);
678                         } else if (prefixIs(flag, "force=")) {
679                                 vector<string> encodings =
680                                         getVectorFromString(flag.substr(6), ";");
681                                 for (size_t i = 0; i < encodings.size(); ++i)
682                                         forcedselected[encodings[i]].insert(symbol);
683                                 flags |= CharInfoForceSelected;
684                         } else if (prefixIs(flag, "force!=")) {
685                                 vector<string> encodings =
686                                         getVectorFromString(flag.substr(7), ";");
687                                 for (size_t i = 0; i < encodings.size(); ++i)
688                                         forcednotselected[encodings[i]].insert(symbol);
689                                 flags |= CharInfoForceSelected;
690                         } else if (flag == "mathalpha") {
691                                 mathalpha.insert(symbol);
692                         } else if (flag == "notermination=text") {
693                                 flags |= CharInfoTextNoTermination;
694                         } else if (flag == "notermination=math") {
695                                 flags |= CharInfoMathNoTermination;
696                         } else if (flag == "notermination=both") {
697                                 flags |= CharInfoTextNoTermination;
698                                 flags |= CharInfoMathNoTermination;
699                         } else if (flag == "notermination=none") {
700                                 flags &= ~CharInfoTextNoTermination;
701                                 flags &= ~CharInfoMathNoTermination;
702                         } else if (contains(flag, "tipashortcut=")) {
703                                 tipashortcut = split(flag, '=');
704                         } else {
705                                 lyxerr << "Ignoring unknown flag `" << flag
706                                        << "' for symbol `0x"
707                                        << hex << symbol << dec
708                                        << "'." << endl;
709                         }
710                 }
711                 // mathcommand and mathpreamble have been added for 1.6.0.
712                 // make them optional so that old files still work.
713                 int const lineno = symbolslex.lineNumber();
714                 bool breakout = false;
715                 docstring mathcommand;
716                 string mathpreamble;
717                 if (symbolslex.next(true)) {
718                         if (symbolslex.lineNumber() != lineno) {
719                                 // line in old format without mathcommand and mathpreamble
720                                 getNextToken = false;
721                         } else {
722                                 mathcommand = symbolslex.getDocString();
723                                 if (suffixIs(mathcommand, '}'))
724                                         flags |= CharInfoMathNoTermination;
725                                 if (symbolslex.next(true)) {
726                                         if (symbolslex.lineNumber() != lineno) {
727                                                 // line in new format with mathcommand only
728                                                 getNextToken = false;
729                                         } else {
730                                                 // line in new format with mathcommand and mathpreamble
731                                                 mathpreamble = symbolslex.getString();
732                                         }
733                                 } else
734                                         breakout = true;
735                         }
736                 } else {
737                         breakout = true;
738                 }
739
740                 // backward compatibility
741                 if (mathpreamble == "esintoramsmath")
742                         mathpreamble = "esint|amsmath";
743
744                 if (!textpreamble.empty())
745                         if (textpreamble[0] != '\\')
746                                 flags |= CharInfoTextFeature;
747                 if (!mathpreamble.empty())
748                         if (mathpreamble[0] != '\\')
749                                 flags |= CharInfoMathFeature;
750
751                 CharInfo info = CharInfo(
752                         textcommand, mathcommand,
753                         textpreamble, mathpreamble,
754                         tipashortcut, flags);
755                 LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '"
756                            << to_utf8(info.textcommand()) << "' '" << info.textpreamble()
757                            << " '" << info.textfeature() << ' ' << info.textnotermination()
758                            << ' ' << to_utf8(info.mathcommand()) << "' '" << info.mathpreamble()
759                            << "' " << info.mathfeature() << ' ' << info.mathnotermination()
760                            << ' ' << info.combining() << ' ' << info.force()
761                            << ' ' << info.forceselected());
762
763                 // we assume that at least one command is nonempty when using unicodesymbols
764                 if (info.isUnicodeSymbol()) {
765                         unicodesymbols[symbol] = info;
766                 }
767
768                 if (breakout)
769                         break;
770         }
771
772         // Now read the encodings
773         enum {
774                 et_encoding = 1,
775                 et_end
776         };
777
778         LexerKeyword encodingtags[] = {
779                 { "encoding", et_encoding },
780                 { "end", et_end }
781         };
782
783         Lexer lex(encodingtags);
784         lex.setFile(encfile);
785         lex.setContext("Encodings::read");
786         while (lex.isOK()) {
787                 switch (lex.lex()) {
788                 case et_encoding:
789                 {
790                         lex.next();
791                         string const name = lex.getString();
792                         lex.next();
793                         string const latexname = lex.getString();
794                         lex.next();
795                         string const guiname = lex.getString();
796                         lex.next();
797                         string const iconvname = lex.getString();
798                         lex.next();
799                         string const width = lex.getString();
800                         bool fixedwidth = false;
801                         bool unsafe = false;
802                         if (width == "fixed")
803                                 fixedwidth = true;
804                         else if (width == "variable")
805                                 fixedwidth = false;
806                         else if (width == "variableunsafe") {
807                                 fixedwidth = false;
808                                 unsafe = true;
809                         }
810                         else
811                                 lex.printError("Unknown width");
812
813                         lex.next();
814                         string const p = lex.getString();
815                         Encoding::Package package = Encoding::none;
816                         if (p == "none")
817                                 package = Encoding::none;
818                         else if (p == "inputenc")
819                                 package = Encoding::inputenc;
820                         else if (p == "CJK")
821                                 package = Encoding::CJK;
822                         else if (p == "japanese")
823                                 package = Encoding::japanese;
824                         else
825                                 lex.printError("Unknown package");
826
827                         LYXERR(Debug::INFO, "Reading encoding " << name);
828                         encodinglist[name] = Encoding(name, latexname,
829                                 guiname, iconvname, fixedwidth, unsafe,
830                                 package);
831
832                         if (lex.lex() != et_end)
833                                 lex.printError("Missing end");
834                         break;
835                 }
836                 case et_end:
837                         lex.printError("Misplaced end");
838                         break;
839                 case Lexer::LEX_FEOF:
840                         break;
841                 default:
842                         lex.printError("Unknown tag");
843                         break;
844                 }
845         }
846
847         // Move all information from forcednotselected to forcedselected
848         for (CharSetMap::const_iterator it1 = forcednotselected.begin(); it1 != forcednotselected.end(); ++it1) {
849                 for (CharSetMap::iterator it2 = forcedselected.begin(); it2 != forcedselected.end(); ++it2) {
850                         if (it2->first != it1->first)
851                                 it2->second.insert(it1->second.begin(), it1->second.end());
852                 }
853         }
854
855 }
856
857
858 } // namespace lyx