X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2FEncoding.cpp;h=c6441f1dc05607d49ee8f780c7f0400ada17c3c6;hb=2417d9d911dbca181c48f45d1aad26d31c9aa815;hp=e37293e993a9f6fb592d1cf4501d448d2414cb30;hpb=138b23fac84930cdbfada0067c61480989041113;p=lyx.git diff --git a/src/Encoding.cpp b/src/Encoding.cpp index e37293e993..c6441f1dc0 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -16,7 +16,7 @@ #include "debug.h" #include "LaTeXFeatures.h" -#include "LyXLex.h" +#include "Lexer.h" #include "LyXRC.h" #include "support/FileName.h" @@ -25,24 +25,22 @@ #include - -namespace lyx { - -using support::FileName; - #ifndef CXX_GLOBAL_CSTD using std::strtol; #endif - using std::endl; using std::string; +namespace lyx { + +using support::FileName; + Encodings encodings; namespace { -char_type arabic_table[50][4] = { +char_type arabic_table[172][4] = { {0xfe80, 0xfe80, 0xfe80, 0xfe80}, // 0x0621 = hamza {0xfe81, 0xfe82, 0xfe81, 0xfe82}, // 0x0622 = ligature madda on alef {0xfe83, 0xfe84, 0xfe83, 0xfe84}, // 0x0623 = ligature hamza on alef @@ -96,11 +94,134 @@ char_type arabic_table[50][4] = { {0x0650, 0x0650, 0x0650, 0x0650}, // 0x0650 = kasra {0x0651, 0x0651, 0x0651, 0x0651}, // 0x0651 = shadda {0x0652, 0x0652, 0x0652, 0x0652}, // 0x0652 = sukun + + {0, 0, 0, 0}, // 0x0653 + {0, 0, 0, 0}, // 0x0654 + {0, 0, 0, 0}, // 0x0655 + {0, 0, 0, 0}, // 0x0656 + {0, 0, 0, 0}, // 0x0657 + {0, 0, 0, 0}, // 0x0658 + {0, 0, 0, 0}, // 0x0659 + {0, 0, 0, 0}, // 0x065a + {0, 0, 0, 0}, // 0x065b + {0, 0, 0, 0}, // 0x065c + {0, 0, 0, 0}, // 0x065d + {0, 0, 0, 0}, // 0x065e + {0, 0, 0, 0}, // 0x065f + {0, 0, 0, 0}, // 0x0660 + {0, 0, 0, 0}, // 0x0661 + {0, 0, 0, 0}, // 0x0662 + {0, 0, 0, 0}, // 0x0663 + {0, 0, 0, 0}, // 0x0664 + {0, 0, 0, 0}, // 0x0665 + {0, 0, 0, 0}, // 0x0666 + {0, 0, 0, 0}, // 0x0667 + {0, 0, 0, 0}, // 0x0668 + {0, 0, 0, 0}, // 0x0669 + {0, 0, 0, 0}, // 0x066a + {0, 0, 0, 0}, // 0x066b + {0, 0, 0, 0}, // 0x066c + {0, 0, 0, 0}, // 0x066d + {0, 0, 0, 0}, // 0x066e + {0, 0, 0, 0}, // 0x066f + {0, 0, 0, 0}, // 0x0670 + {0, 0, 0, 0}, // 0x0671 + {0, 0, 0, 0}, // 0x0672 + {0, 0, 0, 0}, // 0x0673 + {0, 0, 0, 0}, // 0x0674 + {0, 0, 0, 0}, // 0x0675 + {0, 0, 0, 0}, // 0x0676 + {0, 0, 0, 0}, // 0x0677 + {0, 0, 0, 0}, // 0x0678 + {0, 0, 0, 0}, // 0x0679 + {0, 0, 0, 0}, // 0x067a + {0, 0, 0, 0}, // 0x067b + {0, 0, 0, 0}, // 0x067c + {0, 0, 0, 0}, // 0x067d + {0xfb56, 0xfb57, 0xfb58, 0xfb59}, // 0x067e = peh + {0, 0, 0, 0}, // 0x067f + {0, 0, 0, 0}, // 0x0680 + {0, 0, 0, 0}, // 0x0681 + {0, 0, 0, 0}, // 0x0682 + {0, 0, 0, 0}, // 0x0683 + {0, 0, 0, 0}, // 0x0684 + {0, 0, 0, 0}, // 0x0685 + {0xfb7a, 0xfb7b, 0xfb7c, 0xfb7d}, // 0x0686 = tcheh + {0, 0, 0, 0}, // 0x0687 + {0, 0, 0, 0}, // 0x0688 + {0, 0, 0, 0}, // 0x0689 + {0, 0, 0, 0}, // 0x068a + {0, 0, 0, 0}, // 0x068b + {0, 0, 0, 0}, // 0x068c + {0, 0, 0, 0}, // 0x068d + {0, 0, 0, 0}, // 0x068e + {0, 0, 0, 0}, // 0x068f + {0, 0, 0, 0}, // 0x0690 + {0, 0, 0, 0}, // 0x0691 + {0, 0, 0, 0}, // 0x0692 + {0, 0, 0, 0}, // 0x0693 + {0, 0, 0, 0}, // 0x0694 + {0, 0, 0, 0}, // 0x0695 + {0, 0, 0, 0}, // 0x0696 + {0, 0, 0, 0}, // 0x0697 + {0xfb8a, 0xfb8b, 0xfb8a, 0xfb8b}, // 0x0698 = jeh + {0, 0, 0, 0}, // 0x0699 + {0, 0, 0, 0}, // 0x069a + {0, 0, 0, 0}, // 0x069b + {0, 0, 0, 0}, // 0x069c + {0, 0, 0, 0}, // 0x069d + {0, 0, 0, 0}, // 0x069e + {0, 0, 0, 0}, // 0x069f + {0, 0, 0, 0}, // 0x06a0 + {0, 0, 0, 0}, // 0x06a1 + {0, 0, 0, 0}, // 0x06a2 + {0, 0, 0, 0}, // 0x06a3 + {0, 0, 0, 0}, // 0x06a4 + {0, 0, 0, 0}, // 0x06a5 + {0, 0, 0, 0}, // 0x06a6 + {0, 0, 0, 0}, // 0x06a7 + {0, 0, 0, 0}, // 0x06a8 + {0xfb8e, 0xfb8f, 0xfb90, 0xfb91}, // 0x06a9 = farsi kaf + {0, 0, 0, 0}, // 0x06aa + {0, 0, 0, 0}, // 0x06ab + {0, 0, 0, 0}, // 0x06ac + {0, 0, 0, 0}, // 0x06ad + {0, 0, 0, 0}, // 0x06ae + {0xfb92, 0xfb93, 0xfb94, 0xfb95}, // 0x06af = gaf + {0, 0, 0, 0}, // 0x06b0 + {0, 0, 0, 0}, // 0x06b1 + {0, 0, 0, 0}, // 0x06b2 + {0, 0, 0, 0}, // 0x06b3 + {0, 0, 0, 0}, // 0x06b4 + {0, 0, 0, 0}, // 0x06b5 + {0, 0, 0, 0}, // 0x06b6 + {0, 0, 0, 0}, // 0x06b7 + {0, 0, 0, 0}, // 0x06b8 + {0, 0, 0, 0}, // 0x06b9 + {0, 0, 0, 0}, // 0x06ba + {0, 0, 0, 0}, // 0x06bb + {0, 0, 0, 0}, // 0x06bc + {0, 0, 0, 0}, // 0x06bd + {0, 0, 0, 0}, // 0x06be + {0, 0, 0, 0}, // 0x06bf + {0, 0, 0, 0}, // 0x06c0 + {0, 0, 0, 0}, // 0x06c1 + {0, 0, 0, 0}, // 0x06c2 + {0, 0, 0, 0}, // 0x06c3 + {0, 0, 0, 0}, // 0x06c4 + {0, 0, 0, 0}, // 0x06c5 + {0, 0, 0, 0}, // 0x06c6 + {0, 0, 0, 0}, // 0x06c7 + {0, 0, 0, 0}, // 0x06c8 + {0, 0, 0, 0}, // 0x06c9 + {0, 0, 0, 0}, // 0x06ca + {0, 0, 0, 0}, // 0x06cb + {0xfbfc, 0xfbfd, 0xfbfe, 0xfbff} // 0x06cc = farsi yeh }; char_type const arabic_start = 0x0621; -char_type const arabic_end = 0x0652; +char_type const arabic_end = 0x06cc; /// Information about a single UCS4 character @@ -123,25 +244,42 @@ struct CharInfo { typedef std::map CharInfoMap; CharInfoMap unicodesymbols; + +/// The highest code point in UCS4 encoding (1<<20 + 1<<16) +char_type const max_ucs4 = 0x110000; + } // namespace anon -Encoding::Encoding(string const & n, string const & l, string const & i) - : Name_(n), LatexName_(l), iconvName_(i) +Encoding::Encoding(string const & n, string const & l, string const & i, + bool f, Encoding::Package p) + : Name_(n), LatexName_(l), iconvName_(i), fixedwidth_(f), package_(p) { - if (n == "ascii") + if (n == "ascii") { // ASCII can encode 128 code points and nothing else start_encodable_ = 128; - else if (i == "UTF-8") - // UTF8 can encode all 1<<20 + 1<<16 UCS4 code points - start_encodable_ = 0x110000; - else { - start_encodable_ = 0; - // temporarily switch off lyxerr, since we will generate iconv errors - lyxerr.disable(); + complete_ = true; + } else if (i == "UTF-8") { + // UTF8 can encode all UCS4 code points + start_encodable_ = max_ucs4; + complete_ = true; + } else { + complete_ = false; + } +} + + +void Encoding::init() const +{ + start_encodable_ = 0; + // temporarily switch off lyxerr, since we will generate iconv errors + lyxerr.disable(); + if (fixedwidth_) { + // We do not need to check all UCS4 code points, it is enough + // if we check all 256 code points of this encoding. for (unsigned short j = 0; j < 256; ++j) { char const c = j; - std::vector const ucs4 = eightbit_to_ucs4(&c, 1, i); + std::vector const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_); if (ucs4.size() == 1) { char_type const c = ucs4[0]; CharInfoMap::const_iterator const it = unicodesymbols.find(c); @@ -149,19 +287,38 @@ Encoding::Encoding(string const & n, string const & l, string const & i) encodable_.insert(c); } } - lyxerr.enable(); - CharSet::iterator it = encodable_.find(start_encodable_); - while (it != encodable_.end()) { - encodable_.erase(it); - ++start_encodable_; - it = encodable_.find(start_encodable_); + } else { + // We do not know how many code points this encoding has, and + // they do not have a direct representation as a single byte, + // therefore we need to check all UCS4 code points. + // This is expensive! + for (char_type c = 0; c < max_ucs4; ++c) { + std::vector const eightbit = ucs4_to_eightbit(&c, 1, iconvName_); + if (!eightbit.empty()) { + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it == unicodesymbols.end() || !it->second.force) + encodable_.insert(c); + } } } + lyxerr.enable(); + CharSet::iterator it = encodable_.find(start_encodable_); + while (it != encodable_.end()) { + encodable_.erase(it); + ++start_encodable_; + it = encodable_.find(start_encodable_); + } + complete_ = true; } docstring const Encoding::latexChar(char_type c) const { + // assure the used encoding is properly initialized + if (!complete_) + init(); + BOOST_ASSERT(complete_); + if (c < start_encodable_) return docstring(1, c); if (encodable_.find(c) == encodable_.end()) { @@ -205,10 +362,10 @@ bool Encodings::isComposeChar_hebrew(char_type c) bool Encodings::is_arabic_special(char_type c) { return (c >= 0x0621 && c <= 0x0625) || - c == 0x0627 || c == 0x0629 || - c == 0x062f || c == 0x0648 || + c == 0x0627 || c == 0x0629 || + c == 0x062f || c == 0x0648 || (c >= 0x0630 && c <= 0x0632) || - c == 0x0649; + c == 0x0649 || c == 0x0698; } @@ -278,7 +435,7 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) { // We must read the symbolsfile first, because the Encoding // constructor depends on it. - LyXLex symbolslex(0, 0); + Lexer symbolslex(0, 0); symbolslex.setFile(symbolsfile); while (symbolslex.isOK()) { char_type symbol; @@ -320,8 +477,8 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) info.force = true; else lyxerr << "Ignoring unknown flag `" << flag - << "' for symbol `0x" - << std::hex << symbol << std::dec + << "' for symbol `0x" + << std::hex << symbol << std::dec << "'." << endl; } @@ -348,7 +505,7 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) { "end", et_end } }; - LyXLex lex(encodingtags, et_last - 1); + Lexer lex(encodingtags, et_last - 1); lex.setFile(encfile); while (lex.isOK()) { switch (lex.lex()) { @@ -360,8 +517,36 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) string const latexname = lex.getString(); lex.next(); string const iconvname = lex.getString(); + lex.next(); + string const width = lex.getString(); + bool fixedwidth = false; + if (width == "fixed") + fixedwidth = true; + else if (width == "variable") + fixedwidth = false; + else { + lex.printError("Encodings::read: " + "Unknown width: `$$Token'"); + } + + lex.next(); + string const p = lex.getString(); + Encoding::Package package = Encoding::none; + if (p == "none") + package = Encoding::none; + else if (p == "inputenc") + package = Encoding::inputenc; + else if (p == "CJK") + package = Encoding::CJK; + else { + lex.printError("Encodings::read: " + "Unknown package: `$$Token'"); + } + LYXERR(Debug::INFO) << "Reading encoding " << name << endl; - encodinglist[name] = Encoding(name, latexname, iconvname); + encodinglist[name] = Encoding(name, latexname, + iconvname, fixedwidth, + package); if (lex.lex() != et_end) lex.printError("Encodings::read: " "missing end"); @@ -370,7 +555,7 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) case et_end: lex.printError("Encodings::read: Misplaced end"); break; - case LyXLex::LEX_FEOF: + case Lexer::LEX_FEOF: break; default: lex.printError("Encodings::read: "