X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2FEncoding.cpp;h=71ea8c63e49a944691d5c760bbbc2c7cfa372bbd;hb=6b651f2ad9f698c01993dcc6e340682c279f1c55;hp=e37293e993a9f6fb592d1cf4501d448d2414cb30;hpb=138b23fac84930cdbfada0067c61480989041113;p=lyx.git diff --git a/src/Encoding.cpp b/src/Encoding.cpp index e37293e993..71ea8c63e4 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -14,35 +14,29 @@ #include "Encoding.h" -#include "debug.h" #include "LaTeXFeatures.h" -#include "LyXLex.h" +#include "Lexer.h" #include "LyXRC.h" +#include "support/debug.h" #include "support/FileName.h" #include "support/lstrings.h" #include "support/unicode.h" +#include + #include +using namespace std; +using namespace lyx::support; namespace lyx { -using support::FileName; - -#ifndef CXX_GLOBAL_CSTD -using std::strtol; -#endif - -using std::endl; -using std::string; - - Encodings encodings; namespace { -char_type arabic_table[50][4] = { +char_type arabic_table[172][4] = { {0xfe80, 0xfe80, 0xfe80, 0xfe80}, // 0x0621 = hamza {0xfe81, 0xfe82, 0xfe81, 0xfe82}, // 0x0622 = ligature madda on alef {0xfe83, 0xfe84, 0xfe83, 0xfe84}, // 0x0623 = ligature hamza on alef @@ -96,11 +90,134 @@ char_type arabic_table[50][4] = { {0x0650, 0x0650, 0x0650, 0x0650}, // 0x0650 = kasra {0x0651, 0x0651, 0x0651, 0x0651}, // 0x0651 = shadda {0x0652, 0x0652, 0x0652, 0x0652}, // 0x0652 = sukun + + {0, 0, 0, 0}, // 0x0653 + {0, 0, 0, 0}, // 0x0654 + {0, 0, 0, 0}, // 0x0655 + {0, 0, 0, 0}, // 0x0656 + {0, 0, 0, 0}, // 0x0657 + {0, 0, 0, 0}, // 0x0658 + {0, 0, 0, 0}, // 0x0659 + {0, 0, 0, 0}, // 0x065a + {0, 0, 0, 0}, // 0x065b + {0, 0, 0, 0}, // 0x065c + {0, 0, 0, 0}, // 0x065d + {0, 0, 0, 0}, // 0x065e + {0, 0, 0, 0}, // 0x065f + {0, 0, 0, 0}, // 0x0660 + {0, 0, 0, 0}, // 0x0661 + {0, 0, 0, 0}, // 0x0662 + {0, 0, 0, 0}, // 0x0663 + {0, 0, 0, 0}, // 0x0664 + {0, 0, 0, 0}, // 0x0665 + {0, 0, 0, 0}, // 0x0666 + {0, 0, 0, 0}, // 0x0667 + {0, 0, 0, 0}, // 0x0668 + {0, 0, 0, 0}, // 0x0669 + {0, 0, 0, 0}, // 0x066a + {0, 0, 0, 0}, // 0x066b + {0, 0, 0, 0}, // 0x066c + {0, 0, 0, 0}, // 0x066d + {0, 0, 0, 0}, // 0x066e + {0, 0, 0, 0}, // 0x066f + {0, 0, 0, 0}, // 0x0670 + {0, 0, 0, 0}, // 0x0671 + {0, 0, 0, 0}, // 0x0672 + {0, 0, 0, 0}, // 0x0673 + {0, 0, 0, 0}, // 0x0674 + {0, 0, 0, 0}, // 0x0675 + {0, 0, 0, 0}, // 0x0676 + {0, 0, 0, 0}, // 0x0677 + {0, 0, 0, 0}, // 0x0678 + {0, 0, 0, 0}, // 0x0679 + {0, 0, 0, 0}, // 0x067a + {0, 0, 0, 0}, // 0x067b + {0, 0, 0, 0}, // 0x067c + {0, 0, 0, 0}, // 0x067d + {0xfb56, 0xfb57, 0xfb58, 0xfb59}, // 0x067e = peh + {0, 0, 0, 0}, // 0x067f + {0, 0, 0, 0}, // 0x0680 + {0, 0, 0, 0}, // 0x0681 + {0, 0, 0, 0}, // 0x0682 + {0, 0, 0, 0}, // 0x0683 + {0, 0, 0, 0}, // 0x0684 + {0, 0, 0, 0}, // 0x0685 + {0xfb7a, 0xfb7b, 0xfb7c, 0xfb7d}, // 0x0686 = tcheh + {0, 0, 0, 0}, // 0x0687 + {0, 0, 0, 0}, // 0x0688 + {0, 0, 0, 0}, // 0x0689 + {0, 0, 0, 0}, // 0x068a + {0, 0, 0, 0}, // 0x068b + {0, 0, 0, 0}, // 0x068c + {0, 0, 0, 0}, // 0x068d + {0, 0, 0, 0}, // 0x068e + {0, 0, 0, 0}, // 0x068f + {0, 0, 0, 0}, // 0x0690 + {0, 0, 0, 0}, // 0x0691 + {0, 0, 0, 0}, // 0x0692 + {0, 0, 0, 0}, // 0x0693 + {0, 0, 0, 0}, // 0x0694 + {0, 0, 0, 0}, // 0x0695 + {0, 0, 0, 0}, // 0x0696 + {0, 0, 0, 0}, // 0x0697 + {0xfb8a, 0xfb8b, 0xfb8a, 0xfb8b}, // 0x0698 = jeh + {0, 0, 0, 0}, // 0x0699 + {0, 0, 0, 0}, // 0x069a + {0, 0, 0, 0}, // 0x069b + {0, 0, 0, 0}, // 0x069c + {0, 0, 0, 0}, // 0x069d + {0, 0, 0, 0}, // 0x069e + {0, 0, 0, 0}, // 0x069f + {0, 0, 0, 0}, // 0x06a0 + {0, 0, 0, 0}, // 0x06a1 + {0, 0, 0, 0}, // 0x06a2 + {0, 0, 0, 0}, // 0x06a3 + {0, 0, 0, 0}, // 0x06a4 + {0, 0, 0, 0}, // 0x06a5 + {0, 0, 0, 0}, // 0x06a6 + {0, 0, 0, 0}, // 0x06a7 + {0, 0, 0, 0}, // 0x06a8 + {0xfb8e, 0xfb8f, 0xfb90, 0xfb91}, // 0x06a9 = farsi kaf + {0, 0, 0, 0}, // 0x06aa + {0, 0, 0, 0}, // 0x06ab + {0, 0, 0, 0}, // 0x06ac + {0, 0, 0, 0}, // 0x06ad + {0, 0, 0, 0}, // 0x06ae + {0xfb92, 0xfb93, 0xfb94, 0xfb95}, // 0x06af = gaf + {0, 0, 0, 0}, // 0x06b0 + {0, 0, 0, 0}, // 0x06b1 + {0, 0, 0, 0}, // 0x06b2 + {0, 0, 0, 0}, // 0x06b3 + {0, 0, 0, 0}, // 0x06b4 + {0, 0, 0, 0}, // 0x06b5 + {0, 0, 0, 0}, // 0x06b6 + {0, 0, 0, 0}, // 0x06b7 + {0, 0, 0, 0}, // 0x06b8 + {0, 0, 0, 0}, // 0x06b9 + {0, 0, 0, 0}, // 0x06ba + {0, 0, 0, 0}, // 0x06bb + {0, 0, 0, 0}, // 0x06bc + {0, 0, 0, 0}, // 0x06bd + {0, 0, 0, 0}, // 0x06be + {0, 0, 0, 0}, // 0x06bf + {0, 0, 0, 0}, // 0x06c0 + {0, 0, 0, 0}, // 0x06c1 + {0, 0, 0, 0}, // 0x06c2 + {0, 0, 0, 0}, // 0x06c3 + {0, 0, 0, 0}, // 0x06c4 + {0, 0, 0, 0}, // 0x06c5 + {0, 0, 0, 0}, // 0x06c6 + {0, 0, 0, 0}, // 0x06c7 + {0, 0, 0, 0}, // 0x06c8 + {0, 0, 0, 0}, // 0x06c9 + {0, 0, 0, 0}, // 0x06ca + {0, 0, 0, 0}, // 0x06cb + {0xfbfc, 0xfbfd, 0xfbfe, 0xfbff} // 0x06cc = farsi yeh }; char_type const arabic_start = 0x0621; -char_type const arabic_end = 0x0652; +char_type const arabic_end = 0x06cc; /// Information about a single UCS4 character @@ -120,28 +237,57 @@ struct CharInfo { }; -typedef std::map CharInfoMap; +typedef map CharInfoMap; CharInfoMap unicodesymbols; + +/// The highest code point in UCS4 encoding (1<<20 + 1<<16) +char_type const max_ucs4 = 0x110000; + } // namespace anon -Encoding::Encoding(string const & n, string const & l, string const & i) - : Name_(n), LatexName_(l), iconvName_(i) +EncodingException::EncodingException(char_type c) + : failed_char(c), par_id(0), pos(0) +{ +} + + +const char * EncodingException::what() const throw() +{ + return "Could not find LaTeX command for a character"; +} + + +Encoding::Encoding(string const & n, string const & l, string const & i, + bool f, Encoding::Package p) + : Name_(n), LatexName_(l), iconvName_(i), fixedwidth_(f), package_(p) { - if (n == "ascii") + if (n == "ascii") { // ASCII can encode 128 code points and nothing else start_encodable_ = 128; - else if (i == "UTF-8") - // UTF8 can encode all 1<<20 + 1<<16 UCS4 code points - start_encodable_ = 0x110000; - else { - start_encodable_ = 0; - // temporarily switch off lyxerr, since we will generate iconv errors - lyxerr.disable(); + complete_ = true; + } else if (i == "UTF-8") { + // UTF8 can encode all UCS4 code points + start_encodable_ = max_ucs4; + complete_ = true; + } else { + complete_ = false; + } +} + + +void Encoding::init() const +{ + start_encodable_ = 0; + // temporarily switch off lyxerr, since we will generate iconv errors + lyxerr.disable(); + if (fixedwidth_) { + // We do not need to check all UCS4 code points, it is enough + // if we check all 256 code points of this encoding. for (unsigned short j = 0; j < 256; ++j) { - char const c = j; - std::vector const ucs4 = eightbit_to_ucs4(&c, 1, i); + char const c = char(j); + vector const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_); if (ucs4.size() == 1) { char_type const c = ucs4[0]; CharInfoMap::const_iterator const it = unicodesymbols.find(c); @@ -149,29 +295,45 @@ Encoding::Encoding(string const & n, string const & l, string const & i) encodable_.insert(c); } } - lyxerr.enable(); - CharSet::iterator it = encodable_.find(start_encodable_); - while (it != encodable_.end()) { - encodable_.erase(it); - ++start_encodable_; - it = encodable_.find(start_encodable_); + } else { + // We do not know how many code points this encoding has, and + // they do not have a direct representation as a single byte, + // therefore we need to check all UCS4 code points. + // This is expensive! + for (char_type c = 0; c < max_ucs4; ++c) { + vector const eightbit = ucs4_to_eightbit(&c, 1, iconvName_); + if (!eightbit.empty()) { + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it == unicodesymbols.end() || !it->second.force) + encodable_.insert(c); + } } } + lyxerr.enable(); + CharSet::iterator it = encodable_.find(start_encodable_); + while (it != encodable_.end()) { + encodable_.erase(it); + ++start_encodable_; + it = encodable_.find(start_encodable_); + } + complete_ = true; } docstring const Encoding::latexChar(char_type c) const { + // assure the used encoding is properly initialized + if (!complete_) + init(); + BOOST_ASSERT(complete_); + if (c < start_encodable_) return docstring(1, c); if (encodable_.find(c) == encodable_.end()) { // c cannot be encoded in this encoding CharInfoMap::const_iterator const it = unicodesymbols.find(c); if (it == unicodesymbols.end()) - lyxerr << "Could not find LaTeX command for character 0x" - << std::hex << c << std::dec - << ".\nLaTeX export will fail." - << endl; + throw EncodingException(c); else return it->second.command; } @@ -205,10 +367,10 @@ bool Encodings::isComposeChar_hebrew(char_type c) bool Encodings::is_arabic_special(char_type c) { return (c >= 0x0621 && c <= 0x0625) || - c == 0x0627 || c == 0x0629 || - c == 0x062f || c == 0x0648 || + c == 0x0627 || c == 0x0629 || + c == 0x062f || c == 0x0648 || (c >= 0x0630 && c <= 0x0632) || - c == 0x0649; + c == 0x0649 || c == 0x0698; } @@ -244,6 +406,25 @@ bool Encodings::isCombiningChar(char_type c) } +bool Encodings::isKnownScriptChar(char_type const c, string & preamble) +{ + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + + if (it == unicodesymbols.end()) + return false; + + if (it->second.preamble != "textgreek" && + it->second.preamble != "textcyr") + return false; + + if (preamble.empty()) { + preamble = it->second.preamble; + return true; + } + return it->second.preamble == preamble; +} + + Encoding const * Encodings::getFromLyXName(string const & name) const { EncodingList::const_iterator it = encodinglist.find(name); @@ -256,7 +437,7 @@ Encoding const * Encodings::getFromLyXName(string const & name) const Encoding const * Encodings::getFromLaTeXName(string const & name) const { - // We don't use std::find_if because it makes copies of the pairs in + // We don't use find_if because it makes copies of the pairs in // the map. // This linear search is OK since we don't have many encodings. // Users could even optimize it by putting the encodings they use @@ -278,7 +459,7 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) { // We must read the symbolsfile first, because the Encoding // constructor depends on it. - LyXLex symbolslex(0, 0); + Lexer symbolslex(0, 0); symbolslex.setFile(symbolsfile); while (symbolslex.isOK()) { char_type symbol; @@ -286,11 +467,11 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) string flags; if (symbolslex.next(true)) { - std::istringstream is(symbolslex.getString()); + istringstream is(symbolslex.getString()); // reading symbol directly does not work if - // char_type == std::wchar_t. + // char_type == wchar_t. boost::uint32_t tmp; - if(!(is >> std::hex >> tmp)) + if(!(is >> hex >> tmp)) break; symbol = tmp; } else @@ -313,26 +494,24 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) info.force = false; while (!flags.empty()) { string flag; - flags = support::split(flags, flag, ','); + flags = split(flags, flag, ','); if (flag == "combining") info.combining = true; else if (flag == "force") info.force = true; else lyxerr << "Ignoring unknown flag `" << flag - << "' for symbol `0x" - << std::hex << symbol << std::dec + << "' for symbol `0x" + << hex << symbol << dec << "'." << endl; } if (!info.preamble.empty()) info.feature = info.preamble[0] != '\\'; - LYXERR(Debug::INFO) - << "Read unicode symbol " << symbol << " '" + LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '" << to_utf8(info.command) << "' '" << info.preamble - << "' " << info.combining << ' ' << info.feature - << endl; + << "' " << info.combining << ' ' << info.feature); unicodesymbols[symbol] = info; } @@ -348,7 +527,7 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) { "end", et_end } }; - LyXLex lex(encodingtags, et_last - 1); + Lexer lex(encodingtags, et_last - 1); lex.setFile(encfile); while (lex.isOK()) { switch (lex.lex()) { @@ -360,8 +539,36 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) string const latexname = lex.getString(); lex.next(); string const iconvname = lex.getString(); - LYXERR(Debug::INFO) << "Reading encoding " << name << endl; - encodinglist[name] = Encoding(name, latexname, iconvname); + lex.next(); + string const width = lex.getString(); + bool fixedwidth = false; + if (width == "fixed") + fixedwidth = true; + else if (width == "variable") + fixedwidth = false; + else { + lex.printError("Encodings::read: " + "Unknown width: `$$Token'"); + } + + lex.next(); + string const p = lex.getString(); + Encoding::Package package = Encoding::none; + if (p == "none") + package = Encoding::none; + else if (p == "inputenc") + package = Encoding::inputenc; + else if (p == "CJK") + package = Encoding::CJK; + else { + lex.printError("Encodings::read: " + "Unknown package: `$$Token'"); + } + + LYXERR(Debug::INFO, "Reading encoding " << name); + encodinglist[name] = Encoding(name, latexname, + iconvname, fixedwidth, + package); if (lex.lex() != et_end) lex.printError("Encodings::read: " "missing end"); @@ -370,7 +577,7 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) case et_end: lex.printError("Encodings::read: Misplaced end"); break; - case LyXLex::LEX_FEOF: + case Lexer::LEX_FEOF: break; default: lex.printError("Encodings::read: "