X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2FEncoding.cpp;h=71ea8c63e49a944691d5c760bbbc2c7cfa372bbd;hb=6b651f2ad9f698c01993dcc6e340682c279f1c55;hp=eb2a1e736257c53b13213ac684ca09a3c73d96b1;hpb=b6407071cfffdd6e48b35420c60768a1555d7c9d;p=lyx.git diff --git a/src/Encoding.cpp b/src/Encoding.cpp index eb2a1e7362..71ea8c63e4 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -14,35 +14,29 @@ #include "Encoding.h" -#include "debug.h" #include "LaTeXFeatures.h" #include "Lexer.h" #include "LyXRC.h" +#include "support/debug.h" #include "support/FileName.h" #include "support/lstrings.h" #include "support/unicode.h" +#include + #include +using namespace std; +using namespace lyx::support; namespace lyx { -using support::FileName; - -#ifndef CXX_GLOBAL_CSTD -using std::strtol; -#endif - -using std::endl; -using std::string; - - Encodings encodings; namespace { -char_type arabic_table[50][4] = { +char_type arabic_table[172][4] = { {0xfe80, 0xfe80, 0xfe80, 0xfe80}, // 0x0621 = hamza {0xfe81, 0xfe82, 0xfe81, 0xfe82}, // 0x0622 = ligature madda on alef {0xfe83, 0xfe84, 0xfe83, 0xfe84}, // 0x0623 = ligature hamza on alef @@ -96,11 +90,134 @@ char_type arabic_table[50][4] = { {0x0650, 0x0650, 0x0650, 0x0650}, // 0x0650 = kasra {0x0651, 0x0651, 0x0651, 0x0651}, // 0x0651 = shadda {0x0652, 0x0652, 0x0652, 0x0652}, // 0x0652 = sukun + + {0, 0, 0, 0}, // 0x0653 + {0, 0, 0, 0}, // 0x0654 + {0, 0, 0, 0}, // 0x0655 + {0, 0, 0, 0}, // 0x0656 + {0, 0, 0, 0}, // 0x0657 + {0, 0, 0, 0}, // 0x0658 + {0, 0, 0, 0}, // 0x0659 + {0, 0, 0, 0}, // 0x065a + {0, 0, 0, 0}, // 0x065b + {0, 0, 0, 0}, // 0x065c + {0, 0, 0, 0}, // 0x065d + {0, 0, 0, 0}, // 0x065e + {0, 0, 0, 0}, // 0x065f + {0, 0, 0, 0}, // 0x0660 + {0, 0, 0, 0}, // 0x0661 + {0, 0, 0, 0}, // 0x0662 + {0, 0, 0, 0}, // 0x0663 + {0, 0, 0, 0}, // 0x0664 + {0, 0, 0, 0}, // 0x0665 + {0, 0, 0, 0}, // 0x0666 + {0, 0, 0, 0}, // 0x0667 + {0, 0, 0, 0}, // 0x0668 + {0, 0, 0, 0}, // 0x0669 + {0, 0, 0, 0}, // 0x066a + {0, 0, 0, 0}, // 0x066b + {0, 0, 0, 0}, // 0x066c + {0, 0, 0, 0}, // 0x066d + {0, 0, 0, 0}, // 0x066e + {0, 0, 0, 0}, // 0x066f + {0, 0, 0, 0}, // 0x0670 + {0, 0, 0, 0}, // 0x0671 + {0, 0, 0, 0}, // 0x0672 + {0, 0, 0, 0}, // 0x0673 + {0, 0, 0, 0}, // 0x0674 + {0, 0, 0, 0}, // 0x0675 + {0, 0, 0, 0}, // 0x0676 + {0, 0, 0, 0}, // 0x0677 + {0, 0, 0, 0}, // 0x0678 + {0, 0, 0, 0}, // 0x0679 + {0, 0, 0, 0}, // 0x067a + {0, 0, 0, 0}, // 0x067b + {0, 0, 0, 0}, // 0x067c + {0, 0, 0, 0}, // 0x067d + {0xfb56, 0xfb57, 0xfb58, 0xfb59}, // 0x067e = peh + {0, 0, 0, 0}, // 0x067f + {0, 0, 0, 0}, // 0x0680 + {0, 0, 0, 0}, // 0x0681 + {0, 0, 0, 0}, // 0x0682 + {0, 0, 0, 0}, // 0x0683 + {0, 0, 0, 0}, // 0x0684 + {0, 0, 0, 0}, // 0x0685 + {0xfb7a, 0xfb7b, 0xfb7c, 0xfb7d}, // 0x0686 = tcheh + {0, 0, 0, 0}, // 0x0687 + {0, 0, 0, 0}, // 0x0688 + {0, 0, 0, 0}, // 0x0689 + {0, 0, 0, 0}, // 0x068a + {0, 0, 0, 0}, // 0x068b + {0, 0, 0, 0}, // 0x068c + {0, 0, 0, 0}, // 0x068d + {0, 0, 0, 0}, // 0x068e + {0, 0, 0, 0}, // 0x068f + {0, 0, 0, 0}, // 0x0690 + {0, 0, 0, 0}, // 0x0691 + {0, 0, 0, 0}, // 0x0692 + {0, 0, 0, 0}, // 0x0693 + {0, 0, 0, 0}, // 0x0694 + {0, 0, 0, 0}, // 0x0695 + {0, 0, 0, 0}, // 0x0696 + {0, 0, 0, 0}, // 0x0697 + {0xfb8a, 0xfb8b, 0xfb8a, 0xfb8b}, // 0x0698 = jeh + {0, 0, 0, 0}, // 0x0699 + {0, 0, 0, 0}, // 0x069a + {0, 0, 0, 0}, // 0x069b + {0, 0, 0, 0}, // 0x069c + {0, 0, 0, 0}, // 0x069d + {0, 0, 0, 0}, // 0x069e + {0, 0, 0, 0}, // 0x069f + {0, 0, 0, 0}, // 0x06a0 + {0, 0, 0, 0}, // 0x06a1 + {0, 0, 0, 0}, // 0x06a2 + {0, 0, 0, 0}, // 0x06a3 + {0, 0, 0, 0}, // 0x06a4 + {0, 0, 0, 0}, // 0x06a5 + {0, 0, 0, 0}, // 0x06a6 + {0, 0, 0, 0}, // 0x06a7 + {0, 0, 0, 0}, // 0x06a8 + {0xfb8e, 0xfb8f, 0xfb90, 0xfb91}, // 0x06a9 = farsi kaf + {0, 0, 0, 0}, // 0x06aa + {0, 0, 0, 0}, // 0x06ab + {0, 0, 0, 0}, // 0x06ac + {0, 0, 0, 0}, // 0x06ad + {0, 0, 0, 0}, // 0x06ae + {0xfb92, 0xfb93, 0xfb94, 0xfb95}, // 0x06af = gaf + {0, 0, 0, 0}, // 0x06b0 + {0, 0, 0, 0}, // 0x06b1 + {0, 0, 0, 0}, // 0x06b2 + {0, 0, 0, 0}, // 0x06b3 + {0, 0, 0, 0}, // 0x06b4 + {0, 0, 0, 0}, // 0x06b5 + {0, 0, 0, 0}, // 0x06b6 + {0, 0, 0, 0}, // 0x06b7 + {0, 0, 0, 0}, // 0x06b8 + {0, 0, 0, 0}, // 0x06b9 + {0, 0, 0, 0}, // 0x06ba + {0, 0, 0, 0}, // 0x06bb + {0, 0, 0, 0}, // 0x06bc + {0, 0, 0, 0}, // 0x06bd + {0, 0, 0, 0}, // 0x06be + {0, 0, 0, 0}, // 0x06bf + {0, 0, 0, 0}, // 0x06c0 + {0, 0, 0, 0}, // 0x06c1 + {0, 0, 0, 0}, // 0x06c2 + {0, 0, 0, 0}, // 0x06c3 + {0, 0, 0, 0}, // 0x06c4 + {0, 0, 0, 0}, // 0x06c5 + {0, 0, 0, 0}, // 0x06c6 + {0, 0, 0, 0}, // 0x06c7 + {0, 0, 0, 0}, // 0x06c8 + {0, 0, 0, 0}, // 0x06c9 + {0, 0, 0, 0}, // 0x06ca + {0, 0, 0, 0}, // 0x06cb + {0xfbfc, 0xfbfd, 0xfbfe, 0xfbff} // 0x06cc = farsi yeh }; char_type const arabic_start = 0x0621; -char_type const arabic_end = 0x0652; +char_type const arabic_end = 0x06cc; /// Information about a single UCS4 character @@ -120,7 +237,7 @@ struct CharInfo { }; -typedef std::map CharInfoMap; +typedef map CharInfoMap; CharInfoMap unicodesymbols; @@ -130,8 +247,20 @@ char_type const max_ucs4 = 0x110000; } // namespace anon +EncodingException::EncodingException(char_type c) + : failed_char(c), par_id(0), pos(0) +{ +} + + +const char * EncodingException::what() const throw() +{ + return "Could not find LaTeX command for a character"; +} + + Encoding::Encoding(string const & n, string const & l, string const & i, - bool f, Encoding::Package p) + bool f, Encoding::Package p) : Name_(n), LatexName_(l), iconvName_(i), fixedwidth_(f), package_(p) { if (n == "ascii") { @@ -157,8 +286,8 @@ void Encoding::init() const // We do not need to check all UCS4 code points, it is enough // if we check all 256 code points of this encoding. for (unsigned short j = 0; j < 256; ++j) { - char const c = j; - std::vector const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_); + char const c = char(j); + vector const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_); if (ucs4.size() == 1) { char_type const c = ucs4[0]; CharInfoMap::const_iterator const it = unicodesymbols.find(c); @@ -172,7 +301,7 @@ void Encoding::init() const // therefore we need to check all UCS4 code points. // This is expensive! for (char_type c = 0; c < max_ucs4; ++c) { - std::vector const eightbit = ucs4_to_eightbit(&c, 1, iconvName_); + vector const eightbit = ucs4_to_eightbit(&c, 1, iconvName_); if (!eightbit.empty()) { CharInfoMap::const_iterator const it = unicodesymbols.find(c); if (it == unicodesymbols.end() || !it->second.force) @@ -204,10 +333,7 @@ docstring const Encoding::latexChar(char_type c) const // c cannot be encoded in this encoding CharInfoMap::const_iterator const it = unicodesymbols.find(c); if (it == unicodesymbols.end()) - lyxerr << "Could not find LaTeX command for character 0x" - << std::hex << c << std::dec - << ".\nLaTeX export will fail." - << endl; + throw EncodingException(c); else return it->second.command; } @@ -241,10 +367,10 @@ bool Encodings::isComposeChar_hebrew(char_type c) bool Encodings::is_arabic_special(char_type c) { return (c >= 0x0621 && c <= 0x0625) || - c == 0x0627 || c == 0x0629 || - c == 0x062f || c == 0x0648 || + c == 0x0627 || c == 0x0629 || + c == 0x062f || c == 0x0648 || (c >= 0x0630 && c <= 0x0632) || - c == 0x0649; + c == 0x0649 || c == 0x0698; } @@ -280,6 +406,25 @@ bool Encodings::isCombiningChar(char_type c) } +bool Encodings::isKnownScriptChar(char_type const c, string & preamble) +{ + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + + if (it == unicodesymbols.end()) + return false; + + if (it->second.preamble != "textgreek" && + it->second.preamble != "textcyr") + return false; + + if (preamble.empty()) { + preamble = it->second.preamble; + return true; + } + return it->second.preamble == preamble; +} + + Encoding const * Encodings::getFromLyXName(string const & name) const { EncodingList::const_iterator it = encodinglist.find(name); @@ -292,7 +437,7 @@ Encoding const * Encodings::getFromLyXName(string const & name) const Encoding const * Encodings::getFromLaTeXName(string const & name) const { - // We don't use std::find_if because it makes copies of the pairs in + // We don't use find_if because it makes copies of the pairs in // the map. // This linear search is OK since we don't have many encodings. // Users could even optimize it by putting the encodings they use @@ -322,11 +467,11 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) string flags; if (symbolslex.next(true)) { - std::istringstream is(symbolslex.getString()); + istringstream is(symbolslex.getString()); // reading symbol directly does not work if - // char_type == std::wchar_t. + // char_type == wchar_t. boost::uint32_t tmp; - if(!(is >> std::hex >> tmp)) + if(!(is >> hex >> tmp)) break; symbol = tmp; } else @@ -349,26 +494,24 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) info.force = false; while (!flags.empty()) { string flag; - flags = support::split(flags, flag, ','); + flags = split(flags, flag, ','); if (flag == "combining") info.combining = true; else if (flag == "force") info.force = true; else lyxerr << "Ignoring unknown flag `" << flag - << "' for symbol `0x" - << std::hex << symbol << std::dec + << "' for symbol `0x" + << hex << symbol << dec << "'." << endl; } if (!info.preamble.empty()) info.feature = info.preamble[0] != '\\'; - LYXERR(Debug::INFO) - << "Read unicode symbol " << symbol << " '" + LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '" << to_utf8(info.command) << "' '" << info.preamble - << "' " << info.combining << ' ' << info.feature - << endl; + << "' " << info.combining << ' ' << info.feature); unicodesymbols[symbol] = info; } @@ -398,30 +541,34 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) string const iconvname = lex.getString(); lex.next(); string const width = lex.getString(); - bool fixedwidth; + bool fixedwidth = false; if (width == "fixed") fixedwidth = true; else if (width == "variable") fixedwidth = false; - else + else { lex.printError("Encodings::read: " - "Unknown width: `$$Token'"); + "Unknown width: `$$Token'"); + } + lex.next(); string const p = lex.getString(); - Encoding::Package package; + Encoding::Package package = Encoding::none; if (p == "none") - package = Encoding::none; + package = Encoding::none; else if (p == "inputenc") package = Encoding::inputenc; else if (p == "CJK") package = Encoding::CJK; - else + else { lex.printError("Encodings::read: " - "Unknown package: `$$Token'"); - LYXERR(Debug::INFO) << "Reading encoding " << name << endl; + "Unknown package: `$$Token'"); + } + + LYXERR(Debug::INFO, "Reading encoding " << name); encodinglist[name] = Encoding(name, latexname, - iconvname, fixedwidth, - package); + iconvname, fixedwidth, + package); if (lex.lex() != et_end) lex.printError("Encodings::read: " "missing end");