Set CJK document language instead of adding a note.

author Georg Baum <baum@lyx.org>

Sat, 6 Oct 2012 07:38:14 +0000 (09:38 +0200)

committer Georg Baum <baum@lyx.org>

Sat, 6 Oct 2012 07:38:14 +0000 (09:38 +0200)
author Georg Baum <baum@lyx.org>
Sat, 6 Oct 2012 07:38:14 +0000 (09:38 +0200)
committer Georg Baum <baum@lyx.org>
Sat, 6 Oct 2012 07:38:14 +0000 (09:38 +0200)
diff --git a/src/tex2lyx/Preamble.cpp b/src/tex2lyx/Preamble.cpp

index 6ea980eb1db69c5ab1ed0b0d8a1cc735e1638523..430708c0fc670606024167ba029b4671e67723ea 100644 (file)
--- a/src/tex2lyx/Preamble.cpp
+++ b/src/tex2lyx/Preamble.cpp
@@ -44,9 +44,8 @@ Preamble preamble;
  
  namespace {
  
-// "chinese-simplified", "chinese-traditional", "japanese-cjk", "korean"
-// cannot be supported because it is impossible to determine the correct document
-// language if CJK is used.
+// CJK languages are handled in text.cpp, polyglossia languages are listed
+// further down.
  /**
   * known babel language names (including synonyms)
   * not in standard babel: arabic, arabtex, armenian, belarusian, serbian-latin, thai
@@ -88,6 +87,9 @@ const char * const known_coded_languages[] = {"french", "afrikaans", "albanian",
  "uppersorbian", "uppersorbian", "english", "english", "vietnamese", "welsh",
  0};
  
+/// languages with danish quotes (.lyx names)
+const char * const known_danish_quotes_languages[] = {"danish", 0};
+
  /// languages with english quotes (.lyx names)
  const char * const known_english_quotes_languages[] = {"american", "australian",
  "bahasa", "bahasam", "brazilian", "canadian", "chinese-simplified", "english",
@@ -639,6 +641,7 @@ void Preamble::handle_package(Parser &p, string const & name,
         vector<string> options = split_options(opts);
         add_package(name, options);
         string scale;
+       char const * const * where = 0;
  
         if (is_known(name, known_xetex_packages)) {
                 xetex = true;
@@ -753,9 +756,6 @@ void Preamble::handle_package(Parser &p, string const & name,
         }
  
         else if (name == "CJK") {
-               // It is impossible to determine the document language if CJK is used.
-               // All we can do is to notify the user that he has to set this by himself.
-               have_CJK = true;
                 // set the encoding to "auto" because it might be set to "default" by the babel handling
                 // and this would not be correct for CJK
                 if (h_inputencoding == "default")
@@ -833,8 +833,8 @@ void Preamble::handle_package(Parser &p, string const & name,
         else if (name == "subfig")
                 ; // ignore this FIXME: Use the package separator mechanism instead
  
-       else if (is_known(name, known_languages))
-               h_language = name;
+       else if ((where = is_known(name, known_languages)))
+               h_language = known_coded_languages[where - known_languages];
  
         else if (name == "natbib") {
                 h_biblio_style = "plainnat";
@@ -914,7 +914,7 @@ bool Preamble::writeLyXHeader(ostream & os, bool subdoc)
         // http://en.wikipedia.org/wiki/Quotation_mark,_non-English_usage
         // (quotes for kazakh and interlingua are unknown)
         // danish
-       if (h_language == "danish")
+       if (is_known(h_language, known_danish_quotes_languages))
                 h_quotes_language = "danish";
         // french
         else if (is_known(h_language, known_french_quotes_languages))
@@ -1644,6 +1644,16 @@ void Preamble::parse(Parser & p, string const & forceclass,
                 ss << tc.sides();
                 h_papersides = ss.str();
         }
+
+       // If the CJK package is used we cannot set the document language from
+       // the babel options. Instead, we guess which language is used most
+       // and set this one.
+       default_language = h_language;
+       if (is_full_document && auto_packages.find("CJK") != auto_packages.end()) {
+               p.pushPosition();
+               h_language = guessLanguage(p, default_language);
+               p.popPosition();
+       }
  }
  
  
diff --git a/src/tex2lyx/Preamble.h b/src/tex2lyx/Preamble.h

index 2ce75fd0653c0fa4f0a6111cc3940d40a2cc133e..db2242f3ce60a24e56a9e6f141ba6f8d15c14a09 100644 (file)
--- a/src/tex2lyx/Preamble.h
+++ b/src/tex2lyx/Preamble.h
@@ -38,8 +38,10 @@ public:
         std::string inputencoding() const { return h_inputencoding; }
         ///
         std::string notefontcolor() const { return h_notefontcolor; }
-       ///
-       std::string language() const { return h_language; }
+       /// The document language
+       std::string docLanguage() const { return h_language; }
+       /// The language of text which is not explicitly marked
+       std::string defaultLanguage() const  { return default_language; }
         ///
         std::string use_indices() const { return h_use_indices; }
         ///
@@ -89,6 +91,8 @@ private:
  
         /// needed to handle encodings with babel
         bool one_language;
+       /// the main non-CJK language
+       std::string default_language;
  
         /// was at least one title layout found?
         bool title_layout_found;
diff --git a/src/tex2lyx/tex2lyx.cpp b/src/tex2lyx/tex2lyx.cpp

index 384c22836d02e6ab4a113a193d60c2000caf20c8..57436cbcdaf7692958e2076eb75074bda710104c 100644 (file)
--- a/src/tex2lyx/tex2lyx.cpp
+++ b/src/tex2lyx/tex2lyx.cpp
@@ -332,7 +332,6 @@ bool checkModule(string const & name, bool command)
  bool noweb_mode = false;
  bool pdflatex = false;
  bool xetex = false;
-bool have_CJK = false;
  bool is_nonCJKJapanese = false;
  bool roundtrip = false;
  
@@ -704,7 +703,7 @@ bool tex2lyx(idocstream & is, ostream & os, string encoding)
         stringstream ss;
         // store the document language in the context to be able to handle the
         // commands like \foreignlanguage and \textenglish etc.
-       context.font.language = preamble.language();
+       context.font.language = preamble.defaultLanguage();
         // parse the main text
         parse_text(p, ss, FLAG_END, true, context);
         if (Context::empty)
diff --git a/src/tex2lyx/tex2lyx.h b/src/tex2lyx/tex2lyx.h

index a471a0f3f447627ef133ddb15dd100366d109661..1a98a5f91a75a9897876657bd116073d48a48a3a 100644 (file)
--- a/src/tex2lyx/tex2lyx.h
+++ b/src/tex2lyx/tex2lyx.h
@@ -67,6 +67,10 @@ void parse_text_in_inset(Parser & p, std::ostream & os, unsigned flags,
                           bool outer, Context const & context,
                           InsetLayout const * layout = 0);
  
+/// Guess document language from \p p if CJK is used.
+/// \p lang is used for all non-CJK contents.
+std::string guessLanguage(Parser & p, std::string const & lang);
+
  
  /// in math.cpp
  void parse_math(Parser & p, std::ostream & os, unsigned flags, mode_type mode);
@@ -166,8 +170,6 @@ extern bool noweb_mode;
  extern bool pdflatex;
  /// Did we recognize any xetex-only construct?
  extern bool xetex;
-/// Do we have CJK?
-extern bool have_CJK;
  /// Do we have non-CJK Japanese?
  extern bool is_nonCJKJapanese;
  /// LyX format that is created by tex2lyx
diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp

index a97d00bd3693314fd5bc5d8e1c4b503974424298..34ffbe86c1d1c198f9d1489234cbfc783f6010b0 100644 (file)
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@@ -130,17 +130,9 @@ const char * const supported_CJK_encodings[] = {
   * the same as supported_CJK_encodings with their corresponding LyX language name
   * please keep this in sync with supported_CJK_encodings line by line!
   */
-const char * const coded_supported_CJK_encodings[] = {
+const char * const supported_CJK_languages[] = {
  "japanese-cjk", "korean", "chinese-simplified", "chinese-traditional", 0};
  
-string CJK2lyx(string const & encoding)
-{
-       char const * const * where = is_known(encoding, supported_CJK_encodings);
-       if (where)
-               return coded_supported_CJK_encodings[where - supported_CJK_encodings];
-       return encoding;
-}
-
  /*!
   * natbib commands.
   * The starred forms are also known except for "citefullauthor",
@@ -1440,8 +1432,9 @@ void parse_environment(Parser & p, ostream & os, bool outer,
                 // LyX doesn't support the second argument so if
                 // this is used we need to output everything as ERT
                 string const mapping = p.getArg('{', '}');
-               if ((!mapping.empty() && mapping != " ")
-                       || (!is_known(encoding, supported_CJK_encodings))) {
+               char const * const * const where =
+                       is_known(encoding, supported_CJK_encodings);
+               if ((!mapping.empty() && mapping != " ") || !where) {
                         parent_context.check_layout(os);
                         handle_ert(os, "\\begin{" + name + "}{" + encoding + "}{" + mapping + "}",
                                        parent_context);
@@ -1459,7 +1452,8 @@ void parse_environment(Parser & p, ostream & os, bool outer,
                         handle_ert(os, "\\end{" + name + "}",
                                        parent_context);
                 } else {
-                       string const lang = CJK2lyx(encoding);
+                       string const lang =
+                               supported_CJK_languages[where - supported_CJK_encodings];
                         // store the language because we must reset it at the end
                         string const lang_old = parent_context.font.language;
                         parent_context.font.language = lang;
@@ -2142,24 +2136,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
         while (p.good()) {
                 Token const & t = p.get_token();
  
-       // it is impossible to determine the correct document language if CJK is used.
-       // Therefore write a note at the beginning of the document
-       if (have_CJK) {
-               context.check_layout(os);
-               begin_inset(os, "Note Note\n");
-               os << "status open\n\\begin_layout Plain Layout\n"
-                  << "\\series bold\n"
-                  << "Important information:\n"
-                  << "\\end_layout\n\n"
-                  << "\\begin_layout Plain Layout\n"
-                  << "This document contains text in Chinese, Japanese or Korean.\n"
-                  << " It was therefore impossible for tex2lyx to set the correct document language for your document."
-                  << " Please set the language manually in the document settings.\n"
-                  << "\\end_layout\n";
-               end_inset(os);
-               have_CJK = false;
-       }
-
         // it is impossible to determine the correct encoding for non-CJK Japanese.
         // Therefore write a note at the beginning of the document
         if (is_nonCJKJapanese) {
@@ -4426,6 +4402,79 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
         }
  }
  
+
+string guessLanguage(Parser & p, string const & lang)
+{
+       typedef std::map<std::string, size_t> LangMap;
+       // map from language names to number of characters
+       LangMap used;
+       used[lang] = 0;
+       for (char const * const * i = supported_CJK_languages; *i; i++)
+               used[string(*i)] = 0;
+
+       while (p.good()) {
+               Token const t = p.get_token();
+               // comments are not counted for any language
+               if (t.cat() == catComment)
+                       continue;
+               // commands are not counted as well, but we need to detect
+               // \begin{CJK} and switch encoding if needed
+               if (t.cat() == catEscape) {
+                       if (t.cs() == "inputencoding") {
+                               string const enc = subst(p.verbatim_item(), "\n", " ");
+                               p.setEncoding(enc);
+                               continue;
+                       }
+                       if (t.cs() != "begin")
+                               continue;
+               } else {
+                       // Non-CJK content is counted for lang.
+                       // We do not care about the real language here:
+                       // If we have more non-CJK contents than CJK contents,
+                       // we simply use the language that was specified as
+                       // babel main language.
+                       used[lang] += t.asInput().length();
+                       continue;
+               }
+               // Now we are starting an environment
+               p.pushPosition();
+               string const name = p.getArg('{', '}');
+               if (name != "CJK") {
+                       p.popPosition();
+                       continue;
+               }
+               // It is a CJK environment
+               p.popPosition();
+               /* name = */ p.getArg('{', '}');
+               string const encoding = p.getArg('{', '}');
+               /* mapping = */ p.getArg('{', '}');
+               string const encoding_old = p.getEncoding();
+               char const * const * const where =
+                       is_known(encoding, supported_CJK_encodings);
+               if (where)
+                       p.setEncoding(encoding);
+               else
+                       p.setEncoding("utf8");
+               string const text = p.verbatimEnvironment("CJK");
+               p.setEncoding(encoding_old);
+               p.skip_spaces();
+               if (!where) {
+                       // ignore contents in unknown CJK encoding
+                       continue;
+               }
+               // the language of the text
+               string const cjk =
+                       supported_CJK_languages[where - supported_CJK_encodings];
+               used[cjk] += text.length();
+       }
+       LangMap::const_iterator use = used.begin();
+       for (LangMap::const_iterator it = used.begin(); it != used.end(); ++it) {
+               if (it->second > use->second)
+                       use = it;
+       }
+       return use->first;
+}
+
  // }])
author	Georg Baum <baum@lyx.org>
	Sat, 6 Oct 2012 07:38:14 +0000 (09:38 +0200)
committer	Georg Baum <baum@lyx.org>
	Sat, 6 Oct 2012 07:38:14 +0000 (09:38 +0200)
src/tex2lyx/Preamble.cpp		patch \| blob \| history
src/tex2lyx/Preamble.h		patch \| blob \| history
src/tex2lyx/tex2lyx.cpp		patch \| blob \| history
src/tex2lyx/tex2lyx.h		patch \| blob \| history
src/tex2lyx/text.cpp		patch \| blob \| history