tex2lyx: improve CJK handling

[lyx.git] / src / tex2lyx / text.cpp
diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp

index 2590e00bc205ccbf41a0fc53cee5a572ca046717..62d5e8df9ebfaf80c6d3458a94bf37d5ffdba6af 100644 (file)
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@@ -118,39 +118,28 @@ char const * const known_coded_ref_commands[] = { "ref", "pageref", "vref",
   "vpageref", "formatted", "eqref", 0 };
  
  /**
- * known polyglossia language names (including variants)
+ * supported CJK encodings
+ * SJIS anf Bg5 cannot be supported as this is not
+ * supported by iconv
+ * JIS does not work with LyX's encoding conversion
   */
-const char * const polyglossia_languages[] = {
-"albanian", "croatian", "hebrew", "norsk", "swedish", "amharic", "czech", "hindi",
-"nynorsk", "syriac", "arabic", "danish", "icelandic", "occitan", "tamil",
-"armenian", "divehi", "interlingua", "polish", "telugu", "asturian", "dutch",
-"irish", "portuges", "thai", "bahasai", "english", "italian", "romanian", "turkish",
-"bahasam", "esperanto", "lao", "russian", "turkmen", "basque", "estonian", "latin",
-"samin", "ukrainian", "bengali", "farsi", "latvian", "sanskrit", "urdu", "brazil",
-"brazilian", "finnish", "lithuanian", "scottish", "usorbian", "breton", "french",
-"lsorbian", "serbian", "vietnamese", "bulgarian", "galician", "magyar", "slovak",
-"welsh", "catalan", "german", "malayalam", "slovenian", "coptic", "greek",
-"marathi", "spanish"
-"american", "ancient", "australian", "british", "monotonic", "newzealand",
-"polytonic", 0};
+const char * const supported_CJK_encodings[] = {
+"EUC-JP", "KS", "GB", "UTF8", 0};
  
  /**
- * the same as polyglossia_languages with .lyx names
- * please keep this in sync with polyglossia_languages line by line!
+ * the same as supported_CJK_encodings with their corresponding LyX language name
+ * please keep this in sync with supported_CJK_encodings line by line!
   */
-const char * const coded_polyglossia_languages[] = {
-"albanian", "croatian", "hebrew", "norsk", "swedish", "amharic", "czech", "hindi",
-"nynorsk", "syriac", "arabic_arabi", "danish", "icelandic", "occitan", "tamil",
-"armenian", "divehi", "interlingua", "polish", "telugu", "asturian", "dutch",
-"irish", "portuges", "thai", "bahasa", "english", "italian", "romanian", "turkish",
-"bahasam", "esperanto", "lao", "russian", "turkmen", "basque", "estonian", "latin",
-"samin", "ukrainian", "bengali", "farsi", "latvian", "sanskrit", "urdu", "brazilian",
-"brazilian", "finnish", "lithuanian", "scottish", "uppersorbian", "breton", "french",
-"lowersorbian", "serbian", "vietnamese", "bulgarian", "galician", "magyar", "slovak",
-"welsh", "catalan", "ngerman", "malayalam", "slovene", "coptic", "greek",
-"marathi", "spanish"
-"american", "ancientgreek", "australian", "british", "greek", "newzealand",
-"polutonikogreek", 0};
+const char * const coded_supported_CJK_encodings[] = {
+"japanese-cjk", "korean", "chinese-simplified", "chinese-traditional", 0};
+
+string CJK2lyx(string const & encoding)
+{
+       char const * const * where = is_known(encoding, supported_CJK_encodings);
+       if (where)
+               return coded_supported_CJK_encodings[where - supported_CJK_encodings];
+       return encoding;
+}
  
  /*!
   * natbib commands.
@@ -1247,7 +1236,7 @@ void parse_environment(Parser & p, ostream & os, bool outer,
                 }
         }
  
-       else if (is_known(name, polyglossia_languages)) {
+       else if (is_known(name, preamble.polyglossia_languages)) {
                 // We must begin a new paragraph if not already done
                 if (! parent_context.atParagraphStart()) {
                         parent_context.check_end_layout(os);
@@ -1255,7 +1244,7 @@ void parse_environment(Parser & p, ostream & os, bool outer,
                 }
                 // save the language in the context so that it is
                 // handled by parse_text
-               parent_context.font.language = polyglossia2lyx(name);
+               parent_context.font.language = preamble.polyglossia2lyx(name);
                 parse_text(p, os, FLAG_END, outer, parent_context);
                 // Just in case the environment is empty
                 parent_context.extra_stuff.erase();
@@ -1433,6 +1422,55 @@ void parse_environment(Parser & p, ostream & os, bool outer,
                 os << "\n\\begin_layout Standard\n";
         }
  
+       else if (name == "CJK") {
+               // the scheme is \begin{CJK}{encoding}{mapping}{text}
+               // It is impossible to decide if a CJK environment was in its own paragraph or within
+               // a line. We therefore always assume a paragraph since the latter is a rare case.
+               eat_whitespace(p, os, parent_context, false);
+               parent_context.check_end_layout(os);
+               // store the encoding to be able to reset it
+               string const encoding_old = p.getEncoding();
+               string const encoding = p.getArg('{', '}');
+               // SJIS and Bg5 cammopt be handled by iconv
+               // JIS does not work with LyX's encoding conversion
+               if (encoding != "Bg5" && encoding != "JIS" && encoding != "SJIS")
+                       p.setEncoding(encoding);
+               else
+                       p.setEncoding("utf8");
+               // LyX doesn't support the second argument so if
+               // this is used we need to output everything as ERT
+               string const mapping = p.getArg('{', '}');
+               if ((!mapping.empty() && mapping != " ")
+                       || (!is_known(encoding, supported_CJK_encodings))) {
+                       parent_context.check_layout(os);
+                       handle_ert(os, "\\begin{" + name + "}{" + encoding + "}{" + mapping + "}",
+                                      parent_context);
+                       // we must parse the content as verbatim because e.g. JIS can contain
+                       // normally invalid characters
+                       string const s = p.plainEnvironment("CJK");
+                       for (string::const_iterator it = s.begin(), et = s.end(); it != et; ++it) {
+                               if (*it == '\\')
+                                       handle_ert(os, "\\", parent_context);
+                               else if (*it == '$')
+                                       handle_ert(os, "$", parent_context);
+                               else 
+                                       os << *it;
+                       }
+                       handle_ert(os, "\\end{" + name + "}",
+                                      parent_context);
+               } else {
+                       string const lang = CJK2lyx(encoding);
+                       // store the language because we must reset it at the end
+                       string const lang_old = parent_context.font.language;
+                       parent_context.font.language = lang;
+                       parse_text_in_inset(p, os, FLAG_END, outer, parent_context);
+                       parent_context.font.language = lang_old;
+                       parent_context.new_paragraph(os);
+               }
+               p.setEncoding(encoding_old);
+               p.skip_spaces();
+       }
+
         else if (name == "lyxgreyedout") {
                 eat_whitespace(p, os, parent_context, false);
                 parent_context.check_layout(os);
@@ -2029,6 +2067,43 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
         while (p.good()) {
                 Token const & t = p.get_token();
  
+       // it is impossible to determine the correct document language if CJK is used.
+       // Therefore write a note at the beginning of the document
+       if (have_CJK) {
+               context.check_layout(os);
+               begin_inset(os, "Note Note\n");
+               os << "status open\n\\begin_layout Plain Layout\n"
+                  << "\\series bold\n"
+                  << "Important information:\n"
+                  << "\\end_layout\n\n"
+                  << "\\begin_layout Plain Layout\n"
+                  << "This document contains text in Chinese, Japanese or Korean.\n"
+                  << " It was therefore impossible for tex2lyx to set the correct document langue for your document."
+                  << " Please set the language manually in the document settings.\n"
+                  << "\\end_layout\n";
+               end_inset(os);
+               have_CJK = false;
+       }
+
+       // it is impossible to determine the correct encoding for non-CJK Japanese.
+       // Therefore write a note at the beginning of the document
+       if (is_nonCJKJapanese) {
+               context.check_layout(os);
+               begin_inset(os, "Note Note\n");
+               os << "status open\n\\begin_layout Plain Layout\n"
+                  << "\\series bold\n"
+                  << "Important information:\n"
+                  << "\\end_layout\n\n"
+                  << "\\begin_layout Plain Layout\n"
+                  << "This document is in Japanese (non-CJK).\n"
+                  << " It was therefore impossible for tex2lyx to determine the correct encoding."
+                  << " The encoding EUC-JP was assumed. If this is incorrect, please set the correct"
+                  << " encoding in the document settings.\n"
+                  << "\\end_layout\n";
+               end_inset(os);
+               is_nonCJKJapanese = false;
+       }
+
  #ifdef FILEDEBUG
                 debugToken(cerr, t, flags);
  #endif
@@ -3464,7 +3539,8 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                                               context.font.language, lang);
                 }
                 
-               else if (is_known(t.cs().substr(4, string::npos), polyglossia_languages)) {
+               else if (prefixIs(t.cs(), "text") 
+                        && is_known(t.cs().substr(4), preamble.polyglossia_languages)) {
                         // scheme is \textLANGUAGE{text} where LANGUAGE is in polyglossia_languages[]
                         string lang;
                         // We have to output the whole command if it has an option
@@ -3475,21 +3551,18 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                                 // check if the option contains a variant, if yes, extract it
                                 string::size_type pos_var = langopts.find("variant");
                                 string::size_type i = langopts.find(',');
-                               if (pos_var != string::npos){
+                               string::size_type k = langopts.find('=', pos_var);
+                               if (pos_var != string::npos && i == string::npos) {
                                         string variant;
-                                       if (i == string::npos) {
-                                               variant = langopts.substr(pos_var + 8, langopts.length() - pos_var - 9);
-                                               lang = polyglossia2lyx(variant);
-                                               parse_text_attributes(p, os, FLAG_ITEM, outer,
-                                                                         context, "\\lang",
-                                                                         context.font.language, lang);
-                                       }
-                                       else
-                                               handle_ert(os, t.asInput() + langopts, context);
+                                       variant = langopts.substr(k + 1, langopts.length() - k - 2);
+                                       lang = preamble.polyglossia2lyx(variant);
+                                       parse_text_attributes(p, os, FLAG_ITEM, outer,
+                                                                 context, "\\lang",
+                                                                 context.font.language, lang);
                                 } else
                                         handle_ert(os, t.asInput() + langopts, context);
                         } else {
-                               lang = polyglossia2lyx(t.cs().substr(4, string::npos));
+                               lang = preamble.polyglossia2lyx(t.cs().substr(4, string::npos));
                                 parse_text_attributes(p, os, FLAG_ITEM, outer,
                                                           context, "\\lang",
                                                           context.font.language, lang);
@@ -3838,14 +3911,21 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                                 arg += p.getFullOpt();
                                 eat_whitespace(p, os, context, false);
                                 handle_ert(os, arg + '{', context);
-                               eat_whitespace(p, os, context, false);
                                 parse_text(p, os, FLAG_ITEM, outer, context);
                                 handle_ert(os, "}", context);
                         } else {
                                 string special = p.getFullOpt();
                                 special += p.getOpt();
-                               parse_outer_box(p, os, FLAG_ITEM, outer,
-                                               context, t.cs(), special);
+                               // LyX does not yet support \framebox without any option
+                               if (!special.empty())
+                                       parse_outer_box(p, os, FLAG_ITEM, outer,
+                                                       context, t.cs(), special);
+                               else {
+                                       eat_whitespace(p, os, context, false);
+                                       handle_ert(os, "\\framebox{", context);
+                                       parse_text(p, os, FLAG_ITEM, outer, context);
+                                       handle_ert(os, "}", context);
+                               }
                         }
                 }
  
@@ -3859,7 +3939,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                                 arg += p.getFullOpt();
                                 eat_whitespace(p, os, context, false);
                                 handle_ert(os, arg + '{', context);
-                               eat_whitespace(p, os, context, false);
                                 parse_text(p, os, FLAG_ITEM, outer, context);
                                 handle_ert(os, "}", context);
                         } else