Use CJKutf8 package if input encoding is "utf8" and a used language requires CJK.

[lyx.git] / src / Paragraph.cpp
diff --git a/src/Paragraph.cpp b/src/Paragraph.cpp

index 427818a76150b88951de516e9fa8086669133449..27aecb5eaee7fc4af6a211e6617136b572ed97a4 100644 (file)
--- a/src/Paragraph.cpp
+++ b/src/Paragraph.cpp
@@ -869,7 +869,7 @@ int Paragraph::eraseChars(pos_type start, pos_type end, bool trackChanges)
         return end - i;
  }
  
-
+// Handle combining characters
  int Paragraph::Private::latexSurrogatePair(BufferParams const & bparams,
                 otexstream & os, char_type c, char_type next,
                 OutputParams const & runparams)
@@ -891,22 +891,24 @@ int Paragraph::Private::latexSurrogatePair(BufferParams const & bparams,
         }
         docstring latex2 = encoding.latexChar(c).first;
  
-       if (docstring(1, next) == latex1) {
-               // The encoding supports the combination:
+       // Handle combining characters in "script" context (i.e., \textgreek and \textcyrillic)
+       docstring::size_type const brace1 = latex2.find_first_of(from_ascii("{"));
+       docstring::size_type const brace2 = latex2.find_last_of(from_ascii("}"));
+       string script = to_ascii(latex2.substr(1, brace1 - 1));
+
+       // Greek and Cyrillic letters need to be wrapped in \textcyrillic and \textgreek  if they
+       // are not encodable in the current font encoding (regardless of the input encoding).
+       bool scriptchar = false;
+       if (!bparams.useNonTeXFonts) // With non-TeX fonts the font encoding is Unicode.
+               scriptchar = Encodings::isKnownScriptChar(c, script);
+
+       if (!scriptchar && docstring(1, next) == latex1) {
+               // Font and input encoding support the combination:
                 // output as is (combining char after base char).
                 os << latex2 << latex1;
                 return latex1.length() + latex2.length();
         }
  
-       // Handle combining characters in "script" context (i.e., \textgreek and \textcyrillic)
-       docstring::size_type const brace1 = latex2.find_first_of(from_ascii("{"));
-       docstring::size_type const brace2 = latex2.find_last_of(from_ascii("}"));
-       string script = to_ascii(latex2.substr(1, brace1 - 1));
-       // "Script chars" need to embraced in \textcyrillic and \textgreek notwithstanding
-       // whether they are encodable or not (it only depends on the font encoding)
-       if (!runparams.isFullUnicode())
-               // This will get us a script value to deal with below
-               Encodings::isKnownScriptChar(c, script);
         int pos = 0;
         int length = brace2;
         string fontenc;
@@ -917,30 +919,19 @@ int Paragraph::Private::latexSurrogatePair(BufferParams const & bparams,
         docstring scriptmacro;
         docstring cb;
         if (script == "textgreek" || script == "textcyrillic") {
-               // We separate the script macro (\text[greek|cyr]) from the rest,
-               // since we need to include the combining char in it (#6463).
-               // This is "the rest":
+               // Strip the \text(greek|cyrillic) script macro  ...
                 pos = brace1 + 1;
                 length -= pos;
                 latex2 = latex2.substr(pos, length);
-               // We only need the script macro with non-native font encodings
+               // and place it before the accent macro if required (#6463)
                 if (Encodings::needsScriptWrapper(script, fontenc)) {
                         scriptmacro = from_ascii("\\" + script + "{");
                         cb = from_ascii("}");
                 }
         }
  
-       docstring lb;
-       docstring rb;
-       // polutonikogreek does not play nice with brackets
-       if (!runparams.local_font
-           || runparams.local_font->language()->lang() != "polutonikogreek") {
-               lb = from_ascii("{");
-               rb = from_ascii("}");
-       }
-
-       os << scriptmacro << latex1 << lb << latex2 << rb << cb;
-       return latex1.length() + latex2.length() + lb.length() + rb.length() + cb.length();
+       os << scriptmacro << latex1 << "{" << latex2 << "}" << cb;
+       return latex1.length() + 1 + latex2.length() + 1 + cb.length();
  }
  
  
@@ -1017,8 +1008,9 @@ int Paragraph::Private::writeScriptChars(BufferParams const & bparams,
         int pos = 0;
         int length = brace2;
         bool closing_brace = true;
+       // We only need the script macro with non-native font encodings
         if (!Encodings::needsScriptWrapper(script, fontenc)) {
-               // Correct font encoding is being used, so we can avoid \text[greek|cyr].
+               // Correct font encoding is being used, so we can avoid \text(greek|cyrrillic).
                 pos = brace1 + 1;
                 length -= pos;
                 closing_brace = false;
@@ -1143,13 +1135,15 @@ void Paragraph::Private::latexInset(BufferParams const & bparams,
         odocstream::pos_type const len = os.os().tellp();
  
         if (inset->forceLTR()
-           && !runparams.use_polyglossia
             && running_font.isRightToLeft()
             // ERT is an exception, it should be output with no
             // decorations at all
             && inset->lyxCode() != ERT_CODE) {
-               if (running_font.language()->lang() == "farsi")
-                       os << "\\beginL" << termcmd;
+               if (runparams.use_polyglossia) {
+                       os << "\\LRE{";
+               } else if (running_font.language()->lang() == "farsi"
+                          || running_font.language()->lang() == "arabic_arabi")
+                       os << "\\textLR{" << termcmd;
                 else
                         os << "\\L{";
                 close = true;
@@ -1205,12 +1199,8 @@ void Paragraph::Private::latexInset(BufferParams const & bparams,
                 throw(e);
         }
  
-       if (close) {
-               if (running_font.language()->lang() == "farsi")
-                               os << "\\endL" << termcmd;
-                       else
-                               os << '}';
-       }
+       if (close)
+               os << '}';
  
         if (os.texrow().rows() > previous_row_count) {
                 os.texrow().start(owner_->id(), i + 1);
@@ -1394,8 +1384,9 @@ void Paragraph::Private::latexSpecialChar(otexstream & os,
                 string fontenc;
                 fontenc = running_font.language()->fontenc(bparams);
                 // "Script chars" need to embraced in \textcyrillic and \textgreek notwithstanding
-               // whether they are encodable or not (it only depends on the font encoding)
-               if (!runparams.isFullUnicode() && Encodings::isKnownScriptChar(c, script)) {
+               // whether they are encodable or not (it only depends on the font encoding),
+               // except if we are using fontspec.
+               if (!bparams.useNonTeXFonts && Encodings::isKnownScriptChar(c, script)) {
                         docstring const wrapper = from_ascii("\\" + script + "{");
                         docstring ltx = latex.first;
                         if (!prefixIs(ltx, wrapper))
@@ -2000,47 +1991,44 @@ char_type Paragraph::getUChar(BufferParams const & bparams,
  {
         char_type c = d->text_[pos];
  
-       // Return unchanged character in LTR languages.
-       if (!getFontSettings(bparams, pos).isRightToLeft())
+       // Return unchanged character in LTR languages
+       // or if we use poylglossia/bidi.
+       if (rp.use_polyglossia || !getFontSettings(bparams, pos).isRightToLeft())
                 return c;
  
-       // FIXME This is a complete mess due to all the language-specific
-       // special cases. We need to unify this eventually, but this
-       // requires a file format change and some thought.
-       // We also need to unify the input of parentheses in different RTL
-       // languages. Currently, some have their own methods (Arabic:
-       // 18599/lyxsvn, Hebrew: e5f42f67d/lyxgit), some don't (Urdu, Syriac).
-       // Also note that the representation in the LyX file is probably wrong
-       // (see FIXME in TextMetrics::breakRow).
-       // Most likely, we should simply rely on Qt's unicode handling here.
-       string const & lang = getFontSettings(bparams, pos).language()->lang();
+       // Without polyglossia/bidi, we need to account for some special cases.
+       // FIXME This needs to be audited!
+       // Check if:
+       // * The input is as expected for all delimiters
+       //   => checked for Hebrew!
+       // * The output matches the display in the LyX workarea
+       //   => checked for Hebrew!
+       // * The special cases below are really necessary
+       //   => checked for Hebrew!
+       // * In arabic_arabi, brackets are transformed to Arabic
+       //   Ornate Parentheses. Is this is really wanted?
  
-       // With polyglossia, brackets and stuff need not be reversed in RTL scripts
-       // FIXME: The special casing for Hebrew parens is due to the special
-       // handling on input (for Hebrew in e5f42f67d/lyxgit); see #8251.
+       string const & lang = getFontSettings(bparams, pos).language()->lang();
         char_type uc = c;
-       if (rp.use_polyglossia) {
-               switch (c) {
-               case '(':
-                       if (lang == "hebrew")
-                               uc = ')';
-                       break;
-               case ')':
-                       if (lang == "hebrew")
-                               uc = '(';
-                       break;
-               }
-               return uc;
-       }
  
-       // In the following languages, brackets don't need to be reversed.
-       // Furthermore, in arabic_arabi, they are transformed to Arabic
-       // Ornate Parentheses (dunno if this is really wanted)
+       // 1. In the following languages, parentheses need to be reversed.
+       bool const reverseparens = lang == "hebrew";
+
+       // 2. In the following languages, brackets don't need to be reversed.
         bool const reversebrackets = lang != "arabic_arabtex"
                         && lang != "arabic_arabi"
-                       && lang != "farsi"; 
+                       && lang != "farsi";
  
+       // Now swap delimiters if needed.
         switch (c) {
+       case '(':
+               if (reverseparens)
+                       uc = ')';
+               break;
+       case ')':
+               if (reverseparens)
+                       uc = '(';
+               break;
         case '[':
                 if (reversebrackets)
                         uc = ']';
@@ -2374,6 +2362,9 @@ int Paragraph::Private::startTeXParParams(BufferParams const & bparams,
         string const begin_tag = "\\begin";
         InsetCode code = ownerCode();
         bool const lastpar = runparams.isLastPar;
+       // RTL in classic (PDF)LaTeX (without the Bidi package)
+       bool const rtl_classic = owner_->getParLanguage(bparams)->rightToLeft()
+               && !runparams.use_polyglossia;
  
         switch (curAlign) {
         case LYX_ALIGN_NONE:
@@ -2383,16 +2374,18 @@ int Paragraph::Private::startTeXParParams(BufferParams const & bparams,
         case LYX_ALIGN_DECIMAL:
                 break;
         case LYX_ALIGN_LEFT: {
-               if (owner_->getParLanguage(bparams)->babel() != "hebrew")
-                       corrected_env(os, begin_tag, "flushleft", code, lastpar, column);
-               else
+               if (rtl_classic)
+                       // Classic (PDF)LaTeX switches the left/right logic in RTL mode
                         corrected_env(os, begin_tag, "flushright", code, lastpar, column);
+               else
+                       corrected_env(os, begin_tag, "flushleft", code, lastpar, column);
                 break;
         } case LYX_ALIGN_RIGHT: {
-               if (owner_->getParLanguage(bparams)->babel() != "hebrew")
-                       corrected_env(os, begin_tag, "flushright", code, lastpar, column);
-               else
+               if (rtl_classic)
+                       // Classic (PDF)LaTeX switches the left/right logic in RTL mode
                         corrected_env(os, begin_tag, "flushleft", code, lastpar, column);
+               else
+                       corrected_env(os, begin_tag, "flushright", code, lastpar, column);
                 break;
         } case LYX_ALIGN_CENTER: {
                 corrected_env(os, begin_tag, "center", code, lastpar, column);
@@ -2432,6 +2425,9 @@ bool Paragraph::Private::endTeXParParams(BufferParams const & bparams,
         string const end_tag = "\\par\\end";
         InsetCode code = ownerCode();
         bool const lastpar = runparams.isLastPar;
+       // RTL in classic (PDF)LaTeX (without the Bidi package)
+       bool const rtl_classic = owner_->getParLanguage(bparams)->rightToLeft()
+               && !runparams.use_polyglossia;
  
         switch (curAlign) {
         case LYX_ALIGN_NONE:
@@ -2441,16 +2437,18 @@ bool Paragraph::Private::endTeXParParams(BufferParams const & bparams,
         case LYX_ALIGN_DECIMAL:
                 break;
         case LYX_ALIGN_LEFT: {
-               if (owner_->getParLanguage(bparams)->babel() != "hebrew")
-                       output = corrected_env(os, end_tag, "flushleft", code, lastpar, col);
-               else
+               if (rtl_classic)
+                       // Classic (PDF)LaTeX switches the left/right logic in RTL mode
                         output = corrected_env(os, end_tag, "flushright", code, lastpar, col);
+               else
+                       output = corrected_env(os, end_tag, "flushleft", code, lastpar, col);
                 break;
         } case LYX_ALIGN_RIGHT: {
-               if (owner_->getParLanguage(bparams)->babel() != "hebrew")
-                       output = corrected_env(os, end_tag, "flushright", code, lastpar, col);
-               else
+               if (rtl_classic)
+                       // Classic (PDF)LaTeX switches the left/right logic in RTL mode
                         output = corrected_env(os, end_tag, "flushleft", code, lastpar, col);
+               else
+                       output = corrected_env(os, end_tag, "flushright", code, lastpar, col);
                 break;
         } case LYX_ALIGN_CENTER: {
                 corrected_env(os, end_tag, "center", code, lastpar, col);
@@ -3414,7 +3412,10 @@ docstring Paragraph::simpleLyXHTMLOnePar(Buffer const & buf,
                 } else {
                         char_type c = getUChar(buf.masterBuffer()->params(),
                                                runparams, i);
-                       xs << c;
+                       if (c == ' ' && (style.free_spacing || runparams.free_spacing))
+                               xs << XHTMLStream::ESCAPE_NONE << "&nbsp;";
+                       else
+                               xs << c;
                 }
                 font_old = font.fontInfo();
         }