Fix Python unicode string in lyx2lyx

[lyx.git] / lib / lyx2lyx / lyx_2_3.py
diff --git a/lib/lyx2lyx/lyx_2_3.py b/lib/lyx2lyx/lyx_2_3.py

index 1da16f2b3456ee67a45e77ef5b977d79418a7416..d0481ae8ff22161fc36dad6c183f23d146750b3c 100644 (file)
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@@ -26,15 +26,15 @@ import sys, os
  
  from parser_tools import find_end_of, find_token_backwards, find_end_of_layout, \
      find_token, find_end_of_inset, get_value,  get_bool_value, \
-    get_containing_layout, get_quoted_value, del_token
+    get_containing_layout, get_quoted_value, del_token, find_re
  #  find_tokens, find_token_exact, is_in_inset, \
  #  check_token, get_option_value
  
-from lyx2lyx_tools import add_to_preamble, put_cmd_in_ert
+from lyx2lyx_tools import add_to_preamble, put_cmd_in_ert, revert_font_attrs, \
+    insert_to_preamble
  #  get_ert, lyx2latex, \
  #  lyx2verbatim, length_in_bp, convert_info_insets
-#  insert_to_preamble, latex_length, revert_flex_inset, \
-#  revert_font_attrs, hex2ratio, str2bool
+#  latex_length, revert_flex_inset, hex2ratio, str2bool
  
  ####################################################################
  # Private helper functions
@@ -166,7 +166,7 @@ def revert_ibranches(document):
          i += 1
  
      # now we need to add the new branches to the header
-    for old, new in ibranches.iteritems():
+    for old, new in ibranches.items():
          i = find_token(document.header, "\\branch " + old, 0)
          if i == -1:
              document.warning("Can't find branch %s even though we found it before!" % (old))
@@ -1146,11 +1146,12 @@ def revert_noprefix(document):
              i += 1
              continue
          k = find_token(document.body, "LatexCommand labelonly", i, j)
-        if k == -1:
-            i = j
-            continue
-        noprefix = get_bool_value(document.body, "noprefix", i, j)
+        noprefix = False
+        if k != -1:
+            noprefix = get_bool_value(document.body, "noprefix", i, j)
          if not noprefix:
+            # either it was not a labelonly command, or else noprefix was not set.
+            # in that case, we just delete the option.
              del_token(document.body, "noprefix", i, j)
              i = j
              continue
@@ -1550,9 +1551,6 @@ command_insets = ["bibitem", "citation", "href", "index_print", "nomenclature"]
  def convert_literalparam(document):
      " Add param literal "
  
-    # These already had some sort of latexify method
-    latexified_insets = ["href", "index_print", "nomenclature"]
-
      for inset in command_insets:
          i = 0
          while True:
@@ -1566,7 +1564,8 @@ def convert_literalparam(document):
                  continue
              while i < j and document.body[i].strip() != '':
                  i += 1
-            if inset in latexified_insets:
+            # href is already fully latexified. Here we can switch off literal.
+            if inset == "href":
                  document.body.insert(i, "literal \"false\"")
              else:
                  document.body.insert(i, "literal \"true\"")
@@ -1842,58 +1841,63 @@ def revert_chapterbib(document):
  
  
  def convert_dashligatures(document):
-    " Remove a zero-length space (U+200B) after en- and em-dashes. "
-
-    i = find_token(document.header, "\\use_microtype", 0)
-    if i != -1:
-        if document.start > 474 and document.start < 509:
-            # This was created by LyX 2.2
-            document.header[i+1:i+1] = ["\\use_dash_ligatures false"]
-        else:
-            # This was created by LyX 2.1 or earlier
-            document.header[i+1:i+1] = ["\\use_dash_ligatures true"]
-
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
-        # Skip some document parts where dashes are not converted
-        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
-                        "FormulaMacro", "Graphics", "IPA", "listings"]:
-            j = find_end_of_inset(document.body, i)
-            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
-                                 + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
-            continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            if after.startswith(u"\u200B"):
-                document.body[i] = document.body[i][:j+1] + after[1:]
-            else:
-                if len(after) == 0 and document.body[i+1].startswith(u"\u200B"):
-                    document.body[i+1] = document.body[i+1][1:]
-                    break
-            start = j+1
-        i += 1
-
+    "Set 'use_dash_ligatures' according to content."
+    use_dash_ligatures = None
+    # eventually remove preamble code from 2.3->2.2 conversion:
+    for i, line in enumerate(document.preamble):
+        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
+            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
+                and document.preamble[i-2] == '% Added by lyx2lyx'):
+                del document.preamble[i-2:i+1]
+                use_dash_ligatures = True
+    if use_dash_ligatures is None:
+        # Look for dashes:
+        # (Documents by LyX 2.1 or older have "\twohyphens\n" or "\threehyphens\n"
+        # as interim representation for dash ligatures in 2.2.)
+        has_literal_dashes = False
+        has_ligature_dashes = False
+        j = 0
+        for i, line in enumerate(document.body):
+            # Skip some document parts where dashes are not converted
+            if (i < j) or line.startswith("\\labelwidthstring"):
+                continue
+            words = line.split()
+            if len(words) > 1 and words[0] == "\\begin_inset" and \
+            words[1] in ["CommandInset", "ERT", "External", "Formula",
+                         "FormulaMacro", "Graphics", "IPA", "listings"]:
+                j = find_end_of_inset(document.body, i)
+                if j == -1:
+                    document.warning("Malformed LyX document: "
+                        "Can't find end of %s inset at line %d" % (words[1],i))
+                continue
+            # literal dash followed by a word or no-break space:
+            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
+                         flags=re.UNICODE):
+                has_literal_dashes = True
+            # ligature dash followed by word or no-break space on next line:
+            if re.search(u"(\\twohyphens|\\threehyphens)", line,
+                            flags=re.UNICODE) and re.match(u"[\w\u00A0]",
+                            document.body[i+1], flags=re.UNICODE):
+                has_ligature_dashes = True
+        if has_literal_dashes and has_ligature_dashes:
+            # TODO: insert a warning note in the document?
+            document.warning('This document contained both literal and '
+                '"ligature" dashes.\n Line breaks may have changed. '
+                'See UserGuide chapter 3.9.1 for details.')
+        elif has_literal_dashes:
+            use_dash_ligatures = False
+        elif has_ligature_dashes:
+            use_dash_ligatures = True
+    # insert the setting if there is a preferred value
+    if use_dash_ligatures is not None:
+        i = find_token(document.header, "\\use_microtype", 0)
+        if i != -1:
+            document.header.insert(i+1, "\\use_dash_ligatures %s"
+                                % str(use_dash_ligatures).lower())
  
  def revert_dashligatures(document):
-    " Remove font ligature settings for en- and em-dashes. "
+    """Remove font ligature settings for en- and em-dashes.
+    Revert conversion of \twodashes or \threedashes to literal dashes."""
      i = find_token(document.header, "\\use_dash_ligatures", 0)
      if i == -1:
          return
@@ -1903,42 +1907,341 @@ def revert_dashligatures(document):
      i = find_token(document.header, "\\use_non_tex_fonts", 0)
      if i != -1:
          use_non_tex_fonts = get_bool_value(document.header, "\\use_non_tex_fonts", i)
-    if not use_dash_ligatures or use_non_tex_fonts:
+    if not use_dash_ligatures or document.backend != "latex":
          return
  
-    # Add a zero-length space (U+200B) after en- and em-dashes
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
+    j = 0
+    new_body = []
+    for i, line in enumerate(document.body):
          # Skip some document parts where dashes are not converted
+        if (i < j) or line.startswith("\\labelwidthstring"):
+            new_body.append(line)
+            continue
+        words = line.split()
          if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
+           words[1] in ["CommandInset", "ERT", "External", "Formula",
                          "FormulaMacro", "Graphics", "IPA", "listings"]:
              j = find_end_of_inset(document.body, i)
              if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
+                document.warning("Malformed LyX document: Can't find end of "
                                   + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
+            new_body.append(line)
              continue
+        line = line.replace(u'\u2013', '\\twohyphens\n')
+        line = line.replace(u'\u2014', '\\threehyphens\n')
+        lines = line.split('\n')
+        new_body.extend(line.split('\n'))
+    document.body = new_body
+    # redefine the dash LICRs to use ligature dashes:
+    add_to_preamble(document, [r'\renewcommand{\textendash}{--}',
+                               r'\renewcommand{\textemdash}{---}'])
  
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            document.body[i] = document.body[i][:j+1] + u"\u200B" + after
-            start = j+1
-        i += 1
+
+def revert_noto(document):
+    " Revert Noto font definitions to LaTeX "
+
+    if find_token(document.header, "\\use_non_tex_fonts false", 0) != -1:
+        preamble = ""
+        i = find_token(document.header, "\\font_roman \"NotoSerif-TLF\"", 0)
+        if i != -1:
+            add_to_preamble(document, ["\\renewcommand{\\rmdefault}{NotoSerif-TLF}"])
+            document.header[i] = document.header[i].replace("NotoSerif-TLF", "default")
+        i = find_token(document.header, "\\font_sans \"NotoSans-TLF\"", 0)
+        if i != -1:
+            add_to_preamble(document, ["\\renewcommand{\\sfdefault}{NotoSans-TLF}"])
+            document.header[i] = document.header[i].replace("NotoSans-TLF", "default")
+        i = find_token(document.header, "\\font_typewriter \"NotoMono-TLF\"", 0)
+        if i != -1:
+            add_to_preamble(document, ["\\renewcommand{\\ttdefault}{NotoMono-TLF}"])
+            document.header[i] = document.header[i].replace("NotoMono-TLF", "default")
+
+
+def revert_xout(document):
+  " Reverts \\xout font attribute "
+  changed = revert_font_attrs(document.body, "\\xout", "\\xout")
+  if changed == True:
+    insert_to_preamble(document, \
+        ['%  for proper cross-out',
+        '\\PassOptionsToPackage{normalem}{ulem}',
+        '\\usepackage{ulem}'])
+
+
+def convert_mathindent(document):
+    " add the \\is_math_indent tag "
+    # check if the document uses the class option "fleqn"
+    k = find_token(document.header, "\\quotes_style", 0)
+    regexp = re.compile(r'^.*fleqn.*')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        document.header.insert(k, "\\is_math_indent 1")
+        # delete the found option
+        document.header[i] = document.header[i].replace(",fleqn", "")
+        document.header[i] = document.header[i].replace(", fleqn", "")
+        document.header[i] = document.header[i].replace("fleqn,", "")
+        j = find_re(document.header, regexp, 0)
+        if i == j:
+            # then we have fleqn as the only option
+            del document.header[i]
+    else:
+        document.header.insert(k, "\\is_math_indent 0")
+
+
+def revert_mathindent(document):
+    " Define mathindent if set in the document "
+    # first output the length
+    regexp = re.compile(r'(\\math_indentation)')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        value = get_value(document.header, "\\math_indentation" , i).split()[0]
+        if value != "default":
+            add_to_preamble(document, ["\\setlength{\\mathindent}{" + value + '}'])
+        del document.header[i]
+    # now set the document class option
+    regexp = re.compile(r'(\\is_math_indent 1)')
+    i = find_re(document.header, regexp, 0)
+    if i == -1:
+        regexp = re.compile(r'(\\is_math_indent)')
+        j = find_re(document.header, regexp, 0)
+        del document.header[j]
+    else:
+        k = find_token(document.header, "\\options", 0)
+        if k != -1:
+           document.header[k] = document.header[k].replace("\\options", "\\options fleqn,")
+           del document.header[i]
+        else:
+            l = find_token(document.header, "\\use_default_options", 0)
+            document.header.insert(l, "\\options fleqn")
+            del document.header[i + 1]
+
+
+def revert_baselineskip(document):
+  " Revert baselineskips to TeX code "
+  i = 0
+  vspaceLine = 0
+  hspaceLine = 0
+  while True:
+    regexp = re.compile(r'^.*baselineskip%.*$')
+    i = find_re(document.body, regexp, i)
+    if i == -1:
+      return
+    vspaceLine = find_token(document.body, "\\begin_inset VSpace", i)
+    if  vspaceLine == i:
+      # output VSpace inset as TeX code
+      # first read out the values
+      beg = document.body[i].rfind("VSpace ");
+      end = document.body[i].rfind("baselineskip%");
+      baselineskip = float(document.body[i][beg + 7:end]);
+      # we store the value in percent, thus divide by 100
+      baselineskip = baselineskip/100;
+      baselineskip = str(baselineskip);
+      # check if it is the starred version
+      if document.body[i].find('*') != -1:
+        star = '*'
+      else:
+        star = ''
+      # now output TeX code
+      endInset = find_end_of_inset(document.body, i)
+      if endInset == -1:
+        document.warning("Malformed LyX document: Missing '\\end_inset' of VSpace inset.")
+        return
+      else:
+        document.body[vspaceLine: endInset + 1] = put_cmd_in_ert("\\vspace" + star + '{' + baselineskip + "\\baselineskip}")
+    hspaceLine = find_token(document.body, "\\begin_inset space \\hspace", i - 1)
+    document.warning("hspaceLine: " + str(hspaceLine))
+    document.warning("i: " + str(i))
+    if  hspaceLine == i - 1:
+      # output space inset as TeX code
+      # first read out the values
+      beg = document.body[i].rfind("\\length ");
+      end = document.body[i].rfind("baselineskip%");
+      baselineskip = float(document.body[i][beg + 7:end]);
+      document.warning("baselineskip: " + str(baselineskip))
+      # we store the value in percent, thus divide by 100
+      baselineskip = baselineskip/100;
+      baselineskip = str(baselineskip);
+      # check if it is the starred version
+      if document.body[i-1].find('*') != -1:
+        star = '*'
+      else:
+        star = ''
+      # now output TeX code
+      endInset = find_end_of_inset(document.body, i)
+      if endInset == -1:
+        document.warning("Malformed LyX document: Missing '\\end_inset' of space inset.")
+        return
+      else:
+        document.body[hspaceLine: endInset + 1] = put_cmd_in_ert("\\hspace" + star + '{' + baselineskip + "\\baselineskip}")
+
+    i = i + 1
+
+
+def revert_rotfloat(document):
+  " Revert placement options for rotated floats "
+  i = 0
+  j = 0
+  k = 0
+  while True:
+    i = find_token(document.body, "sideways true", i)
+    if i != -1:
+      regexp = re.compile(r'^.*placement.*$')
+      j = find_re(document.body, regexp, i-2)
+      if j == -1:
+          return
+      if j != i-2:
+          i = i + 1
+          continue
+    else:
+      return
+    # we found a sideways float with placement options
+    # at first store the placement
+    beg = document.body[i-2].rfind(" ");
+    placement = document.body[i-2][beg+1:]
+    # check if the option'H' is used
+    if placement.find("H") != -1:
+      add_to_preamble(document, ["\\usepackage{float}"])
+    # now check if it is a starred type
+    if document.body[i-1].find("wide true") != -1:
+      star = '*'
+    else:
+      star = ''
+    # store the float type
+    beg = document.body[i-3].rfind(" ");
+    fType = document.body[i-3][beg+1:]
+    # now output TeX code
+    endInset = find_end_of_inset(document.body, i-3)
+    if endInset == -1:
+      document.warning("Malformed LyX document: Missing '\\end_inset' of Float inset.")
+      return
+    else:
+      document.body[endInset-2: endInset+1] = put_cmd_in_ert("\\end{sideways" + fType + star + '}')
+      document.body[i-3: i+2] = put_cmd_in_ert("\\begin{sideways" + fType + star + "}[" + placement + ']')
+      add_to_preamble(document, ["\\usepackage{rotfloat}"])
+
+    i = i + 1
+
+
+def convert_allowbreak(document):
+    " Zero widths Space-inset -> \SpecialChar allowbreak. "
+    body = "\n".join(document.body)
+    body = body.replace("\\begin_inset space \hspace{}\n"
+                        "\\length 0dd\n"
+                        "\\end_inset\n\n",
+                        "\\SpecialChar allowbreak\n")
+    document.body = body.split("\n")
+
+
+def revert_allowbreak(document):
+    " \SpecialChar allowbreak -> Zero widths Space-inset. "
+    body = "\n".join(document.body)
+    body = body.replace("\\SpecialChar allowbreak\n",
+                        "\\begin_inset space \hspace{}\n"
+                        "\\length 0dd\n"
+                        "\\end_inset\n\n")
+    document.body = body.split("\n")
+
+
+def convert_mathnumberpos(document):
+    " add the \\math_number_before tag "
+    # check if the document uses the class option "leqno"
+    k = find_token(document.header, "\\quotes_style", 0)
+    m = find_token(document.header, "\\options", 0)
+    regexp = re.compile(r'^.*leqno.*')
+    i = find_re(document.header, regexp, 0)
+    if i != -1 and i == m:
+        document.header.insert(k, "\\math_number_before 1")
+        # delete the found option
+        document.header[i] = document.header[i].replace(",leqno", "")
+        document.header[i] = document.header[i].replace(", leqno", "")
+        document.header[i] = document.header[i].replace("leqno,", "")
+        j = find_re(document.header, regexp, 0)
+        if i == j:
+            # then we have leqno as the only option
+            del document.header[i]
+    else:
+        document.header.insert(k, "\\math_number_before 0")
+
+
+def revert_mathnumberpos(document):
+    " add the document class option leqno"
+    regexp = re.compile(r'(\\math_number_before 1)')
+    i = find_re(document.header, regexp, 0)
+    if i == -1:
+        regexp = re.compile(r'(\\math_number_before)')
+        j = find_re(document.header, regexp, 0)
+        del document.header[j]
+    else:
+        k = find_token(document.header, "\\options", 0)
+        if k != -1:
+           document.header[k] = document.header[k].replace("\\options", "\\options leqno,")
+           del document.header[i]
+        else:
+            l = find_token(document.header, "\\use_default_options", 0)
+            document.header.insert(l, "\\options leqno")
+            del document.header[i + 1]
+
+
+def convert_mathnumberingname(document):
+    " rename the \\math_number_before tag to \\math_numbering_side "
+    regexp = re.compile(r'(\\math_number_before 1)')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        document.header[i] = "\\math_numbering_side left"
+    regexp = re.compile(r'(\\math_number_before 0)')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        document.header[i] = "\\math_numbering_side default"
+    # check if the document uses the class option "reqno"
+    k = find_token(document.header, "\\math_numbering_side", 0)
+    m = find_token(document.header, "\\options", 0)
+    regexp = re.compile(r'^.*reqno.*')
+    i = find_re(document.header, regexp, 0)
+    if i != -1 and i == m:
+        document.header[k] = "\\math_numbering_side right"
+        # delete the found option
+        document.header[i] = document.header[i].replace(",reqno", "")
+        document.header[i] = document.header[i].replace(", reqno", "")
+        document.header[i] = document.header[i].replace("reqno,", "")
+        j = find_re(document.header, regexp, 0)
+        if i == j:
+            # then we have reqno as the only option
+            del document.header[i]
+
+
+def revert_mathnumberingname(document):
+    " rename the \\math_numbering_side tag back to \\math_number_before "
+    # just rename
+    regexp = re.compile(r'(\\math_numbering_side left)')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        document.header[i] = "\\math_number_before 1"
+    # add the option reqno and delete the tag
+    regexp = re.compile(r'(\\math_numbering_side right)')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        document.header[i] = "\\math_number_before 0"
+        k = find_token(document.header, "\\options", 0)
+        if k != -1:
+           document.header[k] = document.header[k].replace("\\options", "\\options reqno,")
+        else:
+            l = find_token(document.header, "\\use_default_options", 0)
+            document.header.insert(l, "\\options reqno")
+    # add the math_number_before tag
+    regexp = re.compile(r'(\\math_numbering_side default)')
+    i = find_re(document.header, regexp, 0)
+    if i != -1:
+        document.header[i] = "\\math_number_before 0"
+
+
+def convert_minted(document):
+    " add the \\use_minted tag "
+    document.header.insert(-1, "\\use_minted 0")
+
+
+def revert_minted(document):
+    " remove the \\use_minted tag "
+    i = find_token(document.header, "\\use_minted", 0)
+    if i != -1:
+        document.header.pop(i)
  
  
  ##
@@ -1973,10 +2276,28 @@ convert = [
             [532, [convert_literalparam]],
             [533, []],
             [534, []],
-           [535, [convert_dashligatures]]
+           [535, [convert_dashligatures]],
+           [536, []],
+           [537, []],
+           [538, [convert_mathindent]],
+           [539, []],
+           [540, []],
+           [541, [convert_allowbreak]],
+           [542, [convert_mathnumberpos]],
+           [543, [convert_mathnumberingname]],
+           [544, [convert_minted]]
            ]
  
  revert =  [
+           [543, [revert_minted]],
+           [542, [revert_mathnumberingname]],
+           [541, [revert_mathnumberpos]],
+           [540, [revert_allowbreak]],
+           [539, [revert_rotfloat]],
+           [538, [revert_baselineskip]],
+           [537, [revert_mathindent]],
+           [536, [revert_xout]],
+           [535, [revert_noto]],
             [534, [revert_dashligatures]],
             [533, [revert_chapterbib]],
             [532, [revert_multibib]],