less fingerpainting involved now. and no warnings...

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index 9cd4ba5b803212d79e8c9421ecbac7ad47c883e9..05fd93fdc23688b55898a09e5aa18a2d13812188 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -253,8 +253,12 @@ necessary parsing in modern formats than in ancient ones.
      insets = []
      lang_re = re.compile(r"^\\lang\s(\S+)")
      inset_re = re.compile(r"^\\begin_inset\s(\S+)")
+    if not forward: # no need to read file unless we are reverting
+        spec_chars = read_unicodesymbols()
+
      if document.inputencoding == "auto" or document.inputencoding == "default":
-        for i in range(len(document.body)):
+        i = 0
+        while i < len(document.body):
              result = lang_re.match(document.body[i])
              if result:
                  language = result.group(1)
@@ -297,13 +301,19 @@ necessary parsing in modern formats than in ancient ones.
                      # with the correct encoding.
                      document.body[i] = orig.decode(encoding_stack[-1])
                  else:
-                    # Convert unicode to the 8bit string that will be written
-                    # to the file with the correct encoding.
-                    orig = document.body[i].encode(encoding_stack[-1])
-                    # Convert the 8bit string that will be written to the
-                    # file to fake unicode with the encoding that will later
-                    # be used when writing to the file.
-                    document.body[i] = orig.decode(document.encoding)
+                    try:
+                        # Convert unicode to the 8bit string that will be written
+                        # to the file with the correct encoding.
+                        orig = document.body[i].encode(encoding_stack[-1])
+                        # Convert the 8bit string that will be written to the
+                        # file to fake unicode with the encoding that will later
+                        # be used when writing to the file.
+                        document.body[i] = orig.decode(document.encoding)
+                    except:
+                        mod_line = revert_unicode_line(document, i, insets, spec_chars)
+                        document.body[i:i+1] = mod_line.split('\n')
+                        i += len(mod_line.split('\n')) - 1
+            i += 1
  
  
  def convert_utf8(document):
@@ -341,33 +351,92 @@ def read_unicodesymbols():
              except:
                  pass
      fp.close()
-
      return spec_chars
  
  
+def revert_unicode_line(document, i, insets, spec_chars, replacement_character = '???'):
+    # Define strings to start and end ERT and math insets
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
+    ert_outro='\n\\end_layout\n\n\\end_inset\n'
+    math_intro='\n\\begin_inset Formula $'
+    math_outro='$\n\\end_inset'
+
+    mod_line = u''
+    if i and not is_inset_line(document, i-1):
+        last_char = document.body[i - 1][-1:]
+    else:
+        last_char = ''
+
+    line = document.body[i]
+    for character in line:
+        try:
+            # Try to write the character
+            dummy = character.encode(document.encoding)
+            mod_line += character
+            last_char = character
+        except:
+            # Try to replace with ERT/math inset
+            if spec_chars.has_key(character):
+                command = spec_chars[character][0] # the command to replace unicode
+                flag1 = spec_chars[character][1]
+                flag2 = spec_chars[character][2]
+                if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+                    # We have a character that should be combined with the previous
+                    command += '{' + last_char + '}'
+                    # Remove the last character. Ignore if it is whitespace
+                    if len(last_char.rstrip()):
+                        # last_char was found and is not whitespace
+                        if mod_line:
+                            mod_line = mod_line[:-1]
+                        else: # last_char belongs to the last line
+                            document.body[i-1] = document.body[i-1][:-1]
+                    else:
+                        # The last character was replaced by a command. For now it is
+                        # ignored. This could be handled better.
+                        pass
+                if command[0:2] == '\\\\':
+                    if command[2:12]=='ensuremath':
+                        if insets and insets[-1] == "ERT":
+                            # math in ERT
+                            command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
+                            command = command.replace('}', '$\n')
+                        elif not insets or insets[-1] != "Formula":
+                            # add a math inset with the replacement character
+                            command = command.replace('\\\\ensuremath{\\', math_intro)
+                            command = command.replace('}', math_outro)
+                        else:
+                            # we are already in a math inset
+                            command = command.replace('\\\\ensuremath{\\', '')
+                            command = command.replace('}', '')
+                    else:
+                        if insets and insets[-1] == "Formula":
+                            # avoid putting an ERT in a math; instead put command as text
+                            command = command.replace('\\\\', '\mathrm{')
+                            command = command + '}'
+                        elif not insets or insets[-1] != "ERT":
+                            # add an ERT inset with the replacement character
+                            command = command.replace('\\\\', ert_intro)
+                            command = command + ert_outro
+                        else:
+                            command = command.replace('\\\\', '\n\\backslash\n')
+                    last_char = '' # indicate that the character should not be removed
+                mod_line += command
+            else:
+                # Replace with replacement string
+                mod_line += replacement_character
+    return mod_line
+
+
  def revert_unicode(document):
      '''Transform unicode characters that can not be written using the
  document encoding to commands according to the unicodesymbols
  file. Characters that can not be replaced by commands are replaced by
  an replacement string.  Flags other than 'combined' are currently not
  implemented.'''
-
-    replacement_character = '???'
      spec_chars = read_unicodesymbols()
-
-    # Define strings to start and end ERT and math insets
-    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
-    ert_outro='\n\\end_layout\n\n\\end_inset\n'
-    math_intro='\n\\begin_inset Formula $'
-    math_outro='$\n\\end_inset'
-    # Find unicode characters and replace them
-    in_ert = False # flag set to 1 if in ERT inset
-    in_math = False # flag set to 1 if in math inset
      insets = [] # list of active insets
  
-    # Go through the file to capture all combining characters
-    last_char = '' # to store the previous character
-
+    # Go through the document to capture all combining characters
      i = 0
      while i < len(document.body):
          line = document.body[i]
@@ -381,67 +450,10 @@ implemented.'''
          try:
              # If all goes well the line is written here
              dummy = line.encode(document.encoding)
-            last_char = line[-1]
              i += 1
          except:
              # Error, some character(s) in the line need to be replaced
-            mod_line = u''
-            for character in line:
-                try:
-                    # Try to write the character
-                    dummy = character.encode(document.encoding)
-                    mod_line += character
-                    last_char = character
-                except:
-                    # Try to replace with ERT/math inset
-                    if spec_chars.has_key(character):
-                        command = spec_chars[character][0] # the command to replace unicode
-                        flag1 = spec_chars[character][1]
-                        flag2 = spec_chars[character][2]
-                        if flag1.find('combining') > -1 or flag2.find('combining') > -1:
-                            # We have a character that should be combined with the previous
-                            command += '{' + last_char + '}'
-                            # Remove the last character. Ignore if it is whitespace
-                            if len(last_char.rstrip()):
-                                # last_char was found and is not whitespace
-                                if mod_line:
-                                    mod_line = mod_line[:-1]
-                                else: # last_char belongs to the last line
-                                    document.body[i-1] = document.body[i-1][:-1]
-                            else:
-                                # The last character was replaced by a command. For now it is
-                                # ignored. This could be handled better.
-                                pass
-                        if command[0:2] == '\\\\':
-                            if command[2:12]=='ensuremath':
-                                if insets[-1] == "ERT":
-                                    # math in ERT
-                                    command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
-                                    command = command.replace('}', '$\n')
-                                elif insets[-1] != "Formula":
-                                    # add a math inset with the replacement character
-                                    command = command.replace('\\\\ensuremath{\\', math_intro)
-                                    command = command.replace('}', math_outro)
-                                else:
-                                    # we are already in a math inset
-                                    command = command.replace('\\\\ensuremath{\\', '')
-                                    command = command.replace('}', '')
-                            else:
-                                if insets[-1] == "Formula":
-                                    # avoid putting an ERT in a math; instead put command as text
-                                    command = command.replace('\\\\', '\mathrm{')
-                                    command = command + '}'
-                                elif insets[-1] != "ERT":
-                                    # add an ERT inset with the replacement character
-                                    command = command.replace('\\\\', ert_intro)
-                                    command = command + ert_outro
-                                else:
-                                    command = command.replace('\\\\', '\n\\backslash\n')
-                            last_char = '' # indicate that the character should not be removed
-                        mod_line += command
-                    else:
-                        # Replace with replacement string
-                        mod_line += replacement_character
+            mod_line = revert_unicode_line(document, i, insets, spec_chars)
              document.body[i:i+1] = mod_line.split('\n')
              i += len(mod_line.split('\n'))
  
@@ -641,17 +653,17 @@ def convert_commandparams(document):
              if commandparams_info[name][0] == "":
                  document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('\\', '\\\\').replace('"', '\\"')))
          if option2 != "":
              if commandparams_info[name][1] == "":
                  document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('\\', '\\\\').replace('"', '\\"')))
          if argument != "":
              if commandparams_info[name][2] == "":
                  document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('\\', '\\\\').replace('"', '\\"')))
          document.body[i:i+1] = lines
          i = i + 1
  
@@ -678,13 +690,13 @@ def revert_commandparams(document):
                      preview_line = document.body[k]
                  elif (commandparams_info[name][0] != "" and
                        pname == commandparams_info[name][0]):
-                    option1 = pvalue.strip('"').replace('\\"', '"')
+                    option1 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][1] != "" and
                        pname == commandparams_info[name][1]):
-                    option2 = pvalue.strip('"').replace('\\"', '"')
+                    option2 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][2] != "" and
                        pname == commandparams_info[name][2]):
-                    argument = pvalue.strip('"').replace('\\"', '"')
+                    argument = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
              elif document.body[k].strip() != "":
                  document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
          if name == "bibitem":
@@ -1138,6 +1150,33 @@ def convert_accent(document):
          i += 3
  
  
+def is_inset_line(document, i):
+    """ Line i of body has an inset """
+    if document.body[i][:1] == '\\':
+        return True
+    last_tokens = "".join(document.body[i].split()[-2:])
+    return last_tokens.find('\\') != -1
+
+
+# A wrapper around normalize that handles special cases (cf. bug 3313)
+def normalize(form, text):
+    # do not normalize OHM, ANGSTROM
+    keep_characters = [0x2126,0x212b]
+    result = ''
+    convert = ''
+    for i in text:
+        if ord(i) in keep_characters:
+            if len(convert) > 0:
+                result = result + unicodedata.normalize(form, convert)
+                convert = ''
+            result = result + i
+        else:
+            convert = convert + i
+    if len(convert) > 0:
+        result = result + unicodedata.normalize(form, convert)
+    return result
+
+
  def revert_accent(document):
      inverse_accent_map = {}
      for k in accent_map:
@@ -1153,36 +1192,35 @@ def revert_accent(document):
      # words before unicode normalization.
      # We do this only if the next line starts with an accent, otherwise we
      # would create things like '\begin_inset ERTstatus'.
-    numberoflines = len(document.body)
-    for i in range(numberoflines-1):
+    for i in range(len(document.body) - 1):
          if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
              continue
-        if (document.body[i+1][0] in inverse_accent_map):
+        if (document.body[i+1][0] in inverse_accent_map and not is_inset_line(document, i)):
              # the last character of this line and the first of the next line
-            # form probably a surrogate pair.
+            # form probably a surrogate pair, inline insets are excluded (second part of the test)
              while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
                  document.body[i] += document.body[i+1][0]
                  document.body[i+1] = document.body[i+1][1:]
  
      # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
      # This is needed to catch all accented characters.
-    for i in range(numberoflines):
+    for i in range(len(document.body)):
          # Unfortunately we have a mixture of unicode strings and plain strings,
          # because we never use u'xxx' for string literals, but 'xxx'.
          # Therefore we may have to try two times to normalize the data.
          try:
-            document.body[i] = unicodedata.normalize("NFD", document.body[i])
+            document.body[i] = normalize("NFD", document.body[i])
          except TypeError:
-            document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
+            document.body[i] = normalize("NFD", unicode(document.body[i], 'utf-8'))
  
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
+
      i = 0
      while i < len(document.body):
-
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
              result = lang_re.match(document.body[i])
@@ -1215,10 +1253,7 @@ def revert_accent(document):
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
-                    if j > 0:
-                        document.body[i] = document.body[i][:j-1]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
                      break
@@ -1231,24 +1266,21 @@ def revert_accent(document):
                      accented_char = inverse_accented_map[accented_char]
                  accent = document.body[i][j]
                  try:
-                    dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
+                    dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
-                    if j > 1:
-                        document.body[i] = document.body[i][:j-2]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j-1]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
          i = i + 1
  
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
-    for i in range(numberoflines):
-        document.body[i] = unicodedata.normalize("NFC", document.body[i])
+    for i in range(len(document.body)):
+        document.body[i] = normalize("NFC", document.body[i])
  
  
  def normalize_font_whitespace_259(document):
@@ -1790,7 +1822,9 @@ after label
                                        '',
                                        r'\backslash',
                                        r'begin{lstlisting}%s' % params,
-                                      r'\end_layout'
+                                      r'\end_layout',
+                                      '',
+                                      r'\begin_layout %s' % document.default_layout,
                                      ] + document.body[k : j - 1] + \
                                       ['',
                                        r'\begin_layout %s' % document.default_layout,