(Atempt to) Fix bug 4048

author José Matox <jamatos@lyx.org>

Fri, 20 Jul 2007 16:02:24 +0000 (16:02 +0000)

committer José Matox <jamatos@lyx.org>

Fri, 20 Jul 2007 16:02:24 +0000 (16:02 +0000)
author José Matox <jamatos@lyx.org>
Fri, 20 Jul 2007 16:02:24 +0000 (16:02 +0000)
committer José Matox <jamatos@lyx.org>
Fri, 20 Jul 2007 16:02:24 +0000 (16:02 +0000)
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index 9cd4ba5b803212d79e8c9421ecbac7ad47c883e9..dc0c99eae91bbe326cf76a4147501da1564c6ae1 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -253,8 +253,12 @@ necessary parsing in modern formats than in ancient ones.
      insets = []
      lang_re = re.compile(r"^\\lang\s(\S+)")
      inset_re = re.compile(r"^\\begin_inset\s(\S+)")
+    if not forward: # no need to read file unless we are reverting
+        spec_chars = read_unicodesymbols()
+
      if document.inputencoding == "auto" or document.inputencoding == "default":
-        for i in range(len(document.body)):
+        i = 0
+        while i < len(document.body):
              result = lang_re.match(document.body[i])
              if result:
                  language = result.group(1)
@@ -297,13 +301,20 @@ necessary parsing in modern formats than in ancient ones.
                      # with the correct encoding.
                      document.body[i] = orig.decode(encoding_stack[-1])
                  else:
-                    # Convert unicode to the 8bit string that will be written
-                    # to the file with the correct encoding.
-                    orig = document.body[i].encode(encoding_stack[-1])
-                    # Convert the 8bit string that will be written to the
-                    # file to fake unicode with the encoding that will later
-                    # be used when writing to the file.
-                    document.body[i] = orig.decode(document.encoding)
+                    try:
+                        # Convert unicode to the 8bit string that will be written
+                        # to the file with the correct encoding.
+                        orig = document.body[i].encode(encoding_stack[-1])
+                        # Convert the 8bit string that will be written to the
+                        # file to fake unicode with the encoding that will later
+                        # be used when writing to the file.
+                        document.body[i] = orig.decode(document.encoding)
+                    except:
+                        last_char = document.body[i-1][-1]
+                        mod_line, last_char = revert_unicode_line(document, i, last_char, insets, spec_chars)
+                        document.body[i:i+1] = mod_line.split('\n')
+                        i += len(mod_line.split('\n')) - 1
+            i += 1
  
  
  def convert_utf8(document):
@@ -341,33 +352,88 @@ def read_unicodesymbols():
              except:
                  pass
      fp.close()
-
      return spec_chars
  
  
+def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_character = '???'):
+    # Define strings to start and end ERT and math insets
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
+    ert_outro='\n\\end_layout\n\n\\end_inset\n'
+    math_intro='\n\\begin_inset Formula $'
+    math_outro='$\n\\end_inset'
+
+    mod_line = u''
+    line = document.body[i]
+    for character in line:
+        try:
+            # Try to write the character
+            dummy = character.encode(document.encoding)
+            mod_line += character
+            last_char = character
+        except:
+            # Try to replace with ERT/math inset
+            if spec_chars.has_key(character):
+                command = spec_chars[character][0] # the command to replace unicode
+                flag1 = spec_chars[character][1]
+                flag2 = spec_chars[character][2]
+                if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+                    # We have a character that should be combined with the previous
+                    command += '{' + last_char + '}'
+                    # Remove the last character. Ignore if it is whitespace
+                    if len(last_char.rstrip()):
+                        # last_char was found and is not whitespace
+                        if mod_line:
+                            mod_line = mod_line[:-1]
+                        else: # last_char belongs to the last line
+                            document.body[i-1] = document.body[i-1][:-1]
+                    else:
+                        # The last character was replaced by a command. For now it is
+                        # ignored. This could be handled better.
+                        pass
+                if command[0:2] == '\\\\':
+                    if command[2:12]=='ensuremath':
+                        if insets and insets[-1] == "ERT":
+                            # math in ERT
+                            command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
+                            command = command.replace('}', '$\n')
+                        elif not insets or insets[-1] != "Formula":
+                            # add a math inset with the replacement character
+                            command = command.replace('\\\\ensuremath{\\', math_intro)
+                            command = command.replace('}', math_outro)
+                        else:
+                            # we are already in a math inset
+                            command = command.replace('\\\\ensuremath{\\', '')
+                            command = command.replace('}', '')
+                    else:
+                        if insets and insets[-1] == "Formula":
+                            # avoid putting an ERT in a math; instead put command as text
+                            command = command.replace('\\\\', '\mathrm{')
+                            command = command + '}'
+                        elif not insets or insets[-1] != "ERT":
+                            # add an ERT inset with the replacement character
+                            command = command.replace('\\\\', ert_intro)
+                            command = command + ert_outro
+                        else:
+                            command = command.replace('\\\\', '\n\\backslash\n')
+                    last_char = '' # indicate that the character should not be removed
+                mod_line += command
+            else:
+                # Replace with replacement string
+                mod_line += replacement_character
+    return mod_line, last_char
+
+
  def revert_unicode(document):
      '''Transform unicode characters that can not be written using the
  document encoding to commands according to the unicodesymbols
  file. Characters that can not be replaced by commands are replaced by
  an replacement string.  Flags other than 'combined' are currently not
  implemented.'''
-
-    replacement_character = '???'
      spec_chars = read_unicodesymbols()
-
-    # Define strings to start and end ERT and math insets
-    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
-    ert_outro='\n\\end_layout\n\n\\end_inset\n'
-    math_intro='\n\\begin_inset Formula $'
-    math_outro='$\n\\end_inset'
-    # Find unicode characters and replace them
-    in_ert = False # flag set to 1 if in ERT inset
-    in_math = False # flag set to 1 if in math inset
      insets = [] # list of active insets
-
-    # Go through the file to capture all combining characters
      last_char = '' # to store the previous character
  
+    # Go through the document to capture all combining characters
      i = 0
      while i < len(document.body):
          line = document.body[i]
@@ -385,63 +451,7 @@ implemented.'''
              i += 1
          except:
              # Error, some character(s) in the line need to be replaced
-            mod_line = u''
-            for character in line:
-                try:
-                    # Try to write the character
-                    dummy = character.encode(document.encoding)
-                    mod_line += character
-                    last_char = character
-                except:
-                    # Try to replace with ERT/math inset
-                    if spec_chars.has_key(character):
-                        command = spec_chars[character][0] # the command to replace unicode
-                        flag1 = spec_chars[character][1]
-                        flag2 = spec_chars[character][2]
-                        if flag1.find('combining') > -1 or flag2.find('combining') > -1:
-                            # We have a character that should be combined with the previous
-                            command += '{' + last_char + '}'
-                            # Remove the last character. Ignore if it is whitespace
-                            if len(last_char.rstrip()):
-                                # last_char was found and is not whitespace
-                                if mod_line:
-                                    mod_line = mod_line[:-1]
-                                else: # last_char belongs to the last line
-                                    document.body[i-1] = document.body[i-1][:-1]
-                            else:
-                                # The last character was replaced by a command. For now it is
-                                # ignored. This could be handled better.
-                                pass
-                        if command[0:2] == '\\\\':
-                            if command[2:12]=='ensuremath':
-                                if insets[-1] == "ERT":
-                                    # math in ERT
-                                    command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
-                                    command = command.replace('}', '$\n')
-                                elif insets[-1] != "Formula":
-                                    # add a math inset with the replacement character
-                                    command = command.replace('\\\\ensuremath{\\', math_intro)
-                                    command = command.replace('}', math_outro)
-                                else:
-                                    # we are already in a math inset
-                                    command = command.replace('\\\\ensuremath{\\', '')
-                                    command = command.replace('}', '')
-                            else:
-                                if insets[-1] == "Formula":
-                                    # avoid putting an ERT in a math; instead put command as text
-                                    command = command.replace('\\\\', '\mathrm{')
-                                    command = command + '}'
-                                elif insets[-1] != "ERT":
-                                    # add an ERT inset with the replacement character
-                                    command = command.replace('\\\\', ert_intro)
-                                    command = command + ert_outro
-                                else:
-                                    command = command.replace('\\\\', '\n\\backslash\n')
-                            last_char = '' # indicate that the character should not be removed
-                        mod_line += command
-                    else:
-                        # Replace with replacement string
-                        mod_line += replacement_character
+            mod_line, last_char = revert_unicode_line(document, i, last_char, insets, spec_chars)
              document.body[i:i+1] = mod_line.split('\n')
              i += len(mod_line.split('\n'))
author	José Matox <jamatos@lyx.org>
	Fri, 20 Jul 2007 16:02:24 +0000 (16:02 +0000)
committer	José Matox <jamatos@lyx.org>
	Fri, 20 Jul 2007 16:02:24 +0000 (16:02 +0000)