Fix a few edge-cases which in the lyx2lyx conversion to format 249

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index 3374d8567f229dd571c19a0476fa62feb0742c7e..d874ef3101bb27a4b9c574833ecb1aaeecec3b34 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -246,10 +246,13 @@ document.encoding must be set to the old value (format 248) in both cases.
  We do this here and not in LyX.py because it is far easier to do the
  necessary parsing in modern formats than in ancient ones.
  """
+    inset_types = ["Foot", "Note"]
      if document.cjk_encoding != '':
          return
      encoding_stack = [document.encoding]
+    inset_stack = []
      lang_re = re.compile(r"^\\lang\s(\S+)")
+    inset_re = re.compile(r"^\\begin_inset\s(\S+)")
      if document.inputencoding == "auto" or document.inputencoding == "default":
          for i in range(len(document.body)):
              result = lang_re.match(document.body[i])
@@ -264,7 +267,11 @@ necessary parsing in modern formats than in ancient ones.
                      encoding_stack[-1] = lang[language][3]
              elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
                  document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
-                encoding_stack.append(encoding_stack[-1])
+                if len(inset_stack) > 0 and inset_stack[-1] in inset_types:
+                    from lyx2lyx_lang import lang
+                    encoding_stack.append(lang[document.language][3])
+                else:
+                    encoding_stack.append(encoding_stack[-1])
              elif find_token(document.body, "\\end_layout", i, i + 1) == i:
                  document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
                  if len(encoding_stack) == 1:
@@ -272,6 +279,15 @@ necessary parsing in modern formats than in ancient ones.
                      document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
                  else:
                      del encoding_stack[-1]
+            elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
+                inset_result = inset_re.match(document.body[i])
+                if inset_result:
+                    inset_type = inset_result.group(1)
+                    inset_stack.append(inset_type)
+                else: 
+                    inset_stack.append("")
+            elif find_token(document.body, "\\end_inset", i, i + 1) == i:
+                del inset_stack[-1]
              if encoding_stack[-1] != document.encoding:
                  if forward:
                      # This line has been incorrectly interpreted as if it was
@@ -1043,7 +1059,8 @@ def revert_accent(document):
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
-    for i in range(len(document.body)):
+    i = 0
+    while i < len(document.body):
  
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
@@ -1075,7 +1092,7 @@ def revert_accent(document):
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
-                        document.body[i+1:i+1] = document.body[i][j+1:]
+                        document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
                      if j > 0:
                          document.body[i] = document.body[i][:j-1]
@@ -1097,7 +1114,7 @@ def revert_accent(document):
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
-                        document.body[i+1:i+1] = document.body[i][j+1:]
+                        document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
                      if j > 1:
                          document.body[i] = document.body[i][:j-2]
@@ -1106,6 +1123,8 @@ def revert_accent(document):
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
+        i = i + 1
+
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
      for i in range(numberoflines):
          document.body[i] = unicodedata.normalize("NFC", document.body[i])
@@ -1616,7 +1635,7 @@ after label
          inlinecode = ''
          # looking for the oneline code for lstinline
          inlinecode = document.body[find_end_of_layout(document.body, 
-            find_token(document.body, '\\begin_layout Standard', i + 1) +1 ) - 1]
+            find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
          if len(caption) > 0:
              if len(params) == 0:
                  params = 'caption={%s}' % caption
@@ -1633,7 +1652,7 @@ after label
          if inline == 'true':
              document.body[i:(j+1)] = [r'\begin_inset ERT',
                                        'status %s' % status,
-                                      r'\begin_layout Standard',
+                                      r'\begin_layout %s' % document.default_layout,
                                        '', 
                                        '',
                                        r'\backslash',
@@ -1645,7 +1664,7 @@ after label
              document.body[i: j+1] =  [r'\begin_inset ERT',
                                        'status %s' % status,
                                        '',
-                                      r'\begin_layout Standard',
+                                      r'\begin_layout %s' % document.default_layout,
                                        '',
                                        '',
                                        r'\backslash',
@@ -1653,7 +1672,7 @@ after label
                                        r'\end_layout'
                                      ] + document.body[k : j - 1] + \
                                       ['',
-                                      r'\begin_layout Standard',
+                                      r'\begin_layout %s' % document.default_layout,
                                        '',
                                        r'\backslash',
                                        'end{lstlisting}',
@@ -1704,7 +1723,7 @@ lstinputlisting{file}[opt]
          document.body[i : j + 1] = [r'\begin_inset ERT',
                                      'status open',
                                      '',
-                                    r'\begin_layout Standard',
+                                    r'\begin_layout %s' % document.default_layout,
                                      '',
                                      '',
                                      r'\backslash',
@@ -1769,6 +1788,7 @@ def convert_ext_font_sizes(document):
      else:
          del document.header[i]
  
+
  def revert_separator_layout(document):
      r'''Revert --Separator-- to a lyx note
  From
@@ -1803,11 +1823,11 @@ something
          if j == -1:
              # this should not happen
              break
-        document.body[i : j + 1] = [r'\begin_layout Standard',
+        document.body[i : j + 1] = [r'\begin_layout %s' % document.default_layout,
                                      r'\begin_inset Note Note',
                                      'status open',
                                      '',
-                                    r'\begin_layout Standard',
+                                    r'\begin_layout %s' % document.default_layout,
                                      'Separate Environment',
                                      r'\end_layout',
                                      '',
@@ -1817,6 +1837,7 @@ something
                                      r'\end_layout'
                                      ]
  
+
  def convert_arabic (document):
      if document.language == "arabic":
          document.language = "arabic_arabtex"
@@ -1830,7 +1851,8 @@ def convert_arabic (document):
              # change the language name
              document.body[i] = '\lang arabic_arabtex'
          i = i + 1
-       
+
+
  def revert_arabic (document):
      if document.language == "arabic_arabtex":
          document.language = "arabic"
@@ -1845,13 +1867,11 @@ def revert_arabic (document):
              document.body[i] = '\lang arabic'
          i = i + 1
  
-def revert_unicode(document):
-    '''Transform unicode symbols according to the unicode list.
-Preamble flags are not implemented.
-Combination characters are currently ignored.
-Forced output is currently not enforced'''
-    pathname = os.path.dirname(sys.argv[0])
-    fp = open(pathname.strip('lyx2lyx') + 'unicodesymbols','r')
+
+def read_unicodesymbols():
+    " Read the unicodesymbols list of unicode characters and corresponding commands."
+    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+    fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
      spec_chars = {}
      for line in fp.readlines():
          if line[0] != '#':
@@ -1859,82 +1879,133 @@ Forced output is currently not enforced'''
              line=line.replace('" ',' ') # remove all quotation marks with spaces after
              line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
              try:
-                # flag1 and flag2 are preamble & flags
-                # currently NOT implemented
+                # flag1 and flag2 are preamble and other flags
                  [ucs4,command,flag1,flag2] =line.split(None,3)
                  spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
              except:
                  pass
      fp.close()
+
+    return spec_chars
+
+
+def revert_unicode(document):
+    '''Transform unicode characters that can not be written using the
+document encoding to commands according to the unicodesymbols
+file. Characters that can not be replaced by commands are replaced by
+an replacement string.  Flags other than 'combined' are currently not
+implemented.'''
+
+    replacement_character = '???'
+    spec_chars = read_unicodesymbols()
+
      # Define strings to start and end ERT and math insets
-    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout Standard\n\\backslash\n'
-    ert_outro='\n\\end_layout\n\n\\end_inset\n\n'
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash' % document.default_layout
+    ert_outro='\n\\end_layout\n\n\\end_inset\n'
      math_intro='\n\\begin_inset Formula $'
-    math_outro='$\n\\end_inset\n'
+    math_outro='$\n\\end_inset'
      # Find unicode characters and replace them
-    in_ert = 0 # flag set to 1 if in ERT inset
-    in_math = 0 # flag set to 1 if in math inset
+    in_ert = False # flag set to 1 if in ERT inset
+    in_math = False # flag set to 1 if in math inset
      insets = [] # list of active insets
-    for i, current_line in enumerate(document.body):
-        if current_line.find('\\begin_inset') > -1:
+
+    # Go through the file to capture all combining characters
+    last_char = '' # to store the previous character
+
+    i = 0
+    while i < len(document.body):
+        line = document.body[i]
+        # Check for insets
+        if line.find('\\begin_inset') > -1:
              # check which inset to start
-            if current_line.find('\\begin_inset ERT') > -1:
-                in_ert = 1
+            if line.find('\\begin_inset ERT') > -1:
+                in_ert = True
                  insets.append('ert')
-            elif current_line.find('\\begin_inset Formula') > -1:
-                in_math = 1
+            elif line.find('\\begin_inset Formula') > -1:
+                in_math = True
                  insets.append('math')
              else:
                  insets.append('other')
-        if current_line.find('\\end_inset') > -1:
+        if line.find('\\end_inset') > -1:
              # check which inset to end
              try:
                  cur_inset = insets.pop()
                  if cur_inset == 'ert':
-                    in_ert = 0
+                    in_ert = False
                  elif cur_inset == 'math':
-                    in_math = 0
+                    in_math = False
                  else:
                      pass # end of other inset
              except:
                  pass # inset list was empty (for some reason)
-        current_line=''; # clear to have as container for modified line
-        for j in range(len(document.body[i])):
-            if spec_chars.has_key(document.body[i][j]):
-                flags = spec_chars[document.body[i][j]][1] + spec_chars[document.body[i][j]][2]
-                if flags.find('combining') > -1:
-                    command = ''
-                else:
-                    command = spec_chars[document.body[i][j]][0]; # the command to replace unicode
-                    if command[0:2] == '\\\\':
-                        if command[2:12]=='ensuremath':
-                            if in_ert == 1:
-                                # math in ERT
-                                command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
-                                command = command.replace('}', '$\n')
-                            elif in_math == 0:
-                                # add a math inset with the replacement character
-                                command = command.replace('\\\\ensuremath{\\', math_intro)
-                                command = command.replace('}', math_outro)
+        
+        # Try to write the line
+        try:
+            # If all goes well the line is written here
+            dummy = line.encode(document.encoding)
+            last_char = line[-1]
+            i += 1
+        except:
+            # Error, some character(s) in the line need to be replaced
+            mod_line = u''
+            for character in line:
+                try:
+                    # Try to write the character
+                    dummy = character.encode(document.encoding)
+                    mod_line += character
+                    last_char = character
+                except:
+                    # Try to replace with ERT/math inset
+                    if spec_chars.has_key(character):
+                        command = spec_chars[character][0] # the command to replace unicode
+                        flag1 = spec_chars[character][1]
+                        flag2 = spec_chars[character][2]
+                        if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+                            # We have a character that should be combined with the previous
+                            command += '{' + last_char + '}'
+                            # Remove the last character. Ignore if it is whitespace
+                            if len(last_char.rstrip()):
+                                # last_char was found and is not whitespace
+                                if mod_line:
+                                    mod_line = mod_line[:-1]
+                                else: # last_char belongs to the last line
+                                    document.body[i-1] = document.body[i-1][:-1]
                              else:
-                                # we are already in a math inset
-                                command = command.replace('\\\\ensuremath{\\', '')
-                                command = command.replace('}', '')
-                        else:
-                            if in_math == 1:
-                                # avoid putting an ERT in a math; instead put command as text
-                                command = command.replace('\\\\', '\mathrm{')
-                                command = command + '}'
-                            elif in_ert == 0:
-                                # add an ERT inset with the replacement character
-                                command = command.replace('\\\\', ert_intro)
-                                command = command + ert_outro
+                                # The last character was replaced by a command. For now it is
+                                # ignored. This could be handled better.
+                                pass
+                        if command[0:2] == '\\\\':
+                            if command[2:12]=='ensuremath':
+                                if in_ert:
+                                    # math in ERT
+                                    command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash')
+                                    command = command.replace('}', '$\n')
+                                elif not in_math:
+                                    # add a math inset with the replacement character
+                                    command = command.replace('\\\\ensuremath{\\', math_intro)
+                                    command = command.replace('}', math_outro)
+                                else:
+                                    # we are already in a math inset
+                                    command = command.replace('\\\\ensuremath{\\', '')
+                                    command = command.replace('}', '')
                              else:
-                                command = command.replace('\\\\', '\n\\backslash\n')
-                current_line = current_line + command
-            else:
-                current_line = current_line + document.body[i][j]
-        document.body[i] = current_line
+                                if in_math:
+                                    # avoid putting an ERT in a math; instead put command as text
+                                    command = command.replace('\\\\', '\mathrm{')
+                                    command = command + '}'
+                                elif not in_ert:
+                                    # add an ERT inset with the replacement character
+                                    command = command.replace('\\\\', ert_intro)
+                                    command = command + ert_outro
+                                else:
+                                    command = command.replace('\\\\', '\n\\backslash')
+                            last_char = '' # indicate that the character should not be removed
+                        mod_line += command
+                    else:
+                        # Replace with replacement string
+                        mod_line += replacement_character
+            document.body[i:i+1] = mod_line.split('\n')
+            i += len(mod_line.split('\n'))
  
  
  ##