Remove the FIXME.

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index d874ef3101bb27a4b9c574833ecb1aaeecec3b34..05fd93fdc23688b55898a09e5aa18a2d13812188 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -250,11 +250,15 @@ necessary parsing in modern formats than in ancient ones.
      if document.cjk_encoding != '':
          return
      encoding_stack = [document.encoding]
-    inset_stack = []
+    insets = []
      lang_re = re.compile(r"^\\lang\s(\S+)")
      inset_re = re.compile(r"^\\begin_inset\s(\S+)")
+    if not forward: # no need to read file unless we are reverting
+        spec_chars = read_unicodesymbols()
+
      if document.inputencoding == "auto" or document.inputencoding == "default":
-        for i in range(len(document.body)):
+        i = 0
+        while i < len(document.body):
              result = lang_re.match(document.body[i])
              if result:
                  language = result.group(1)
@@ -267,7 +271,7 @@ necessary parsing in modern formats than in ancient ones.
                      encoding_stack[-1] = lang[language][3]
              elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
                  document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
-                if len(inset_stack) > 0 and inset_stack[-1] in inset_types:
+                if len(insets) > 0 and insets[-1] in inset_types:
                      from lyx2lyx_lang import lang
                      encoding_stack.append(lang[document.language][3])
                  else:
@@ -282,12 +286,11 @@ necessary parsing in modern formats than in ancient ones.
              elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
                  inset_result = inset_re.match(document.body[i])
                  if inset_result:
-                    inset_type = inset_result.group(1)
-                    inset_stack.append(inset_type)
+                    insets.append(inset_result.group(1))
                  else: 
-                    inset_stack.append("")
+                    insets.append("")
              elif find_token(document.body, "\\end_inset", i, i + 1) == i:
-                del inset_stack[-1]
+                del insets[-1]
              if encoding_stack[-1] != document.encoding:
                  if forward:
                      # This line has been incorrectly interpreted as if it was
@@ -298,13 +301,19 @@ necessary parsing in modern formats than in ancient ones.
                      # with the correct encoding.
                      document.body[i] = orig.decode(encoding_stack[-1])
                  else:
-                    # Convert unicode to the 8bit string that will be written
-                    # to the file with the correct encoding.
-                    orig = document.body[i].encode(encoding_stack[-1])
-                    # Convert the 8bit string that will be written to the
-                    # file to fake unicode with the encoding that will later
-                    # be used when writing to the file.
-                    document.body[i] = orig.decode(document.encoding)
+                    try:
+                        # Convert unicode to the 8bit string that will be written
+                        # to the file with the correct encoding.
+                        orig = document.body[i].encode(encoding_stack[-1])
+                        # Convert the 8bit string that will be written to the
+                        # file to fake unicode with the encoding that will later
+                        # be used when writing to the file.
+                        document.body[i] = orig.decode(document.encoding)
+                    except:
+                        mod_line = revert_unicode_line(document, i, insets, spec_chars)
+                        document.body[i:i+1] = mod_line.split('\n')
+                        i += len(mod_line.split('\n')) - 1
+            i += 1
  
  
  def convert_utf8(document):
@@ -325,6 +334,130 @@ def revert_utf8(document):
      convert_multiencoding(document, False)
  
  
+def read_unicodesymbols():
+    " Read the unicodesymbols list of unicode characters and corresponding commands."
+    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+    fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
+    spec_chars = {}
+    for line in fp.readlines():
+        if line[0] != '#':
+            line=line.replace(' "',' ') # remove all quotation marks with spaces before
+            line=line.replace('" ',' ') # remove all quotation marks with spaces after
+            line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
+            try:
+                # flag1 and flag2 are preamble and other flags
+                [ucs4,command,flag1,flag2] =line.split(None,3)
+                spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
+            except:
+                pass
+    fp.close()
+    return spec_chars
+
+
+def revert_unicode_line(document, i, insets, spec_chars, replacement_character = '???'):
+    # Define strings to start and end ERT and math insets
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
+    ert_outro='\n\\end_layout\n\n\\end_inset\n'
+    math_intro='\n\\begin_inset Formula $'
+    math_outro='$\n\\end_inset'
+
+    mod_line = u''
+    if i and not is_inset_line(document, i-1):
+        last_char = document.body[i - 1][-1:]
+    else:
+        last_char = ''
+
+    line = document.body[i]
+    for character in line:
+        try:
+            # Try to write the character
+            dummy = character.encode(document.encoding)
+            mod_line += character
+            last_char = character
+        except:
+            # Try to replace with ERT/math inset
+            if spec_chars.has_key(character):
+                command = spec_chars[character][0] # the command to replace unicode
+                flag1 = spec_chars[character][1]
+                flag2 = spec_chars[character][2]
+                if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+                    # We have a character that should be combined with the previous
+                    command += '{' + last_char + '}'
+                    # Remove the last character. Ignore if it is whitespace
+                    if len(last_char.rstrip()):
+                        # last_char was found and is not whitespace
+                        if mod_line:
+                            mod_line = mod_line[:-1]
+                        else: # last_char belongs to the last line
+                            document.body[i-1] = document.body[i-1][:-1]
+                    else:
+                        # The last character was replaced by a command. For now it is
+                        # ignored. This could be handled better.
+                        pass
+                if command[0:2] == '\\\\':
+                    if command[2:12]=='ensuremath':
+                        if insets and insets[-1] == "ERT":
+                            # math in ERT
+                            command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
+                            command = command.replace('}', '$\n')
+                        elif not insets or insets[-1] != "Formula":
+                            # add a math inset with the replacement character
+                            command = command.replace('\\\\ensuremath{\\', math_intro)
+                            command = command.replace('}', math_outro)
+                        else:
+                            # we are already in a math inset
+                            command = command.replace('\\\\ensuremath{\\', '')
+                            command = command.replace('}', '')
+                    else:
+                        if insets and insets[-1] == "Formula":
+                            # avoid putting an ERT in a math; instead put command as text
+                            command = command.replace('\\\\', '\mathrm{')
+                            command = command + '}'
+                        elif not insets or insets[-1] != "ERT":
+                            # add an ERT inset with the replacement character
+                            command = command.replace('\\\\', ert_intro)
+                            command = command + ert_outro
+                        else:
+                            command = command.replace('\\\\', '\n\\backslash\n')
+                    last_char = '' # indicate that the character should not be removed
+                mod_line += command
+            else:
+                # Replace with replacement string
+                mod_line += replacement_character
+    return mod_line
+
+
+def revert_unicode(document):
+    '''Transform unicode characters that can not be written using the
+document encoding to commands according to the unicodesymbols
+file. Characters that can not be replaced by commands are replaced by
+an replacement string.  Flags other than 'combined' are currently not
+implemented.'''
+    spec_chars = read_unicodesymbols()
+    insets = [] # list of active insets
+
+    # Go through the document to capture all combining characters
+    i = 0
+    while i < len(document.body):
+        line = document.body[i]
+        # Check for insets
+        if line.find('\\begin_inset') > -1:
+            insets.append(line[13:].split()[0])
+        if line.find('\\end_inset') > -1:
+            del insets[-1]
+        
+        # Try to write the line
+        try:
+            # If all goes well the line is written here
+            dummy = line.encode(document.encoding)
+            i += 1
+        except:
+            # Error, some character(s) in the line need to be replaced
+            mod_line = revert_unicode_line(document, i, insets, spec_chars)
+            document.body[i:i+1] = mod_line.split('\n')
+            i += len(mod_line.split('\n'))
+
+
  def revert_cs_label(document):
      " Remove status flag of charstyle label. "
      i = 0
@@ -520,17 +653,17 @@ def convert_commandparams(document):
              if commandparams_info[name][0] == "":
                  document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('\\', '\\\\').replace('"', '\\"')))
          if option2 != "":
              if commandparams_info[name][1] == "":
                  document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('\\', '\\\\').replace('"', '\\"')))
          if argument != "":
              if commandparams_info[name][2] == "":
                  document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('\\', '\\\\').replace('"', '\\"')))
          document.body[i:i+1] = lines
          i = i + 1
  
@@ -557,13 +690,13 @@ def revert_commandparams(document):
                      preview_line = document.body[k]
                  elif (commandparams_info[name][0] != "" and
                        pname == commandparams_info[name][0]):
-                    option1 = pvalue.strip('"').replace('\\"', '"')
+                    option1 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][1] != "" and
                        pname == commandparams_info[name][1]):
-                    option2 = pvalue.strip('"').replace('\\"', '"')
+                    option2 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][2] != "" and
                        pname == commandparams_info[name][2]):
-                    argument = pvalue.strip('"').replace('\\"', '"')
+                    argument = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
              elif document.body[k].strip() != "":
                  document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
          if name == "bibitem":
@@ -1017,6 +1150,33 @@ def convert_accent(document):
          i += 3
  
  
+def is_inset_line(document, i):
+    """ Line i of body has an inset """
+    if document.body[i][:1] == '\\':
+        return True
+    last_tokens = "".join(document.body[i].split()[-2:])
+    return last_tokens.find('\\') != -1
+
+
+# A wrapper around normalize that handles special cases (cf. bug 3313)
+def normalize(form, text):
+    # do not normalize OHM, ANGSTROM
+    keep_characters = [0x2126,0x212b]
+    result = ''
+    convert = ''
+    for i in text:
+        if ord(i) in keep_characters:
+            if len(convert) > 0:
+                result = result + unicodedata.normalize(form, convert)
+                convert = ''
+            result = result + i
+        else:
+            convert = convert + i
+    if len(convert) > 0:
+        result = result + unicodedata.normalize(form, convert)
+    return result
+
+
  def revert_accent(document):
      inverse_accent_map = {}
      for k in accent_map:
@@ -1032,36 +1192,35 @@ def revert_accent(document):
      # words before unicode normalization.
      # We do this only if the next line starts with an accent, otherwise we
      # would create things like '\begin_inset ERTstatus'.
-    numberoflines = len(document.body)
-    for i in range(numberoflines-1):
+    for i in range(len(document.body) - 1):
          if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
              continue
-        if (document.body[i+1][0] in inverse_accent_map):
+        if (document.body[i+1][0] in inverse_accent_map and not is_inset_line(document, i)):
              # the last character of this line and the first of the next line
-            # form probably a surrogate pair.
+            # form probably a surrogate pair, inline insets are excluded (second part of the test)
              while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
                  document.body[i] += document.body[i+1][0]
                  document.body[i+1] = document.body[i+1][1:]
  
      # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
      # This is needed to catch all accented characters.
-    for i in range(numberoflines):
+    for i in range(len(document.body)):
          # Unfortunately we have a mixture of unicode strings and plain strings,
          # because we never use u'xxx' for string literals, but 'xxx'.
          # Therefore we may have to try two times to normalize the data.
          try:
-            document.body[i] = unicodedata.normalize("NFD", document.body[i])
+            document.body[i] = normalize("NFD", document.body[i])
          except TypeError:
-            document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
+            document.body[i] = normalize("NFD", unicode(document.body[i], 'utf-8'))
  
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
+
      i = 0
      while i < len(document.body):
-
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
              result = lang_re.match(document.body[i])
@@ -1094,10 +1253,7 @@ def revert_accent(document):
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
-                    if j > 0:
-                        document.body[i] = document.body[i][:j-1]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
                      break
@@ -1110,24 +1266,21 @@ def revert_accent(document):
                      accented_char = inverse_accented_map[accented_char]
                  accent = document.body[i][j]
                  try:
-                    dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
+                    dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
-                    if j > 1:
-                        document.body[i] = document.body[i][:j-2]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j-1]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
          i = i + 1
  
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
-    for i in range(numberoflines):
-        document.body[i] = unicodedata.normalize("NFC", document.body[i])
+    for i in range(len(document.body)):
+        document.body[i] = normalize("NFC", document.body[i])
  
  
  def normalize_font_whitespace_259(document):
@@ -1669,7 +1822,9 @@ after label
                                        '',
                                        r'\backslash',
                                        r'begin{lstlisting}%s' % params,
-                                      r'\end_layout'
+                                      r'\end_layout',
+                                      '',
+                                      r'\begin_layout %s' % document.default_layout,
                                      ] + document.body[k : j - 1] + \
                                       ['',
                                        r'\begin_layout %s' % document.default_layout,
@@ -1868,146 +2023,6 @@ def revert_arabic (document):
          i = i + 1
  
  
-def read_unicodesymbols():
-    " Read the unicodesymbols list of unicode characters and corresponding commands."
-    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
-    fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
-    spec_chars = {}
-    for line in fp.readlines():
-        if line[0] != '#':
-            line=line.replace(' "',' ') # remove all quotation marks with spaces before
-            line=line.replace('" ',' ') # remove all quotation marks with spaces after
-            line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
-            try:
-                # flag1 and flag2 are preamble and other flags
-                [ucs4,command,flag1,flag2] =line.split(None,3)
-                spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
-            except:
-                pass
-    fp.close()
-
-    return spec_chars
-
-
-def revert_unicode(document):
-    '''Transform unicode characters that can not be written using the
-document encoding to commands according to the unicodesymbols
-file. Characters that can not be replaced by commands are replaced by
-an replacement string.  Flags other than 'combined' are currently not
-implemented.'''
-
-    replacement_character = '???'
-    spec_chars = read_unicodesymbols()
-
-    # Define strings to start and end ERT and math insets
-    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash' % document.default_layout
-    ert_outro='\n\\end_layout\n\n\\end_inset\n'
-    math_intro='\n\\begin_inset Formula $'
-    math_outro='$\n\\end_inset'
-    # Find unicode characters and replace them
-    in_ert = False # flag set to 1 if in ERT inset
-    in_math = False # flag set to 1 if in math inset
-    insets = [] # list of active insets
-
-    # Go through the file to capture all combining characters
-    last_char = '' # to store the previous character
-
-    i = 0
-    while i < len(document.body):
-        line = document.body[i]
-        # Check for insets
-        if line.find('\\begin_inset') > -1:
-            # check which inset to start
-            if line.find('\\begin_inset ERT') > -1:
-                in_ert = True
-                insets.append('ert')
-            elif line.find('\\begin_inset Formula') > -1:
-                in_math = True
-                insets.append('math')
-            else:
-                insets.append('other')
-        if line.find('\\end_inset') > -1:
-            # check which inset to end
-            try:
-                cur_inset = insets.pop()
-                if cur_inset == 'ert':
-                    in_ert = False
-                elif cur_inset == 'math':
-                    in_math = False
-                else:
-                    pass # end of other inset
-            except:
-                pass # inset list was empty (for some reason)
-        
-        # Try to write the line
-        try:
-            # If all goes well the line is written here
-            dummy = line.encode(document.encoding)
-            last_char = line[-1]
-            i += 1
-        except:
-            # Error, some character(s) in the line need to be replaced
-            mod_line = u''
-            for character in line:
-                try:
-                    # Try to write the character
-                    dummy = character.encode(document.encoding)
-                    mod_line += character
-                    last_char = character
-                except:
-                    # Try to replace with ERT/math inset
-                    if spec_chars.has_key(character):
-                        command = spec_chars[character][0] # the command to replace unicode
-                        flag1 = spec_chars[character][1]
-                        flag2 = spec_chars[character][2]
-                        if flag1.find('combining') > -1 or flag2.find('combining') > -1:
-                            # We have a character that should be combined with the previous
-                            command += '{' + last_char + '}'
-                            # Remove the last character. Ignore if it is whitespace
-                            if len(last_char.rstrip()):
-                                # last_char was found and is not whitespace
-                                if mod_line:
-                                    mod_line = mod_line[:-1]
-                                else: # last_char belongs to the last line
-                                    document.body[i-1] = document.body[i-1][:-1]
-                            else:
-                                # The last character was replaced by a command. For now it is
-                                # ignored. This could be handled better.
-                                pass
-                        if command[0:2] == '\\\\':
-                            if command[2:12]=='ensuremath':
-                                if in_ert:
-                                    # math in ERT
-                                    command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash')
-                                    command = command.replace('}', '$\n')
-                                elif not in_math:
-                                    # add a math inset with the replacement character
-                                    command = command.replace('\\\\ensuremath{\\', math_intro)
-                                    command = command.replace('}', math_outro)
-                                else:
-                                    # we are already in a math inset
-                                    command = command.replace('\\\\ensuremath{\\', '')
-                                    command = command.replace('}', '')
-                            else:
-                                if in_math:
-                                    # avoid putting an ERT in a math; instead put command as text
-                                    command = command.replace('\\\\', '\mathrm{')
-                                    command = command + '}'
-                                elif not in_ert:
-                                    # add an ERT inset with the replacement character
-                                    command = command.replace('\\\\', ert_intro)
-                                    command = command + ert_outro
-                                else:
-                                    command = command.replace('\\\\', '\n\\backslash')
-                            last_char = '' # indicate that the character should not be removed
-                        mod_line += command
-                    else:
-                        # Replace with replacement string
-                        mod_line += replacement_character
-            document.body[i:i+1] = mod_line.split('\n')
-            i += len(mod_line.split('\n'))
-
-
  ##
  # Conversion hub
  #
@@ -2082,5 +2097,3 @@ revert =  [
  
  if __name__ == "__main__":
      pass
-
-