Minor updates to XHTML info.

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index dc0c99eae91bbe326cf76a4147501da1564c6ae1..ea8a1436ea6ad7af8e02aa16c8727aef18e517dd 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -287,7 +287,7 @@ necessary parsing in modern formats than in ancient ones.
                  inset_result = inset_re.match(document.body[i])
                  if inset_result:
                      insets.append(inset_result.group(1))
-                else: 
+                else:
                      insets.append("")
              elif find_token(document.body, "\\end_inset", i, i + 1) == i:
                  del insets[-1]
@@ -310,8 +310,7 @@ necessary parsing in modern formats than in ancient ones.
                          # be used when writing to the file.
                          document.body[i] = orig.decode(document.encoding)
                      except:
-                        last_char = document.body[i-1][-1]
-                        mod_line, last_char = revert_unicode_line(document, i, last_char, insets, spec_chars)
+                        mod_line = revert_unicode_line(document, i, insets, spec_chars)
                          document.body[i:i+1] = mod_line.split('\n')
                          i += len(mod_line.split('\n')) - 1
              i += 1
@@ -355,7 +354,7 @@ def read_unicodesymbols():
      return spec_chars
  
  
-def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_character = '???'):
+def revert_unicode_line(document, i, insets, spec_chars, replacement_character = '???'):
      # Define strings to start and end ERT and math insets
      ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
      ert_outro='\n\\end_layout\n\n\\end_inset\n'
@@ -363,6 +362,11 @@ def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_
      math_outro='$\n\\end_inset'
  
      mod_line = u''
+    if i and not is_inset_line(document, i-1):
+        last_char = document.body[i - 1][-1:]
+    else:
+        last_char = ''
+
      line = document.body[i]
      for character in line:
          try:
@@ -420,7 +424,7 @@ def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_
              else:
                  # Replace with replacement string
                  mod_line += replacement_character
-    return mod_line, last_char
+    return mod_line
  
  
  def revert_unicode(document):
@@ -431,7 +435,6 @@ an replacement string.  Flags other than 'combined' are currently not
  implemented.'''
      spec_chars = read_unicodesymbols()
      insets = [] # list of active insets
-    last_char = '' # to store the previous character
  
      # Go through the document to capture all combining characters
      i = 0
@@ -442,16 +445,15 @@ implemented.'''
              insets.append(line[13:].split()[0])
          if line.find('\\end_inset') > -1:
              del insets[-1]
-        
+
          # Try to write the line
          try:
              # If all goes well the line is written here
              dummy = line.encode(document.encoding)
-            last_char = line[-1]
              i += 1
          except:
              # Error, some character(s) in the line need to be replaced
-            mod_line, last_char = revert_unicode_line(document, i, last_char, insets, spec_chars)
+            mod_line = revert_unicode_line(document, i, insets, spec_chars)
              document.body[i:i+1] = mod_line.split('\n')
              i += len(mod_line.split('\n'))
  
@@ -651,17 +653,17 @@ def convert_commandparams(document):
              if commandparams_info[name][0] == "":
                  document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('\\', '\\\\').replace('"', '\\"')))
          if option2 != "":
              if commandparams_info[name][1] == "":
                  document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('\\', '\\\\').replace('"', '\\"')))
          if argument != "":
              if commandparams_info[name][2] == "":
                  document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('\\', '\\\\').replace('"', '\\"')))
          document.body[i:i+1] = lines
          i = i + 1
  
@@ -674,7 +676,7 @@ def revert_commandparams(document):
          if i == -1:
              break
          name = document.body[i].split()[2]
-        j = find_end_of_inset(document.body, i + 1)
+        j = find_end_of_inset(document.body, i)
          preview_line = ""
          option1 = ""
          option2 = ""
@@ -688,13 +690,13 @@ def revert_commandparams(document):
                      preview_line = document.body[k]
                  elif (commandparams_info[name][0] != "" and
                        pname == commandparams_info[name][0]):
-                    option1 = pvalue.strip('"').replace('\\"', '"')
+                    option1 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][1] != "" and
                        pname == commandparams_info[name][1]):
-                    option2 = pvalue.strip('"').replace('\\"', '"')
+                    option2 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][2] != "" and
                        pname == commandparams_info[name][2]):
-                    argument = pvalue.strip('"').replace('\\"', '"')
+                    argument = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
              elif document.body[k].strip() != "":
                  document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
          if name == "bibitem":
@@ -719,7 +721,7 @@ def revert_commandparams(document):
              lines.append('')
              lines.append('\\end_inset')
          document.body[i:j+1] = lines
-        i = j + 1
+        i += len(lines) + 1
  
  
  def revert_nomenclature(document):
@@ -1148,6 +1150,33 @@ def convert_accent(document):
          i += 3
  
  
+def is_inset_line(document, i):
+    """ Line i of body has an inset """
+    if document.body[i][:1] == '\\':
+        return True
+    last_tokens = "".join(document.body[i].split()[-2:])
+    return last_tokens.find('\\') != -1
+
+
+# A wrapper around normalize that handles special cases (cf. bug 3313)
+def normalize(form, text):
+    # do not normalize OHM, ANGSTROM
+    keep_characters = [0x2126,0x212b]
+    result = ''
+    convert = ''
+    for i in text:
+        if ord(i) in keep_characters:
+            if len(convert) > 0:
+                result = result + unicodedata.normalize(form, convert)
+                convert = ''
+            result = result + i
+        else:
+            convert = convert + i
+    if len(convert) > 0:
+        result = result + unicodedata.normalize(form, convert)
+    return result
+
+
  def revert_accent(document):
      inverse_accent_map = {}
      for k in accent_map:
@@ -1163,36 +1192,35 @@ def revert_accent(document):
      # words before unicode normalization.
      # We do this only if the next line starts with an accent, otherwise we
      # would create things like '\begin_inset ERTstatus'.
-    numberoflines = len(document.body)
-    for i in range(numberoflines-1):
+    for i in range(len(document.body) - 1):
          if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
              continue
-        if (document.body[i+1][0] in inverse_accent_map):
+        if (document.body[i+1][0] in inverse_accent_map and not is_inset_line(document, i)):
              # the last character of this line and the first of the next line
-            # form probably a surrogate pair.
+            # form probably a surrogate pair, inline insets are excluded (second part of the test)
              while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
                  document.body[i] += document.body[i+1][0]
                  document.body[i+1] = document.body[i+1][1:]
  
      # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
      # This is needed to catch all accented characters.
-    for i in range(numberoflines):
+    for i in range(len(document.body)):
          # Unfortunately we have a mixture of unicode strings and plain strings,
          # because we never use u'xxx' for string literals, but 'xxx'.
          # Therefore we may have to try two times to normalize the data.
          try:
-            document.body[i] = unicodedata.normalize("NFD", document.body[i])
+            document.body[i] = normalize("NFD", document.body[i])
          except TypeError:
-            document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
+            document.body[i] = normalize("NFD", unicode(document.body[i], 'utf-8'))
  
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
+
      i = 0
      while i < len(document.body):
-
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
              result = lang_re.match(document.body[i])
@@ -1225,10 +1253,7 @@ def revert_accent(document):
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
-                    if j > 0:
-                        document.body[i] = document.body[i][:j-1]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
                      break
@@ -1241,31 +1266,28 @@ def revert_accent(document):
                      accented_char = inverse_accented_map[accented_char]
                  accent = document.body[i][j]
                  try:
-                    dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
+                    dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
-                    if j > 1:
-                        document.body[i] = document.body[i][:j-2]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j-1]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
          i = i + 1
  
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
-    for i in range(numberoflines):
-        document.body[i] = unicodedata.normalize("NFC", document.body[i])
+    for i in range(len(document.body)):
+        document.body[i] = normalize("NFC", document.body[i])
  
  
  def normalize_font_whitespace_259(document):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
      transfers the whitespace outside."""
-       
+
      char_properties = {"\\series": "default",
                         "\\emph": "default",
                         "\\color": "none",
@@ -1276,8 +1298,8 @@ def normalize_font_whitespace_259(document):
  
  def normalize_font_whitespace_274(document):
      """ Before format 259 (sic) the font changes were ignored if a
-    whitespace was the first or last character in the sequence. This was 
-    corrected for most font properties in format 259, but the language 
+    whitespace was the first or last character in the sequence. This was
+    corrected for most font properties in format 259, but the language
      was forgotten then. This function applies the same conversion done
      there (namely, transfers the whitespace outside) for font language
      changes, as well."""
@@ -1288,11 +1310,11 @@ def normalize_font_whitespace_274(document):
  def get_paragraph_language(document, i):
      """ Return the language of the paragraph in which line i of the document
      body is. If the first thing in the paragraph is a \\lang command, that
-    is the paragraph's langauge; otherwise, the paragraph's language is the 
+    is the paragraph's langauge; otherwise, the paragraph's language is the
      document's language."""
  
      lines = document.body
-       
+
      first_nonempty_line = \
          find_nonempty_line(lines, find_beginning_of_layout(lines, i) + 1)
  
@@ -1302,7 +1324,7 @@ def get_paragraph_language(document, i):
          return words[1]
      else:
          return document.language
-       
+
  def normalize_font_whitespace(document, char_properties):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
@@ -1575,7 +1597,7 @@ def revert_graphics_rotation(document):
  
  
  def convert_tableborder(document):
-    # The problematic is: LyX double the table cell border as it ignores the "|" character in
+    # The problem is: LyX doubles the table cell border as it ignores the "|" character in
      # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
      i = 0
      while i < len(document.body):
@@ -1584,7 +1606,7 @@ def convert_tableborder(document):
          # the two tokens have to be in one line
          if (h != -1 and k != -1):
              # delete the "|"
-            document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])-1]
+            document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])]
          i = i + 1
  
  
@@ -1601,13 +1623,13 @@ def revert_tableborder(document):
  
  
  def revert_armenian(document):
-    
-    # set inputencoding from armscii8 to auto 
+
+    # set inputencoding from armscii8 to auto
      if document.inputencoding == "armscii8":
          i = find_token(document.header, "\\inputencoding", 0)
          if i != -1:
              document.header[i] = "\\inputencoding auto"
-    # check if preamble exists, if not k is set to -1 
+    # check if preamble exists, if not k is set to -1
      i = 0
      k = -1
      while i < len(document.preamble):
@@ -1624,7 +1646,7 @@ def revert_armenian(document):
          # create the preamble when it doesn't exist
          else:
              document.preamble.append('\\usepackage{armtex}')
-    # Set document language from armenian to english 
+    # Set document language from armenian to english
      if document.language == "armenian":
          document.language = "english"
          i = find_token(document.header, "\\language", 0)
@@ -1664,10 +1686,10 @@ def revert_preamble_listings_params(document):
  
  
  def revert_listings_inset(document):
-    r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate 
+    r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate
  FROM
  
-\begin_inset 
+\begin_inset
  lstparams "language=Delphi"
  inline true
  status open
@@ -1765,7 +1787,7 @@ after label
              k = cap_end + 1
          inlinecode = ''
          # looking for the oneline code for lstinline
-        inlinecode = document.body[find_end_of_layout(document.body, 
+        inlinecode = document.body[find_end_of_layout(document.body,
              find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
          if len(caption) > 0:
              if len(params) == 0:
@@ -1784,7 +1806,7 @@ after label
              document.body[i:(j+1)] = [r'\begin_inset ERT',
                                        'status %s' % status,
                                        r'\begin_layout %s' % document.default_layout,
-                                      '', 
+                                      '',
                                        '',
                                        r'\backslash',
                                        'lstinline%s{%s}' % (params, inlinecode),
@@ -1800,7 +1822,9 @@ after label
                                        '',
                                        r'\backslash',
                                        r'begin{lstlisting}%s' % params,
-                                      r'\end_layout'
+                                      r'\end_layout',
+                                      '',
+                                      r'\begin_layout %s' % document.default_layout,
                                      ] + document.body[k : j - 1] + \
                                       ['',
                                        r'\begin_layout %s' % document.default_layout,
@@ -1810,7 +1834,7 @@ after label
                                        r'\end_layout',
                                        '',
                                        r'\end_inset']
-            
+
  
  def revert_include_listings(document):
      r''' Revert lstinputlisting Include option , translate
@@ -1849,7 +1873,7 @@ lstinputlisting{file}[opt]
          # find command line lstinputlisting{file}[options]
          cmd, file, option = '', '', ''
          if re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]):
-            cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()            
+            cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()
          option = option.replace('\\', '\\backslash\n')
          document.body[i : j + 1] = [r'\begin_inset ERT',
                                      'status open',