Add support for CALS tables in DocBook.

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index f8bcd6e4d2fc1b72162b6b35e9207e1b3550b23e..76d23b6b5de831abb0d430479a46918b49664b8d 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -15,7 +15,7 @@
  #
  # You should have received a copy of the GNU General Public License
  # along with this program; if not, write to the Free Software
  #
  # You should have received a copy of the GNU General Public License
  # along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  
  """ Convert files to the file format generated by lyx 1.5"""
  
  
  """ Convert files to the file format generated by lyx 1.5"""
  
@@ -24,8 +24,17 @@ import unicodedata
  import sys, os
  
  from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value, find_beginning_of, find_nonempty_line
  import sys, os
  
  from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value, find_beginning_of, find_nonempty_line
+from lyx2lyx_tools import insert_document_option
  from LyX import get_encoding
  
  from LyX import get_encoding
  
+# Provide support for both python 2 and 3
+PY2 = sys.version_info[0] == 2
+if not PY2:
+    text_type = str
+    unichr = chr
+else:
+    text_type = unicode
+# End of code to support for both python 2 and 3
  
  ####################################################################
  # Private helper functions
  
  ####################################################################
  # Private helper functions
@@ -53,7 +62,7 @@ def find_beginning_of_layout(lines, i):
  def revert_framed(document):
      "Revert framed notes. "
      i = 0
  def revert_framed(document):
      "Revert framed notes. "
      i = 0
-    while 1:
+    while True:
          i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i)
  
          if i == -1:
          i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i)
  
          if i == -1:
@@ -93,7 +102,7 @@ def convert_font_settings(document):
      if font_scheme == '':
          document.warning("Malformed LyX document: Empty `\\fontscheme'.")
          font_scheme = 'default'
      if font_scheme == '':
          document.warning("Malformed LyX document: Empty `\\fontscheme'.")
          font_scheme = 'default'
-    if not font_scheme in roman_fonts.keys():
+    if not font_scheme in list(roman_fonts.keys()):
          document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
          font_scheme = 'default'
      document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme],
          document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
          font_scheme = 'default'
      document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme],
@@ -163,7 +172,7 @@ def revert_font_settings(document):
          del document.header[i]
      if font_tt_scale != '100':
          document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
          del document.header[i]
      if font_tt_scale != '100':
          document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
-    for font_scheme in roman_fonts.keys():
+    for font_scheme in list(roman_fonts.keys()):
          if (roman_fonts[font_scheme] == fonts['roman'] and
              sans_fonts[font_scheme] == fonts['sans'] and
              typewriter_fonts[font_scheme] == fonts['typewriter']):
          if (roman_fonts[font_scheme] == fonts['roman'] and
              sans_fonts[font_scheme] == fonts['sans'] and
              typewriter_fonts[font_scheme] == fonts['typewriter']):
@@ -208,7 +217,7 @@ def revert_booktabs(document):
      re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
      re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
      i = 0
      re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
      re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset Tabular", i)
          if i == -1:
              return
          i = find_token(document.body, "\\begin_inset Tabular", i)
          if i == -1:
              return
@@ -287,7 +296,7 @@ necessary parsing in modern formats than in ancient ones.
                  inset_result = inset_re.match(document.body[i])
                  if inset_result:
                      insets.append(inset_result.group(1))
                  inset_result = inset_re.match(document.body[i])
                  if inset_result:
                      insets.append(inset_result.group(1))
-                else: 
+                else:
                      insets.append("")
              elif find_token(document.body, "\\end_inset", i, i + 1) == i:
                  del insets[-1]
                      insets.append("")
              elif find_token(document.body, "\\end_inset", i, i + 1) == i:
                  del insets[-1]
@@ -310,8 +319,7 @@ necessary parsing in modern formats than in ancient ones.
                          # be used when writing to the file.
                          document.body[i] = orig.decode(document.encoding)
                      except:
                          # be used when writing to the file.
                          document.body[i] = orig.decode(document.encoding)
                      except:
-                        last_char = document.body[i-1][-1]
-                        mod_line, last_char = revert_unicode_line(document, i, last_char, insets, spec_chars)
+                        mod_line = revert_unicode_line(document, i, insets, spec_chars)
                          document.body[i:i+1] = mod_line.split('\n')
                          i += len(mod_line.split('\n')) - 1
              i += 1
                          document.body[i:i+1] = mod_line.split('\n')
                          i += len(mod_line.split('\n')) - 1
              i += 1
@@ -335,6 +343,7 @@ def revert_utf8(document):
      convert_multiencoding(document, False)
  
  
      convert_multiencoding(document, False)
  
  
+# FIXME: Use the version in unicode_symbols.py which has some bug fixes
  def read_unicodesymbols():
      " Read the unicodesymbols list of unicode characters and corresponding commands."
      pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
  def read_unicodesymbols():
      " Read the unicodesymbols list of unicode characters and corresponding commands."
      pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
@@ -355,14 +364,19 @@ def read_unicodesymbols():
      return spec_chars
  
  
      return spec_chars
  
  
-def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_character = '???'):
+def revert_unicode_line(document, i, insets, spec_chars, replacement_character = '???'):
      # Define strings to start and end ERT and math insets
      # Define strings to start and end ERT and math insets
-    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s' % document.default_layout
      ert_outro='\n\\end_layout\n\n\\end_inset\n'
      math_intro='\n\\begin_inset Formula $'
      math_outro='$\n\\end_inset'
  
      mod_line = u''
      ert_outro='\n\\end_layout\n\n\\end_inset\n'
      math_intro='\n\\begin_inset Formula $'
      math_outro='$\n\\end_inset'
  
      mod_line = u''
+    if i and not is_inset_line(document, i-1):
+        last_char = document.body[i - 1][-1:]
+    else:
+        last_char = ''
+
      line = document.body[i]
      for character in line:
          try:
      line = document.body[i]
      for character in line:
          try:
@@ -372,7 +386,7 @@ def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_
              last_char = character
          except:
              # Try to replace with ERT/math inset
              last_char = character
          except:
              # Try to replace with ERT/math inset
-            if spec_chars.has_key(character):
+            if character in spec_chars:
                  command = spec_chars[character][0] # the command to replace unicode
                  flag1 = spec_chars[character][1]
                  flag2 = spec_chars[character][2]
                  command = spec_chars[character][0] # the command to replace unicode
                  flag1 = spec_chars[character][1]
                  flag2 = spec_chars[character][2]
@@ -411,8 +425,8 @@ def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_
                              command = command + '}'
                          elif not insets or insets[-1] != "ERT":
                              # add an ERT inset with the replacement character
                              command = command + '}'
                          elif not insets or insets[-1] != "ERT":
                              # add an ERT inset with the replacement character
-                            command = command.replace('\\\\', ert_intro)
-                            command = command + ert_outro
+                            command = command.replace('\\\\', '\n\\backslash\n')
+                            command = ert_intro + command + ert_outro
                          else:
                              command = command.replace('\\\\', '\n\\backslash\n')
                      last_char = '' # indicate that the character should not be removed
                          else:
                              command = command.replace('\\\\', '\n\\backslash\n')
                      last_char = '' # indicate that the character should not be removed
@@ -420,7 +434,7 @@ def revert_unicode_line(document, i, last_char, insets, spec_chars, replacement_
              else:
                  # Replace with replacement string
                  mod_line += replacement_character
              else:
                  # Replace with replacement string
                  mod_line += replacement_character
-    return mod_line, last_char
+    return mod_line
  
  
  def revert_unicode(document):
  
  
  def revert_unicode(document):
@@ -431,7 +445,6 @@ an replacement string.  Flags other than 'combined' are currently not
  implemented.'''
      spec_chars = read_unicodesymbols()
      insets = [] # list of active insets
  implemented.'''
      spec_chars = read_unicodesymbols()
      insets = [] # list of active insets
-    last_char = '' # to store the previous character
  
      # Go through the document to capture all combining characters
      i = 0
  
      # Go through the document to capture all combining characters
      i = 0
@@ -442,16 +455,15 @@ implemented.'''
              insets.append(line[13:].split()[0])
          if line.find('\\end_inset') > -1:
              del insets[-1]
              insets.append(line[13:].split()[0])
          if line.find('\\end_inset') > -1:
              del insets[-1]
-        
+
          # Try to write the line
          try:
              # If all goes well the line is written here
              dummy = line.encode(document.encoding)
          # Try to write the line
          try:
              # If all goes well the line is written here
              dummy = line.encode(document.encoding)
-            last_char = line[-1]
              i += 1
          except:
              # Error, some character(s) in the line need to be replaced
              i += 1
          except:
              # Error, some character(s) in the line need to be replaced
-            mod_line, last_char = revert_unicode_line(document, i, last_char, insets, spec_chars)
+            mod_line = revert_unicode_line(document, i, insets, spec_chars)
              document.body[i:i+1] = mod_line.split('\n')
              i += len(mod_line.split('\n'))
  
              document.body[i:i+1] = mod_line.split('\n')
              i += len(mod_line.split('\n'))
  
@@ -459,14 +471,14 @@ implemented.'''
  def revert_cs_label(document):
      " Remove status flag of charstyle label. "
      i = 0
  def revert_cs_label(document):
      " Remove status flag of charstyle label. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset CharStyle", i)
          if i == -1:
              return
          # Seach for a line starting 'show_label'
          # If it is not there, break with a warning message
          i = i + 1
          i = find_token(document.body, "\\begin_inset CharStyle", i)
          if i == -1:
              return
          # Seach for a line starting 'show_label'
          # If it is not there, break with a warning message
          i = i + 1
-        while 1:
+        while True:
              if (document.body[i][:10] == "show_label"):
                  del document.body[i]
                  break
              if (document.body[i][:10] == "show_label"):
                  del document.body[i]
                  break
@@ -493,7 +505,7 @@ key "argument"
  This must be called after convert_commandparams.
  """
      i = 0
  This must be called after convert_commandparams.
  """
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\bibitem", i)
          if i == -1:
              break
          i = find_token(document.body, "\\bibitem", i)
          if i == -1:
              break
@@ -581,7 +593,7 @@ def convert_commandparams(document):
      # convert_bibitem()), but could be read in, so we convert it here, too.
  
      i = 0
      # convert_bibitem()), but could be read in, so we convert it here, too.
  
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset LatexCommand", i)
          if i == -1:
              break
          i = find_token(document.body, "\\begin_inset LatexCommand", i)
          if i == -1:
              break
@@ -651,17 +663,17 @@ def convert_commandparams(document):
              if commandparams_info[name][0] == "":
                  document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
              else:
              if commandparams_info[name][0] == "":
                  document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('\\', '\\\\').replace('"', '\\"')))
          if option2 != "":
              if commandparams_info[name][1] == "":
                  document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
              else:
          if option2 != "":
              if commandparams_info[name][1] == "":
                  document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('\\', '\\\\').replace('"', '\\"')))
          if argument != "":
              if commandparams_info[name][2] == "":
                  document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
              else:
          if argument != "":
              if commandparams_info[name][2] == "":
                  document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
              else:
-                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
+                lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('\\', '\\\\').replace('"', '\\"')))
          document.body[i:i+1] = lines
          i = i + 1
  
          document.body[i:i+1] = lines
          i = i + 1
  
@@ -669,12 +681,12 @@ def convert_commandparams(document):
  def revert_commandparams(document):
      regex = re.compile(r'(\S+)\s+(.+)')
      i = 0
  def revert_commandparams(document):
      regex = re.compile(r'(\S+)\s+(.+)')
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset LatexCommand", i)
          if i == -1:
              break
          name = document.body[i].split()[2]
          i = find_token(document.body, "\\begin_inset LatexCommand", i)
          if i == -1:
              break
          name = document.body[i].split()[2]
-        j = find_end_of_inset(document.body, i + 1)
+        j = find_end_of_inset(document.body, i)
          preview_line = ""
          option1 = ""
          option2 = ""
          preview_line = ""
          option1 = ""
          option2 = ""
@@ -688,13 +700,13 @@ def revert_commandparams(document):
                      preview_line = document.body[k]
                  elif (commandparams_info[name][0] != "" and
                        pname == commandparams_info[name][0]):
                      preview_line = document.body[k]
                  elif (commandparams_info[name][0] != "" and
                        pname == commandparams_info[name][0]):
-                    option1 = pvalue.strip('"').replace('\\"', '"')
+                    option1 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][1] != "" and
                        pname == commandparams_info[name][1]):
                  elif (commandparams_info[name][1] != "" and
                        pname == commandparams_info[name][1]):
-                    option2 = pvalue.strip('"').replace('\\"', '"')
+                    option2 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
                  elif (commandparams_info[name][2] != "" and
                        pname == commandparams_info[name][2]):
                  elif (commandparams_info[name][2] != "" and
                        pname == commandparams_info[name][2]):
-                    argument = pvalue.strip('"').replace('\\"', '"')
+                    argument = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
              elif document.body[k].strip() != "":
                  document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
          if name == "bibitem":
              elif document.body[k].strip() != "":
                  document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
          if name == "bibitem":
@@ -719,7 +731,7 @@ def revert_commandparams(document):
              lines.append('')
              lines.append('\\end_inset')
          document.body[i:j+1] = lines
              lines.append('')
              lines.append('\\end_inset')
          document.body[i:j+1] = lines
-        i = j + 1
+        i += len(lines) + 1
  
  
  def revert_nomenclature(document):
  
  
  def revert_nomenclature(document):
@@ -727,7 +739,7 @@ def revert_nomenclature(document):
      regex = re.compile(r'(\S+)\s+(.+)')
      i = 0
      use_nomencl = 0
      regex = re.compile(r'(\S+)\s+(.+)')
      i = 0
      use_nomencl = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
          if i == -1:
              break
          i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
          if i == -1:
              break
@@ -778,7 +790,7 @@ def revert_printnomenclature(document):
      regex = re.compile(r'(\S+)\s+(.+)')
      i = 0
      use_nomencl = 0
      regex = re.compile(r'(\S+)\s+(.+)')
      i = 0
      use_nomencl = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
          if i == -1:
              break
          i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
          if i == -1:
              break
@@ -844,7 +856,7 @@ def revert_esint(document):
  def revert_clearpage(document):
      " clearpage -> ERT "
      i = 0
  def revert_clearpage(document):
      " clearpage -> ERT "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\clearpage", i)
          if i == -1:
              break
          i = find_token(document.body, "\\clearpage", i)
          if i == -1:
              break
@@ -865,7 +877,7 @@ def revert_clearpage(document):
  def revert_cleardoublepage(document):
      " cleardoublepage -> ERT "
      i = 0
  def revert_cleardoublepage(document):
      " cleardoublepage -> ERT "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\cleardoublepage", i)
          if i == -1:
              break
          i = find_token(document.body, "\\cleardoublepage", i)
          if i == -1:
              break
@@ -923,7 +935,7 @@ def revert_encodings(document):
  def convert_caption(document):
      " Convert caption layouts to caption insets. "
      i = 0
  def convert_caption(document):
      " Convert caption layouts to caption insets. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_layout Caption", i)
          if i == -1:
              return
          i = find_token(document.body, "\\begin_layout Caption", i)
          if i == -1:
              return
@@ -943,7 +955,7 @@ def revert_caption(document):
      " Convert caption insets to caption layouts. "
      " This assumes that the text class has a caption style. "
      i = 0
      " Convert caption insets to caption layouts. "
      " This assumes that the text class has a caption style. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset Caption", i)
          if i == -1:
              return
          i = find_token(document.body, "\\begin_inset Caption", i)
          if i == -1:
              return
@@ -1108,7 +1120,7 @@ def convert_accent(document):
      re_contents = re.compile(r'^([^\s{]+)(.*)$')
      re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
      i = 0
      re_contents = re.compile(r'^([^\s{]+)(.*)$')
      re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
      i = 0
-    while 1:
+    while True:
          i = find_re(document.body, re_wholeinset, i)
          if i == -1:
              return
          i = find_re(document.body, re_wholeinset, i)
          if i == -1:
              return
@@ -1148,6 +1160,33 @@ def convert_accent(document):
          i += 3
  
  
          i += 3
  
  
+def is_inset_line(document, i):
+    """ Line i of body has an inset """
+    if document.body[i][:1] == '\\':
+        return True
+    last_tokens = "".join(document.body[i].split()[-2:])
+    return last_tokens.find('\\') != -1
+
+
+# A wrapper around normalize that handles special cases (cf. bug 3313)
+def normalize(form, text):
+    # do not normalize OHM, ANGSTROM
+    keep_characters = [0x2126,0x212b]
+    result = ''
+    convert = ''
+    for i in text:
+        if ord(i) in keep_characters:
+            if len(convert) > 0:
+                result = result + unicodedata.normalize(form, convert)
+                convert = ''
+            result = result + i
+        else:
+            convert = convert + i
+    if len(convert) > 0:
+        result = result + unicodedata.normalize(form, convert)
+    return result
+
+
  def revert_accent(document):
      inverse_accent_map = {}
      for k in accent_map:
  def revert_accent(document):
      inverse_accent_map = {}
      for k in accent_map:
@@ -1163,36 +1202,35 @@ def revert_accent(document):
      # words before unicode normalization.
      # We do this only if the next line starts with an accent, otherwise we
      # would create things like '\begin_inset ERTstatus'.
      # words before unicode normalization.
      # We do this only if the next line starts with an accent, otherwise we
      # would create things like '\begin_inset ERTstatus'.
-    numberoflines = len(document.body)
-    for i in range(numberoflines-1):
+    for i in range(len(document.body) - 1):
          if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
              continue
          if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
              continue
-        if (document.body[i+1][0] in inverse_accent_map):
+        if (document.body[i+1][0] in inverse_accent_map and not is_inset_line(document, i)):
              # the last character of this line and the first of the next line
              # the last character of this line and the first of the next line
-            # form probably a surrogate pair.
+            # form probably a surrogate pair, inline insets are excluded (second part of the test)
              while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
                  document.body[i] += document.body[i+1][0]
                  document.body[i+1] = document.body[i+1][1:]
  
      # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
      # This is needed to catch all accented characters.
              while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
                  document.body[i] += document.body[i+1][0]
                  document.body[i+1] = document.body[i+1][1:]
  
      # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
      # This is needed to catch all accented characters.
-    for i in range(numberoflines):
+    for i in range(len(document.body)):
          # Unfortunately we have a mixture of unicode strings and plain strings,
          # because we never use u'xxx' for string literals, but 'xxx'.
          # Therefore we may have to try two times to normalize the data.
          try:
          # Unfortunately we have a mixture of unicode strings and plain strings,
          # because we never use u'xxx' for string literals, but 'xxx'.
          # Therefore we may have to try two times to normalize the data.
          try:
-            document.body[i] = unicodedata.normalize("NFD", document.body[i])
+            document.body[i] = normalize("NFD", document.body[i])
          except TypeError:
          except TypeError:
-            document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
+            document.body[i] = normalize("NFD", text_type(document.body[i], 'utf-8'))
  
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
  
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
+
      i = 0
      while i < len(document.body):
      i = 0
      while i < len(document.body):
-
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
              result = lang_re.match(document.body[i])
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
              result = lang_re.match(document.body[i])
@@ -1225,10 +1263,7 @@ def revert_accent(document):
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
-                    if j > 0:
-                        document.body[i] = document.body[i][:j-1]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
                      break
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
                      break
@@ -1241,31 +1276,28 @@ def revert_accent(document):
                      accented_char = inverse_accented_map[accented_char]
                  accent = document.body[i][j]
                  try:
                      accented_char = inverse_accented_map[accented_char]
                  accent = document.body[i][j]
                  try:
-                    dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
+                    dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
                          document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
-                    if j > 1:
-                        document.body[i] = document.body[i][:j-2]
-                    else:
-                        document.body[i] = u''
+                    document.body[i] = document.body[i][:j-1]
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
          i = i + 1
  
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
          i = i + 1
  
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
-    for i in range(numberoflines):
-        document.body[i] = unicodedata.normalize("NFC", document.body[i])
+    for i in range(len(document.body)):
+        document.body[i] = normalize("NFC", document.body[i])
  
  
  def normalize_font_whitespace_259(document):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
      transfers the whitespace outside."""
  
  
  def normalize_font_whitespace_259(document):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
      transfers the whitespace outside."""
-       
+
      char_properties = {"\\series": "default",
                         "\\emph": "default",
                         "\\color": "none",
      char_properties = {"\\series": "default",
                         "\\emph": "default",
                         "\\color": "none",
@@ -1276,8 +1308,8 @@ def normalize_font_whitespace_259(document):
  
  def normalize_font_whitespace_274(document):
      """ Before format 259 (sic) the font changes were ignored if a
  
  def normalize_font_whitespace_274(document):
      """ Before format 259 (sic) the font changes were ignored if a
-    whitespace was the first or last character in the sequence. This was 
-    corrected for most font properties in format 259, but the language 
+    whitespace was the first or last character in the sequence. This was
+    corrected for most font properties in format 259, but the language
      was forgotten then. This function applies the same conversion done
      there (namely, transfers the whitespace outside) for font language
      changes, as well."""
      was forgotten then. This function applies the same conversion done
      there (namely, transfers the whitespace outside) for font language
      changes, as well."""
@@ -1288,11 +1320,11 @@ def normalize_font_whitespace_274(document):
  def get_paragraph_language(document, i):
      """ Return the language of the paragraph in which line i of the document
      body is. If the first thing in the paragraph is a \\lang command, that
  def get_paragraph_language(document, i):
      """ Return the language of the paragraph in which line i of the document
      body is. If the first thing in the paragraph is a \\lang command, that
-    is the paragraph's langauge; otherwise, the paragraph's language is the 
+    is the paragraph's langauge; otherwise, the paragraph's language is the
      document's language."""
  
      lines = document.body
      document's language."""
  
      lines = document.body
-       
+
      first_nonempty_line = \
          find_nonempty_line(lines, find_beginning_of_layout(lines, i) + 1)
  
      first_nonempty_line = \
          find_nonempty_line(lines, find_beginning_of_layout(lines, i) + 1)
  
@@ -1302,7 +1334,7 @@ def get_paragraph_language(document, i):
          return words[1]
      else:
          return document.language
          return words[1]
      else:
          return document.language
-       
+
  def normalize_font_whitespace(document, char_properties):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
  def normalize_font_whitespace(document, char_properties):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
@@ -1324,15 +1356,15 @@ def normalize_font_whitespace(document, char_properties):
              # a new paragraph resets all font changes
              changes.clear()
              # also reset the default language to be the paragraph's language
              # a new paragraph resets all font changes
              changes.clear()
              # also reset the default language to be the paragraph's language
-            if "\\lang" in char_properties.keys():
+            if "\\lang" in list(char_properties.keys()):
                  char_properties["\\lang"] = \
                      get_paragraph_language(document, i + 1)
  
                  char_properties["\\lang"] = \
                      get_paragraph_language(document, i + 1)
  
-        elif len(words) > 1 and words[0] in char_properties.keys():
+        elif len(words) > 1 and words[0] in list(char_properties.keys()):
              # we have a font change
              if char_properties[words[0]] == words[1]:
                  # property gets reset
              # we have a font change
              if char_properties[words[0]] == words[1]:
                  # property gets reset
-                if words[0] in changes.keys():
+                if words[0] in list(changes.keys()):
                      del changes[words[0]]
                  defaultproperty = True
              else:
                      del changes[words[0]]
                  defaultproperty = True
              else:
@@ -1350,11 +1382,11 @@ def normalize_font_whitespace(document, char_properties):
                  lines[i-1] = lines[i-1][:-1]
                  # a space before the font change
                  added_lines = [" "]
                  lines[i-1] = lines[i-1][:-1]
                  # a space before the font change
                  added_lines = [" "]
-                for k in changes.keys():
+                for k in list(changes.keys()):
                      # exclude property k because that is already in lines[i]
                      if k != words[0]:
                          added_lines[1:1] = ["%s %s" % (k, changes[k])]
                      # exclude property k because that is already in lines[i]
                      if k != words[0]:
                          added_lines[1:1] = ["%s %s" % (k, changes[k])]
-                for k in changes.keys():
+                for k in list(changes.keys()):
                      # exclude property k because that must be added below anyway
                      if k != words[0]:
                          added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
                      # exclude property k because that must be added below anyway
                      if k != words[0]:
                          added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
@@ -1378,11 +1410,11 @@ def normalize_font_whitespace(document, char_properties):
                          continue
                  lines[i+1] = lines[i+1][1:]
                  added_lines = [" "]
                          continue
                  lines[i+1] = lines[i+1][1:]
                  added_lines = [" "]
-                for k in changes.keys():
+                for k in list(changes.keys()):
                      # exclude property k because that is already in lines[i]
                      if k != words[0]:
                          added_lines[1:1] = ["%s %s" % (k, changes[k])]
                      # exclude property k because that is already in lines[i]
                      if k != words[0]:
                          added_lines[1:1] = ["%s %s" % (k, changes[k])]
-                for k in changes.keys():
+                for k in list(changes.keys()):
                      # exclude property k because that must be added below anyway
                      if k != words[0]:
                          added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
                      # exclude property k because that must be added below anyway
                      if k != words[0]:
                          added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
@@ -1421,13 +1453,13 @@ def revert_utf8plain(document):
  def revert_beamer_alert(document):
      " Revert beamer's \\alert inset back to ERT. "
      i = 0
  def revert_beamer_alert(document):
      " Revert beamer's \\alert inset back to ERT. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset CharStyle Alert", i)
          if i == -1:
              return
          document.body[i] = "\\begin_inset ERT"
          i = i + 1
          i = find_token(document.body, "\\begin_inset CharStyle Alert", i)
          if i == -1:
              return
          document.body[i] = "\\begin_inset ERT"
          i = i + 1
-        while 1:
+        while True:
              if (document.body[i][:13] == "\\begin_layout"):
                  # Insert the \alert command
                  document.body[i + 1] = "\\alert{" + document.body[i + 1] + '}'
              if (document.body[i][:13] == "\\begin_layout"):
                  # Insert the \alert command
                  document.body[i + 1] = "\\alert{" + document.body[i + 1] + '}'
@@ -1440,13 +1472,13 @@ def revert_beamer_alert(document):
  def revert_beamer_structure(document):
      " Revert beamer's \\structure inset back to ERT. "
      i = 0
  def revert_beamer_structure(document):
      " Revert beamer's \\structure inset back to ERT. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset CharStyle Structure", i)
          if i == -1:
              return
          document.body[i] = "\\begin_inset ERT"
          i = i + 1
          i = find_token(document.body, "\\begin_inset CharStyle Structure", i)
          if i == -1:
              return
          document.body[i] = "\\begin_inset ERT"
          i = i + 1
-        while 1:
+        while True:
              if (document.body[i][:13] == "\\begin_layout"):
                  document.body[i + 1] = "\\structure{" + document.body[i + 1] + '}'
                  break
              if (document.body[i][:13] == "\\begin_layout"):
                  document.body[i + 1] = "\\structure{" + document.body[i + 1] + '}'
                  break
@@ -1519,7 +1551,7 @@ def revert_cv_textclass(document):
  def convert_graphics_rotation(document):
      " add scaleBeforeRotation graphics parameter. "
      i = 0
  def convert_graphics_rotation(document):
      " add scaleBeforeRotation graphics parameter. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset Graphics", i)
          if i == -1:
              return
          i = find_token(document.body, "\\begin_inset Graphics", i)
          if i == -1:
              return
@@ -1541,7 +1573,7 @@ def convert_graphics_rotation(document):
  def revert_graphics_rotation(document):
      " remove scaleBeforeRotation graphics parameter. "
      i = 0
  def revert_graphics_rotation(document):
      " remove scaleBeforeRotation graphics parameter. "
      i = 0
-    while 1:
+    while True:
          i = find_token(document.body, "\\begin_inset Graphics", i)
          if i == -1:
              return
          i = find_token(document.body, "\\begin_inset Graphics", i)
          if i == -1:
              return
@@ -1575,7 +1607,7 @@ def revert_graphics_rotation(document):
  
  
  def convert_tableborder(document):
  
  
  def convert_tableborder(document):
-    # The problematic is: LyX double the table cell border as it ignores the "|" character in
+    # The problem is: LyX doubles the table cell border as it ignores the "|" character in
      # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
      i = 0
      while i < len(document.body):
      # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
      i = 0
      while i < len(document.body):
@@ -1584,7 +1616,7 @@ def convert_tableborder(document):
          # the two tokens have to be in one line
          if (h != -1 and k != -1):
              # delete the "|"
          # the two tokens have to be in one line
          if (h != -1 and k != -1):
              # delete the "|"
-            document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])-1]
+            document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])]
          i = i + 1
  
  
          i = i + 1
  
  
@@ -1601,13 +1633,13 @@ def revert_tableborder(document):
  
  
  def revert_armenian(document):
  
  
  def revert_armenian(document):
-    
-    # set inputencoding from armscii8 to auto 
+
+    # set inputencoding from armscii8 to auto
      if document.inputencoding == "armscii8":
          i = find_token(document.header, "\\inputencoding", 0)
          if i != -1:
              document.header[i] = "\\inputencoding auto"
      if document.inputencoding == "armscii8":
          i = find_token(document.header, "\\inputencoding", 0)
          if i != -1:
              document.header[i] = "\\inputencoding auto"
-    # check if preamble exists, if not k is set to -1 
+    # check if preamble exists, if not k is set to -1
      i = 0
      k = -1
      while i < len(document.preamble):
      i = 0
      k = -1
      while i < len(document.preamble):
@@ -1624,7 +1656,7 @@ def revert_armenian(document):
          # create the preamble when it doesn't exist
          else:
              document.preamble.append('\\usepackage{armtex}')
          # create the preamble when it doesn't exist
          else:
              document.preamble.append('\\usepackage{armtex}')
-    # Set document language from armenian to english 
+    # Set document language from armenian to english
      if document.language == "armenian":
          document.language = "english"
          i = find_token(document.header, "\\language", 0)
      if document.language == "armenian":
          document.language = "english"
          i = find_token(document.header, "\\language", 0)
@@ -1664,10 +1696,10 @@ def revert_preamble_listings_params(document):
  
  
  def revert_listings_inset(document):
  
  
  def revert_listings_inset(document):
-    r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate 
+    r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate
  FROM
  
  FROM
  
-\begin_inset 
+\begin_inset
  lstparams "language=Delphi"
  inline true
  status open
  lstparams "language=Delphi"
  inline true
  status open
@@ -1765,7 +1797,7 @@ after label
              k = cap_end + 1
          inlinecode = ''
          # looking for the oneline code for lstinline
              k = cap_end + 1
          inlinecode = ''
          # looking for the oneline code for lstinline
-        inlinecode = document.body[find_end_of_layout(document.body, 
+        inlinecode = document.body[find_end_of_layout(document.body,
              find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
          if len(caption) > 0:
              if len(params) == 0:
              find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
          if len(caption) > 0:
              if len(params) == 0:
@@ -1784,7 +1816,7 @@ after label
              document.body[i:(j+1)] = [r'\begin_inset ERT',
                                        'status %s' % status,
                                        r'\begin_layout %s' % document.default_layout,
              document.body[i:(j+1)] = [r'\begin_inset ERT',
                                        'status %s' % status,
                                        r'\begin_layout %s' % document.default_layout,
-                                      '', 
+                                      '',
                                        '',
                                        r'\backslash',
                                        'lstinline%s{%s}' % (params, inlinecode),
                                        '',
                                        r'\backslash',
                                        'lstinline%s{%s}' % (params, inlinecode),
@@ -1812,7 +1844,7 @@ after label
                                        r'\end_layout',
                                        '',
                                        r'\end_inset']
                                        r'\end_layout',
                                        '',
                                        r'\end_inset']
-            
+
  
  def revert_include_listings(document):
      r''' Revert lstinputlisting Include option , translate
  
  def revert_include_listings(document):
      r''' Revert lstinputlisting Include option , translate
@@ -1851,7 +1883,7 @@ lstinputlisting{file}[opt]
          # find command line lstinputlisting{file}[options]
          cmd, file, option = '', '', ''
          if re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]):
          # find command line lstinputlisting{file}[options]
          cmd, file, option = '', '', ''
          if re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]):
-            cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()            
+            cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()
          option = option.replace('\\', '\\backslash\n')
          document.body[i : j + 1] = [r'\begin_inset ERT',
                                      'status open',
          option = option.replace('\\', '\\backslash\n')
          document.body[i : j + 1] = [r'\begin_inset ERT',
                                      'status open',
@@ -1876,13 +1908,7 @@ def revert_ext_font_sizes(document):
  
      i = find_token(document.header, '\\paperfontsize', 0)
      document.header[i] = '\\paperfontsize default'
  
      i = find_token(document.header, '\\paperfontsize', 0)
      document.header[i] = '\\paperfontsize default'
-
-    i = find_token(document.header, '\\options', 0)
-    if i == -1:
-        i = find_token(document.header, '\\textclass', 0) + 1
-        document.header[i:i] = ['\\options %s' % fontsize]
-    else:
-        document.header[i] += ',%s' % fontsize
+    insert_document_option(document, fontsize)
  
  
  def convert_ext_font_sizes(document):
  
  
  def convert_ext_font_sizes(document):