Fix a few edge-cases which in the lyx2lyx conversion to format 249

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index 3b1bc7afc3a8f76150127799bd0cca82e2c3ba0b..d874ef3101bb27a4b9c574833ecb1aaeecec3b34 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -21,8 +21,9 @@
  
  import re
  import unicodedata
+import sys, os
  
-from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
+from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value, find_beginning_of, find_nonempty_line
  from LyX import get_encoding
  
  
@@ -37,6 +38,10 @@ def find_end_of_layout(lines, i):
      " Find end of layout, where lines[i] is included."
      return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
  
+def find_beginning_of_layout(lines, i):
+    "Find beginning of layout, where lines[i] is included."
+    return find_beginning_of(lines, i, "\\begin_layout", "\\end_layout")
+
  # End of helper functions
  ####################################################################
  
@@ -241,10 +246,13 @@ document.encoding must be set to the old value (format 248) in both cases.
  We do this here and not in LyX.py because it is far easier to do the
  necessary parsing in modern formats than in ancient ones.
  """
+    inset_types = ["Foot", "Note"]
      if document.cjk_encoding != '':
          return
      encoding_stack = [document.encoding]
+    inset_stack = []
      lang_re = re.compile(r"^\\lang\s(\S+)")
+    inset_re = re.compile(r"^\\begin_inset\s(\S+)")
      if document.inputencoding == "auto" or document.inputencoding == "default":
          for i in range(len(document.body)):
              result = lang_re.match(document.body[i])
@@ -259,7 +267,11 @@ necessary parsing in modern formats than in ancient ones.
                      encoding_stack[-1] = lang[language][3]
              elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
                  document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
-                encoding_stack.append(encoding_stack[-1])
+                if len(inset_stack) > 0 and inset_stack[-1] in inset_types:
+                    from lyx2lyx_lang import lang
+                    encoding_stack.append(lang[document.language][3])
+                else:
+                    encoding_stack.append(encoding_stack[-1])
              elif find_token(document.body, "\\end_layout", i, i + 1) == i:
                  document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
                  if len(encoding_stack) == 1:
@@ -267,6 +279,15 @@ necessary parsing in modern formats than in ancient ones.
                      document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
                  else:
                      del encoding_stack[-1]
+            elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
+                inset_result = inset_re.match(document.body[i])
+                if inset_result:
+                    inset_type = inset_result.group(1)
+                    inset_stack.append(inset_type)
+                else: 
+                    inset_stack.append("")
+            elif find_token(document.body, "\\end_inset", i, i + 1) == i:
+                del inset_stack[-1]
              if encoding_stack[-1] != document.encoding:
                  if forward:
                      # This line has been incorrectly interpreted as if it was
@@ -340,19 +361,24 @@ key "argument"
  
  This must be called after convert_commandparams.
  """
-    regex = re.compile(r'\S+\s*(\[[^\[\{]*\])?(\{[^}]*\})')
      i = 0
      while 1:
          i = find_token(document.body, "\\bibitem", i)
          if i == -1:
              break
-        match = re.match(regex, document.body[i])
-        option = match.group(1)
-        argument = match.group(2)
+        j = document.body[i].find('[') + 1
+        k = document.body[i].rfind(']')
+        if j == 0: # No optional argument found
+            option = None
+        else:
+            option = document.body[i][j:k]
+        j = document.body[i].rfind('{') + 1
+        k = document.body[i].rfind('}')
+        argument = document.body[i][j:k]
          lines = ['\\begin_inset LatexCommand bibitem']
          if option != None:
-            lines.append('label "%s"' % option[1:-1].replace('"', '\\"'))
-        lines.append('key "%s"' % argument[1:-1].replace('"', '\\"'))
+            lines.append('label "%s"' % option.replace('"', '\\"'))
+        lines.append('key "%s"' % argument.replace('"', '\\"'))
          lines.append('')
          lines.append('\\end_inset')
          document.body[i:i+1] = lines
@@ -434,6 +460,13 @@ def convert_commandparams(document):
              i = i + 1
              continue
  
+        j = find_token(document.body, "\\end_inset", i + 1)
+        if j == -1:
+            document.warning("Malformed document")
+        else:
+            command += "".join(document.body[i+1:j])
+            document.body[i+1:j] = []
+
          # The following parser is taken from the original InsetCommandParams::scanCommand
          name = ""
          option1 = ""
@@ -732,7 +765,7 @@ def convert_lyxline(document):
          k = 0
          while i < len(document.body):
              i = find_token(document.body, "\\size " + fontsizes[n], i)
-            k = find_token(document.body, "\\lyxline",i)
+            k = find_token(document.body, "\\lyxline", i)
              # the corresponding fontsize command is always 2 lines before the \lyxline
              if (i != -1 and k == i+2):
                  document.body[i:i+1] = []
@@ -858,7 +891,7 @@ accent_map = {
      "=" : u'\u0304', # macron
      "u" : u'\u0306', # breve
      "." : u'\u0307', # dot above
-    "\"": u'\u0308', # diaresis
+    "\"": u'\u0308', # diaeresis
      "r" : u'\u030a', # ring above
      "H" : u'\u030b', # double acute
      "v" : u'\u030c', # caron
@@ -911,7 +944,7 @@ def _convert_accent(accent, accented_char):
          return ''
      a = accent_map.get(type)
      if a:
-        return unicodedata.normalize("NFKC", "%s%s" % (char, a))
+        return unicodedata.normalize("NFC", "%s%s" % (char, a))
      return ''
  
  
@@ -1017,16 +1050,17 @@ def revert_accent(document):
          # because we never use u'xxx' for string literals, but 'xxx'.
          # Therefore we may have to try two times to normalize the data.
          try:
-            document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+            document.body[i] = unicodedata.normalize("NFD", document.body[i])
          except TypeError:
-            document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+            document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
  
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
      encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
-    for i in range(len(document.body)):
+    i = 0
+    while i < len(document.body):
  
          if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
@@ -1058,7 +1092,7 @@ def revert_accent(document):
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
-                        document.body[i+1:i+1] = document.body[i][j+1:]
+                        document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented character
                      if j > 0:
                          document.body[i] = document.body[i][:j-1]
@@ -1076,11 +1110,11 @@ def revert_accent(document):
                      accented_char = inverse_accented_map[accented_char]
                  accent = document.body[i][j]
                  try:
-                    dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
+                    dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
                  except UnicodeEncodeError:
                      # Insert the rest of the line as new line
                      if j < len(document.body[i]) - 1:
-                        document.body[i+1:i+1] = document.body[i][j+1:]
+                        document.body.insert(i+1, document.body[i][j+1:])
                      # Delete the accented characters
                      if j > 1:
                          document.body[i] = document.body[i][:j-2]
@@ -1089,27 +1123,66 @@ def revert_accent(document):
                      # Finally add the InsetLaTeXAccent
                      document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                      break
+        i = i + 1
+
      # Normalize to "Normal form C" (NFC, pre-composed characters) again
      for i in range(numberoflines):
-        document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+        document.body[i] = unicodedata.normalize("NFC", document.body[i])
  
  
-def normalize_font_whitespace(document):
+def normalize_font_whitespace_259(document):
      """ Before format 259 the font changes were ignored if a
      whitespace was the first or last character in the sequence, this function
      transfers the whitespace outside."""
-
-    if document.backend != "latex":
-        return
-
-    lines = document.body
-
+       
      char_properties = {"\\series": "default",
                         "\\emph": "default",
                         "\\color": "none",
                         "\\shape": "default",
                         "\\bar": "default",
                         "\\family": "default"}
+    return normalize_font_whitespace(document, char_properties)
+
+def normalize_font_whitespace_274(document):
+    """ Before format 259 (sic) the font changes were ignored if a
+    whitespace was the first or last character in the sequence. This was 
+    corrected for most font properties in format 259, but the language 
+    was forgotten then. This function applies the same conversion done
+    there (namely, transfers the whitespace outside) for font language
+    changes, as well."""
+
+    char_properties = {"\\lang": "default"}
+    return normalize_font_whitespace(document, char_properties)
+
+def get_paragraph_language(document, i):
+    """ Return the language of the paragraph in which line i of the document
+    body is. If the first thing in the paragraph is a \\lang command, that
+    is the paragraph's langauge; otherwise, the paragraph's language is the 
+    document's language."""
+
+    lines = document.body
+       
+    first_nonempty_line = \
+        find_nonempty_line(lines, find_beginning_of_layout(lines, i) + 1)
+
+    words = lines[first_nonempty_line].split()
+
+    if len(words) > 1 and words[0] == "\\lang":
+        return words[1]
+    else:
+        return document.language
+       
+def normalize_font_whitespace(document, char_properties):
+    """ Before format 259 the font changes were ignored if a
+    whitespace was the first or last character in the sequence, this function
+    transfers the whitespace outside. Only a change in one of the properties
+    in the provided    char_properties is handled by this function."""
+
+    if document.backend != "latex":
+        return
+
+    lines = document.body
+
      changes = {}
  
      i = 0
@@ -1119,6 +1192,10 @@ def normalize_font_whitespace(document):
          if len(words) > 0 and words[0] == "\\begin_layout":
              # a new paragraph resets all font changes
              changes.clear()
+            # also reset the default language to be the paragraph's language
+            if "\\lang" in char_properties.keys():
+                char_properties["\\lang"] = \
+                    get_paragraph_language(document, i + 1)
  
          elif len(words) > 1 and words[0] in char_properties.keys():
              # we have a font change
@@ -1198,6 +1275,55 @@ def revert_utf8x(document):
      document.inputencoding = get_value(document.header, "\\inputencoding", 0)
  
  
+def revert_utf8plain(document):
+    " Set utf8plain encoding to utf8. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "utf8-plain":
+            document.header[i] = "\\inputencoding utf8"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
+def revert_beamer_alert(document):
+    " Revert beamer's \\alert inset back to ERT. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset CharStyle Alert", i)
+        if i == -1:
+            return
+        document.body[i] = "\\begin_inset ERT"
+        i = i + 1
+        while 1:
+            if (document.body[i][:13] == "\\begin_layout"):
+                # Insert the \alert command
+                document.body[i + 1] = "\\alert{" + document.body[i + 1] + '}'
+                break
+            i = i + 1
+
+        i = i + 1
+
+
+def revert_beamer_structure(document):
+    " Revert beamer's \\structure inset back to ERT. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset CharStyle Structure", i)
+        if i == -1:
+            return
+        document.body[i] = "\\begin_inset ERT"
+        i = i + 1
+        while 1:
+            if (document.body[i][:13] == "\\begin_layout"):
+                document.body[i + 1] = "\\structure{" + document.body[i + 1] + '}'
+                break
+            i = i + 1
+
+        i = i + 1
+
+
  def convert_changes(document):
      " Switch output_changes off if tracking_changes is off. "
      i = find_token(document.header, '\\tracking_changes', 0)
@@ -1257,6 +1383,631 @@ def revert_cv_textclass(document):
          document.textclass = "cv"
  
  
+#
+# add scaleBeforeRotation graphics param
+def convert_graphics_rotation(document):
+    " add scaleBeforeRotation graphics parameter. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset Graphics", i)
+        if i == -1:
+            return
+        j = find_end_of_inset(document.body, i+1)
+        if j == -1:
+            # should not happen
+            document.warning("Malformed LyX document: Could not find end of graphics inset.")
+        # Seach for rotateAngle and width or height or scale
+        # If these params are not there, nothing needs to be done.
+        k = find_token(document.body, "\trotateAngle", i + 1, j)
+        l = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
+        if (k != -1 and l != -1):
+            document.body.insert(j, 'scaleBeforeRotation')
+        i = i + 1
+
+
+#
+# remove scaleBeforeRotation graphics param
+def revert_graphics_rotation(document):
+    " remove scaleBeforeRotation graphics parameter. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset Graphics", i)
+        if i == -1:
+            return
+        j = find_end_of_inset(document.body, i + 1)
+        if j == -1:
+            # should not happen
+            document.warning("Malformed LyX document: Could not find end of graphics inset.")
+        # If there's a scaleBeforeRotation param, just remove that
+        k = find_token(document.body, "\tscaleBeforeRotation", i + 1, j)
+        if k != -1:
+            del document.body[k]
+        else:
+            # if not, and if we have rotateAngle and width or height or scale,
+            # we have to put the rotateAngle value to special
+            rotateAngle = get_value(document.body, 'rotateAngle', i + 1, j)
+            special = get_value(document.body, 'special', i + 1, j)
+            if rotateAngle != "":
+                k = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
+                if k == -1:
+                    break
+                if special == "":
+                    document.body.insert(j-1, '\tspecial angle=%s' % rotateAngle)
+                else:
+                    l = find_token(document.body, "\tspecial", i + 1, j)
+                    document.body[l] = document.body[l].replace(special, 'angle=%s,%s' % (rotateAngle, special))
+                k = find_token(document.body, "\trotateAngle", i + 1, j)
+                if k != -1:
+                    del document.body[k]
+        i = i + 1
+
+
+
+def convert_tableborder(document):
+    # The problematic is: LyX double the table cell border as it ignores the "|" character in
+    # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
+    i = 0
+    while i < len(document.body):
+        h = document.body[i].find("leftline=\"true\"", 0, len(document.body[i]))
+        k = document.body[i].find("|>{", 0, len(document.body[i]))
+        # the two tokens have to be in one line
+        if (h != -1 and k != -1):
+            # delete the "|"
+            document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])-1]
+        i = i + 1
+
+
+def revert_tableborder(document):
+    i = 0
+    while i < len(document.body):
+        h = document.body[i].find("leftline=\"true\"", 0, len(document.body[i]))
+        k = document.body[i].find(">{", 0, len(document.body[i]))
+        # the two tokens have to be in one line
+        if (h != -1 and k != -1):
+            # add the "|"
+            document.body[i] = document.body[i][:k] + '|' + document.body[i][k:]
+        i = i + 1
+
+
+def revert_armenian(document):
+    
+    # set inputencoding from armscii8 to auto 
+    if document.inputencoding == "armscii8":
+        i = find_token(document.header, "\\inputencoding", 0)
+        if i != -1:
+            document.header[i] = "\\inputencoding auto"
+    # check if preamble exists, if not k is set to -1 
+    i = 0
+    k = -1
+    while i < len(document.preamble):
+        if k == -1:
+            k = document.preamble[i].find("\\", 0, len(document.preamble[i]))
+        if k == -1:
+            k = document.preamble[i].find("%", 0, len(document.preamble[i]))
+        i = i + 1
+    # add the entry \usepackage{armtex} to the document preamble
+    if document.language == "armenian":
+        # set the armtex entry as the first preamble line
+        if k != -1:
+            document.preamble[0:0] = ["\\usepackage{armtex}"]
+        # create the preamble when it doesn't exist
+        else:
+            document.preamble.append('\\usepackage{armtex}')
+    # Set document language from armenian to english 
+    if document.language == "armenian":
+        document.language = "english"
+        i = find_token(document.header, "\\language", 0)
+        if i != -1:
+            document.header[i] = "\\language english"
+
+
+def revert_CJK(document):
+    " Set CJK encodings to default and languages chinese, japanese and korean to english. "
+    encodings = ["Bg5", "Bg5+", "GB", "GBt", "GBK", "JIS",
+                 "KS", "SJIS", "UTF8", "EUC-TW", "EUC-JP"]
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc in encodings:
+            document.header[i] = "\\inputencoding default"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+    if document.language == "chinese-simplified" or \
+       document.language == "chinese-traditional" or \
+       document.language == "japanese" or document.language == "korean":
+        document.language = "english"
+        i = find_token(document.header, "\\language", 0)
+        if i != -1:
+            document.header[i] = "\\language english"
+
+
+def revert_preamble_listings_params(document):
+    " Revert preamble option \listings_params "
+    i = find_token(document.header, "\\listings_params", 0)
+    if i != -1:
+        document.preamble.append('\\usepackage{listings}')
+        document.preamble.append('\\lstset{%s}' % document.header[i].split()[1].strip('"'))
+        document.header.pop(i);
+
+
+def revert_listings_inset(document):
+    r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate 
+FROM
+
+\begin_inset 
+lstparams "language=Delphi"
+inline true
+status open
+
+\begin_layout Standard
+var i = 10;
+\end_layout
+
+\end_inset
+
+TO
+
+\begin_inset ERT
+status open
+\begin_layout Standard
+
+
+\backslash
+lstinline[language=Delphi]{var i = 10;}
+\end_layout
+
+\end_inset
+
+There can be an caption inset in this inset
+
+\begin_layout Standard
+\begin_inset Caption
+
+\begin_layout Standard
+before label
+\begin_inset LatexCommand label
+name "lst:caption"
+
+\end_inset
+
+after label
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+'''
+    i = 0
+    while True:
+        i = find_token(document.body, '\\begin_inset listings', i)
+        if i == -1:
+            break
+        else:
+            if not '\\usepackage{listings}' in document.preamble:
+                document.preamble.append('\\usepackage{listings}')
+        j = find_end_of_inset(document.body, i + 1)
+        if j == -1:
+            # this should not happen
+            break
+        inline = 'false'
+        params = ''
+        status = 'open'
+        # first three lines
+        for line in range(i + 1, i + 4):
+            if document.body[line].startswith('inline'):
+                inline = document.body[line].split()[1]
+            if document.body[line].startswith('lstparams'):
+                params = document.body[line].split()[1].strip('"')
+            if document.body[line].startswith('status'):
+                status = document.body[line].split()[1].strip()
+                k = line + 1
+        # caption?
+        caption = ''
+        label = ''
+        cap = find_token(document.body, '\\begin_inset Caption', i)
+        if cap != -1:
+            cap_end = find_end_of_inset(document.body, cap + 1)
+            if cap_end == -1:
+                # this should not happen
+                break
+            # label?
+            lbl = find_token(document.body, '\\begin_inset LatexCommand label', cap + 1)
+            if lbl != -1:
+                lbl_end = find_end_of_inset(document.body, lbl + 1)
+                if lbl_end == -1:
+                    # this should not happen
+                    break
+            else:
+                lbl = cap_end
+                lbl_end = cap_end
+            for line in document.body[lbl : lbl_end + 1]:
+                if line.startswith('name '):
+                    label = line.split()[1].strip('"')
+                    break
+            for line in document.body[cap : lbl ] + document.body[lbl_end + 1 : cap_end + 1]:
+                if not line.startswith('\\'):
+                    caption += line.strip()
+            k = cap_end + 1
+        inlinecode = ''
+        # looking for the oneline code for lstinline
+        inlinecode = document.body[find_end_of_layout(document.body, 
+            find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
+        if len(caption) > 0:
+            if len(params) == 0:
+                params = 'caption={%s}' % caption
+            else:
+                params += ',caption={%s}' % caption
+        if len(label) > 0:
+            if len(params) == 0:
+                params = 'label={%s}' % label
+            else:
+                params += ',label={%s}' % label
+        if len(params) > 0:
+            params = '[%s]' % params
+            params = params.replace('\\', '\\backslash\n')
+        if inline == 'true':
+            document.body[i:(j+1)] = [r'\begin_inset ERT',
+                                      'status %s' % status,
+                                      r'\begin_layout %s' % document.default_layout,
+                                      '', 
+                                      '',
+                                      r'\backslash',
+                                      'lstinline%s{%s}' % (params, inlinecode),
+                                      r'\end_layout',
+                                      '',
+                                      r'\end_inset']
+        else:
+            document.body[i: j+1] =  [r'\begin_inset ERT',
+                                      'status %s' % status,
+                                      '',
+                                      r'\begin_layout %s' % document.default_layout,
+                                      '',
+                                      '',
+                                      r'\backslash',
+                                      r'begin{lstlisting}%s' % params,
+                                      r'\end_layout'
+                                    ] + document.body[k : j - 1] + \
+                                     ['',
+                                      r'\begin_layout %s' % document.default_layout,
+                                      '',
+                                      r'\backslash',
+                                      'end{lstlisting}',
+                                      r'\end_layout',
+                                      '',
+                                      r'\end_inset']
+            
+
+def revert_include_listings(document):
+    r''' Revert lstinputlisting Include option , translate
+\begin_inset Include \lstinputlisting{file}[opt]
+preview false
+
+\end_inset
+
+TO
+
+\begin_inset ERT
+status open
+
+\begin_layout Standard
+
+
+\backslash
+lstinputlisting{file}[opt]
+\end_layout
+
+\end_inset
+    '''
+
+    i = 0
+    while True:
+        i = find_token(document.body, r'\begin_inset Include \lstinputlisting', i)
+        if i == -1:
+            break
+        else:
+            if not '\\usepackage{listings}' in document.preamble:
+                document.preamble.append('\\usepackage{listings}')
+        j = find_end_of_inset(document.body, i + 1)
+        if j == -1:
+            # this should not happen
+            break
+        # find command line lstinputlisting{file}[options]
+        cmd, file, option = '', '', ''
+        if re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]):
+            cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()            
+        option = option.replace('\\', '\\backslash\n')
+        document.body[i : j + 1] = [r'\begin_inset ERT',
+                                    'status open',
+                                    '',
+                                    r'\begin_layout %s' % document.default_layout,
+                                    '',
+                                    '',
+                                    r'\backslash',
+                                    '%s%s{%s}' % (cmd, option, file),
+                                    r'\end_layout',
+                                    '',
+                                    r'\end_inset']
+
+
+def revert_ext_font_sizes(document):
+    if document.backend != "latex": return
+    if not document.textclass.startswith("ext"): return
+
+    fontsize = get_value(document.header, '\\paperfontsize', 0)
+    if fontsize not in ('10', '11', '12'): return
+    fontsize += 'pt'
+
+    i = find_token(document.header, '\\paperfontsize', 0)
+    document.header[i] = '\\paperfontsize default'
+
+    i = find_token(document.header, '\\options', 0)
+    if i == -1:
+        i = find_token(document.header, '\\textclass', 0) + 1
+        document.header[i:i] = ['\\options %s' % fontsize]
+    else:
+        document.header[i] += ',%s' % fontsize
+
+
+def convert_ext_font_sizes(document):
+    if document.backend != "latex": return
+    if not document.textclass.startswith("ext"): return
+
+    fontsize = get_value(document.header, '\\paperfontsize', 0)
+    if fontsize != 'default': return
+
+    i = find_token(document.header, '\\options', 0)
+    if i == -1: return
+
+    options = get_value(document.header, '\\options', i)
+
+    fontsizes = '10pt', '11pt', '12pt'
+    for fs in fontsizes:
+        if options.find(fs) != -1:
+            break
+    else: # this else will only be attained if the for cycle had no match
+        return
+
+    options = options.split(',')
+    for j, opt in enumerate(options):
+        if opt in fontsizes:
+            fontsize = opt[:-2]
+            del options[j]
+            break
+    else:
+        return
+
+    k = find_token(document.header, '\\paperfontsize', 0)
+    document.header[k] = '\\paperfontsize %s' % fontsize
+
+    if options:
+        document.header[i] = '\\options %s' % ','.join(options)
+    else:
+        del document.header[i]
+
+
+def revert_separator_layout(document):
+    r'''Revert --Separator-- to a lyx note
+From
+
+\begin_layout --Separator--
+something
+\end_layout
+
+to
+
+\begin_layout Standard
+\begin_inset Note Note
+status open
+
+\begin_layout Standard
+Separate Evironment
+\end_layout
+
+\end_inset
+something
+
+\end_layout
+
+    '''
+
+    i = 0
+    while True:
+        i = find_token(document.body, r'\begin_layout --Separator--', i)
+        if i == -1:
+            break
+        j = find_end_of_layout(document.body, i + 1)
+        if j == -1:
+            # this should not happen
+            break
+        document.body[i : j + 1] = [r'\begin_layout %s' % document.default_layout,
+                                    r'\begin_inset Note Note',
+                                    'status open',
+                                    '',
+                                    r'\begin_layout %s' % document.default_layout,
+                                    'Separate Environment',
+                                    r'\end_layout',
+                                    '',
+                                    r'\end_inset'] + \
+                                    document.body[ i + 1 : j] + \
+                                    ['',
+                                    r'\end_layout'
+                                    ]
+
+
+def convert_arabic (document):
+    if document.language == "arabic":
+        document.language = "arabic_arabtex"
+        i = find_token(document.header, "\\language", 0)
+        if i != -1:
+            document.header[i] = "\\language arabic_arabtex"
+    i = 0
+    while i < len(document.body):
+        h = document.body[i].find("\lang arabic", 0, len(document.body[i]))
+        if (h != -1):
+            # change the language name
+            document.body[i] = '\lang arabic_arabtex'
+        i = i + 1
+
+
+def revert_arabic (document):
+    if document.language == "arabic_arabtex":
+        document.language = "arabic"
+        i = find_token(document.header, "\\language", 0)
+        if i != -1:
+            document.header[i] = "\\language arabic"
+    i = 0
+    while i < len(document.body):
+        h = document.body[i].find("\lang arabic_arabtex", 0, len(document.body[i]))
+        if (h != -1):
+            # change the language name
+            document.body[i] = '\lang arabic'
+        i = i + 1
+
+
+def read_unicodesymbols():
+    " Read the unicodesymbols list of unicode characters and corresponding commands."
+    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+    fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
+    spec_chars = {}
+    for line in fp.readlines():
+        if line[0] != '#':
+            line=line.replace(' "',' ') # remove all quotation marks with spaces before
+            line=line.replace('" ',' ') # remove all quotation marks with spaces after
+            line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
+            try:
+                # flag1 and flag2 are preamble and other flags
+                [ucs4,command,flag1,flag2] =line.split(None,3)
+                spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
+            except:
+                pass
+    fp.close()
+
+    return spec_chars
+
+
+def revert_unicode(document):
+    '''Transform unicode characters that can not be written using the
+document encoding to commands according to the unicodesymbols
+file. Characters that can not be replaced by commands are replaced by
+an replacement string.  Flags other than 'combined' are currently not
+implemented.'''
+
+    replacement_character = '???'
+    spec_chars = read_unicodesymbols()
+
+    # Define strings to start and end ERT and math insets
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash' % document.default_layout
+    ert_outro='\n\\end_layout\n\n\\end_inset\n'
+    math_intro='\n\\begin_inset Formula $'
+    math_outro='$\n\\end_inset'
+    # Find unicode characters and replace them
+    in_ert = False # flag set to 1 if in ERT inset
+    in_math = False # flag set to 1 if in math inset
+    insets = [] # list of active insets
+
+    # Go through the file to capture all combining characters
+    last_char = '' # to store the previous character
+
+    i = 0
+    while i < len(document.body):
+        line = document.body[i]
+        # Check for insets
+        if line.find('\\begin_inset') > -1:
+            # check which inset to start
+            if line.find('\\begin_inset ERT') > -1:
+                in_ert = True
+                insets.append('ert')
+            elif line.find('\\begin_inset Formula') > -1:
+                in_math = True
+                insets.append('math')
+            else:
+                insets.append('other')
+        if line.find('\\end_inset') > -1:
+            # check which inset to end
+            try:
+                cur_inset = insets.pop()
+                if cur_inset == 'ert':
+                    in_ert = False
+                elif cur_inset == 'math':
+                    in_math = False
+                else:
+                    pass # end of other inset
+            except:
+                pass # inset list was empty (for some reason)
+        
+        # Try to write the line
+        try:
+            # If all goes well the line is written here
+            dummy = line.encode(document.encoding)
+            last_char = line[-1]
+            i += 1
+        except:
+            # Error, some character(s) in the line need to be replaced
+            mod_line = u''
+            for character in line:
+                try:
+                    # Try to write the character
+                    dummy = character.encode(document.encoding)
+                    mod_line += character
+                    last_char = character
+                except:
+                    # Try to replace with ERT/math inset
+                    if spec_chars.has_key(character):
+                        command = spec_chars[character][0] # the command to replace unicode
+                        flag1 = spec_chars[character][1]
+                        flag2 = spec_chars[character][2]
+                        if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+                            # We have a character that should be combined with the previous
+                            command += '{' + last_char + '}'
+                            # Remove the last character. Ignore if it is whitespace
+                            if len(last_char.rstrip()):
+                                # last_char was found and is not whitespace
+                                if mod_line:
+                                    mod_line = mod_line[:-1]
+                                else: # last_char belongs to the last line
+                                    document.body[i-1] = document.body[i-1][:-1]
+                            else:
+                                # The last character was replaced by a command. For now it is
+                                # ignored. This could be handled better.
+                                pass
+                        if command[0:2] == '\\\\':
+                            if command[2:12]=='ensuremath':
+                                if in_ert:
+                                    # math in ERT
+                                    command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash')
+                                    command = command.replace('}', '$\n')
+                                elif not in_math:
+                                    # add a math inset with the replacement character
+                                    command = command.replace('\\\\ensuremath{\\', math_intro)
+                                    command = command.replace('}', math_outro)
+                                else:
+                                    # we are already in a math inset
+                                    command = command.replace('\\\\ensuremath{\\', '')
+                                    command = command.replace('}', '')
+                            else:
+                                if in_math:
+                                    # avoid putting an ERT in a math; instead put command as text
+                                    command = command.replace('\\\\', '\mathrm{')
+                                    command = command + '}'
+                                elif not in_ert:
+                                    # add an ERT inset with the replacement character
+                                    command = command.replace('\\\\', ert_intro)
+                                    command = command + ert_outro
+                                else:
+                                    command = command.replace('\\\\', '\n\\backslash')
+                            last_char = '' # indicate that the character should not be removed
+                        mod_line += command
+                    else:
+                        # Replace with replacement string
+                        mod_line += replacement_character
+            document.body[i:i+1] = mod_line.split('\n')
+            i += len(mod_line.split('\n'))
+
+
  ##
  # Conversion hub
  #
@@ -1275,14 +2026,40 @@ convert = [[246, []],
             [256, []],
             [257, [convert_caption]],
             [258, [convert_lyxline]],
-           [259, [convert_accent, normalize_font_whitespace]],
+           [259, [convert_accent, normalize_font_whitespace_259]],
             [260, []],
             [261, [convert_changes]],
             [262, []],
             [263, [normalize_language_name]],
-           [264, [convert_cv_textclass]]]
-
-revert =  [[263, [revert_cv_textclass]],
+           [264, [convert_cv_textclass]],
+           [265, [convert_tableborder]],
+           [266, []],
+           [267, []],
+           [268, []],
+           [269, []],
+           [270, []],
+           [271, [convert_ext_font_sizes]],
+           [272, []],
+           [273, []],
+           [274, [normalize_font_whitespace_274]],
+           [275, [convert_graphics_rotation]],
+           [276, [convert_arabic]]
+          ]
+
+revert =  [
+           [275, [revert_arabic]],
+           [274, [revert_graphics_rotation]],
+           [273, []],
+           [272, [revert_separator_layout]],
+           [271, [revert_preamble_listings_params, revert_listings_inset, revert_include_listings]],
+           [270, [revert_ext_font_sizes]],
+           [269, [revert_beamer_alert, revert_beamer_structure]],
+           [268, [revert_preamble_listings_params, revert_listings_inset, revert_include_listings]],
+           [267, [revert_CJK]],
+           [266, [revert_utf8plain]],
+           [265, [revert_armenian]],
+           [264, [revert_tableborder]],
+           [263, [revert_cv_textclass]],
             [262, [revert_language_name]],
             [261, [revert_ascii]],
             [260, []],
@@ -1297,7 +2074,7 @@ revert =  [[263, [revert_cv_textclass]],
             [251, [revert_commandparams]],
             [250, [revert_cs_label]],
             [249, []],
-           [248, [revert_accent, revert_utf8]],
+           [248, [revert_accent, revert_utf8, revert_unicode]],
             [247, [revert_booktabs]],
             [246, [revert_font_settings]],
             [245, [revert_framed]]]