unicodesymbols: add some missing punctuation characters again

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index d7cc7cbef9bd849fdbc9fb7495a75bd7614c32f7..898b0b90e634fb45d88278c2f58dc0cfb928dfa7 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -20,7 +20,9 @@
  """ Convert files to the file format generated by lyx 1.5"""
  
  import re
-from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value
+import unicodedata
+
+from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
  from LyX import get_encoding
  
  
@@ -28,9 +30,13 @@ from LyX import get_encoding
  # Private helper functions
  
  def find_end_of_inset(lines, i):
-    " Find beginning of inset, where lines[i] is included."
+    " Find end of inset, where lines[i] is included."
      return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
  
+def find_end_of_layout(lines, i):
+    " Find end of layout, where lines[i] is included."
+    return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
+
  # End of helper functions
  ####################################################################
  
@@ -217,18 +223,85 @@ def revert_booktabs(document):
          i = i + 1
  
  
+def convert_multiencoding(document, forward):
+    """ Fix files with multiple encodings.
+Files with an inputencoding of "auto" or "default" and multiple languages
+where at least two languages have different default encodings are encoded
+in multiple encodings for file formats < 249. These files are incorrectly
+read and written (as if the whole file was in the encoding of the main
+language).
+This is not true for files written by CJK-LyX, they are always in the locale
+encoding.
+
+This function
+- converts from fake unicode values to true unicode if forward is true, and
+- converts from true unicode values to fake unicode if forward is false.
+document.encoding must be set to the old value (format 248) in both cases.
+
+We do this here and not in LyX.py because it is far easier to do the
+necessary parsing in modern formats than in ancient ones.
+"""
+    if document.cjk_encoding != '':
+        return
+    encoding_stack = [document.encoding]
+    lang_re = re.compile(r"^\\lang\s(\S+)")
+    if document.inputencoding == "auto" or document.inputencoding == "default":
+        for i in range(len(document.body)):
+            result = lang_re.match(document.body[i])
+            if result:
+                language = result.group(1)
+                if language == "default":
+                    document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding), 3)
+                    encoding_stack[-1] = document.encoding
+                else:
+                    from lyx2lyx_lang import lang
+                    document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3]), 3)
+                    encoding_stack[-1] = lang[language][3]
+            elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
+                document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
+                encoding_stack.append(encoding_stack[-1])
+            elif find_token(document.body, "\\end_layout", i, i + 1) == i:
+                document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
+                if len(encoding_stack) == 1:
+                    # Don't remove the document encoding from the stack
+                    document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
+                else:
+                    del encoding_stack[-1]
+            if encoding_stack[-1] != document.encoding:
+                if forward:
+                    # This line has been incorrectly interpreted as if it was
+                    # encoded in 'encoding'.
+                    # Convert back to the 8bit string that was in the file.
+                    orig = document.body[i].encode(document.encoding)
+                    # Convert the 8bit string that was in the file to unicode
+                    # with the correct encoding.
+                    document.body[i] = orig.decode(encoding_stack[-1])
+                else:
+                    # Convert unicode to the 8bit string that will be written
+                    # to the file with the correct encoding.
+                    orig = document.body[i].encode(encoding_stack[-1])
+                    # Convert the 8bit string that will be written to the
+                    # file to fake unicode with the encoding that will later
+                    # be used when writing to the file.
+                    document.body[i] = orig.decode(document.encoding)
+
+
  def convert_utf8(document):
+    " Set document encoding to UTF-8. "
+    convert_multiencoding(document, True)
      document.encoding = "utf8"
  
  
  def revert_utf8(document):
+    " Set document encoding to the value corresponding to inputencoding. "
      i = find_token(document.header, "\\inputencoding", 0)
      if i == -1:
          document.header.append("\\inputencoding auto")
      elif get_value(document.header, "\\inputencoding", i) == "utf8":
          document.header[i] = "\\inputencoding auto"
      document.inputencoding = get_value(document.header, "\\inputencoding", 0)
-    document.encoding = get_encoding(document.language, document.inputencoding, 248)
+    document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
+    convert_multiencoding(document, False)
  
  
  def revert_cs_label(document):
@@ -267,19 +340,24 @@ key "argument"
  
  This must be called after convert_commandparams.
  """
-    regex = re.compile(r'\S+\s*(\[[^\[\{]*\])?(\{[^}]*\})')
      i = 0
      while 1:
          i = find_token(document.body, "\\bibitem", i)
          if i == -1:
              break
-        match = re.match(regex, document.body[i])
-        option = match.group(1)
-        argument = match.group(2)
+        j = document.body[i].find('[') + 1
+        k = document.body[i].rfind(']')
+        if j == 0: # No optional argument found
+            option = None
+        else:
+            option = document.body[i][j:k]
+        j = document.body[i].rfind('{') + 1
+        k = document.body[i].rfind('}')
+        argument = document.body[i][j:k]
          lines = ['\\begin_inset LatexCommand bibitem']
          if option != None:
-            lines.append('label "%s"' % option[1:-1].replace('"', '\\"'))
-        lines.append('key "%s"' % argument[1:-1].replace('"', '\\"'))
+            lines.append('label "%s"' % option.replace('"', '\\"'))
+        lines.append('key "%s"' % argument.replace('"', '\\"'))
          lines.append('')
          lines.append('\\end_inset')
          document.body[i:i+1] = lines
@@ -350,9 +428,6 @@ def convert_commandparams(document):
      # \begin_inset LatexCommand bibitem was not the official version (see
      # convert_bibitem()), but could be read in, so we convert it here, too.
  
-    # FIXME: Handle things like \command[foo[bar]]{foo{bar}}
-    # we need a real parser here.
-    regex = re.compile(r'\\([^\[\{]+)(\[[^\[\{]*\])?(\[[^\[\{]*\])?(\{[^}]*\})?')
      i = 0
      while 1:
          i = find_token(document.body, "\\begin_inset LatexCommand", i)
@@ -384,11 +459,11 @@ def convert_commandparams(document):
                  if nestdepth == 0:
                      state = "WS"
                  else:
-                    --nestdepth
+                    nestdepth = nestdepth - 1
              if ((state == "OPTION" and c == '[') or
                  (state == "SECOPTION" and c == '[') or
                  (state == "CONTENT" and c == '{')):
-                ++nestdepth
+                nestdepth = nestdepth + 1
              if state == "CMDNAME":
                      name += c
              elif state == "OPTION":
@@ -584,6 +659,1004 @@ def revert_printnomenclature(document):
          document.preamble.append('\\makenomenclature')
  
  
+def convert_esint(document):
+    " Add \\use_esint setting to header. "
+    i = find_token(document.header, "\\cite_engine", 0)
+    if i == -1:
+        document.warning("Malformed LyX document: Missing `\\cite_engine'.")
+        return
+    # 0 is off, 1 is auto, 2 is on.
+    document.header.insert(i, '\\use_esint 0')
+
+
+def revert_esint(document):
+    " Remove \\use_esint setting from header. "
+    i = find_token(document.header, "\\use_esint", 0)
+    if i == -1:
+        document.warning("Malformed LyX document: Missing `\\use_esint'.")
+        return
+    use_esint = document.header[i].split()[1]
+    del document.header[i]
+    # 0 is off, 1 is auto, 2 is on.
+    if (use_esint == 2):
+        document.preamble.append('\\usepackage{esint}')
+
+
+def revert_clearpage(document):
+    " clearpage -> ERT "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\clearpage", i)
+        if i == -1:
+            break
+        document.body[i:i+1] =  ['\\begin_inset ERT',
+                                'status collapsed',
+                                '',
+                                '\\begin_layout %s' % document.default_layout,
+                                '',
+                                '',
+                                '\\backslash',
+                                'clearpage',
+                                '\\end_layout',
+                                '',
+                                '\\end_inset']
+    i = i + 1
+
+
+def revert_cleardoublepage(document):
+    " cleardoublepage -> ERT "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\cleardoublepage", i)
+        if i == -1:
+            break
+        document.body[i:i+1] =  ['\\begin_inset ERT',
+                                'status collapsed',
+                                '',
+                                '\\begin_layout %s' % document.default_layout,
+                                '',
+                                '',
+                                '\\backslash',
+                                'cleardoublepage',
+                                '\\end_layout',
+                                '',
+                                '\\end_inset']
+    i = i + 1
+
+
+def convert_lyxline(document):
+    " remove fontsize commands for \lyxline "
+    # The problematic is: The old \lyxline definition doesn't handle the fontsize
+    # to change the line thickness. The new definiton does this so that imported
+    # \lyxlines would have a different line thickness. The eventual fontsize command
+    # before \lyxline is therefore removed to get the same output.
+    fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
+                 "large", "Large", "LARGE", "huge", "Huge"]
+    for n in range(0, len(fontsizes)):
+        i = 0
+        k = 0
+        while i < len(document.body):
+            i = find_token(document.body, "\\size " + fontsizes[n], i)
+            k = find_token(document.body, "\\lyxline", i)
+            # the corresponding fontsize command is always 2 lines before the \lyxline
+            if (i != -1 and k == i+2):
+                document.body[i:i+1] = []
+            else:
+                break
+        i = i + 1
+
+
+def revert_encodings(document):
+    " Set new encodings to auto. "
+    encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
+                 "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250",
+                 "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"]
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc in encodings:
+            document.header[i] = "\\inputencoding auto"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
+def convert_caption(document):
+    " Convert caption layouts to caption insets. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_layout Caption", i)
+        if i == -1:
+            return
+        j = find_end_of_layout(document.body, i)
+        if j == -1:
+            document.warning("Malformed LyX document: Missing `\\end_layout'.")
+            return
+
+        document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
+        document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
+                            "\\begin_inset Caption", "",
+                            "\\begin_layout %s" % document.default_layout]
+        i = i + 1
+
+
+def revert_caption(document):
+    " Convert caption insets to caption layouts. "
+    " This assumes that the text class has a caption style. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset Caption", i)
+        if i == -1:
+            return
+
+        # We either need to delete the previous \begin_layout line, or we
+        # need to end the previous layout if this inset is not in the first
+        # position of the paragraph.
+        layout_before = find_token_backwards(document.body, "\\begin_layout", i)
+        if layout_before == -1:
+            document.warning("Malformed LyX document: Missing `\\begin_layout'.")
+            return
+        layout_line = document.body[layout_before]
+        del_layout_before = True
+        l = layout_before + 1
+        while l < i:
+            if document.body[l] != "":
+                del_layout_before = False
+                break
+            l = l + 1
+        if del_layout_before:
+            del document.body[layout_before:i]
+            i = layout_before
+        else:
+            document.body[i:i] = ["\\end_layout", ""]
+            i = i + 2
+
+        # Find start of layout in the inset and end of inset
+        j = find_token(document.body, "\\begin_layout", i)
+        if j == -1:
+            document.warning("Malformed LyX document: Missing `\\begin_layout'.")
+            return
+        k = find_end_of_inset(document.body, i)
+        if k == -1:
+            document.warning("Malformed LyX document: Missing `\\end_inset'.")
+            return
+
+        # We either need to delete the following \end_layout line, or we need
+        # to restart the old layout if this inset is not at the paragraph end.
+        layout_after = find_token(document.body, "\\end_layout", k)
+        if layout_after == -1:
+            document.warning("Malformed LyX document: Missing `\\end_layout'.")
+            return
+        del_layout_after = True
+        l = k + 1
+        while l < layout_after:
+            if document.body[l] != "":
+                del_layout_after = False
+                break
+            l = l + 1
+        if del_layout_after:
+            del document.body[k+1:layout_after+1]
+        else:
+            document.body[k+1:k+1] = [layout_line, ""]
+
+        # delete \begin_layout and \end_inset and replace \begin_inset with
+        # "\begin_layout Caption". This works because we can only have one
+        # paragraph in the caption inset: The old \end_layout will be recycled.
+        del document.body[k]
+        if document.body[k] == "":
+            del document.body[k]
+        del document.body[j]
+        if document.body[j] == "":
+            del document.body[j]
+        document.body[i] = "\\begin_layout Caption"
+        if document.body[i+1] == "":
+            del document.body[i+1]
+        i = i + 1
+
+
+# Accents of InsetLaTeXAccent
+accent_map = {
+    "`" : u'\u0300', # grave
+    "'" : u'\u0301', # acute
+    "^" : u'\u0302', # circumflex
+    "~" : u'\u0303', # tilde
+    "=" : u'\u0304', # macron
+    "u" : u'\u0306', # breve
+    "." : u'\u0307', # dot above
+    "\"": u'\u0308', # diaresis
+    "r" : u'\u030a', # ring above
+    "H" : u'\u030b', # double acute
+    "v" : u'\u030c', # caron
+    "b" : u'\u0320', # minus sign below
+    "d" : u'\u0323', # dot below
+    "c" : u'\u0327', # cedilla
+    "k" : u'\u0328', # ogonek
+    "t" : u'\u0361'  # tie. This is special: It spans two characters, but
+                     # only one is given as argument, so we don't need to
+                     # treat it differently.
+}
+
+
+# special accents of InsetLaTeXAccent without argument
+special_accent_map = {
+    'i' : u'\u0131', # dotless i
+    'j' : u'\u0237', # dotless j
+    'l' : u'\u0142', # l with stroke
+    'L' : u'\u0141'  # L with stroke
+}
+
+
+# special accent arguments of InsetLaTeXAccent
+accented_map = {
+    '\\i' : u'\u0131', # dotless i
+    '\\j' : u'\u0237'  # dotless j
+}
+
+
+def _convert_accent(accent, accented_char):
+    type = accent
+    char = accented_char
+    if char == '':
+        if type in special_accent_map:
+            return special_accent_map[type]
+        # a missing char is treated as space by LyX
+        char = ' '
+    elif type == 'q' and char in ['t', 'd', 'l', 'L']:
+        # Special caron, only used with t, d, l and L.
+        # It is not in the map because we convert it to the same unicode
+        # character as the normal caron: \q{} is only defined if babel with
+        # the czech or slovak language is used, and the normal caron
+        # produces the correct output if the T1 font encoding is used.
+        # For the same reason we never convert to \q{} in the other direction.
+        type = 'v'
+    elif char in accented_map:
+        char = accented_map[char]
+    elif (len(char) > 1):
+        # We can only convert accents on a single char
+        return ''
+    a = accent_map.get(type)
+    if a:
+        return unicodedata.normalize("NFKC", "%s%s" % (char, a))
+    return ''
+
+
+def convert_ertbackslash(body, i, ert, default_layout):
+    r""" -------------------------------------------------------------------------------------------
+    Convert backslashes and '\n' into valid ERT code, append the converted
+    text to body[i] and return the (maybe incremented) line index i"""
+
+    for c in ert:
+        if c == '\\':
+            body[i] = body[i] + '\\backslash '
+            i = i + 1
+            body.insert(i, '')
+        elif c == '\n':
+            body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
+            i = i + 4
+        else:
+            body[i] = body[i] + c
+    return i
+
+
+def convert_accent(document):
+    # The following forms are supported by LyX:
+    # '\i \"{a}' (standard form, as written by LyX)
+    # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
+    # '\i \"{ }' (also accepted if the accented char is a space)
+    # '\i \" a'  (also accepted)
+    # '\i \"'    (also accepted)
+    re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
+    re_contents = re.compile(r'^([^\s{]+)(.*)$')
+    re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
+    i = 0
+    while 1:
+        i = find_re(document.body, re_wholeinset, i)
+        if i == -1:
+            return
+        match = re_wholeinset.match(document.body[i])
+        prefix = match.group(1)
+        contents = match.group(3).strip()
+        match = re_contents.match(contents)
+        if match:
+            # Strip first char (always \)
+            accent = match.group(1)[1:]
+            accented_contents = match.group(2).strip()
+            match = re_accentedcontents.match(accented_contents)
+            accented_char = match.group(1)
+            converted = _convert_accent(accent, accented_char)
+            if converted == '':
+                # Normalize contents
+                contents = '%s{%s}' % (accent, accented_char),
+            else:
+                document.body[i] = '%s%s' % (prefix, converted)
+                i += 1
+                continue
+        document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
+        document.body[i] = prefix
+        document.body[i+1:i+1] = ['\\begin_inset ERT',
+                                  'status collapsed',
+                                  '',
+                                  '\\begin_layout %s' % document.default_layout,
+                                  '',
+                                  '',
+                                  '']
+        i = convert_ertbackslash(document.body, i + 7,
+                                 '\\%s' % contents,
+                                 document.default_layout)
+        document.body[i+1:i+1] = ['\\end_layout',
+                                  '',
+                                  '\\end_inset']
+        i += 3
+
+
+def revert_accent(document):
+    inverse_accent_map = {}
+    for k in accent_map:
+        inverse_accent_map[accent_map[k]] = k
+    inverse_special_accent_map = {}
+    for k in special_accent_map:
+        inverse_special_accent_map[special_accent_map[k]] = k
+    inverse_accented_map = {}
+    for k in accented_map:
+        inverse_accented_map[accented_map[k]] = k
+
+    # Since LyX may insert a line break within a word we must combine all
+    # words before unicode normalization.
+    # We do this only if the next line starts with an accent, otherwise we
+    # would create things like '\begin_inset ERTstatus'.
+    numberoflines = len(document.body)
+    for i in range(numberoflines-1):
+        if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
+            continue
+        if (document.body[i+1][0] in inverse_accent_map):
+            # the last character of this line and the first of the next line
+            # form probably a surrogate pair.
+            while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
+                document.body[i] += document.body[i+1][0]
+                document.body[i+1] = document.body[i+1][1:]
+
+    # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
+    # This is needed to catch all accented characters.
+    for i in range(numberoflines):
+        # Unfortunately we have a mixture of unicode strings and plain strings,
+        # because we never use u'xxx' for string literals, but 'xxx'.
+        # Therefore we may have to try two times to normalize the data.
+        try:
+            document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+        except TypeError:
+            document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+
+    # Replace accented characters with InsetLaTeXAccent
+    # Do not convert characters that can be represented in the chosen
+    # encoding.
+    encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
+    lang_re = re.compile(r"^\\lang\s(\S+)")
+    for i in range(len(document.body)):
+
+        if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
+            # Track the encoding of the current line
+            result = lang_re.match(document.body[i])
+            if result:
+                language = result.group(1)
+                if language == "default":
+                    encoding_stack[-1] = document.encoding
+                else:
+                    from lyx2lyx_lang import lang
+                    encoding_stack[-1] = lang[language][3]
+                continue
+            elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
+                encoding_stack.append(encoding_stack[-1])
+                continue
+            elif find_token(document.body, "\\end_layout", i, i + 1) == i:
+                del encoding_stack[-1]
+                continue
+
+        for j in range(len(document.body[i])):
+            # dotless i and dotless j are both in special_accent_map and can
+            # occur as an accented character, so we need to test that the
+            # following character is no accent
+            if (document.body[i][j] in inverse_special_accent_map and
+                (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
+                accent = document.body[i][j]
+                try:
+                    dummy = accent.encode(encoding_stack[-1])
+                except UnicodeEncodeError:
+                    # Insert the rest of the line as new line
+                    if j < len(document.body[i]) - 1:
+                        document.body[i+1:i+1] = document.body[i][j+1:]
+                    # Delete the accented character
+                    if j > 0:
+                        document.body[i] = document.body[i][:j-1]
+                    else:
+                        document.body[i] = u''
+                    # Finally add the InsetLaTeXAccent
+                    document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
+                    break
+            elif j > 0 and document.body[i][j] in inverse_accent_map:
+                accented_char = document.body[i][j-1]
+                if accented_char == ' ':
+                    # Conform to LyX output
+                    accented_char = ''
+                elif accented_char in inverse_accented_map:
+                    accented_char = inverse_accented_map[accented_char]
+                accent = document.body[i][j]
+                try:
+                    dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
+                except UnicodeEncodeError:
+                    # Insert the rest of the line as new line
+                    if j < len(document.body[i]) - 1:
+                        document.body[i+1:i+1] = document.body[i][j+1:]
+                    # Delete the accented characters
+                    if j > 1:
+                        document.body[i] = document.body[i][:j-2]
+                    else:
+                        document.body[i] = u''
+                    # Finally add the InsetLaTeXAccent
+                    document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
+                    break
+    # Normalize to "Normal form C" (NFC, pre-composed characters) again
+    for i in range(numberoflines):
+        document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+
+
+def normalize_font_whitespace(document):
+    """ Before format 259 the font changes were ignored if a
+    whitespace was the first or last character in the sequence, this function
+    transfers the whitespace outside."""
+
+    if document.backend != "latex":
+        return
+
+    lines = document.body
+
+    char_properties = {"\\series": "default",
+                       "\\emph": "default",
+                       "\\color": "none",
+                       "\\shape": "default",
+                       "\\bar": "default",
+                       "\\family": "default"}
+    changes = {}
+
+    i = 0
+    while i < len(lines):
+        words = lines[i].split()
+
+        if len(words) > 0 and words[0] == "\\begin_layout":
+            # a new paragraph resets all font changes
+            changes.clear()
+
+        elif len(words) > 1 and words[0] in char_properties.keys():
+            # we have a font change
+            if char_properties[words[0]] == words[1]:
+                # property gets reset
+                if words[0] in changes.keys():
+                    del changes[words[0]]
+                defaultproperty = True
+            else:
+                # property gets set
+                changes[words[0]] = words[1]
+                defaultproperty = False
+
+            # We need to explicitly reset all changed properties if we find
+            # a space below, because LyX 1.4 would output the space after
+            # closing the previous change and before starting the new one,
+            # and closing a font change means to close all properties, not
+            # just the changed one.
+
+            if lines[i-1] and lines[i-1][-1] == " ":
+                lines[i-1] = lines[i-1][:-1]
+                # a space before the font change
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                if defaultproperty:
+                    # Property is reset in lines[i], so add the new stuff afterwards
+                    lines[i+1:i+1] = added_lines
+                else:
+                    # Reset property for the space
+                    added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                    lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+            elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
+                # a space after the font change
+                if (lines[i+1] == " " and lines[i+2]):
+                    next_words = lines[i+2].split()
+                    if len(next_words) > 0 and next_words[0] == words[0]:
+                        # a single blank with a property different from the
+                        # previous and the next line must not be changed
+                        i = i + 2
+                        continue
+                lines[i+1] = lines[i+1][1:]
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                # Reset property for the space
+                added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+        i = i + 1
+
+
+def revert_utf8x(document):
+    " Set utf8x encoding to utf8. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "utf8x":
+            document.header[i] = "\\inputencoding utf8"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
+def revert_utf8plain(document):
+    " Set utf8plain encoding to utf8. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "utf8-plain":
+            document.header[i] = "\\inputencoding utf8"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
+def revert_beamer_alert(document):
+    " Revert beamer's \\alert inset back to ERT. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset CharStyle Alert", i)
+        if i == -1:
+            return
+        document.body[i] = "\\begin_inset ERT"
+        i = i + 1
+        while 1:
+            if (document.body[i][:13] == "\\begin_layout"):
+                # Insert the \alert command
+                document.body[i + 1] = "\\alert{" + document.body[i + 1] + '}'
+                break
+            i = i + 1
+
+        i = i + 1
+
+
+def revert_beamer_structure(document):
+    " Revert beamer's \\structure inset back to ERT. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset CharStyle Structure", i)
+        if i == -1:
+            return
+        document.body[i] = "\\begin_inset ERT"
+        i = i + 1
+        while 1:
+            if (document.body[i][:13] == "\\begin_layout"):
+                document.body[i + 1] = "\\structure{" + document.body[i + 1] + '}'
+                break
+            i = i + 1
+
+        i = i + 1
+
+
+def convert_changes(document):
+    " Switch output_changes off if tracking_changes is off. "
+    i = find_token(document.header, '\\tracking_changes', 0)
+    if i == -1:
+        document.warning("Malformed lyx document: Missing '\\tracking_changes'.")
+        return
+    j = find_token(document.header, '\\output_changes', 0)
+    if j == -1:
+        document.warning("Malformed lyx document: Missing '\\output_changes'.")
+        return
+    tracking_changes = get_value(document.header, "\\tracking_changes", i)
+    output_changes = get_value(document.header, "\\output_changes", j)
+    if tracking_changes == "false" and output_changes == "true":
+        document.header[j] = "\\output_changes false"
+
+
+def revert_ascii(document):
+    " Set ascii encoding to auto. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "ascii":
+            document.header[i] = "\\inputencoding auto"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
+def normalize_language_name(document):
+    lang = { "brazil": "brazilian",
+             "portuges": "portuguese"}
+
+    if document.language in lang:
+        document.language = lang[document.language]
+        i = find_token(document.header, "\\language", 0)
+        document.header[i] = "\\language %s" % document.language
+
+
+def revert_language_name(document):
+    lang = { "brazilian": "brazil",
+             "portuguese": "portuges"}
+
+    if document.language in lang:
+        document.language = lang[document.language]
+        i = find_token(document.header, "\\language", 0)
+        document.header[i] = "\\language %s" % document.language
+
+#
+#  \textclass cv -> \textclass simplecv
+def convert_cv_textclass(document):
+    if document.textclass == "cv":
+        document.textclass = "simplecv"
+
+
+def revert_cv_textclass(document):
+    if document.textclass == "simplecv":
+        document.textclass = "cv"
+
+
+def convert_tableborder(document):
+    # The problematic is: LyX double the table cell border as it ignores the "|" character in
+    # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
+    i = 0
+    while i < len(document.body):
+        h = document.body[i].find("leftline=\"true\"", 0, len(document.body[i]))
+        k = document.body[i].find("|>{", 0, len(document.body[i]))
+        # the two tokens have to be in one line
+        if (h != -1 and k != -1):
+            # delete the "|"
+            document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])-1]
+        i = i + 1
+
+
+def revert_tableborder(document):
+    i = 0
+    while i < len(document.body):
+        h = document.body[i].find("leftline=\"true\"", 0, len(document.body[i]))
+        k = document.body[i].find(">{", 0, len(document.body[i]))
+        # the two tokens have to be in one line
+        if (h != -1 and k != -1):
+            # add the "|"
+            document.body[i] = document.body[i][:k] + '|' + document.body[i][k:]
+        i = i + 1
+
+
+def revert_armenian(document):
+    
+    # set inputencoding from armscii8 to auto 
+    if document.inputencoding == "armscii8":
+        i = find_token(document.header, "\\inputencoding", 0)
+        if i != -1:
+            document.header[i] = "\\inputencoding auto"
+    # check if preamble exists, if not k is set to -1 
+    i = 0
+    k = -1
+    while i < len(document.preamble):
+        if k == -1:
+            k = document.preamble[i].find("\\", 0, len(document.preamble[i]))
+        if k == -1:
+            k = document.preamble[i].find("%", 0, len(document.preamble[i]))
+        i = i + 1
+    # add the entry \usepackage{armtex} to the document preamble
+    if document.language == "armenian":
+        # set the armtex entry as the first preamble line
+        if k != -1:
+            document.preamble[0:0] = ["\\usepackage{armtex}"]
+        # create the preamble when it doesn't exist
+        else:
+            document.preamble.append('\\usepackage{armtex}')
+    # Set document language from armenian to english 
+    if document.language == "armenian":
+        document.language = "english"
+        i = find_token(document.header, "\\language", 0)
+        if i != -1:
+            document.header[i] = "\\language english"
+
+
+def revert_CJK(document):
+    " Set CJK encodings to default and languages chinese, japanese and korean to english. "
+    encodings = ["Bg5", "Bg5+", "GB", "GBt", "GBK", "JIS",
+                 "KS", "SJIS", "UTF8", "EUC-TW", "EUC-JP"]
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc in encodings:
+            document.header[i] = "\\inputencoding default"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+    if document.language == "chinese-simplified" or \
+       document.language == "chinese-traditional" or \
+       document.language == "japanese" or document.language == "korean":
+        document.language = "english"
+        i = find_token(document.header, "\\language", 0)
+        if i != -1:
+            document.header[i] = "\\language english"
+
+
+def revert_preamble_listings_params(document):
+    " Revert preamble option \listings_params "
+    i = find_token(document.header, "\\listings_params", 0)
+    if i != -1:
+        document.preamble.append('\\usepackage{listings}')
+        document.preamble.append('\\lstset{%s}' % document.header[i].split()[1].strip('"'))
+        document.header.pop(i);
+
+
+def revert_listings_inset(document):
+    r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate 
+FROM
+
+\begin_inset 
+lstparams "language=Delphi"
+inline true
+status open
+
+\begin_layout Standard
+var i = 10;
+\end_layout
+
+\end_inset
+
+TO
+
+\begin_inset ERT
+status open
+\begin_layout Standard
+
+
+\backslash
+lstinline[language=Delphi]{var i = 10;}
+\end_layout
+
+\end_inset
+
+There can be an caption inset in this inset
+
+\begin_layout Standard
+\begin_inset Caption
+
+\begin_layout Standard
+before label
+\begin_inset LatexCommand label
+name "lst:caption"
+
+\end_inset
+
+after label
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+'''
+    i = 0
+    while True:
+        i = find_token(document.body, '\\begin_inset listings', i)
+        if i == -1:
+            break
+        else:
+            if not '\\usepackage{listings}' in document.preamble:
+                document.preamble.append('\\usepackage{listings}')
+        j = find_end_of_inset(document.body, i + 1)
+        if j == -1:
+            # this should not happen
+            break
+        inline = 'false'
+        params = ''
+        status = 'open'
+        # first three lines
+        for line in range(i + 1, i + 4):
+            if document.body[line].startswith('inline'):
+                inline = document.body[line].split()[1]
+            if document.body[line].startswith('lstparams'):
+                params = document.body[line].split()[1].strip('"')
+            if document.body[line].startswith('status'):
+                status = document.body[line].split()[1].strip()
+                k = line + 1
+        # caption?
+        caption = ''
+        label = ''
+        cap = find_token(document.body, '\\begin_inset Caption', i)
+        if cap != -1:
+            cap_end = find_end_of_inset(document.body, cap + 1)
+            if cap_end == -1:
+                # this should not happen
+                break
+            # label?
+            lbl = find_token(document.body, '\\begin_inset LatexCommand label', cap + 1)
+            if lbl != -1:
+                lbl_end = find_end_of_inset(document.body, lbl + 1)
+                if lbl_end == -1:
+                    # this should not happen
+                    break
+            else:
+                lbl = cap_end
+                lbl_end = cap_end
+            for line in document.body[lbl : lbl_end + 1]:
+                if line.startswith('name '):
+                    label = line.split()[1].strip('"')
+                    break
+            for line in document.body[cap : lbl ] + document.body[lbl_end + 1 : cap_end + 1]:
+                if not line.startswith('\\'):
+                    caption += line.strip()
+            k = cap_end + 1
+        inlinecode = ''
+        # looking for the oneline code for lstinline
+        inlinecode = document.body[find_end_of_layout(document.body, 
+            find_token(document.body, '\\begin_layout Standard', i + 1) +1 ) - 1]
+        if len(caption) > 0:
+            if len(params) == 0:
+                params = 'caption={%s}' % caption
+            else:
+                params += ',caption={%s}' % caption
+        if len(label) > 0:
+            if len(params) == 0:
+                params = 'label={%s}' % label
+            else:
+                params += ',label={%s}' % label
+        if len(params) > 0:
+            params = '[%s]' % params
+            params = params.replace('\\', '\\backslash\n')
+        if inline == 'true':
+            document.body[i:(j+1)] = [r'\begin_inset ERT',
+                                      'status %s' % status,
+                                      r'\begin_layout Standard',
+                                      '', 
+                                      '',
+                                      r'\backslash',
+                                      'lstinline%s{%s}' % (params, inlinecode),
+                                      r'\end_layout',
+                                      '',
+                                      r'\end_inset']
+        else:
+            document.body[i: j+1] =  [r'\begin_inset ERT',
+                                      'status %s' % status,
+                                      '',
+                                      r'\begin_layout Standard',
+                                      '',
+                                      '',
+                                      r'\backslash',
+                                      r'begin{lstlisting}%s' % params,
+                                      r'\end_layout'
+                                    ] + document.body[k : j - 1] + \
+                                     ['',
+                                      r'\begin_layout Standard',
+                                      '',
+                                      r'\backslash',
+                                      'end{lstlisting}',
+                                      r'\end_layout',
+                                      '',
+                                      r'\end_inset']
+            
+
+def revert_include_listings(document):
+    r''' Revert lstinputlisting Include option , translate
+\begin_inset Include \lstinputlisting{file}[opt]
+preview false
+
+\end_inset
+
+TO
+
+\begin_inset ERT
+status open
+
+\begin_layout Standard
+
+
+\backslash
+lstinputlisting{file}[opt]
+\end_layout
+
+\end_inset
+    '''
+
+    i = 0
+    while True:
+        i = find_token(document.body, r'\begin_inset Include \lstinputlisting', i)
+        if i == -1:
+            break
+        else:
+            if not '\\usepackage{listings}' in document.preamble:
+                document.preamble.append('\\usepackage{listings}')
+        j = find_end_of_inset(document.body, i + 1)
+        if j == -1:
+            # this should not happen
+            break
+        # find command line lstinputlisting{file}[options]
+        cmd, file, option = '', '', ''
+        if re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]):
+            cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()            
+        option = option.replace('\\', '\\backslash\n')
+        document.body[i : j + 1] = [r'\begin_inset ERT',
+                                    'status open',
+                                    '',
+                                    r'\begin_layout Standard',
+                                    '',
+                                    '',
+                                    r'\backslash',
+                                    '%s%s{%s}' % (cmd, option, file),
+                                    r'\end_layout',
+                                    '',
+                                    r'\end_inset']
+
+
+def revert_ext_font_sizes(document):
+    if document.backend != "latex": return
+    if not document.textclass.startswith("ext"): return
+
+    fontsize = get_value(document.header, '\\paperfontsize', 0)
+    if fontsize not in ('10', '11', '12'): return
+    fontsize += 'pt'
+
+    i = find_token(document.header, '\\paperfontsize', 0)
+    document.header[i] = '\\paperfontsize default'
+
+    i = find_token(document.header, '\\options', 0)
+    if i == -1:
+        i = find_token(document.header, '\\textclass', 0) + 1
+        document.header[i:i] = ['\\options %s' % fontsize]
+    else:
+        document.header[i] += ',%s' % fontsize
+
+
+def convert_ext_font_sizes(document):
+    if document.backend != "latex": return
+    if not document.textclass.startswith("ext"): return
+
+    fontsize = get_value(document.header, '\\paperfontsize', 0)
+    if fontsize != 'default': return
+
+    i = find_token(document.header, '\\options', 0)
+    if i == -1: return
+
+    options = get_value(document.header, '\\options', i)
+
+    fontsizes = '10pt', '11pt', '12pt'
+    for fs in fontsizes:
+        if options.find(fs) != -1:
+            break
+    else: # this else will only be attained if the for cycle had no match
+        return
+
+    options = options.split(',')
+    for j, opt in enumerate(options):
+        if opt in fontsizes:
+            fontsize = opt[:-2]
+            del options[j]
+            break
+    else:
+        return
+
+    k = find_token(document.header, '\\paperfontsize', 0)
+    document.header[k] = '\\paperfontsize %s' % fontsize
+
+    if options:
+        document.header[i] = '\\options %s' % ','.join(options)
+    else:
+        del document.header[i]
+
+
  ##
  # Conversion hub
  #
@@ -596,13 +1669,51 @@ convert = [[246, []],
             [250, []],
             [251, []],
             [252, [convert_commandparams, convert_bibitem]],
-           [253, []]]
+           [253, []],
+           [254, [convert_esint]],
+           [255, []],
+           [256, []],
+           [257, [convert_caption]],
+           [258, [convert_lyxline]],
+           [259, [convert_accent, normalize_font_whitespace]],
+           [260, []],
+           [261, [convert_changes]],
+           [262, []],
+           [263, [normalize_language_name]],
+           [264, [convert_cv_textclass]],
+           [265, [convert_tableborder]],
+           [266, []],
+           [267, []],
+           [268, []],
+           [269, []],
+           [270, []],
+           [271, [convert_ext_font_sizes]]
+          ]
  
-revert =  [[252, [revert_nomenclature, revert_printnomenclature]],
+revert =  [
+           [270, [revert_ext_font_sizes]],
+           [269, [revert_beamer_alert, revert_beamer_structure]],
+           [268, [revert_preamble_listings_params, revert_listings_inset, revert_include_listings]],
+           [267, [revert_CJK]],
+           [266, [revert_utf8plain]],
+           [265, [revert_armenian]],
+           [264, [revert_tableborder]],
+           [263, [revert_cv_textclass]],
+           [262, [revert_language_name]],
+           [261, [revert_ascii]],
+           [260, []],
+           [259, [revert_utf8x]],
+           [258, []],
+           [257, []],
+           [256, [revert_caption]],
+           [255, [revert_encodings]],
+           [254, [revert_clearpage, revert_cleardoublepage]],
+           [253, [revert_esint]],
+           [252, [revert_nomenclature, revert_printnomenclature]],
             [251, [revert_commandparams]],
             [250, [revert_cs_label]],
             [249, []],
-           [248, [revert_utf8]],
+           [248, [revert_accent, revert_utf8]],
             [247, [revert_booktabs]],
             [246, [revert_font_settings]],
             [245, [revert_framed]]]
@@ -611,3 +1722,4 @@ revert =  [[252, [revert_nomenclature, revert_printnomenclature]],
  if __name__ == "__main__":
      pass
  
+