Allow utf8x \inputencoding

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index 70ec731ee691e5c256fcc7e0aad415769d91a3c2..a57282b2561d669f6c60844636e9a0f039e58f33 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -20,7 +20,9 @@
  """ Convert files to the file format generated by lyx 1.5"""
  
  import re
-from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value
+import unicodedata
+
+from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
  from LyX import get_encoding
  
  
@@ -28,9 +30,13 @@ from LyX import get_encoding
  # Private helper functions
  
  def find_end_of_inset(lines, i):
-    " Find beginning of inset, where lines[i] is included."
+    " Find end of inset, where lines[i] is included."
      return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
  
+def find_end_of_layout(lines, i):
+    " Find end of layout, where lines[i] is included."
+    return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
+
  # End of helper functions
  ####################################################################
  
@@ -217,18 +223,81 @@ def revert_booktabs(document):
          i = i + 1
  
  
+def convert_multiencoding(document, forward):
+    """ Fix files with multiple encodings.
+Files with an inputencoding of "auto" or "default" and multiple languages
+where at least two languages have different default encodings are encoded
+in multiple encodings for file formats < 249. These files are incorrectly
+read and written (as if the whole file was in the encoding of the main
+language).
+This is not true for files written by CJK-LyX, they are always in the locale
+encoding.
+
+This function
+- converts from fake unicode values to true unicode if forward is true, and
+- converts from true unicode values to fake unicode if forward is false.
+document.encoding must be set to the old value (format 248) in both cases.
+
+We do this here and not in LyX.py because it is far easier to do the
+necessary parsing in modern formats than in ancient ones.
+"""
+    if document.cjk_encoding != '':
+        return
+    encoding_stack = [document.encoding]
+    lang_re = re.compile(r"^\\lang\s(\S+)")
+    if document.inputencoding == "auto" or document.inputencoding == "default":
+        for i in range(len(document.body)):
+            result = lang_re.match(document.body[i])
+            if result:
+                language = result.group(1)
+                if language == "default":
+                    document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding))
+                    encoding_stack[-1] = document.encoding
+                else:
+                    from lyx2lyx_lang import lang
+                    document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3]))
+                    encoding_stack[-1] = lang[language][3]
+            elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
+                document.warning("Adding nested encoding %s." % encoding_stack[-1])
+                encoding_stack.append(encoding_stack[-1])
+            elif find_token(document.body, "\\end_layout", i, i + 1) == i:
+                document.warning("Removing nested encoding %s." % encoding_stack[-1])
+                del encoding_stack[-1]
+            if encoding_stack[-1] != document.encoding:
+                if forward:
+                    # This line has been incorrectly interpreted as if it was
+                    # encoded in 'encoding'.
+                    # Convert back to the 8bit string that was in the file.
+                    orig = document.body[i].encode(document.encoding)
+                    # Convert the 8bit string that was in the file to unicode
+                    # with the correct encoding.
+                    document.body[i] = orig.decode(encoding_stack[-1])
+                else:
+                    # Convert unicode to the 8bit string that will be written
+                    # to the file with the correct encoding.
+                    orig = document.body[i].encode(encoding_stack[-1])
+                    # Convert the 8bit string that will be written to the
+                    # file to fake unicode with the encoding that will later
+                    # be used when writing to the file.
+                    document.body[i] = orig.decode(document.encoding)
+
+
  def convert_utf8(document):
+    " Set document encoding to UTF-8. "
+    convert_multiencoding(document, True)
      document.encoding = "utf8"
  
  
  def revert_utf8(document):
+    " Set document encoding to the value corresponding to inputencoding. "
      i = find_token(document.header, "\\inputencoding", 0)
      if i == -1:
          document.header.append("\\inputencoding auto")
      elif get_value(document.header, "\\inputencoding", i) == "utf8":
          document.header[i] = "\\inputencoding auto"
      document.inputencoding = get_value(document.header, "\\inputencoding", 0)
-    document.encoding = get_encoding(document.language, document.inputencoding, 248)
+    document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
+    convert_multiencoding(document, False)
  
  
  def revert_cs_label(document):
@@ -350,9 +419,6 @@ def convert_commandparams(document):
      # \begin_inset LatexCommand bibitem was not the official version (see
      # convert_bibitem()), but could be read in, so we convert it here, too.
  
-    # FIXME: Handle things like \command[foo[bar]]{foo{bar}}
-    # we need a real parser here.
-    regex = re.compile(r'\\([^\[\{]+)(\[[^\[\{]*\])?(\[[^\[\{]*\])?(\{[^}]*\})?')
      i = 0
      while 1:
          i = find_token(document.body, "\\begin_inset LatexCommand", i)
@@ -372,8 +438,8 @@ def convert_commandparams(document):
          state = "WS"
          # Used to handle things like \command[foo[bar]]{foo{bar}}
          nestdepth = 0
-        for j in range(len(command)):
-            c = command[j]
+        b = 0
+        for c in command:
              if ((state == "CMDNAME" and c == ' ') or
                  (state == "CMDNAME" and c == '[') or
                  (state == "CMDNAME" and c == '{')):
@@ -384,11 +450,11 @@ def convert_commandparams(document):
                  if nestdepth == 0:
                      state = "WS"
                  else:
-                    --nestdepth
+                    nestdepth = nestdepth - 1
              if ((state == "OPTION" and c == '[') or
                  (state == "SECOPTION" and c == '[') or
                  (state == "CONTENT" and c == '{')):
-                ++nestdepth
+                nestdepth = nestdepth + 1
              if state == "CMDNAME":
                      name += c
              elif state == "OPTION":
@@ -398,10 +464,6 @@ def convert_commandparams(document):
              elif state == "CONTENT":
                      argument += c
              elif state == "WS":
-                if j > 0:
-                    b = command[j-1]
-                else:
-                    b = 0
                  if c == '\\':
                      state = "CMDNAME"
                  elif c == '[' and b != ']':
@@ -413,6 +475,7 @@ def convert_commandparams(document):
                  elif c == '{':
                      state = "CONTENT"
                      nestdepth = 0 # Just to be sure
+            b = c
  
          # Now we have parsed the command, output the parameters
          lines = ["\\begin_inset LatexCommand %s" % name]
@@ -491,6 +554,646 @@ def revert_commandparams(document):
          i = j + 1
  
  
+def revert_nomenclature(document):
+    " Convert nomenclature entry to ERT. "
+    regex = re.compile(r'(\S+)\s+(.+)')
+    i = 0
+    use_nomencl = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
+        if i == -1:
+            break
+        use_nomencl = 1
+        j = find_end_of_inset(document.body, i + 1)
+        preview_line = ""
+        symbol = ""
+        description = ""
+        prefix = ""
+        for k in range(i + 1, j):
+            match = re.match(regex, document.body[k])
+            if match:
+                name = match.group(1)
+                value = match.group(2)
+                if name == "preview":
+                    preview_line = document.body[k]
+                elif name == "symbol":
+                    symbol = value.strip('"').replace('\\"', '"')
+                elif name == "description":
+                    description = value.strip('"').replace('\\"', '"')
+                elif name == "prefix":
+                    prefix = value.strip('"').replace('\\"', '"')
+            elif document.body[k].strip() != "":
+                document.warning("Ignoring unknown contents `%s' in nomenclature inset." % document.body[k])
+        if prefix == "":
+            command = 'nomenclature{%s}{%s}' % (symbol, description)
+        else:
+            command = 'nomenclature[%s]{%s}{%s}' % (prefix, symbol, description)
+        document.body[i:j+1] = ['\\begin_inset ERT',
+                                'status collapsed',
+                                '',
+                                '\\begin_layout %s' % document.default_layout,
+                                '',
+                                '',
+                                '\\backslash',
+                                command,
+                                '\\end_layout',
+                                '',
+                                '\\end_inset']
+        i = i + 11
+    if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
+        document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
+        document.preamble.append('\\makenomenclature')
+
+
+def revert_printnomenclature(document):
+    " Convert printnomenclature to ERT. "
+    regex = re.compile(r'(\S+)\s+(.+)')
+    i = 0
+    use_nomencl = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
+        if i == -1:
+            break
+        use_nomencl = 1
+        j = find_end_of_inset(document.body, i + 1)
+        preview_line = ""
+        labelwidth = ""
+        for k in range(i + 1, j):
+            match = re.match(regex, document.body[k])
+            if match:
+                name = match.group(1)
+                value = match.group(2)
+                if name == "preview":
+                    preview_line = document.body[k]
+                elif name == "labelwidth":
+                    labelwidth = value.strip('"').replace('\\"', '"')
+            elif document.body[k].strip() != "":
+                document.warning("Ignoring unknown contents `%s' in printnomenclature inset." % document.body[k])
+        if labelwidth == "":
+            command = 'nomenclature{}'
+        else:
+            command = 'nomenclature[%s]' % labelwidth
+        document.body[i:j+1] = ['\\begin_inset ERT',
+                                'status collapsed',
+                                '',
+                                '\\begin_layout %s' % document.default_layout,
+                                '',
+                                '',
+                                '\\backslash',
+                                command,
+                                '\\end_layout',
+                                '',
+                                '\\end_inset']
+        i = i + 11
+    if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
+        document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
+        document.preamble.append('\\makenomenclature')
+
+
+def convert_esint(document):
+    " Add \\use_esint setting to header. "
+    i = find_token(document.header, "\\cite_engine", 0)
+    if i == -1:
+        document.warning("Malformed LyX document: Missing `\\cite_engine'.")
+        return
+    # 0 is off, 1 is auto, 2 is on.
+    document.header.insert(i, '\\use_esint 0')
+
+
+def revert_esint(document):
+    " Remove \\use_esint setting from header. "
+    i = find_token(document.header, "\\use_esint", 0)
+    if i == -1:
+        document.warning("Malformed LyX document: Missing `\\use_esint'.")
+        return
+    use_esint = document.header[i].split()[1]
+    del document.header[i]
+    # 0 is off, 1 is auto, 2 is on.
+    if (use_esint == 2):
+        document.preamble.append('\\usepackage{esint}')
+
+
+def revert_clearpage(document):
+    " clearpage -> ERT "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\clearpage", i)
+        if i == -1:
+            break
+        document.body[i:i+1] =  ['\\begin_inset ERT',
+                                'status collapsed',
+                                '',
+                                '\\begin_layout %s' % document.default_layout,
+                                '',
+                                '',
+                                '\\backslash',
+                                'clearpage',
+                                '\\end_layout',
+                                '',
+                                '\\end_inset']
+    i = i + 1
+
+
+def revert_cleardoublepage(document):
+    " cleardoublepage -> ERT "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\cleardoublepage", i)
+        if i == -1:
+            break
+        document.body[i:i+1] =  ['\\begin_inset ERT',
+                                'status collapsed',
+                                '',
+                                '\\begin_layout %s' % document.default_layout,
+                                '',
+                                '',
+                                '\\backslash',
+                                'cleardoublepage',
+                                '\\end_layout',
+                                '',
+                                '\\end_inset']
+    i = i + 1
+
+
+def convert_lyxline(document):
+    " remove fontsize commands for \lyxline "
+    # The problematic is: The old \lyxline definition doesn't handle the fontsize
+    # to change the line thickness. The new definiton does this so that imported
+    # \lyxlines would have a different line thickness. The eventual fontsize command
+    # before \lyxline is therefore removed to get the same output.
+    fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
+                 "large", "Large", "LARGE", "huge", "Huge"]
+    for n in range(0, len(fontsizes)):
+        i = 0
+        k = 0
+        while i < len(document.body):
+            i = find_token(document.body, "\\size " + fontsizes[n], i)
+            k = find_token(document.body, "\\lyxline",i)
+            # the corresponding fontsize command is always 2 lines before the \lyxline
+            if (i != -1 and k == i+2):
+                document.body[i:i+1] = []
+            else:
+                break
+        i = i + 1
+
+
+def revert_encodings(document):
+    " Set new encodings to auto. "
+    encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
+                 "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250",
+                 "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"]
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc in encodings:
+            document.header[i] = "\\inputencoding auto"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
+def convert_caption(document):
+    " Convert caption layouts to caption insets. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_layout Caption", i)
+        if i == -1:
+            return
+        j = find_end_of_layout(document.body, i)
+        if j == -1:
+            document.warning("Malformed LyX document: Missing `\\end_layout'.")
+            return
+
+        document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
+        document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
+                            "\\begin_inset Caption", "",
+                            "\\begin_layout %s" % document.default_layout]
+        i = i + 1
+
+
+def revert_caption(document):
+    " Convert caption insets to caption layouts. "
+    " This assumes that the text class has a caption style. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset Caption", i)
+        if i == -1:
+            return
+
+        # We either need to delete the previous \begin_layout line, or we
+        # need to end the previous layout if this inset is not in the first
+        # position of the paragraph.
+        layout_before = find_token_backwards(document.body, "\\begin_layout", i)
+        if layout_before == -1:
+            document.warning("Malformed LyX document: Missing `\\begin_layout'.")
+            return
+        layout_line = document.body[layout_before]
+        del_layout_before = True
+        l = layout_before + 1
+        while l < i:
+            if document.body[l] != "":
+                del_layout_before = False
+                break
+            l = l + 1
+        if del_layout_before:
+            del document.body[layout_before:i]
+            i = layout_before
+        else:
+            document.body[i:i] = ["\\end_layout", ""]
+            i = i + 2
+
+        # Find start of layout in the inset and end of inset
+        j = find_token(document.body, "\\begin_layout", i)
+        if j == -1:
+            document.warning("Malformed LyX document: Missing `\\begin_layout'.")
+            return
+        k = find_end_of_inset(document.body, i)
+        if k == -1:
+            document.warning("Malformed LyX document: Missing `\\end_inset'.")
+            return
+
+        # We either need to delete the following \end_layout line, or we need
+        # to restart the old layout if this inset is not at the paragraph end.
+        layout_after = find_token(document.body, "\\end_layout", k)
+        if layout_after == -1:
+            document.warning("Malformed LyX document: Missing `\\end_layout'.")
+            return
+        del_layout_after = True
+        l = k + 1
+        while l < layout_after:
+            if document.body[l] != "":
+                del_layout_after = False
+                break
+            l = l + 1
+        if del_layout_after:
+            del document.body[k+1:layout_after+1]
+        else:
+            document.body[k+1:k+1] = [layout_line, ""]
+
+        # delete \begin_layout and \end_inset and replace \begin_inset with
+        # "\begin_layout Caption". This works because we can only have one
+        # paragraph in the caption inset: The old \end_layout will be recycled.
+        del document.body[k]
+        if document.body[k] == "":
+            del document.body[k]
+        del document.body[j]
+        if document.body[j] == "":
+            del document.body[j]
+        document.body[i] = "\\begin_layout Caption"
+        if document.body[i+1] == "":
+            del document.body[i+1]
+        i = i + 1
+
+
+# Accents of InsetLaTeXAccent
+accent_map = {
+    "`" : u'\u0300', # grave
+    "'" : u'\u0301', # acute
+    "^" : u'\u0302', # circumflex
+    "~" : u'\u0303', # tilde
+    "=" : u'\u0304', # macron
+    "u" : u'\u0306', # breve
+    "." : u'\u0307', # dot above
+    "\"": u'\u0308', # diaresis
+    "r" : u'\u030a', # ring above
+    "H" : u'\u030b', # double acute
+    "v" : u'\u030c', # caron
+    "b" : u'\u0320', # minus sign below
+    "d" : u'\u0323', # dot below
+    "c" : u'\u0327', # cedilla
+    "k" : u'\u0328', # ogonek
+    "t" : u'\u0361'  # tie. This is special: It spans two characters, but
+                     # only one is given as argument, so we don't need to
+                     # treat it differently.
+}
+
+
+# special accents of InsetLaTeXAccent without argument
+special_accent_map = {
+    'i' : u'\u0131', # dotless i
+    'j' : u'\u0237', # dotless j
+    'l' : u'\u0142', # l with stroke
+    'L' : u'\u0141'  # L with stroke
+}
+
+
+# special accent arguments of InsetLaTeXAccent
+accented_map = {
+    '\\i' : u'\u0131', # dotless i
+    '\\j' : u'\u0237'  # dotless j
+}
+
+
+def _convert_accent(accent, accented_char):
+    type = accent
+    char = accented_char
+    if char == '':
+        if type in special_accent_map:
+            return special_accent_map[type]
+        # a missing char is treated as space by LyX
+        char = ' '
+    elif type == 'q' and char in ['t', 'd', 'l', 'L']:
+        # Special caron, only used with t, d, l and L.
+        # It is not in the map because we convert it to the same unicode
+        # character as the normal caron: \q{} is only defined if babel with
+        # the czech or slovak language is used, and the normal caron
+        # produces the correct output if the T1 font encoding is used.
+        # For the same reason we never convert to \q{} in the other direction.
+        type = 'v'
+    elif char in accented_map:
+        char = accented_map[char]
+    elif (len(char) > 1):
+        # We can only convert accents on a single char
+        return ''
+    a = accent_map.get(type)
+    if a:
+        return unicodedata.normalize("NFKC", "%s%s" % (char, a))
+    return ''
+
+
+def convert_ertbackslash(body, i, ert, default_layout):
+    r""" -------------------------------------------------------------------------------------------
+    Convert backslashes and '\n' into valid ERT code, append the converted
+    text to body[i] and return the (maybe incremented) line index i"""
+
+    for c in ert:
+        if c == '\\':
+            body[i] = body[i] + '\\backslash '
+            i = i + 1
+            body.insert(i, '')
+        elif c == '\n':
+            body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
+            i = i + 4
+        else:
+            body[i] = body[i] + c
+    return i
+
+
+def convert_accent(document):
+    # The following forms are supported by LyX:
+    # '\i \"{a}' (standard form, as written by LyX)
+    # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
+    # '\i \"{ }' (also accepted if the accented char is a space)
+    # '\i \" a'  (also accepted)
+    # '\i \"'    (also accepted)
+    re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
+    re_contents = re.compile(r'^([^\s{]+)(.*)$')
+    re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
+    i = 0
+    while 1:
+        i = find_re(document.body, re_wholeinset, i)
+        if i == -1:
+            return
+        match = re_wholeinset.match(document.body[i])
+        prefix = match.group(1)
+        contents = match.group(3).strip()
+        match = re_contents.match(contents)
+        if match:
+            # Strip first char (always \)
+            accent = match.group(1)[1:]
+            accented_contents = match.group(2).strip()
+            match = re_accentedcontents.match(accented_contents)
+            accented_char = match.group(1)
+            converted = _convert_accent(accent, accented_char)
+            if converted == '':
+                # Normalize contents
+                contents = '%s{%s}' % (accent, accented_char),
+            else:
+                document.body[i] = '%s%s' % (prefix, converted)
+                i += 1
+                continue
+        document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
+        document.body[i] = prefix
+        document.body[i+1:i+1] = ['\\begin_inset ERT',
+                                  'status collapsed',
+                                  '',
+                                  '\\begin_layout %s' % document.default_layout,
+                                  '',
+                                  '',
+                                  '']
+        i = convert_ertbackslash(document.body, i + 7,
+                                 '\\%s' % contents,
+                                 document.default_layout)
+        document.body[i+1:i+1] = ['\\end_layout',
+                                  '',
+                                  '\\end_inset']
+        i += 3
+
+
+def revert_accent(document):
+    inverse_accent_map = {}
+    for k in accent_map:
+        inverse_accent_map[accent_map[k]] = k
+    inverse_special_accent_map = {}
+    for k in special_accent_map:
+        inverse_special_accent_map[special_accent_map[k]] = k
+    inverse_accented_map = {}
+    for k in accented_map:
+        inverse_accented_map[accented_map[k]] = k
+
+    # Since LyX may insert a line break within a word we must combine all
+    # words before unicode normalization.
+    # We do this only if the next line starts with an accent, otherwise we
+    # would create things like '\begin_inset ERTstatus'.
+    numberoflines = len(document.body)
+    for i in range(numberoflines-1):
+        if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
+            continue
+        if (document.body[i+1][0] in inverse_accent_map):
+            # the last character of this line and the first of the next line
+            # form probably a surrogate pair.
+            while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
+                document.body[i] += document.body[i+1][0]
+                document.body[i+1] = document.body[i+1][1:]
+
+    # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
+    # This is needed to catch all accented characters.
+    for i in range(numberoflines):
+        # Unfortunately we have a mixture of unicode strings and plain strings,
+        # because we never use u'xxx' for string literals, but 'xxx'.
+        # Therefore we may have to try two times to normalize the data.
+        try:
+            document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+        except TypeError:
+            document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+
+    # Replace accented characters with InsetLaTeXAccent
+    # Do not convert characters that can be represented in the chosen
+    # encoding.
+    encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
+    lang_re = re.compile(r"^\\lang\s(\S+)")
+    for i in range(len(document.body)):
+
+        if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
+            # Track the encoding of the current line
+            result = lang_re.match(document.body[i])
+            if result:
+                language = result.group(1)
+                if language == "default":
+                    encoding_stack[-1] = document.encoding
+                else:
+                    from lyx2lyx_lang import lang
+                    encoding_stack[-1] = lang[language][3]
+                continue
+            elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
+                encoding_stack.append(encoding_stack[-1])
+                continue
+            elif find_token(document.body, "\\end_layout", i, i + 1) == i:
+                del encoding_stack[-1]
+                continue
+
+        for j in range(len(document.body[i])):
+            # dotless i and dotless j are both in special_accent_map and can
+            # occur as an accented character, so we need to test that the
+            # following character is no accent
+            if (document.body[i][j] in inverse_special_accent_map and
+                (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
+                accent = document.body[i][j]
+                try:
+                    dummy = accent.encode(encoding_stack[-1])
+                except UnicodeEncodeError:
+                    # Insert the rest of the line as new line
+                    if j < len(document.body[i]) - 1:
+                        document.body[i+1:i+1] = document.body[i][j+1:]
+                    # Delete the accented character
+                    if j > 0:
+                        document.body[i] = document.body[i][:j-1]
+                    else:
+                        document.body[i] = u''
+                    # Finally add the InsetLaTeXAccent
+                    document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
+                    break
+            elif j > 0 and document.body[i][j] in inverse_accent_map:
+                accented_char = document.body[i][j-1]
+                if accented_char == ' ':
+                    # Conform to LyX output
+                    accented_char = ''
+                elif accented_char in inverse_accented_map:
+                    accented_char = inverse_accented_map[accented_char]
+                accent = document.body[i][j]
+                try:
+                    dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
+                except UnicodeEncodeError:
+                    # Insert the rest of the line as new line
+                    if j < len(document.body[i]) - 1:
+                        document.body[i+1:i+1] = document.body[i][j+1:]
+                    # Delete the accented characters
+                    if j > 1:
+                        document.body[i] = document.body[i][:j-2]
+                    else:
+                        document.body[i] = u''
+                    # Finally add the InsetLaTeXAccent
+                    document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
+                    break
+    # Normalize to "Normal form C" (NFC, pre-composed characters) again
+    for i in range(numberoflines):
+        document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+
+
+def normalize_font_whitespace(document):
+    """ Before format 259 the font changes were ignored if a
+    whitespace was the first or last character in the sequence, this function
+    transfers the whitespace outside."""
+
+    if document.backend != "latex":
+        return
+
+    lines = document.body
+
+    char_properties = {"\\series": "default",
+                       "\\emph": "default",
+                       "\\color": "none",
+                       "\\shape": "default",
+                       "\\bar": "default",
+                       "\\family": "default"}
+    changes = {}
+
+    i = 0
+    while i < len(lines):
+        words = lines[i].split()
+
+        if len(words) > 0 and words[0] == "\\begin_layout":
+            # a new paragraph resets all font changes
+            changes.clear()
+
+        elif len(words) > 1 and words[0] in char_properties.keys():
+            # we have a font change
+            if char_properties[words[0]] == words[1]:
+                # property gets reset
+                if words[0] in changes.keys():
+                    del changes[words[0]]
+                defaultproperty = True
+            else:
+                # property gets set
+                changes[words[0]] = words[1]
+                defaultproperty = False
+
+            # We need to explicitly reset all changed properties if we find
+            # a space below, because LyX 1.4 would output the space after
+            # closing the previous change and before starting the new one,
+            # and closing a font change means to close all properties, not
+            # just the changed one.
+
+            if lines[i-1] and lines[i-1][-1] == " ":
+                lines[i-1] = lines[i-1][:-1]
+                # a space before the font change
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                if defaultproperty:
+                    # Property is reset in lines[i], so add the new stuff afterwards
+                    lines[i+1:i+1] = added_lines
+                else:
+                    # Reset property for the space
+                    added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                    lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+            elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
+                # a space after the font change
+                if (lines[i+1] == " " and lines[i+2]):
+                    next_words = lines[i+2].split()
+                    if len(next_words) > 0 and next_words[0] == words[0]:
+                        # a single blank with a property different from the
+                        # previous and the next line must not be changed
+                        i = i + 2
+                        continue
+                lines[i+1] = lines[i+1][1:]
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                # Reset property for the space
+                added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+        i = i + 1
+
+
+def revert_utf8x(document):
+    " Set utf8x encoding to utf8. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "utf8x":
+            document.header[i] = "\\inputencoding utf8"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
  ##
  # Conversion hub
  #
@@ -502,12 +1205,28 @@ convert = [[246, []],
             [249, [convert_utf8]],
             [250, []],
             [251, []],
-           [252, [convert_commandparams, convert_bibitem]]]
+           [252, [convert_commandparams, convert_bibitem]],
+           [253, []],
+           [254, [convert_esint]],
+           [255, []],
+           [256, []],
+           [257, [convert_caption]],
+           [258, [convert_lyxline]],
+           [259, [convert_accent, normalize_font_whitespace]],
+           [260, []]]
  
-revert =  [[251, [revert_commandparams]],
+revert =  [[259, [revert_utf8x]],
+           [258, []],
+           [257, []],
+           [256, [revert_caption]],
+           [255, [revert_encodings]],
+           [254, [revert_clearpage, revert_cleardoublepage]],
+           [253, [revert_esint]],
+           [252, [revert_nomenclature, revert_printnomenclature]],
+           [251, [revert_commandparams]],
             [250, [revert_cs_label]],
             [249, []],
-           [248, [revert_utf8]],
+           [248, [revert_accent, revert_utf8]],
             [247, [revert_booktabs]],
             [246, [revert_font_settings]],
             [245, [revert_framed]]]
@@ -516,3 +1235,4 @@ revert =  [[251, [revert_commandparams]],
  if __name__ == "__main__":
      pass
  
+