Allow utf8x \inputencoding

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index dcd600d00dd94eee30572fffd26a26a949d6ead8..a57282b2561d669f6c60844636e9a0f039e58f33 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -20,7 +20,9 @@
  """ Convert files to the file format generated by lyx 1.5"""
  
  import re
-from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value
+import unicodedata
+
+from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
  from LyX import get_encoding
  
  
@@ -28,9 +30,13 @@ from LyX import get_encoding
  # Private helper functions
  
  def find_end_of_inset(lines, i):
-    " Find beginning of inset, where lines[i] is included."
+    " Find end of inset, where lines[i] is included."
      return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
  
+def find_end_of_layout(lines, i):
+    " Find end of layout, where lines[i] is included."
+    return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
+
  # End of helper functions
  ####################################################################
  
@@ -219,10 +225,13 @@ def revert_booktabs(document):
  
  def convert_multiencoding(document, forward):
      """ Fix files with multiple encodings.
-Files with an inputencoding of "auto" and multiple languages where at least
-two languages have different default encodings are encoded in multiple
-encodings for file formats < 249. These files are incorrectly read and
-written (as if the whole file was in the encoding of the main language).
+Files with an inputencoding of "auto" or "default" and multiple languages
+where at least two languages have different default encodings are encoded
+in multiple encodings for file formats < 249. These files are incorrectly
+read and written (as if the whole file was in the encoding of the main
+language).
+This is not true for files written by CJK-LyX, they are always in the locale
+encoding.
  
  This function
  - converts from fake unicode values to true unicode if forward is true, and
@@ -232,9 +241,11 @@ document.encoding must be set to the old value (format 248) in both cases.
  We do this here and not in LyX.py because it is far easier to do the
  necessary parsing in modern formats than in ancient ones.
  """
+    if document.cjk_encoding != '':
+        return
      encoding_stack = [document.encoding]
      lang_re = re.compile(r"^\\lang\s(\S+)")
-    if document.inputencoding == "auto":
+    if document.inputencoding == "auto" or document.inputencoding == "default":
          for i in range(len(document.body)):
              result = lang_re.match(document.body[i])
              if result:
@@ -285,7 +296,7 @@ def revert_utf8(document):
      elif get_value(document.header, "\\inputencoding", i) == "utf8":
          document.header[i] = "\\inputencoding auto"
      document.inputencoding = get_value(document.header, "\\inputencoding", 0)
-    document.encoding = get_encoding(document.language, document.inputencoding, 248)
+    document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
      convert_multiencoding(document, False)
  
  
@@ -663,7 +674,7 @@ def revert_esint(document):
  
  
  def revert_clearpage(document):
-    " clearpage -> ERT"
+    " clearpage -> ERT "
      i = 0
      while 1:
          i = find_token(document.body, "\\clearpage", i)
@@ -684,7 +695,7 @@ def revert_clearpage(document):
  
  
  def revert_cleardoublepage(document):
-    " cleardoublepage -> ERT"
+    " cleardoublepage -> ERT "
      i = 0
      while 1:
          i = find_token(document.body, "\\cleardoublepage", i)
@@ -704,6 +715,28 @@ def revert_cleardoublepage(document):
      i = i + 1
  
  
+def convert_lyxline(document):
+    " remove fontsize commands for \lyxline "
+    # The problematic is: The old \lyxline definition doesn't handle the fontsize
+    # to change the line thickness. The new definiton does this so that imported
+    # \lyxlines would have a different line thickness. The eventual fontsize command
+    # before \lyxline is therefore removed to get the same output.
+    fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
+                 "large", "Large", "LARGE", "huge", "Huge"]
+    for n in range(0, len(fontsizes)):
+        i = 0
+        k = 0
+        while i < len(document.body):
+            i = find_token(document.body, "\\size " + fontsizes[n], i)
+            k = find_token(document.body, "\\lyxline",i)
+            # the corresponding fontsize command is always 2 lines before the \lyxline
+            if (i != -1 and k == i+2):
+                document.body[i:i+1] = []
+            else:
+                break
+        i = i + 1
+
+
  def revert_encodings(document):
      " Set new encodings to auto. "
      encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
@@ -719,6 +752,448 @@ def revert_encodings(document):
      document.inputencoding = get_value(document.header, "\\inputencoding", 0)
  
  
+def convert_caption(document):
+    " Convert caption layouts to caption insets. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_layout Caption", i)
+        if i == -1:
+            return
+        j = find_end_of_layout(document.body, i)
+        if j == -1:
+            document.warning("Malformed LyX document: Missing `\\end_layout'.")
+            return
+
+        document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
+        document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
+                            "\\begin_inset Caption", "",
+                            "\\begin_layout %s" % document.default_layout]
+        i = i + 1
+
+
+def revert_caption(document):
+    " Convert caption insets to caption layouts. "
+    " This assumes that the text class has a caption style. "
+    i = 0
+    while 1:
+        i = find_token(document.body, "\\begin_inset Caption", i)
+        if i == -1:
+            return
+
+        # We either need to delete the previous \begin_layout line, or we
+        # need to end the previous layout if this inset is not in the first
+        # position of the paragraph.
+        layout_before = find_token_backwards(document.body, "\\begin_layout", i)
+        if layout_before == -1:
+            document.warning("Malformed LyX document: Missing `\\begin_layout'.")
+            return
+        layout_line = document.body[layout_before]
+        del_layout_before = True
+        l = layout_before + 1
+        while l < i:
+            if document.body[l] != "":
+                del_layout_before = False
+                break
+            l = l + 1
+        if del_layout_before:
+            del document.body[layout_before:i]
+            i = layout_before
+        else:
+            document.body[i:i] = ["\\end_layout", ""]
+            i = i + 2
+
+        # Find start of layout in the inset and end of inset
+        j = find_token(document.body, "\\begin_layout", i)
+        if j == -1:
+            document.warning("Malformed LyX document: Missing `\\begin_layout'.")
+            return
+        k = find_end_of_inset(document.body, i)
+        if k == -1:
+            document.warning("Malformed LyX document: Missing `\\end_inset'.")
+            return
+
+        # We either need to delete the following \end_layout line, or we need
+        # to restart the old layout if this inset is not at the paragraph end.
+        layout_after = find_token(document.body, "\\end_layout", k)
+        if layout_after == -1:
+            document.warning("Malformed LyX document: Missing `\\end_layout'.")
+            return
+        del_layout_after = True
+        l = k + 1
+        while l < layout_after:
+            if document.body[l] != "":
+                del_layout_after = False
+                break
+            l = l + 1
+        if del_layout_after:
+            del document.body[k+1:layout_after+1]
+        else:
+            document.body[k+1:k+1] = [layout_line, ""]
+
+        # delete \begin_layout and \end_inset and replace \begin_inset with
+        # "\begin_layout Caption". This works because we can only have one
+        # paragraph in the caption inset: The old \end_layout will be recycled.
+        del document.body[k]
+        if document.body[k] == "":
+            del document.body[k]
+        del document.body[j]
+        if document.body[j] == "":
+            del document.body[j]
+        document.body[i] = "\\begin_layout Caption"
+        if document.body[i+1] == "":
+            del document.body[i+1]
+        i = i + 1
+
+
+# Accents of InsetLaTeXAccent
+accent_map = {
+    "`" : u'\u0300', # grave
+    "'" : u'\u0301', # acute
+    "^" : u'\u0302', # circumflex
+    "~" : u'\u0303', # tilde
+    "=" : u'\u0304', # macron
+    "u" : u'\u0306', # breve
+    "." : u'\u0307', # dot above
+    "\"": u'\u0308', # diaresis
+    "r" : u'\u030a', # ring above
+    "H" : u'\u030b', # double acute
+    "v" : u'\u030c', # caron
+    "b" : u'\u0320', # minus sign below
+    "d" : u'\u0323', # dot below
+    "c" : u'\u0327', # cedilla
+    "k" : u'\u0328', # ogonek
+    "t" : u'\u0361'  # tie. This is special: It spans two characters, but
+                     # only one is given as argument, so we don't need to
+                     # treat it differently.
+}
+
+
+# special accents of InsetLaTeXAccent without argument
+special_accent_map = {
+    'i' : u'\u0131', # dotless i
+    'j' : u'\u0237', # dotless j
+    'l' : u'\u0142', # l with stroke
+    'L' : u'\u0141'  # L with stroke
+}
+
+
+# special accent arguments of InsetLaTeXAccent
+accented_map = {
+    '\\i' : u'\u0131', # dotless i
+    '\\j' : u'\u0237'  # dotless j
+}
+
+
+def _convert_accent(accent, accented_char):
+    type = accent
+    char = accented_char
+    if char == '':
+        if type in special_accent_map:
+            return special_accent_map[type]
+        # a missing char is treated as space by LyX
+        char = ' '
+    elif type == 'q' and char in ['t', 'd', 'l', 'L']:
+        # Special caron, only used with t, d, l and L.
+        # It is not in the map because we convert it to the same unicode
+        # character as the normal caron: \q{} is only defined if babel with
+        # the czech or slovak language is used, and the normal caron
+        # produces the correct output if the T1 font encoding is used.
+        # For the same reason we never convert to \q{} in the other direction.
+        type = 'v'
+    elif char in accented_map:
+        char = accented_map[char]
+    elif (len(char) > 1):
+        # We can only convert accents on a single char
+        return ''
+    a = accent_map.get(type)
+    if a:
+        return unicodedata.normalize("NFKC", "%s%s" % (char, a))
+    return ''
+
+
+def convert_ertbackslash(body, i, ert, default_layout):
+    r""" -------------------------------------------------------------------------------------------
+    Convert backslashes and '\n' into valid ERT code, append the converted
+    text to body[i] and return the (maybe incremented) line index i"""
+
+    for c in ert:
+        if c == '\\':
+            body[i] = body[i] + '\\backslash '
+            i = i + 1
+            body.insert(i, '')
+        elif c == '\n':
+            body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
+            i = i + 4
+        else:
+            body[i] = body[i] + c
+    return i
+
+
+def convert_accent(document):
+    # The following forms are supported by LyX:
+    # '\i \"{a}' (standard form, as written by LyX)
+    # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
+    # '\i \"{ }' (also accepted if the accented char is a space)
+    # '\i \" a'  (also accepted)
+    # '\i \"'    (also accepted)
+    re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
+    re_contents = re.compile(r'^([^\s{]+)(.*)$')
+    re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
+    i = 0
+    while 1:
+        i = find_re(document.body, re_wholeinset, i)
+        if i == -1:
+            return
+        match = re_wholeinset.match(document.body[i])
+        prefix = match.group(1)
+        contents = match.group(3).strip()
+        match = re_contents.match(contents)
+        if match:
+            # Strip first char (always \)
+            accent = match.group(1)[1:]
+            accented_contents = match.group(2).strip()
+            match = re_accentedcontents.match(accented_contents)
+            accented_char = match.group(1)
+            converted = _convert_accent(accent, accented_char)
+            if converted == '':
+                # Normalize contents
+                contents = '%s{%s}' % (accent, accented_char),
+            else:
+                document.body[i] = '%s%s' % (prefix, converted)
+                i += 1
+                continue
+        document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
+        document.body[i] = prefix
+        document.body[i+1:i+1] = ['\\begin_inset ERT',
+                                  'status collapsed',
+                                  '',
+                                  '\\begin_layout %s' % document.default_layout,
+                                  '',
+                                  '',
+                                  '']
+        i = convert_ertbackslash(document.body, i + 7,
+                                 '\\%s' % contents,
+                                 document.default_layout)
+        document.body[i+1:i+1] = ['\\end_layout',
+                                  '',
+                                  '\\end_inset']
+        i += 3
+
+
+def revert_accent(document):
+    inverse_accent_map = {}
+    for k in accent_map:
+        inverse_accent_map[accent_map[k]] = k
+    inverse_special_accent_map = {}
+    for k in special_accent_map:
+        inverse_special_accent_map[special_accent_map[k]] = k
+    inverse_accented_map = {}
+    for k in accented_map:
+        inverse_accented_map[accented_map[k]] = k
+
+    # Since LyX may insert a line break within a word we must combine all
+    # words before unicode normalization.
+    # We do this only if the next line starts with an accent, otherwise we
+    # would create things like '\begin_inset ERTstatus'.
+    numberoflines = len(document.body)
+    for i in range(numberoflines-1):
+        if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
+            continue
+        if (document.body[i+1][0] in inverse_accent_map):
+            # the last character of this line and the first of the next line
+            # form probably a surrogate pair.
+            while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
+                document.body[i] += document.body[i+1][0]
+                document.body[i+1] = document.body[i+1][1:]
+
+    # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
+    # This is needed to catch all accented characters.
+    for i in range(numberoflines):
+        # Unfortunately we have a mixture of unicode strings and plain strings,
+        # because we never use u'xxx' for string literals, but 'xxx'.
+        # Therefore we may have to try two times to normalize the data.
+        try:
+            document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+        except TypeError:
+            document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+
+    # Replace accented characters with InsetLaTeXAccent
+    # Do not convert characters that can be represented in the chosen
+    # encoding.
+    encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
+    lang_re = re.compile(r"^\\lang\s(\S+)")
+    for i in range(len(document.body)):
+
+        if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
+            # Track the encoding of the current line
+            result = lang_re.match(document.body[i])
+            if result:
+                language = result.group(1)
+                if language == "default":
+                    encoding_stack[-1] = document.encoding
+                else:
+                    from lyx2lyx_lang import lang
+                    encoding_stack[-1] = lang[language][3]
+                continue
+            elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
+                encoding_stack.append(encoding_stack[-1])
+                continue
+            elif find_token(document.body, "\\end_layout", i, i + 1) == i:
+                del encoding_stack[-1]
+                continue
+
+        for j in range(len(document.body[i])):
+            # dotless i and dotless j are both in special_accent_map and can
+            # occur as an accented character, so we need to test that the
+            # following character is no accent
+            if (document.body[i][j] in inverse_special_accent_map and
+                (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
+                accent = document.body[i][j]
+                try:
+                    dummy = accent.encode(encoding_stack[-1])
+                except UnicodeEncodeError:
+                    # Insert the rest of the line as new line
+                    if j < len(document.body[i]) - 1:
+                        document.body[i+1:i+1] = document.body[i][j+1:]
+                    # Delete the accented character
+                    if j > 0:
+                        document.body[i] = document.body[i][:j-1]
+                    else:
+                        document.body[i] = u''
+                    # Finally add the InsetLaTeXAccent
+                    document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
+                    break
+            elif j > 0 and document.body[i][j] in inverse_accent_map:
+                accented_char = document.body[i][j-1]
+                if accented_char == ' ':
+                    # Conform to LyX output
+                    accented_char = ''
+                elif accented_char in inverse_accented_map:
+                    accented_char = inverse_accented_map[accented_char]
+                accent = document.body[i][j]
+                try:
+                    dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
+                except UnicodeEncodeError:
+                    # Insert the rest of the line as new line
+                    if j < len(document.body[i]) - 1:
+                        document.body[i+1:i+1] = document.body[i][j+1:]
+                    # Delete the accented characters
+                    if j > 1:
+                        document.body[i] = document.body[i][:j-2]
+                    else:
+                        document.body[i] = u''
+                    # Finally add the InsetLaTeXAccent
+                    document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
+                    break
+    # Normalize to "Normal form C" (NFC, pre-composed characters) again
+    for i in range(numberoflines):
+        document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+
+
+def normalize_font_whitespace(document):
+    """ Before format 259 the font changes were ignored if a
+    whitespace was the first or last character in the sequence, this function
+    transfers the whitespace outside."""
+
+    if document.backend != "latex":
+        return
+
+    lines = document.body
+
+    char_properties = {"\\series": "default",
+                       "\\emph": "default",
+                       "\\color": "none",
+                       "\\shape": "default",
+                       "\\bar": "default",
+                       "\\family": "default"}
+    changes = {}
+
+    i = 0
+    while i < len(lines):
+        words = lines[i].split()
+
+        if len(words) > 0 and words[0] == "\\begin_layout":
+            # a new paragraph resets all font changes
+            changes.clear()
+
+        elif len(words) > 1 and words[0] in char_properties.keys():
+            # we have a font change
+            if char_properties[words[0]] == words[1]:
+                # property gets reset
+                if words[0] in changes.keys():
+                    del changes[words[0]]
+                defaultproperty = True
+            else:
+                # property gets set
+                changes[words[0]] = words[1]
+                defaultproperty = False
+
+            # We need to explicitly reset all changed properties if we find
+            # a space below, because LyX 1.4 would output the space after
+            # closing the previous change and before starting the new one,
+            # and closing a font change means to close all properties, not
+            # just the changed one.
+
+            if lines[i-1] and lines[i-1][-1] == " ":
+                lines[i-1] = lines[i-1][:-1]
+                # a space before the font change
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                if defaultproperty:
+                    # Property is reset in lines[i], so add the new stuff afterwards
+                    lines[i+1:i+1] = added_lines
+                else:
+                    # Reset property for the space
+                    added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                    lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+            elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
+                # a space after the font change
+                if (lines[i+1] == " " and lines[i+2]):
+                    next_words = lines[i+2].split()
+                    if len(next_words) > 0 and next_words[0] == words[0]:
+                        # a single blank with a property different from the
+                        # previous and the next line must not be changed
+                        i = i + 2
+                        continue
+                lines[i+1] = lines[i+1][1:]
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                # Reset property for the space
+                added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+        i = i + 1
+
+
+def revert_utf8x(document):
+    " Set utf8x encoding to utf8. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "utf8x":
+            document.header[i] = "\\inputencoding utf8"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
+
  ##
  # Conversion hub
  #
@@ -734,16 +1209,24 @@ convert = [[246, []],
             [253, []],
             [254, [convert_esint]],
             [255, []],
-           [256, []]]
-
-revert =  [[255, [revert_encodings]],
+           [256, []],
+           [257, [convert_caption]],
+           [258, [convert_lyxline]],
+           [259, [convert_accent, normalize_font_whitespace]],
+           [260, []]]
+
+revert =  [[259, [revert_utf8x]],
+           [258, []],
+           [257, []],
+           [256, [revert_caption]],
+           [255, [revert_encodings]],
             [254, [revert_clearpage, revert_cleardoublepage]],
             [253, [revert_esint]],
             [252, [revert_nomenclature, revert_printnomenclature]],
             [251, [revert_commandparams]],
             [250, [revert_cs_label]],
             [249, []],
-           [248, [revert_utf8]],
+           [248, [revert_accent, revert_utf8]],
             [247, [revert_booktabs]],
             [246, [revert_font_settings]],
             [245, [revert_framed]]]
@@ -752,3 +1235,4 @@ revert =  [[255, [revert_encodings]],
  if __name__ == "__main__":
      pass
  
+