Allow utf8x \inputencoding

[lyx.git] / lib / lyx2lyx / lyx_1_5.py
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py

index 2014f26d25709ead6bba32f4a9f62bbbac256f8d..a57282b2561d669f6c60844636e9a0f039e58f33 100644 (file)
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -230,6 +230,8 @@ where at least two languages have different default encodings are encoded
  in multiple encodings for file formats < 249. These files are incorrectly
  read and written (as if the whole file was in the encoding of the main
  language).
+This is not true for files written by CJK-LyX, they are always in the locale
+encoding.
  
  This function
  - converts from fake unicode values to true unicode if forward is true, and
@@ -239,6 +241,8 @@ document.encoding must be set to the old value (format 248) in both cases.
  We do this here and not in LyX.py because it is far easier to do the
  necessary parsing in modern formats than in ancient ones.
  """
+    if document.cjk_encoding != '':
+        return
      encoding_stack = [document.encoding]
      lang_re = re.compile(r"^\\lang\s(\S+)")
      if document.inputencoding == "auto" or document.inputencoding == "default":
@@ -292,7 +296,7 @@ def revert_utf8(document):
      elif get_value(document.header, "\\inputencoding", i) == "utf8":
          document.header[i] = "\\inputencoding auto"
      document.inputencoding = get_value(document.header, "\\inputencoding", 0)
-    document.encoding = get_encoding(document.language, document.inputencoding, 248)
+    document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
      convert_multiencoding(document, False)
  
  
@@ -1016,11 +1020,11 @@ def revert_accent(document):
      # Replace accented characters with InsetLaTeXAccent
      # Do not convert characters that can be represented in the chosen
      # encoding.
-    encoding_stack = [get_encoding(document.language, document.inputencoding, 248)]
+    encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
      lang_re = re.compile(r"^\\lang\s(\S+)")
      for i in range(len(document.body)):
  
-        if document.inputencoding == "auto" or document.inputencoding == "default":
+        if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
              # Track the encoding of the current line
              result = lang_re.match(document.body[i])
              if result:
@@ -1088,7 +1092,7 @@ def revert_accent(document):
  
  def normalize_font_whitespace(document):
      """ Before format 259 the font changes were ignored if a
-    whitespace was the last character in the sequence, this function
+    whitespace was the first or last character in the sequence, this function
      transfers the whitespace outside."""
  
      if document.backend != "latex":
@@ -1100,16 +1104,95 @@ def normalize_font_whitespace(document):
                         "\\emph": "default",
                         "\\color": "none",
                         "\\shape": "default",
+                       "\\bar": "default",
                         "\\family": "default"}
+    changes = {}
  
-    for i in range(len(lines)):
+    i = 0
+    while i < len(lines):
          words = lines[i].split()
  
-        if len(words) > 1 and words[0] in char_properties.keys() \
-               and words[1] == char_properties[words[0]] \
-               and lines[i-1] and lines[i-1][-1] == " ":
-            lines[i-1] = lines[i-1][:-1]
-            lines[i+1] = " " + lines[i+1]
+        if len(words) > 0 and words[0] == "\\begin_layout":
+            # a new paragraph resets all font changes
+            changes.clear()
+
+        elif len(words) > 1 and words[0] in char_properties.keys():
+            # we have a font change
+            if char_properties[words[0]] == words[1]:
+                # property gets reset
+                if words[0] in changes.keys():
+                    del changes[words[0]]
+                defaultproperty = True
+            else:
+                # property gets set
+                changes[words[0]] = words[1]
+                defaultproperty = False
+
+            # We need to explicitly reset all changed properties if we find
+            # a space below, because LyX 1.4 would output the space after
+            # closing the previous change and before starting the new one,
+            # and closing a font change means to close all properties, not
+            # just the changed one.
+
+            if lines[i-1] and lines[i-1][-1] == " ":
+                lines[i-1] = lines[i-1][:-1]
+                # a space before the font change
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                if defaultproperty:
+                    # Property is reset in lines[i], so add the new stuff afterwards
+                    lines[i+1:i+1] = added_lines
+                else:
+                    # Reset property for the space
+                    added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                    lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+            elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
+                # a space after the font change
+                if (lines[i+1] == " " and lines[i+2]):
+                    next_words = lines[i+2].split()
+                    if len(next_words) > 0 and next_words[0] == words[0]:
+                        # a single blank with a property different from the
+                        # previous and the next line must not be changed
+                        i = i + 2
+                        continue
+                lines[i+1] = lines[i+1][1:]
+                added_lines = [" "]
+                for k in changes.keys():
+                    # exclude property k because that is already in lines[i]
+                    if k != words[0]:
+                        added_lines[1:1] = ["%s %s" % (k, changes[k])]
+                for k in changes.keys():
+                    # exclude property k because that must be added below anyway
+                    if k != words[0]:
+                        added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
+                # Reset property for the space
+                added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
+                lines[i:i] = added_lines
+                i = i + len(added_lines)
+
+        i = i + 1
+
+
+def revert_utf8x(document):
+    " Set utf8x encoding to utf8. "
+    i = find_token(document.header, "\\inputencoding", 0)
+    if i == -1:
+        document.header.append("\\inputencoding auto")
+    else:
+        inputenc = get_value(document.header, "\\inputencoding", i)
+        if inputenc == "utf8x":
+            document.header[i] = "\\inputencoding utf8"
+    document.inputencoding = get_value(document.header, "\\inputencoding", 0)
+
  
  ##
  # Conversion hub
@@ -1129,9 +1212,11 @@ convert = [[246, []],
             [256, []],
             [257, [convert_caption]],
             [258, [convert_lyxline]],
-           [259, [convert_accent, normalize_font_whitespace]]]
+           [259, [convert_accent, normalize_font_whitespace]],
+           [260, []]]
  
-revert =  [[258, []],
+revert =  [[259, [revert_utf8x]],
+           [258, []],
             [257, []],
             [256, [revert_caption]],
             [255, [revert_encodings]],