]> git.lyx.org Git - lyx.git/blobdiff - lib/lyx2lyx/lyx_1_5.py
Fix a few edge-cases which in the lyx2lyx conversion to format 249
[lyx.git] / lib / lyx2lyx / lyx_1_5.py
index 3374d8567f229dd571c19a0476fa62feb0742c7e..d874ef3101bb27a4b9c574833ecb1aaeecec3b34 100644 (file)
@@ -246,10 +246,13 @@ document.encoding must be set to the old value (format 248) in both cases.
 We do this here and not in LyX.py because it is far easier to do the
 necessary parsing in modern formats than in ancient ones.
 """
+    inset_types = ["Foot", "Note"]
     if document.cjk_encoding != '':
         return
     encoding_stack = [document.encoding]
+    inset_stack = []
     lang_re = re.compile(r"^\\lang\s(\S+)")
+    inset_re = re.compile(r"^\\begin_inset\s(\S+)")
     if document.inputencoding == "auto" or document.inputencoding == "default":
         for i in range(len(document.body)):
             result = lang_re.match(document.body[i])
@@ -264,7 +267,11 @@ necessary parsing in modern formats than in ancient ones.
                     encoding_stack[-1] = lang[language][3]
             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
                 document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
-                encoding_stack.append(encoding_stack[-1])
+                if len(inset_stack) > 0 and inset_stack[-1] in inset_types:
+                    from lyx2lyx_lang import lang
+                    encoding_stack.append(lang[document.language][3])
+                else:
+                    encoding_stack.append(encoding_stack[-1])
             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
                 document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
                 if len(encoding_stack) == 1:
@@ -272,6 +279,15 @@ necessary parsing in modern formats than in ancient ones.
                     document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
                 else:
                     del encoding_stack[-1]
+            elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
+                inset_result = inset_re.match(document.body[i])
+                if inset_result:
+                    inset_type = inset_result.group(1)
+                    inset_stack.append(inset_type)
+                else: 
+                    inset_stack.append("")
+            elif find_token(document.body, "\\end_inset", i, i + 1) == i:
+                del inset_stack[-1]
             if encoding_stack[-1] != document.encoding:
                 if forward:
                     # This line has been incorrectly interpreted as if it was
@@ -1043,7 +1059,8 @@ def revert_accent(document):
     # encoding.
     encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
     lang_re = re.compile(r"^\\lang\s(\S+)")
-    for i in range(len(document.body)):
+    i = 0
+    while i < len(document.body):
 
         if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
             # Track the encoding of the current line
@@ -1075,7 +1092,7 @@ def revert_accent(document):
                 except UnicodeEncodeError:
                     # Insert the rest of the line as new line
                     if j < len(document.body[i]) - 1:
-                        document.body[i+1:i+1] = document.body[i][j+1:]
+                        document.body.insert(i+1, document.body[i][j+1:])
                     # Delete the accented character
                     if j > 0:
                         document.body[i] = document.body[i][:j-1]
@@ -1097,7 +1114,7 @@ def revert_accent(document):
                 except UnicodeEncodeError:
                     # Insert the rest of the line as new line
                     if j < len(document.body[i]) - 1:
-                        document.body[i+1:i+1] = document.body[i][j+1:]
+                        document.body.insert(i+1, document.body[i][j+1:])
                     # Delete the accented characters
                     if j > 1:
                         document.body[i] = document.body[i][:j-2]
@@ -1106,6 +1123,8 @@ def revert_accent(document):
                     # Finally add the InsetLaTeXAccent
                     document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
                     break
+        i = i + 1
+
     # Normalize to "Normal form C" (NFC, pre-composed characters) again
     for i in range(numberoflines):
         document.body[i] = unicodedata.normalize("NFC", document.body[i])
@@ -1616,7 +1635,7 @@ after label
         inlinecode = ''
         # looking for the oneline code for lstinline
         inlinecode = document.body[find_end_of_layout(document.body, 
-            find_token(document.body, '\\begin_layout Standard', i + 1) +1 ) - 1]
+            find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
         if len(caption) > 0:
             if len(params) == 0:
                 params = 'caption={%s}' % caption
@@ -1633,7 +1652,7 @@ after label
         if inline == 'true':
             document.body[i:(j+1)] = [r'\begin_inset ERT',
                                       'status %s' % status,
-                                      r'\begin_layout Standard',
+                                      r'\begin_layout %s' % document.default_layout,
                                       '', 
                                       '',
                                       r'\backslash',
@@ -1645,7 +1664,7 @@ after label
             document.body[i: j+1] =  [r'\begin_inset ERT',
                                       'status %s' % status,
                                       '',
-                                      r'\begin_layout Standard',
+                                      r'\begin_layout %s' % document.default_layout,
                                       '',
                                       '',
                                       r'\backslash',
@@ -1653,7 +1672,7 @@ after label
                                       r'\end_layout'
                                     ] + document.body[k : j - 1] + \
                                      ['',
-                                      r'\begin_layout Standard',
+                                      r'\begin_layout %s' % document.default_layout,
                                       '',
                                       r'\backslash',
                                       'end{lstlisting}',
@@ -1704,7 +1723,7 @@ lstinputlisting{file}[opt]
         document.body[i : j + 1] = [r'\begin_inset ERT',
                                     'status open',
                                     '',
-                                    r'\begin_layout Standard',
+                                    r'\begin_layout %s' % document.default_layout,
                                     '',
                                     '',
                                     r'\backslash',
@@ -1769,6 +1788,7 @@ def convert_ext_font_sizes(document):
     else:
         del document.header[i]
 
+
 def revert_separator_layout(document):
     r'''Revert --Separator-- to a lyx note
 From
@@ -1803,11 +1823,11 @@ something
         if j == -1:
             # this should not happen
             break
-        document.body[i : j + 1] = [r'\begin_layout Standard',
+        document.body[i : j + 1] = [r'\begin_layout %s' % document.default_layout,
                                     r'\begin_inset Note Note',
                                     'status open',
                                     '',
-                                    r'\begin_layout Standard',
+                                    r'\begin_layout %s' % document.default_layout,
                                     'Separate Environment',
                                     r'\end_layout',
                                     '',
@@ -1817,6 +1837,7 @@ something
                                     r'\end_layout'
                                     ]
 
+
 def convert_arabic (document):
     if document.language == "arabic":
         document.language = "arabic_arabtex"
@@ -1830,7 +1851,8 @@ def convert_arabic (document):
             # change the language name
             document.body[i] = '\lang arabic_arabtex'
         i = i + 1
-       
+
+
 def revert_arabic (document):
     if document.language == "arabic_arabtex":
         document.language = "arabic"
@@ -1845,13 +1867,11 @@ def revert_arabic (document):
             document.body[i] = '\lang arabic'
         i = i + 1
 
-def revert_unicode(document):
-    '''Transform unicode symbols according to the unicode list.
-Preamble flags are not implemented.
-Combination characters are currently ignored.
-Forced output is currently not enforced'''
-    pathname = os.path.dirname(sys.argv[0])
-    fp = open(pathname.strip('lyx2lyx') + 'unicodesymbols','r')
+
+def read_unicodesymbols():
+    " Read the unicodesymbols list of unicode characters and corresponding commands."
+    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+    fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
     spec_chars = {}
     for line in fp.readlines():
         if line[0] != '#':
@@ -1859,82 +1879,133 @@ Forced output is currently not enforced'''
             line=line.replace('" ',' ') # remove all quotation marks with spaces after
             line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
             try:
-                # flag1 and flag2 are preamble & flags
-                # currently NOT implemented
+                # flag1 and flag2 are preamble and other flags
                 [ucs4,command,flag1,flag2] =line.split(None,3)
                 spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
             except:
                 pass
     fp.close()
+
+    return spec_chars
+
+
+def revert_unicode(document):
+    '''Transform unicode characters that can not be written using the
+document encoding to commands according to the unicodesymbols
+file. Characters that can not be replaced by commands are replaced by
+an replacement string.  Flags other than 'combined' are currently not
+implemented.'''
+
+    replacement_character = '???'
+    spec_chars = read_unicodesymbols()
+
     # Define strings to start and end ERT and math insets
-    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout Standard\n\\backslash\n'
-    ert_outro='\n\\end_layout\n\n\\end_inset\n\n'
+    ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash' % document.default_layout
+    ert_outro='\n\\end_layout\n\n\\end_inset\n'
     math_intro='\n\\begin_inset Formula $'
-    math_outro='$\n\\end_inset\n'
+    math_outro='$\n\\end_inset'
     # Find unicode characters and replace them
-    in_ert = 0 # flag set to 1 if in ERT inset
-    in_math = 0 # flag set to 1 if in math inset
+    in_ert = False # flag set to 1 if in ERT inset
+    in_math = False # flag set to 1 if in math inset
     insets = [] # list of active insets
-    for i, current_line in enumerate(document.body):
-        if current_line.find('\\begin_inset') > -1:
+
+    # Go through the file to capture all combining characters
+    last_char = '' # to store the previous character
+
+    i = 0
+    while i < len(document.body):
+        line = document.body[i]
+        # Check for insets
+        if line.find('\\begin_inset') > -1:
             # check which inset to start
-            if current_line.find('\\begin_inset ERT') > -1:
-                in_ert = 1
+            if line.find('\\begin_inset ERT') > -1:
+                in_ert = True
                 insets.append('ert')
-            elif current_line.find('\\begin_inset Formula') > -1:
-                in_math = 1
+            elif line.find('\\begin_inset Formula') > -1:
+                in_math = True
                 insets.append('math')
             else:
                 insets.append('other')
-        if current_line.find('\\end_inset') > -1:
+        if line.find('\\end_inset') > -1:
             # check which inset to end
             try:
                 cur_inset = insets.pop()
                 if cur_inset == 'ert':
-                    in_ert = 0
+                    in_ert = False
                 elif cur_inset == 'math':
-                    in_math = 0
+                    in_math = False
                 else:
                     pass # end of other inset
             except:
                 pass # inset list was empty (for some reason)
-        current_line=''; # clear to have as container for modified line
-        for j in range(len(document.body[i])):
-            if spec_chars.has_key(document.body[i][j]):
-                flags = spec_chars[document.body[i][j]][1] + spec_chars[document.body[i][j]][2]
-                if flags.find('combining') > -1:
-                    command = ''
-                else:
-                    command = spec_chars[document.body[i][j]][0]; # the command to replace unicode
-                    if command[0:2] == '\\\\':
-                        if command[2:12]=='ensuremath':
-                            if in_ert == 1:
-                                # math in ERT
-                                command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
-                                command = command.replace('}', '$\n')
-                            elif in_math == 0:
-                                # add a math inset with the replacement character
-                                command = command.replace('\\\\ensuremath{\\', math_intro)
-                                command = command.replace('}', math_outro)
+        
+        # Try to write the line
+        try:
+            # If all goes well the line is written here
+            dummy = line.encode(document.encoding)
+            last_char = line[-1]
+            i += 1
+        except:
+            # Error, some character(s) in the line need to be replaced
+            mod_line = u''
+            for character in line:
+                try:
+                    # Try to write the character
+                    dummy = character.encode(document.encoding)
+                    mod_line += character
+                    last_char = character
+                except:
+                    # Try to replace with ERT/math inset
+                    if spec_chars.has_key(character):
+                        command = spec_chars[character][0] # the command to replace unicode
+                        flag1 = spec_chars[character][1]
+                        flag2 = spec_chars[character][2]
+                        if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+                            # We have a character that should be combined with the previous
+                            command += '{' + last_char + '}'
+                            # Remove the last character. Ignore if it is whitespace
+                            if len(last_char.rstrip()):
+                                # last_char was found and is not whitespace
+                                if mod_line:
+                                    mod_line = mod_line[:-1]
+                                else: # last_char belongs to the last line
+                                    document.body[i-1] = document.body[i-1][:-1]
                             else:
-                                # we are already in a math inset
-                                command = command.replace('\\\\ensuremath{\\', '')
-                                command = command.replace('}', '')
-                        else:
-                            if in_math == 1:
-                                # avoid putting an ERT in a math; instead put command as text
-                                command = command.replace('\\\\', '\mathrm{')
-                                command = command + '}'
-                            elif in_ert == 0:
-                                # add an ERT inset with the replacement character
-                                command = command.replace('\\\\', ert_intro)
-                                command = command + ert_outro
+                                # The last character was replaced by a command. For now it is
+                                # ignored. This could be handled better.
+                                pass
+                        if command[0:2] == '\\\\':
+                            if command[2:12]=='ensuremath':
+                                if in_ert:
+                                    # math in ERT
+                                    command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash')
+                                    command = command.replace('}', '$\n')
+                                elif not in_math:
+                                    # add a math inset with the replacement character
+                                    command = command.replace('\\\\ensuremath{\\', math_intro)
+                                    command = command.replace('}', math_outro)
+                                else:
+                                    # we are already in a math inset
+                                    command = command.replace('\\\\ensuremath{\\', '')
+                                    command = command.replace('}', '')
                             else:
-                                command = command.replace('\\\\', '\n\\backslash\n')
-                current_line = current_line + command
-            else:
-                current_line = current_line + document.body[i][j]
-        document.body[i] = current_line
+                                if in_math:
+                                    # avoid putting an ERT in a math; instead put command as text
+                                    command = command.replace('\\\\', '\mathrm{')
+                                    command = command + '}'
+                                elif not in_ert:
+                                    # add an ERT inset with the replacement character
+                                    command = command.replace('\\\\', ert_intro)
+                                    command = command + ert_outro
+                                else:
+                                    command = command.replace('\\\\', '\n\\backslash')
+                            last_char = '' # indicate that the character should not be removed
+                        mod_line += command
+                    else:
+                        # Replace with replacement string
+                        mod_line += replacement_character
+            document.body[i:i+1] = mod_line.split('\n')
+            i += len(mod_line.split('\n'))
 
 
 ##