We do this here and not in LyX.py because it is far easier to do the
necessary parsing in modern formats than in ancient ones.
"""
+ inset_types = ["Foot", "Note"]
if document.cjk_encoding != '':
return
encoding_stack = [document.encoding]
+ inset_stack = []
lang_re = re.compile(r"^\\lang\s(\S+)")
+ inset_re = re.compile(r"^\\begin_inset\s(\S+)")
if document.inputencoding == "auto" or document.inputencoding == "default":
for i in range(len(document.body)):
result = lang_re.match(document.body[i])
encoding_stack[-1] = lang[language][3]
elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
- encoding_stack.append(encoding_stack[-1])
+ if len(inset_stack) > 0 and inset_stack[-1] in inset_types:
+ from lyx2lyx_lang import lang
+ encoding_stack.append(lang[document.language][3])
+ else:
+ encoding_stack.append(encoding_stack[-1])
elif find_token(document.body, "\\end_layout", i, i + 1) == i:
document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
if len(encoding_stack) == 1:
document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
else:
del encoding_stack[-1]
+ elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
+ inset_result = inset_re.match(document.body[i])
+ if inset_result:
+ inset_type = inset_result.group(1)
+ inset_stack.append(inset_type)
+ else:
+ inset_stack.append("")
+ elif find_token(document.body, "\\end_inset", i, i + 1) == i:
+ del inset_stack[-1]
if encoding_stack[-1] != document.encoding:
if forward:
# This line has been incorrectly interpreted as if it was
# encoding.
encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
lang_re = re.compile(r"^\\lang\s(\S+)")
- for i in range(len(document.body)):
+ i = 0
+ while i < len(document.body):
if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
# Track the encoding of the current line
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
- document.body[i+1:i+1] = document.body[i][j+1:]
+ document.body.insert(i+1, document.body[i][j+1:])
# Delete the accented character
if j > 0:
document.body[i] = document.body[i][:j-1]
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
- document.body[i+1:i+1] = document.body[i][j+1:]
+ document.body.insert(i+1, document.body[i][j+1:])
# Delete the accented characters
if j > 1:
document.body[i] = document.body[i][:j-2]
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
break
+ i = i + 1
+
# Normalize to "Normal form C" (NFC, pre-composed characters) again
for i in range(numberoflines):
document.body[i] = unicodedata.normalize("NFC", document.body[i])
inlinecode = ''
# looking for the oneline code for lstinline
inlinecode = document.body[find_end_of_layout(document.body,
- find_token(document.body, '\\begin_layout Standard', i + 1) +1 ) - 1]
+ find_token(document.body, '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
if len(caption) > 0:
if len(params) == 0:
params = 'caption={%s}' % caption
if inline == 'true':
document.body[i:(j+1)] = [r'\begin_inset ERT',
'status %s' % status,
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
'',
r'\backslash',
document.body[i: j+1] = [r'\begin_inset ERT',
'status %s' % status,
'',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
'',
r'\backslash',
r'\end_layout'
] + document.body[k : j - 1] + \
['',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
r'\backslash',
'end{lstlisting}',
document.body[i : j + 1] = [r'\begin_inset ERT',
'status open',
'',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
'',
r'\backslash',
else:
del document.header[i]
+
def revert_separator_layout(document):
r'''Revert --Separator-- to a lyx note
From
if j == -1:
# this should not happen
break
- document.body[i : j + 1] = [r'\begin_layout Standard',
+ document.body[i : j + 1] = [r'\begin_layout %s' % document.default_layout,
r'\begin_inset Note Note',
'status open',
'',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'Separate Environment',
r'\end_layout',
'',
r'\end_layout'
]
+
def convert_arabic (document):
if document.language == "arabic":
document.language = "arabic_arabtex"
# change the language name
document.body[i] = '\lang arabic_arabtex'
i = i + 1
-
+
+
def revert_arabic (document):
if document.language == "arabic_arabtex":
document.language = "arabic"
document.body[i] = '\lang arabic'
i = i + 1
-def revert_unicode(document):
- '''Transform unicode symbols according to the unicode list.
-Preamble flags are not implemented.
-Combination characters are currently ignored.
-Forced output is currently not enforced'''
- pathname = os.path.dirname(sys.argv[0])
- fp = open(pathname.strip('lyx2lyx') + 'unicodesymbols','r')
+
+def read_unicodesymbols():
+ " Read the unicodesymbols list of unicode characters and corresponding commands."
+ pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+ fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
spec_chars = {}
for line in fp.readlines():
if line[0] != '#':
line=line.replace('" ',' ') # remove all quotation marks with spaces after
line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
try:
- # flag1 and flag2 are preamble & flags
- # currently NOT implemented
+ # flag1 and flag2 are preamble and other flags
[ucs4,command,flag1,flag2] =line.split(None,3)
spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
except:
pass
fp.close()
+
+ return spec_chars
+
+
+def revert_unicode(document):
+ '''Transform unicode characters that can not be written using the
+document encoding to commands according to the unicodesymbols
+file. Characters that can not be replaced by commands are replaced by
+an replacement string. Flags other than 'combined' are currently not
+implemented.'''
+
+ replacement_character = '???'
+ spec_chars = read_unicodesymbols()
+
# Define strings to start and end ERT and math insets
- ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout Standard\n\\backslash\n'
- ert_outro='\n\\end_layout\n\n\\end_inset\n\n'
+ ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash' % document.default_layout
+ ert_outro='\n\\end_layout\n\n\\end_inset\n'
math_intro='\n\\begin_inset Formula $'
- math_outro='$\n\\end_inset\n'
+ math_outro='$\n\\end_inset'
# Find unicode characters and replace them
- in_ert = 0 # flag set to 1 if in ERT inset
- in_math = 0 # flag set to 1 if in math inset
+ in_ert = False # flag set to 1 if in ERT inset
+ in_math = False # flag set to 1 if in math inset
insets = [] # list of active insets
- for i, current_line in enumerate(document.body):
- if current_line.find('\\begin_inset') > -1:
+
+ # Go through the file to capture all combining characters
+ last_char = '' # to store the previous character
+
+ i = 0
+ while i < len(document.body):
+ line = document.body[i]
+ # Check for insets
+ if line.find('\\begin_inset') > -1:
# check which inset to start
- if current_line.find('\\begin_inset ERT') > -1:
- in_ert = 1
+ if line.find('\\begin_inset ERT') > -1:
+ in_ert = True
insets.append('ert')
- elif current_line.find('\\begin_inset Formula') > -1:
- in_math = 1
+ elif line.find('\\begin_inset Formula') > -1:
+ in_math = True
insets.append('math')
else:
insets.append('other')
- if current_line.find('\\end_inset') > -1:
+ if line.find('\\end_inset') > -1:
# check which inset to end
try:
cur_inset = insets.pop()
if cur_inset == 'ert':
- in_ert = 0
+ in_ert = False
elif cur_inset == 'math':
- in_math = 0
+ in_math = False
else:
pass # end of other inset
except:
pass # inset list was empty (for some reason)
- current_line=''; # clear to have as container for modified line
- for j in range(len(document.body[i])):
- if spec_chars.has_key(document.body[i][j]):
- flags = spec_chars[document.body[i][j]][1] + spec_chars[document.body[i][j]][2]
- if flags.find('combining') > -1:
- command = ''
- else:
- command = spec_chars[document.body[i][j]][0]; # the command to replace unicode
- if command[0:2] == '\\\\':
- if command[2:12]=='ensuremath':
- if in_ert == 1:
- # math in ERT
- command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
- command = command.replace('}', '$\n')
- elif in_math == 0:
- # add a math inset with the replacement character
- command = command.replace('\\\\ensuremath{\\', math_intro)
- command = command.replace('}', math_outro)
+
+ # Try to write the line
+ try:
+ # If all goes well the line is written here
+ dummy = line.encode(document.encoding)
+ last_char = line[-1]
+ i += 1
+ except:
+ # Error, some character(s) in the line need to be replaced
+ mod_line = u''
+ for character in line:
+ try:
+ # Try to write the character
+ dummy = character.encode(document.encoding)
+ mod_line += character
+ last_char = character
+ except:
+ # Try to replace with ERT/math inset
+ if spec_chars.has_key(character):
+ command = spec_chars[character][0] # the command to replace unicode
+ flag1 = spec_chars[character][1]
+ flag2 = spec_chars[character][2]
+ if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+ # We have a character that should be combined with the previous
+ command += '{' + last_char + '}'
+ # Remove the last character. Ignore if it is whitespace
+ if len(last_char.rstrip()):
+ # last_char was found and is not whitespace
+ if mod_line:
+ mod_line = mod_line[:-1]
+ else: # last_char belongs to the last line
+ document.body[i-1] = document.body[i-1][:-1]
else:
- # we are already in a math inset
- command = command.replace('\\\\ensuremath{\\', '')
- command = command.replace('}', '')
- else:
- if in_math == 1:
- # avoid putting an ERT in a math; instead put command as text
- command = command.replace('\\\\', '\mathrm{')
- command = command + '}'
- elif in_ert == 0:
- # add an ERT inset with the replacement character
- command = command.replace('\\\\', ert_intro)
- command = command + ert_outro
+ # The last character was replaced by a command. For now it is
+ # ignored. This could be handled better.
+ pass
+ if command[0:2] == '\\\\':
+ if command[2:12]=='ensuremath':
+ if in_ert:
+ # math in ERT
+ command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash')
+ command = command.replace('}', '$\n')
+ elif not in_math:
+ # add a math inset with the replacement character
+ command = command.replace('\\\\ensuremath{\\', math_intro)
+ command = command.replace('}', math_outro)
+ else:
+ # we are already in a math inset
+ command = command.replace('\\\\ensuremath{\\', '')
+ command = command.replace('}', '')
else:
- command = command.replace('\\\\', '\n\\backslash\n')
- current_line = current_line + command
- else:
- current_line = current_line + document.body[i][j]
- document.body[i] = current_line
+ if in_math:
+ # avoid putting an ERT in a math; instead put command as text
+ command = command.replace('\\\\', '\mathrm{')
+ command = command + '}'
+ elif not in_ert:
+ # add an ERT inset with the replacement character
+ command = command.replace('\\\\', ert_intro)
+ command = command + ert_outro
+ else:
+ command = command.replace('\\\\', '\n\\backslash')
+ last_char = '' # indicate that the character should not be removed
+ mod_line += command
+ else:
+ # Replace with replacement string
+ mod_line += replacement_character
+ document.body[i:i+1] = mod_line.split('\n')
+ i += len(mod_line.split('\n'))
##