lib/lyx2lyx/lyx_1_5.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2006 José Matos <jamatos@lyx.org>
   4 # Copyright (C) 2004-2006 Georg Baum <Georg.Baum@post.rwth-aachen.de>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19
  20 """ Convert files to the file format generated by lyx 1.5"""
  21
  22 import re
  23 import unicodedata
  24 import sys, os
  25
  26 from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value, find_beginning_of, find_nonempty_line
  27 from LyX import get_encoding
  28
  29
  30 ####################################################################
  31 # Private helper functions
  32
  33 def find_end_of_inset(lines, i):
  34     " Find end of inset, where lines[i] is included."
  35     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
  36
  37 def find_end_of_layout(lines, i):
  38     " Find end of layout, where lines[i] is included."
  39     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
  40
  41 def find_beginning_of_layout(lines, i):
  42     "Find beginning of layout, where lines[i] is included."
  43     return find_beginning_of(lines, i, "\\begin_layout", "\\end_layout")
  44
  45 # End of helper functions
  46 ####################################################################
  47
  48
  49 ##
  50 #  Notes: Framed/Shaded
  51 #
  52
  53 def revert_framed(document):
  54     "Revert framed notes. "
  55     i = 0
  56     while 1:
  57         i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i)
  58
  59         if i == -1:
  60             return
  61         document.body[i] = "\\begin_inset Note"
  62         i = i + 1
  63
  64
  65 ##
  66 #  Fonts
  67 #
  68
  69 roman_fonts      = {'default' : 'default', 'ae'       : 'ae',
  70                     'times'   : 'times',   'palatino' : 'palatino',
  71                     'helvet'  : 'default', 'avant'    : 'default',
  72                     'newcent' : 'newcent', 'bookman'  : 'bookman',
  73                     'pslatex' : 'times'}
  74 sans_fonts       = {'default' : 'default', 'ae'       : 'default',
  75                     'times'   : 'default', 'palatino' : 'default',
  76                     'helvet'  : 'helvet',  'avant'    : 'avant',
  77                     'newcent' : 'default', 'bookman'  : 'default',
  78                     'pslatex' : 'helvet'}
  79 typewriter_fonts = {'default' : 'default', 'ae'       : 'default',
  80                     'times'   : 'default', 'palatino' : 'default',
  81                     'helvet'  : 'default', 'avant'    : 'default',
  82                     'newcent' : 'default', 'bookman'  : 'default',
  83                     'pslatex' : 'courier'}
  84
  85 def convert_font_settings(document):
  86     " Convert font settings. "
  87     i = 0
  88     i = find_token_exact(document.header, "\\fontscheme", i)
  89     if i == -1:
  90         document.warning("Malformed LyX document: Missing `\\fontscheme'.")
  91         return
  92     font_scheme = get_value(document.header, "\\fontscheme", i, i + 1)
  93     if font_scheme == '':
  94         document.warning("Malformed LyX document: Empty `\\fontscheme'.")
  95         font_scheme = 'default'
  96     if not font_scheme in roman_fonts.keys():
  97         document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
  98         font_scheme = 'default'
  99     document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme],
 100                           '\\font_sans %s' % sans_fonts[font_scheme],
 101                           '\\font_typewriter %s' % typewriter_fonts[font_scheme],
 102                           '\\font_default_family default',
 103                           '\\font_sc false',
 104                           '\\font_osf false',
 105                           '\\font_sf_scale 100',
 106                           '\\font_tt_scale 100']
 107
 108
 109 def revert_font_settings(document):
 110     " Revert font settings. "
 111     i = 0
 112     insert_line = -1
 113     fonts = {'roman' : 'default', 'sans' : 'default', 'typewriter' : 'default'}
 114     for family in 'roman', 'sans', 'typewriter':
 115         name = '\\font_%s' % family
 116         i = find_token_exact(document.header, name, i)
 117         if i == -1:
 118             document.warning("Malformed LyX document: Missing `%s'." % name)
 119             i = 0
 120         else:
 121             if (insert_line < 0):
 122                 insert_line = i
 123             fonts[family] = get_value(document.header, name, i, i + 1)
 124             del document.header[i]
 125     i = find_token_exact(document.header, '\\font_default_family', i)
 126     if i == -1:
 127         document.warning("Malformed LyX document: Missing `\\font_default_family'.")
 128         font_default_family = 'default'
 129     else:
 130         font_default_family = get_value(document.header, "\\font_default_family", i, i + 1)
 131         del document.header[i]
 132     i = find_token_exact(document.header, '\\font_sc', i)
 133     if i == -1:
 134         document.warning("Malformed LyX document: Missing `\\font_sc'.")
 135         font_sc = 'false'
 136     else:
 137         font_sc = get_value(document.header, '\\font_sc', i, i + 1)
 138         del document.header[i]
 139     if font_sc != 'false':
 140         document.warning("Conversion of '\\font_sc' not yet implemented.")
 141     i = find_token_exact(document.header, '\\font_osf', i)
 142     if i == -1:
 143         document.warning("Malformed LyX document: Missing `\\font_osf'.")
 144         font_osf = 'false'
 145     else:
 146         font_osf = get_value(document.header, '\\font_osf', i, i + 1)
 147         del document.header[i]
 148     i = find_token_exact(document.header, '\\font_sf_scale', i)
 149     if i == -1:
 150         document.warning("Malformed LyX document: Missing `\\font_sf_scale'.")
 151         font_sf_scale = '100'
 152     else:
 153         font_sf_scale = get_value(document.header, '\\font_sf_scale', i, i + 1)
 154         del document.header[i]
 155     if font_sf_scale != '100':
 156         document.warning("Conversion of '\\font_sf_scale' not yet implemented.")
 157     i = find_token_exact(document.header, '\\font_tt_scale', i)
 158     if i == -1:
 159         document.warning("Malformed LyX document: Missing `\\font_tt_scale'.")
 160         font_tt_scale = '100'
 161     else:
 162         font_tt_scale = get_value(document.header, '\\font_tt_scale', i, i + 1)
 163         del document.header[i]
 164     if font_tt_scale != '100':
 165         document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
 166     for font_scheme in roman_fonts.keys():
 167         if (roman_fonts[font_scheme] == fonts['roman'] and
 168             sans_fonts[font_scheme] == fonts['sans'] and
 169             typewriter_fonts[font_scheme] == fonts['typewriter']):
 170             document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
 171             if font_default_family != 'default':
 172                 document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
 173             if font_osf == 'true':
 174                 document.warning("Ignoring `\\font_osf = true'")
 175             return
 176     font_scheme = 'default'
 177     document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
 178     if fonts['roman'] == 'cmr':
 179         document.preamble.append('\\renewcommand{\\rmdefault}{cmr}')
 180         if font_osf == 'true':
 181             document.preamble.append('\\usepackage{eco}')
 182             font_osf = 'false'
 183     for font in 'lmodern', 'charter', 'utopia', 'beraserif', 'ccfonts', 'chancery':
 184         if fonts['roman'] == font:
 185             document.preamble.append('\\usepackage{%s}' % font)
 186     for font in 'cmss', 'lmss', 'cmbr':
 187         if fonts['sans'] == font:
 188             document.preamble.append('\\renewcommand{\\sfdefault}{%s}' % font)
 189     for font in 'berasans':
 190         if fonts['sans'] == font:
 191             document.preamble.append('\\usepackage{%s}' % font)
 192     for font in 'cmtt', 'lmtt', 'cmtl':
 193         if fonts['typewriter'] == font:
 194             document.preamble.append('\\renewcommand{\\ttdefault}{%s}' % font)
 195     for font in 'courier', 'beramono', 'luximono':
 196         if fonts['typewriter'] == font:
 197             document.preamble.append('\\usepackage{%s}' % font)
 198     if font_default_family != 'default':
 199         document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
 200     if font_osf == 'true':
 201         document.warning("Ignoring `\\font_osf = true'")
 202
 203
 204 def revert_booktabs(document):
 205     " We remove the booktabs flag or everything else will become a mess. "
 206     re_row = re.compile(r'^<row.*space="[^"]+".*>$')
 207     re_tspace = re.compile(r'\s+topspace="[^"]+"')
 208     re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
 209     re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
 210     i = 0
 211     while 1:
 212         i = find_token(document.body, "\\begin_inset Tabular", i)
 213         if i == -1:
 214             return
 215         j = find_end_of_inset(document.body, i + 1)
 216         if j == -1:
 217             document.warning("Malformed LyX document: Could not find end of tabular.")
 218             continue
 219         for k in range(i, j):
 220             if re.search('^<features.* booktabs="true".*>$', document.body[k]):
 221                 document.warning("Converting 'booktabs' table to normal table.")
 222                 document.body[k] = document.body[k].replace(' booktabs="true"', '')
 223             if re.search(re_row, document.body[k]):
 224                 document.warning("Removing extra row space.")
 225                 document.body[k] = re_tspace.sub('', document.body[k])
 226                 document.body[k] = re_bspace.sub('', document.body[k])
 227                 document.body[k] = re_ispace.sub('', document.body[k])
 228         i = i + 1
 229
 230
 231 def convert_multiencoding(document, forward):
 232     """ Fix files with multiple encodings.
 233 Files with an inputencoding of "auto" or "default" and multiple languages
 234 where at least two languages have different default encodings are encoded
 235 in multiple encodings for file formats < 249. These files are incorrectly
 236 read and written (as if the whole file was in the encoding of the main
 237 language).
 238 This is not true for files written by CJK-LyX, they are always in the locale
 239 encoding.
 240
 241 This function
 242 - converts from fake unicode values to true unicode if forward is true, and
 243 - converts from true unicode values to fake unicode if forward is false.
 244 document.encoding must be set to the old value (format 248) in both cases.
 245
 246 We do this here and not in LyX.py because it is far easier to do the
 247 necessary parsing in modern formats than in ancient ones.
 248 """
 249     inset_types = ["Foot", "Note"]
 250     if document.cjk_encoding != '':
 251         return
 252     encoding_stack = [document.encoding]
 253     inset_stack = []
 254     lang_re = re.compile(r"^\\lang\s(\S+)")
 255     inset_re = re.compile(r"^\\begin_inset\s(\S+)")
 256     if document.inputencoding == "auto" or document.inputencoding == "default":
 257         for i in range(len(document.body)):
 258             result = lang_re.match(document.body[i])
 259             if result:
 260                 language = result.group(1)
 261                 if language == "default":
 262                     document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding), 3)
 263                     encoding_stack[-1] = document.encoding
 264                 else:
 265                     from lyx2lyx_lang import lang
 266                     document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3]), 3)
 267                     encoding_stack[-1] = lang[language][3]
 268             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
 269                 document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
 270                 if len(inset_stack) > 0 and inset_stack[-1] in inset_types:
 271                     from lyx2lyx_lang import lang
 272                     encoding_stack.append(lang[document.language][3])
 273                 else:
 274                     encoding_stack.append(encoding_stack[-1])
 275             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
 276                 document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
 277                 if len(encoding_stack) == 1:
 278                     # Don't remove the document encoding from the stack
 279                     document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
 280                 else:
 281                     del encoding_stack[-1]
 282             elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
 283                 inset_result = inset_re.match(document.body[i])
 284                 if inset_result:
 285                     inset_type = inset_result.group(1)
 286                     inset_stack.append(inset_type)
 287                 else:
 288                     inset_stack.append("")
 289             elif find_token(document.body, "\\end_inset", i, i + 1) == i:
 290                 del inset_stack[-1]
 291             if encoding_stack[-1] != document.encoding:
 292                 if forward:
 293                     # This line has been incorrectly interpreted as if it was
 294                     # encoded in 'encoding'.
 295                     # Convert back to the 8bit string that was in the file.
 296                     orig = document.body[i].encode(document.encoding)
 297                     # Convert the 8bit string that was in the file to unicode
 298                     # with the correct encoding.
 299                     document.body[i] = orig.decode(encoding_stack[-1])
 300                 else:
 301                     # Convert unicode to the 8bit string that will be written
 302                     # to the file with the correct encoding.
 303                     orig = document.body[i].encode(encoding_stack[-1])
 304                     # Convert the 8bit string that will be written to the
 305                     # file to fake unicode with the encoding that will later
 306                     # be used when writing to the file.
 307                     document.body[i] = orig.decode(document.encoding)
 308
 309
 310 def convert_utf8(document):
 311     " Set document encoding to UTF-8. "
 312     convert_multiencoding(document, True)
 313     document.encoding = "utf8"
 314
 315
 316 def revert_utf8(document):
 317     " Set document encoding to the value corresponding to inputencoding. "
 318     i = find_token(document.header, "\\inputencoding", 0)
 319     if i == -1:
 320         document.header.append("\\inputencoding auto")
 321     elif get_value(document.header, "\\inputencoding", i) == "utf8":
 322         document.header[i] = "\\inputencoding auto"
 323     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
 324     document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
 325     convert_multiencoding(document, False)
 326
 327
 328 def revert_cs_label(document):
 329     " Remove status flag of charstyle label. "
 330     i = 0
 331     while 1:
 332         i = find_token(document.body, "\\begin_inset CharStyle", i)
 333         if i == -1:
 334             return
 335         # Seach for a line starting 'show_label'
 336         # If it is not there, break with a warning message
 337         i = i + 1
 338         while 1:
 339             if (document.body[i][:10] == "show_label"):
 340                 del document.body[i]
 341                 break
 342             elif (document.body[i][:13] == "\\begin_layout"):
 343                 document.warning("Malformed LyX document: Missing 'show_label'.")
 344                 break
 345             i = i + 1
 346
 347         i = i + 1
 348
 349
 350 def convert_bibitem(document):
 351     """ Convert
 352 \bibitem [option]{argument}
 353
 354 to
 355
 356 \begin_inset LatexCommand bibitem
 357 label "option"
 358 key "argument"
 359
 360 \end_inset
 361
 362 This must be called after convert_commandparams.
 363 """
 364     i = 0
 365     while 1:
 366         i = find_token(document.body, "\\bibitem", i)
 367         if i == -1:
 368             break
 369         j = document.body[i].find('[') + 1
 370         k = document.body[i].rfind(']')
 371         if j == 0: # No optional argument found
 372             option = None
 373         else:
 374             option = document.body[i][j:k]
 375         j = document.body[i].rfind('{') + 1
 376         k = document.body[i].rfind('}')
 377         argument = document.body[i][j:k]
 378         lines = ['\\begin_inset LatexCommand bibitem']
 379         if option != None:
 380             lines.append('label "%s"' % option.replace('"', '\\"'))
 381         lines.append('key "%s"' % argument.replace('"', '\\"'))
 382         lines.append('')
 383         lines.append('\\end_inset')
 384         document.body[i:i+1] = lines
 385         i = i + 1
 386
 387
 388 commandparams_info = {
 389     # command : [option1, option2, argument]
 390     "bibitem" : ["label", "", "key"],
 391     "bibtex" : ["options", "btprint", "bibfiles"],
 392     "cite"        : ["after", "before", "key"],
 393     "citet"       : ["after", "before", "key"],
 394     "citep"       : ["after", "before", "key"],
 395     "citealt"     : ["after", "before", "key"],
 396     "citealp"     : ["after", "before", "key"],
 397     "citeauthor"  : ["after", "before", "key"],
 398     "citeyear"    : ["after", "before", "key"],
 399     "citeyearpar" : ["after", "before", "key"],
 400     "citet*"      : ["after", "before", "key"],
 401     "citep*"      : ["after", "before", "key"],
 402     "citealt*"    : ["after", "before", "key"],
 403     "citealp*"    : ["after", "before", "key"],
 404     "citeauthor*" : ["after", "before", "key"],
 405     "Citet"       : ["after", "before", "key"],
 406     "Citep"       : ["after", "before", "key"],
 407     "Citealt"     : ["after", "before", "key"],
 408     "Citealp"     : ["after", "before", "key"],
 409     "Citeauthor"  : ["after", "before", "key"],
 410     "Citet*"      : ["after", "before", "key"],
 411     "Citep*"      : ["after", "before", "key"],
 412     "Citealt*"    : ["after", "before", "key"],
 413     "Citealp*"    : ["after", "before", "key"],
 414     "Citeauthor*" : ["after", "before", "key"],
 415     "citefield"   : ["after", "before", "key"],
 416     "citetitle"   : ["after", "before", "key"],
 417     "cite*"       : ["after", "before", "key"],
 418     "hfill" : ["", "", ""],
 419     "index"      : ["", "", "name"],
 420     "printindex" : ["", "", "name"],
 421     "label" : ["", "", "name"],
 422     "eqref"     : ["name", "", "reference"],
 423     "pageref"   : ["name", "", "reference"],
 424     "prettyref" : ["name", "", "reference"],
 425     "ref"       : ["name", "", "reference"],
 426     "vpageref"  : ["name", "", "reference"],
 427     "vref"      : ["name", "", "reference"],
 428     "tableofcontents" : ["", "", "type"],
 429     "htmlurl" : ["name", "", "target"],
 430     "url"     : ["name", "", "target"]}
 431
 432
 433 def convert_commandparams(document):
 434     """ Convert
 435
 436  \begin_inset LatexCommand \cmdname[opt1][opt2]{arg}
 437  \end_inset
 438
 439  to
 440
 441  \begin_inset LatexCommand cmdname
 442  name1 "opt1"
 443  name2 "opt2"
 444  name3 "arg"
 445  \end_inset
 446
 447  name1, name2 and name3 can be different for each command.
 448 """
 449     # \begin_inset LatexCommand bibitem was not the official version (see
 450     # convert_bibitem()), but could be read in, so we convert it here, too.
 451
 452     i = 0
 453     while 1:
 454         i = find_token(document.body, "\\begin_inset LatexCommand", i)
 455         if i == -1:
 456             break
 457         command = document.body[i][26:].strip()
 458         if command == "":
 459             document.warning("Malformed LyX document: Missing LatexCommand name.")
 460             i = i + 1
 461             continue
 462
 463         j = find_token(document.body, "\\end_inset", i + 1)
 464         if j == -1:
 465             document.warning("Malformed document")
 466         else:
 467             command += "".join(document.body[i+1:j])
 468             document.body[i+1:j] = []
 469
 470         # The following parser is taken from the original InsetCommandParams::scanCommand
 471         name = ""
 472         option1 = ""
 473         option2 = ""
 474         argument = ""
 475         state = "WS"
 476         # Used to handle things like \command[foo[bar]]{foo{bar}}
 477         nestdepth = 0
 478         b = 0
 479         for c in command:
 480             if ((state == "CMDNAME" and c == ' ') or
 481                 (state == "CMDNAME" and c == '[') or
 482                 (state == "CMDNAME" and c == '{')):
 483                 state = "WS"
 484             if ((state == "OPTION" and c == ']') or
 485                 (state == "SECOPTION" and c == ']') or
 486                 (state == "CONTENT" and c == '}')):
 487                 if nestdepth == 0:
 488                     state = "WS"
 489                 else:
 490                     nestdepth = nestdepth - 1
 491             if ((state == "OPTION" and c == '[') or
 492                 (state == "SECOPTION" and c == '[') or
 493                 (state == "CONTENT" and c == '{')):
 494                 nestdepth = nestdepth + 1
 495             if state == "CMDNAME":
 496                     name += c
 497             elif state == "OPTION":
 498                     option1 += c
 499             elif state == "SECOPTION":
 500                     option2 += c
 501             elif state == "CONTENT":
 502                     argument += c
 503             elif state == "WS":
 504                 if c == '\\':
 505                     state = "CMDNAME"
 506                 elif c == '[' and b != ']':
 507                     state = "OPTION"
 508                     nestdepth = 0 # Just to be sure
 509                 elif c == '[' and b == ']':
 510                     state = "SECOPTION"
 511                     nestdepth = 0 # Just to be sure
 512                 elif c == '{':
 513                     state = "CONTENT"
 514                     nestdepth = 0 # Just to be sure
 515             b = c
 516
 517         # Now we have parsed the command, output the parameters
 518         lines = ["\\begin_inset LatexCommand %s" % name]
 519         if option1 != "":
 520             if commandparams_info[name][0] == "":
 521                 document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
 522             else:
 523                 lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
 524         if option2 != "":
 525             if commandparams_info[name][1] == "":
 526                 document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
 527             else:
 528                 lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
 529         if argument != "":
 530             if commandparams_info[name][2] == "":
 531                 document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
 532             else:
 533                 lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
 534         document.body[i:i+1] = lines
 535         i = i + 1
 536
 537
 538 def revert_commandparams(document):
 539     regex = re.compile(r'(\S+)\s+(.+)')
 540     i = 0
 541     while 1:
 542         i = find_token(document.body, "\\begin_inset LatexCommand", i)
 543         if i == -1:
 544             break
 545         name = document.body[i].split()[2]
 546         j = find_end_of_inset(document.body, i + 1)
 547         preview_line = ""
 548         option1 = ""
 549         option2 = ""
 550         argument = ""
 551         for k in range(i + 1, j):
 552             match = re.match(regex, document.body[k])
 553             if match:
 554                 pname = match.group(1)
 555                 pvalue = match.group(2)
 556                 if pname == "preview":
 557                     preview_line = document.body[k]
 558                 elif (commandparams_info[name][0] != "" and
 559                       pname == commandparams_info[name][0]):
 560                     option1 = pvalue.strip('"').replace('\\"', '"')
 561                 elif (commandparams_info[name][1] != "" and
 562                       pname == commandparams_info[name][1]):
 563                     option2 = pvalue.strip('"').replace('\\"', '"')
 564                 elif (commandparams_info[name][2] != "" and
 565                       pname == commandparams_info[name][2]):
 566                     argument = pvalue.strip('"').replace('\\"', '"')
 567             elif document.body[k].strip() != "":
 568                 document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
 569         if name == "bibitem":
 570             if option1 == "":
 571                 lines = ["\\bibitem {%s}" % argument]
 572             else:
 573                 lines = ["\\bibitem [%s]{%s}" % (option1, argument)]
 574         else:
 575             if option1 == "":
 576                 if option2 == "":
 577                     lines = ["\\begin_inset LatexCommand \\%s{%s}" % (name, argument)]
 578                 else:
 579                     lines = ["\\begin_inset LatexCommand \\%s[][%s]{%s}" % (name, option2, argument)]
 580             else:
 581                 if option2 == "":
 582                     lines = ["\\begin_inset LatexCommand \\%s[%s]{%s}" % (name, option1, argument)]
 583                 else:
 584                     lines = ["\\begin_inset LatexCommand \\%s[%s][%s]{%s}" % (name, option1, option2, argument)]
 585         if name != "bibitem":
 586             if preview_line != "":
 587                 lines.append(preview_line)
 588             lines.append('')
 589             lines.append('\\end_inset')
 590         document.body[i:j+1] = lines
 591         i = j + 1
 592
 593
 594 def revert_nomenclature(document):
 595     " Convert nomenclature entry to ERT. "
 596     regex = re.compile(r'(\S+)\s+(.+)')
 597     i = 0
 598     use_nomencl = 0
 599     while 1:
 600         i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
 601         if i == -1:
 602             break
 603         use_nomencl = 1
 604         j = find_end_of_inset(document.body, i + 1)
 605         preview_line = ""
 606         symbol = ""
 607         description = ""
 608         prefix = ""
 609         for k in range(i + 1, j):
 610             match = re.match(regex, document.body[k])
 611             if match:
 612                 name = match.group(1)
 613                 value = match.group(2)
 614                 if name == "preview":
 615                     preview_line = document.body[k]
 616                 elif name == "symbol":
 617                     symbol = value.strip('"').replace('\\"', '"')
 618                 elif name == "description":
 619                     description = value.strip('"').replace('\\"', '"')
 620                 elif name == "prefix":
 621                     prefix = value.strip('"').replace('\\"', '"')
 622             elif document.body[k].strip() != "":
 623                 document.warning("Ignoring unknown contents `%s' in nomenclature inset." % document.body[k])
 624         if prefix == "":
 625             command = 'nomenclature{%s}{%s}' % (symbol, description)
 626         else:
 627             command = 'nomenclature[%s]{%s}{%s}' % (prefix, symbol, description)
 628         document.body[i:j+1] = ['\\begin_inset ERT',
 629                                 'status collapsed',
 630                                 '',
 631                                 '\\begin_layout %s' % document.default_layout,
 632                                 '',
 633                                 '',
 634                                 '\\backslash',
 635                                 command,
 636                                 '\\end_layout',
 637                                 '',
 638                                 '\\end_inset']
 639         i = i + 11
 640     if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
 641         document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
 642         document.preamble.append('\\makenomenclature')
 643
 644
 645 def revert_printnomenclature(document):
 646     " Convert printnomenclature to ERT. "
 647     regex = re.compile(r'(\S+)\s+(.+)')
 648     i = 0
 649     use_nomencl = 0
 650     while 1:
 651         i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
 652         if i == -1:
 653             break
 654         use_nomencl = 1
 655         j = find_end_of_inset(document.body, i + 1)
 656         preview_line = ""
 657         labelwidth = ""
 658         for k in range(i + 1, j):
 659             match = re.match(regex, document.body[k])
 660             if match:
 661                 name = match.group(1)
 662                 value = match.group(2)
 663                 if name == "preview":
 664                     preview_line = document.body[k]
 665                 elif name == "labelwidth":
 666                     labelwidth = value.strip('"').replace('\\"', '"')
 667             elif document.body[k].strip() != "":
 668                 document.warning("Ignoring unknown contents `%s' in printnomenclature inset." % document.body[k])
 669         if labelwidth == "":
 670             command = 'nomenclature{}'
 671         else:
 672             command = 'nomenclature[%s]' % labelwidth
 673         document.body[i:j+1] = ['\\begin_inset ERT',
 674                                 'status collapsed',
 675                                 '',
 676                                 '\\begin_layout %s' % document.default_layout,
 677                                 '',
 678                                 '',
 679                                 '\\backslash',
 680                                 command,
 681                                 '\\end_layout',
 682                                 '',
 683                                 '\\end_inset']
 684         i = i + 11
 685     if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
 686         document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
 687         document.preamble.append('\\makenomenclature')
 688
 689
 690 def convert_esint(document):
 691     " Add \\use_esint setting to header. "
 692     i = find_token(document.header, "\\cite_engine", 0)
 693     if i == -1:
 694         document.warning("Malformed LyX document: Missing `\\cite_engine'.")
 695         return
 696     # 0 is off, 1 is auto, 2 is on.
 697     document.header.insert(i, '\\use_esint 0')
 698
 699
 700 def revert_esint(document):
 701     " Remove \\use_esint setting from header. "
 702     i = find_token(document.header, "\\use_esint", 0)
 703     if i == -1:
 704         document.warning("Malformed LyX document: Missing `\\use_esint'.")
 705         return
 706     use_esint = document.header[i].split()[1]
 707     del document.header[i]
 708     # 0 is off, 1 is auto, 2 is on.
 709     if (use_esint == 2):
 710         document.preamble.append('\\usepackage{esint}')
 711
 712
 713 def revert_clearpage(document):
 714     " clearpage -> ERT "
 715     i = 0
 716     while 1:
 717         i = find_token(document.body, "\\clearpage", i)
 718         if i == -1:
 719             break
 720         document.body[i:i+1] =  ['\\begin_inset ERT',
 721                                 'status collapsed',
 722                                 '',
 723                                 '\\begin_layout %s' % document.default_layout,
 724                                 '',
 725                                 '',
 726                                 '\\backslash',
 727                                 'clearpage',
 728                                 '\\end_layout',
 729                                 '',
 730                                 '\\end_inset']
 731     i = i + 1
 732
 733
 734 def revert_cleardoublepage(document):
 735     " cleardoublepage -> ERT "
 736     i = 0
 737     while 1:
 738         i = find_token(document.body, "\\cleardoublepage", i)
 739         if i == -1:
 740             break
 741         document.body[i:i+1] =  ['\\begin_inset ERT',
 742                                 'status collapsed',
 743                                 '',
 744                                 '\\begin_layout %s' % document.default_layout,
 745                                 '',
 746                                 '',
 747                                 '\\backslash',
 748                                 'cleardoublepage',
 749                                 '\\end_layout',
 750                                 '',
 751                                 '\\end_inset']
 752     i = i + 1
 753
 754
 755 def convert_lyxline(document):
 756     " remove fontsize commands for \lyxline "
 757     # The problematic is: The old \lyxline definition doesn't handle the fontsize
 758     # to change the line thickness. The new definiton does this so that imported
 759     # \lyxlines would have a different line thickness. The eventual fontsize command
 760     # before \lyxline is therefore removed to get the same output.
 761     fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
 762                  "large", "Large", "LARGE", "huge", "Huge"]
 763     for n in range(0, len(fontsizes)):
 764         i = 0
 765         k = 0
 766         while i < len(document.body):
 767             i = find_token(document.body, "\\size " + fontsizes[n], i)
 768             k = find_token(document.body, "\\lyxline", i)
 769             # the corresponding fontsize command is always 2 lines before the \lyxline
 770             if (i != -1 and k == i+2):
 771                 document.body[i:i+1] = []
 772             else:
 773                 break
 774         i = i + 1
 775
 776
 777 def revert_encodings(document):
 778     " Set new encodings to auto. "
 779     encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
 780                  "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250",
 781                  "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"]
 782     i = find_token(document.header, "\\inputencoding", 0)
 783     if i == -1:
 784         document.header.append("\\inputencoding auto")
 785     else:
 786         inputenc = get_value(document.header, "\\inputencoding", i)
 787         if inputenc in encodings:
 788             document.header[i] = "\\inputencoding auto"
 789     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
 790
 791
 792 def convert_caption(document):
 793     " Convert caption layouts to caption insets. "
 794     i = 0
 795     while 1:
 796         i = find_token(document.body, "\\begin_layout Caption", i)
 797         if i == -1:
 798             return
 799         j = find_end_of_layout(document.body, i)
 800         if j == -1:
 801             document.warning("Malformed LyX document: Missing `\\end_layout'.")
 802             return
 803
 804         document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
 805         document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
 806                             "\\begin_inset Caption", "",
 807                             "\\begin_layout %s" % document.default_layout]
 808         i = i + 1
 809
 810
 811 def revert_caption(document):
 812     " Convert caption insets to caption layouts. "
 813     " This assumes that the text class has a caption style. "
 814     i = 0
 815     while 1:
 816         i = find_token(document.body, "\\begin_inset Caption", i)
 817         if i == -1:
 818             return
 819
 820         # We either need to delete the previous \begin_layout line, or we
 821         # need to end the previous layout if this inset is not in the first
 822         # position of the paragraph.
 823         layout_before = find_token_backwards(document.body, "\\begin_layout", i)
 824         if layout_before == -1:
 825             document.warning("Malformed LyX document: Missing `\\begin_layout'.")
 826             return
 827         layout_line = document.body[layout_before]
 828         del_layout_before = True
 829         l = layout_before + 1
 830         while l < i:
 831             if document.body[l] != "":
 832                 del_layout_before = False
 833                 break
 834             l = l + 1
 835         if del_layout_before:
 836             del document.body[layout_before:i]
 837             i = layout_before
 838         else:
 839             document.body[i:i] = ["\\end_layout", ""]
 840             i = i + 2
 841
 842         # Find start of layout in the inset and end of inset
 843         j = find_token(document.body, "\\begin_layout", i)
 844         if j == -1:
 845             document.warning("Malformed LyX document: Missing `\\begin_layout'.")
 846             return
 847         k = find_end_of_inset(document.body, i)
 848         if k == -1:
 849             document.warning("Malformed LyX document: Missing `\\end_inset'.")
 850             return
 851
 852         # We either need to delete the following \end_layout line, or we need
 853         # to restart the old layout if this inset is not at the paragraph end.
 854         layout_after = find_token(document.body, "\\end_layout", k)
 855         if layout_after == -1:
 856             document.warning("Malformed LyX document: Missing `\\end_layout'.")
 857             return
 858         del_layout_after = True
 859         l = k + 1
 860         while l < layout_after:
 861             if document.body[l] != "":
 862                 del_layout_after = False
 863                 break
 864             l = l + 1
 865         if del_layout_after:
 866             del document.body[k+1:layout_after+1]
 867         else:
 868             document.body[k+1:k+1] = [layout_line, ""]
 869
 870         # delete \begin_layout and \end_inset and replace \begin_inset with
 871         # "\begin_layout Caption". This works because we can only have one
 872         # paragraph in the caption inset: The old \end_layout will be recycled.
 873         del document.body[k]
 874         if document.body[k] == "":
 875             del document.body[k]
 876         del document.body[j]
 877         if document.body[j] == "":
 878             del document.body[j]
 879         document.body[i] = "\\begin_layout Caption"
 880         if document.body[i+1] == "":
 881             del document.body[i+1]
 882         i = i + 1
 883
 884
 885 # Accents of InsetLaTeXAccent
 886 accent_map = {
 887     "`" : u'\u0300', # grave
 888     "'" : u'\u0301', # acute
 889     "^" : u'\u0302', # circumflex
 890     "~" : u'\u0303', # tilde
 891     "=" : u'\u0304', # macron
 892     "u" : u'\u0306', # breve
 893     "." : u'\u0307', # dot above
 894     "\"": u'\u0308', # diaeresis
 895     "r" : u'\u030a', # ring above
 896     "H" : u'\u030b', # double acute
 897     "v" : u'\u030c', # caron
 898     "b" : u'\u0320', # minus sign below
 899     "d" : u'\u0323', # dot below
 900     "c" : u'\u0327', # cedilla
 901     "k" : u'\u0328', # ogonek
 902     "t" : u'\u0361'  # tie. This is special: It spans two characters, but
 903                      # only one is given as argument, so we don't need to
 904                      # treat it differently.
 905 }
 906
 907
 908 # special accents of InsetLaTeXAccent without argument
 909 special_accent_map = {
 910     'i' : u'\u0131', # dotless i
 911     'j' : u'\u0237', # dotless j
 912     'l' : u'\u0142', # l with stroke
 913     'L' : u'\u0141'  # L with stroke
 914 }
 915
 916
 917 # special accent arguments of InsetLaTeXAccent
 918 accented_map = {
 919     '\\i' : u'\u0131', # dotless i
 920     '\\j' : u'\u0237'  # dotless j
 921 }
 922
 923
 924 def _convert_accent(accent, accented_char):
 925     type = accent
 926     char = accented_char
 927     if char == '':
 928         if type in special_accent_map:
 929             return special_accent_map[type]
 930         # a missing char is treated as space by LyX
 931         char = ' '
 932     elif type == 'q' and char in ['t', 'd', 'l', 'L']:
 933         # Special caron, only used with t, d, l and L.
 934         # It is not in the map because we convert it to the same unicode
 935         # character as the normal caron: \q{} is only defined if babel with
 936         # the czech or slovak language is used, and the normal caron
 937         # produces the correct output if the T1 font encoding is used.
 938         # For the same reason we never convert to \q{} in the other direction.
 939         type = 'v'
 940     elif char in accented_map:
 941         char = accented_map[char]
 942     elif (len(char) > 1):
 943         # We can only convert accents on a single char
 944         return ''
 945     a = accent_map.get(type)
 946     if a:
 947         return unicodedata.normalize("NFC", "%s%s" % (char, a))
 948     return ''
 949
 950
 951 def convert_ertbackslash(body, i, ert, default_layout):
 952     r""" -------------------------------------------------------------------------------------------
 953     Convert backslashes and '\n' into valid ERT code, append the converted
 954     text to body[i] and return the (maybe incremented) line index i"""
 955
 956     for c in ert:
 957         if c == '\\':
 958             body[i] = body[i] + '\\backslash '
 959             i = i + 1
 960             body.insert(i, '')
 961         elif c == '\n':
 962             body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
 963             i = i + 4
 964         else:
 965             body[i] = body[i] + c
 966     return i
 967
 968
 969 def convert_accent(document):
 970     # The following forms are supported by LyX:
 971     # '\i \"{a}' (standard form, as written by LyX)
 972     # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
 973     # '\i \"{ }' (also accepted if the accented char is a space)
 974     # '\i \" a'  (also accepted)
 975     # '\i \"'    (also accepted)
 976     re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
 977     re_contents = re.compile(r'^([^\s{]+)(.*)$')
 978     re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
 979     i = 0
 980     while 1:
 981         i = find_re(document.body, re_wholeinset, i)
 982         if i == -1:
 983             return
 984         match = re_wholeinset.match(document.body[i])
 985         prefix = match.group(1)
 986         contents = match.group(3).strip()
 987         match = re_contents.match(contents)
 988         if match:
 989             # Strip first char (always \)
 990             accent = match.group(1)[1:]
 991             accented_contents = match.group(2).strip()
 992             match = re_accentedcontents.match(accented_contents)
 993             accented_char = match.group(1)
 994             converted = _convert_accent(accent, accented_char)
 995             if converted == '':
 996                 # Normalize contents
 997                 contents = '%s{%s}' % (accent, accented_char),
 998             else:
 999                 document.body[i] = '%s%s' % (prefix, converted)
1000                 i += 1
1001                 continue
1002         document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
1003         document.body[i] = prefix
1004         document.body[i+1:i+1] = ['\\begin_inset ERT',
1005                                   'status collapsed',
1006                                   '',
1007                                   '\\begin_layout %s' % document.default_layout,
1008                                   '',
1009                                   '',
1010                                   '']
1011         i = convert_ertbackslash(document.body, i + 7,
1012                                  '\\%s' % contents,
1013                                  document.default_layout)
1014         document.body[i+1:i+1] = ['\\end_layout',
1015                                   '',
1016                                   '\\end_inset']
1017         i += 3
1018
1019
1020 def revert_accent(document):
1021     inverse_accent_map = {}
1022     for k in accent_map:
1023         inverse_accent_map[accent_map[k]] = k
1024     inverse_special_accent_map = {}
1025     for k in special_accent_map:
1026         inverse_special_accent_map[special_accent_map[k]] = k
1027     inverse_accented_map = {}
1028     for k in accented_map:
1029         inverse_accented_map[accented_map[k]] = k
1030
1031     # Since LyX may insert a line break within a word we must combine all
1032     # words before unicode normalization.
1033     # We do this only if the next line starts with an accent, otherwise we
1034     # would create things like '\begin_inset ERTstatus'.
1035     numberoflines = len(document.body)
1036     for i in range(numberoflines-1):
1037         if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
1038             continue
1039         if (document.body[i+1][0] in inverse_accent_map):
1040             # the last character of this line and the first of the next line
1041             # form probably a surrogate pair.
1042             while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
1043                 document.body[i] += document.body[i+1][0]
1044                 document.body[i+1] = document.body[i+1][1:]
1045
1046     # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
1047     # This is needed to catch all accented characters.
1048     for i in range(numberoflines):
1049         # Unfortunately we have a mixture of unicode strings and plain strings,
1050         # because we never use u'xxx' for string literals, but 'xxx'.
1051         # Therefore we may have to try two times to normalize the data.
1052         try:
1053             document.body[i] = unicodedata.normalize("NFD", document.body[i])
1054         except TypeError:
1055             document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
1056
1057     # Replace accented characters with InsetLaTeXAccent
1058     # Do not convert characters that can be represented in the chosen
1059     # encoding.
1060     encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
1061     lang_re = re.compile(r"^\\lang\s(\S+)")
1062     i = 0
1063     while i < len(document.body):
1064
1065         if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
1066             # Track the encoding of the current line
1067             result = lang_re.match(document.body[i])
1068             if result:
1069                 language = result.group(1)
1070                 if language == "default":
1071                     encoding_stack[-1] = document.encoding
1072                 else:
1073                     from lyx2lyx_lang import lang
1074                     encoding_stack[-1] = lang[language][3]
1075                 continue
1076             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
1077                 encoding_stack.append(encoding_stack[-1])
1078                 continue
1079             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
1080                 del encoding_stack[-1]
1081                 continue
1082
1083         for j in range(len(document.body[i])):
1084             # dotless i and dotless j are both in special_accent_map and can
1085             # occur as an accented character, so we need to test that the
1086             # following character is no accent
1087             if (document.body[i][j] in inverse_special_accent_map and
1088                 (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
1089                 accent = document.body[i][j]
1090                 try:
1091                     dummy = accent.encode(encoding_stack[-1])
1092                 except UnicodeEncodeError:
1093                     # Insert the rest of the line as new line
1094                     if j < len(document.body[i]) - 1:
1095                         document.body.insert(i+1, document.body[i][j+1:])
1096                     # Delete the accented character
1097                     if j > 0:
1098                         document.body[i] = document.body[i][:j-1]
1099                     else:
1100                         document.body[i] = u''
1101                     # Finally add the InsetLaTeXAccent
1102                     document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
1103                     break
1104             elif j > 0 and document.body[i][j] in inverse_accent_map:
1105                 accented_char = document.body[i][j-1]
1106                 if accented_char == ' ':
1107                     # Conform to LyX output
1108                     accented_char = ''
1109                 elif accented_char in inverse_accented_map:
1110                     accented_char = inverse_accented_map[accented_char]
1111                 accent = document.body[i][j]
1112                 try:
1113                     dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
1114                 except UnicodeEncodeError:
1115                     # Insert the rest of the line as new line
1116                     if j < len(document.body[i]) - 1:
1117                         document.body.insert(i+1, document.body[i][j+1:])
1118                     # Delete the accented characters
1119                     if j > 1:
1120                         document.body[i] = document.body[i][:j-2]
1121                     else:
1122                         document.body[i] = u''
1123                     # Finally add the InsetLaTeXAccent
1124                     document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
1125                     break
1126         i = i + 1
1127
1128     # Normalize to "Normal form C" (NFC, pre-composed characters) again
1129     for i in range(numberoflines):
1130         document.body[i] = unicodedata.normalize("NFC", document.body[i])
1131
1132
1133 def normalize_font_whitespace_259(document):
1134     """ Before format 259 the font changes were ignored if a
1135     whitespace was the first or last character in the sequence, this function
1136     transfers the whitespace outside."""
1137
1138     char_properties = {"\\series": "default",
1139                        "\\emph": "default",
1140                        "\\color": "none",
1141                        "\\shape": "default",
1142                        "\\bar": "default",
1143                        "\\family": "default"}
1144     return normalize_font_whitespace(document, char_properties)
1145
1146 def normalize_font_whitespace_274(document):
1147     """ Before format 259 (sic) the font changes were ignored if a
1148     whitespace was the first or last character in the sequence. This was
1149     corrected for most font properties in format 259, but the language
1150     was forgotten then. This function applies the same conversion done
1151     there (namely, transfers the whitespace outside) for font language
1152     changes, as well."""
1153
1154     char_properties = {"\\lang": "default"}
1155     return normalize_font_whitespace(document, char_properties)
1156
1157 def get_paragraph_language(document, i):
1158     """ Return the language of the paragraph in which line i of the document
1159     body is. If the first thing in the paragraph is a \\lang command, that
1160     is the paragraph's langauge; otherwise, the paragraph's language is the
1161     document's language."""
1162
1163     lines = document.body
1164
1165     first_nonempty_line = \
1166         find_nonempty_line(lines, find_beginning_of_layout(lines, i) + 1)
1167
1168     words = lines[first_nonempty_line].split()
1169
1170     if len(words) > 1 and words[0] == "\\lang":
1171         return words[1]
1172     else:
1173         return document.language
1174
1175 def normalize_font_whitespace(document, char_properties):
1176     """ Before format 259 the font changes were ignored if a
1177     whitespace was the first or last character in the sequence, this function
1178     transfers the whitespace outside. Only a change in one of the properties
1179     in the provided     char_properties is handled by this function."""
1180
1181     if document.backend != "latex":
1182         return
1183
1184     lines = document.body
1185
1186     changes = {}
1187
1188     i = 0
1189     while i < len(lines):
1190         words = lines[i].split()
1191
1192         if len(words) > 0 and words[0] == "\\begin_layout":
1193             # a new paragraph resets all font changes
1194             changes.clear()
1195             # also reset the default language to be the paragraph's language
1196             if "\\lang" in char_properties.keys():
1197                 char_properties["\\lang"] = \
1198                     get_paragraph_language(document, i + 1)
1199
1200         elif len(words) > 1 and words[0] in char_properties.keys():
1201             # we have a font change
1202             if char_properties[words[0]] == words[1]:
1203                 # property gets reset
1204                 if words[0] in changes.keys():
1205                     del changes[words[0]]
1206                 defaultproperty = True
1207             else:
1208                 # property gets set
1209                 changes[words[0]] = words[1]
1210                 defaultproperty = False
1211
1212             # We need to explicitly reset all changed properties if we find
1213             # a space below, because LyX 1.4 would output the space after
1214             # closing the previous change and before starting the new one,
1215             # and closing a font change means to close all properties, not
1216             # just the changed one.
1217
1218             if lines[i-1] and lines[i-1][-1] == " ":
1219                 lines[i-1] = lines[i-1][:-1]
1220                 # a space before the font change
1221                 added_lines = [" "]
1222                 for k in changes.keys():
1223                     # exclude property k because that is already in lines[i]
1224                     if k != words[0]:
1225                         added_lines[1:1] = ["%s %s" % (k, changes[k])]
1226                 for k in changes.keys():
1227                     # exclude property k because that must be added below anyway
1228                     if k != words[0]:
1229                         added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
1230                 if defaultproperty:
1231                     # Property is reset in lines[i], so add the new stuff afterwards
1232                     lines[i+1:i+1] = added_lines
1233                 else:
1234                     # Reset property for the space
1235                     added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
1236                     lines[i:i] = added_lines
1237                 i = i + len(added_lines)
1238
1239             elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
1240                 # a space after the font change
1241                 if (lines[i+1] == " " and lines[i+2]):
1242                     next_words = lines[i+2].split()
1243                     if len(next_words) > 0 and next_words[0] == words[0]:
1244                         # a single blank with a property different from the
1245                         # previous and the next line must not be changed
1246                         i = i + 2
1247                         continue
1248                 lines[i+1] = lines[i+1][1:]
1249                 added_lines = [" "]
1250                 for k in changes.keys():
1251                     # exclude property k because that is already in lines[i]
1252                     if k != words[0]:
1253                         added_lines[1:1] = ["%s %s" % (k, changes[k])]
1254                 for k in changes.keys():
1255                     # exclude property k because that must be added below anyway
1256                     if k != words[0]:
1257                         added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
1258                 # Reset property for the space
1259                 added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
1260                 lines[i:i] = added_lines
1261                 i = i + len(added_lines)
1262
1263         i = i + 1
1264
1265
1266 def revert_utf8x(document):
1267     " Set utf8x encoding to utf8. "
1268     i = find_token(document.header, "\\inputencoding", 0)
1269     if i == -1:
1270         document.header.append("\\inputencoding auto")
1271     else:
1272         inputenc = get_value(document.header, "\\inputencoding", i)
1273         if inputenc == "utf8x":
1274             document.header[i] = "\\inputencoding utf8"
1275     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1276
1277
1278 def revert_utf8plain(document):
1279     " Set utf8plain encoding to utf8. "
1280     i = find_token(document.header, "\\inputencoding", 0)
1281     if i == -1:
1282         document.header.append("\\inputencoding auto")
1283     else:
1284         inputenc = get_value(document.header, "\\inputencoding", i)
1285         if inputenc == "utf8-plain":
1286             document.header[i] = "\\inputencoding utf8"
1287     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1288
1289
1290 def revert_beamer_alert(document):
1291     " Revert beamer's \\alert inset back to ERT. "
1292     i = 0
1293     while 1:
1294         i = find_token(document.body, "\\begin_inset CharStyle Alert", i)
1295         if i == -1:
1296             return
1297         document.body[i] = "\\begin_inset ERT"
1298         i = i + 1
1299         while 1:
1300             if (document.body[i][:13] == "\\begin_layout"):
1301                 # Insert the \alert command
1302                 document.body[i + 1] = "\\alert{" + document.body[i + 1] + '}'
1303                 break
1304             i = i + 1
1305
1306         i = i + 1
1307
1308
1309 def revert_beamer_structure(document):
1310     " Revert beamer's \\structure inset back to ERT. "
1311     i = 0
1312     while 1:
1313         i = find_token(document.body, "\\begin_inset CharStyle Structure", i)
1314         if i == -1:
1315             return
1316         document.body[i] = "\\begin_inset ERT"
1317         i = i + 1
1318         while 1:
1319             if (document.body[i][:13] == "\\begin_layout"):
1320                 document.body[i + 1] = "\\structure{" + document.body[i + 1] + '}'
1321                 break
1322             i = i + 1
1323
1324         i = i + 1
1325
1326
1327 def convert_changes(document):
1328     " Switch output_changes off if tracking_changes is off. "
1329     i = find_token(document.header, '\\tracking_changes', 0)
1330     if i == -1:
1331         document.warning("Malformed lyx document: Missing '\\tracking_changes'.")
1332         return
1333     j = find_token(document.header, '\\output_changes', 0)
1334     if j == -1:
1335         document.warning("Malformed lyx document: Missing '\\output_changes'.")
1336         return
1337     tracking_changes = get_value(document.header, "\\tracking_changes", i)
1338     output_changes = get_value(document.header, "\\output_changes", j)
1339     if tracking_changes == "false" and output_changes == "true":
1340         document.header[j] = "\\output_changes false"
1341
1342
1343 def revert_ascii(document):
1344     " Set ascii encoding to auto. "
1345     i = find_token(document.header, "\\inputencoding", 0)
1346     if i == -1:
1347         document.header.append("\\inputencoding auto")
1348     else:
1349         inputenc = get_value(document.header, "\\inputencoding", i)
1350         if inputenc == "ascii":
1351             document.header[i] = "\\inputencoding auto"
1352     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1353
1354
1355 def normalize_language_name(document):
1356     lang = { "brazil": "brazilian",
1357              "portuges": "portuguese"}
1358
1359     if document.language in lang:
1360         document.language = lang[document.language]
1361         i = find_token(document.header, "\\language", 0)
1362         document.header[i] = "\\language %s" % document.language
1363
1364
1365 def revert_language_name(document):
1366     lang = { "brazilian": "brazil",
1367              "portuguese": "portuges"}
1368
1369     if document.language in lang:
1370         document.language = lang[document.language]
1371         i = find_token(document.header, "\\language", 0)
1372         document.header[i] = "\\language %s" % document.language
1373
1374 #
1375 #  \textclass cv -> \textclass simplecv
1376 def convert_cv_textclass(document):
1377     if document.textclass == "cv":
1378         document.textclass = "simplecv"
1379
1380
1381 def revert_cv_textclass(document):
1382     if document.textclass == "simplecv":
1383         document.textclass = "cv"
1384
1385
1386 #
1387 # add scaleBeforeRotation graphics param
1388 def convert_graphics_rotation(document):
1389     " add scaleBeforeRotation graphics parameter. "
1390     i = 0
1391     while 1:
1392         i = find_token(document.body, "\\begin_inset Graphics", i)
1393         if i == -1:
1394             return
1395         j = find_end_of_inset(document.body, i+1)
1396         if j == -1:
1397             # should not happen
1398             document.warning("Malformed LyX document: Could not find end of graphics inset.")
1399         # Seach for rotateAngle and width or height or scale
1400         # If these params are not there, nothing needs to be done.
1401         k = find_token(document.body, "\trotateAngle", i + 1, j)
1402         l = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
1403         if (k != -1 and l != -1):
1404             document.body.insert(j, 'scaleBeforeRotation')
1405         i = i + 1
1406
1407
1408 #
1409 # remove scaleBeforeRotation graphics param
1410 def revert_graphics_rotation(document):
1411     " remove scaleBeforeRotation graphics parameter. "
1412     i = 0
1413     while 1:
1414         i = find_token(document.body, "\\begin_inset Graphics", i)
1415         if i == -1:
1416             return
1417         j = find_end_of_inset(document.body, i + 1)
1418         if j == -1:
1419             # should not happen
1420             document.warning("Malformed LyX document: Could not find end of graphics inset.")
1421         # If there's a scaleBeforeRotation param, just remove that
1422         k = find_token(document.body, "\tscaleBeforeRotation", i + 1, j)
1423         if k != -1:
1424             del document.body[k]
1425         else:
1426             # if not, and if we have rotateAngle and width or height or scale,
1427             # we have to put the rotateAngle value to special
1428             rotateAngle = get_value(document.body, 'rotateAngle', i + 1, j)
1429             special = get_value(document.body, 'special', i + 1, j)
1430             if rotateAngle != "":
1431                 k = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
1432                 if k == -1:
1433                     break
1434                 if special == "":
1435                     document.body.insert(j-1, '\tspecial angle=%s' % rotateAngle)
1436                 else:
1437                     l = find_token(document.body, "\tspecial", i + 1, j)
1438                     document.body[l] = document.body[l].replace(special, 'angle=%s,%s' % (rotateAngle, special))
1439                 k = find_token(document.body, "\trotateAngle", i + 1, j)
1440                 if k != -1:
1441                     del document.body[k]
1442         i = i + 1
1443
1444
1445
1446 def convert_tableborder(document):
1447     # The problematic is: LyX double the table cell border as it ignores the "|" character in
1448     # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
1449     i = 0
1450     while i < len(document.body):
1451         h = document.body[i].find("leftline=\"true\"", 0, len(document.body[i]))
1452         k = document.body[i].find("|>{", 0, len(document.body[i]))
1453         # the two tokens have to be in one line
1454         if (h != -1 and k != -1):
1455             # delete the "|"
1456             document.body[i] = document.body[i][:k] + document.body[i][k+1:len(document.body[i])-1]
1457         i = i + 1
1458
1459
1460 def revert_tableborder(document):
1461     i = 0
1462     while i < len(document.body):
1463         h = document.body[i].find("leftline=\"true\"", 0, len(document.body[i]))
1464         k = document.body[i].find(">{", 0, len(document.body[i]))
1465         # the two tokens have to be in one line
1466         if (h != -1 and k != -1):
1467             # add the "|"
1468             document.body[i] = document.body[i][:k] + '|' + document.body[i][k:]
1469         i = i + 1
1470
1471
1472 def revert_armenian(document):
1473
1474     # set inputencoding from armscii8 to auto
1475     if document.inputencoding == "armscii8":
1476         i = find_token(document.header, "\\inputencoding", 0)
1477         if i != -1:
1478             document.header[i] = "\\inputencoding auto"
1479     # check if preamble exists, if not k is set to -1
1480     i = 0
1481     k = -1
1482     while i < len(document.preamble):
1483         if k == -1:
1484             k = document.preamble[i].find("\\", 0, len(document.preamble[i]))
1485         if k == -1:
1486             k = document.preamble[i].find("%", 0, len(document.preamble[i]))
1487         i = i + 1
1488     # add the entry \usepackage{armtex} to the document preamble
1489     if document.language == "armenian":
1490         # set the armtex entry as the first preamble line
1491         if k != -1:
1492             document.preamble[0:0] = ["\\usepackage{armtex}"]
1493         # create the preamble when it doesn't exist
1494         else:
1495             document.preamble.append('\\usepackage{armtex}')
1496     # Set document language from armenian to english
1497     if document.language == "armenian":
1498         document.language = "english"
1499         i = find_token(document.header, "\\language", 0)
1500         if i != -1:
1501             document.header[i] = "\\language english"
1502
1503
1504 def revert_CJK(document):
1505     " Set CJK encodings to default and languages chinese, japanese and korean to english. "
1506     encodings = ["Bg5", "Bg5+", "GB", "GBt", "GBK", "JIS",
1507                  "KS", "SJIS", "UTF8", "EUC-TW", "EUC-JP"]
1508     i = find_token(document.header, "\\inputencoding", 0)
1509     if i == -1:
1510         document.header.append("\\inputencoding auto")
1511     else:
1512         inputenc = get_value(document.header, "\\inputencoding", i)
1513         if inputenc in encodings:
1514             document.header[i] = "\\inputencoding default"
1515     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1516
1517     if document.language == "chinese-simplified" or \
1518        document.language == "chinese-traditional" or \
1519        document.language == "japanese" or document.language == "korean":
1520         document.language = "english"
1521         i = find_token(document.header, "\\language", 0)
1522         if i != -1:
1523             document.header[i] = "\\language english"
1524
1525
1526 def revert_preamble_listings_params(document):
1527     " Revert preamble option \listings_params "
1528     i = find_token(document.header, "\\listings_params", 0)
1529     if i != -1:
1530         document.preamble.append('\\usepackage{listings}')
1531         document.preamble.append('\\lstset{%s}' % document.header[i].split()[1].strip('"'))
1532         document.header.pop(i);
1533
1534
1535 def revert_listings_inset(document):
1536     r''' Revert listings inset to \lstinline or \begin, \end lstlisting, translate
1537 FROM
1538
1539 \begin_inset
1540 lstparams "language=Delphi"
1541 inline true
1542 status open
1543
1544 \begin_layout Standard
1545 var i = 10;
1546 \end_layout
1547
1548 \end_inset
1549
1550 TO
1551
1552 \begin_inset ERT
1553 status open
1554 \begin_layout Standard
1555
1556
1557 \backslash
1558 lstinline[language=Delphi]{var i = 10;}
1559 \end_layout
1560
1561 \end_inset
1562
1563 There can be an caption inset in this inset
1564
1565 \begin_layout Standard
1566 \begin_inset Caption
1567
1568 \begin_layout Standard
1569 before label
1570 \begin_inset LatexCommand label
1571 name "lst:caption"
1572
1573 \end_inset
1574
1575 after label
1576 \end_layout
1577
1578 \end_inset
1579
1580
1581 \end_layout
1582
1583 '''
1584     i = 0
1585     while True:
1586         i = find_token(document.body, '\\begin_inset listings', i)
1587         if i == -1:
1588             break
1589         else:
1590             if not '\\usepackage{listings}' in document.preamble:
1591                 document.preamble.append('\\usepackage{listings}')
1592         j = find_end_of_inset(document.body, i + 1)
1593         if j == -1:
1594             # this should not happen
1595             break
1596         inline = 'false'
1597         params = ''
1598         status = 'open'
1599         # first three lines
1600         for line in range(i + 1, i + 4):
1601             if document.body[line].startswith('inline'):
1602                 inline = document.body[line].split()[1]
1603             if document.body[line].startswith('lstparams'):
1604                 params = document.body[line].split()[1].strip('"')
1605             if document.body[line].startswith('status'):
1606                 status = document.body[line].split()[1].strip()
1607                 k = line + 1
1608         # caption?
1609         caption = ''
1610         label = ''
1611         cap = find_token(document.body, '\\begin_inset Caption', i)
1612         if cap != -1:
1613             cap_end = find_end_of_inset(document.body, cap + 1)
1614             if cap_end == -1:
1615                 # this should not happen
1616                 break
1617             # label?
1618             lbl = find_token(document.body, '\\begin_inset LatexCommand label', cap + 1)
1619             if lbl != -1:
1620                 lbl_end = find_end_of_inset(document.body, lbl + 1)
1621                 if lbl_end == -1:
1622                     # this should not happen
1623                     break
1624             else:
1625                 lbl = cap_end
1626                 lbl_end = cap_end
1627             for line in document.body[lbl : lbl_end + 1]:
1628                 if line.startswith('name '):
1629                     label = line.split()[1].strip('"')
1630                     break
1631             for line in document.body[cap : lbl ] + document.body[lbl_end + 1 : cap_end + 1]:
1632                 if not line.startswith('\\'):
1633                     caption += line.strip()
1634             k = cap_end + 1
1635         inlinecode = ''
1636         # looking for the oneline code for lstinline
1637         inlinecode = document.body[find_end_of_layout(document.body,
1638             find_token(document.body,  '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
1639         if len(caption) > 0:
1640             if len(params) == 0:
1641                 params = 'caption={%s}' % caption
1642             else:
1643                 params += ',caption={%s}' % caption
1644         if len(label) > 0:
1645             if len(params) == 0:
1646                 params = 'label={%s}' % label
1647             else:
1648                 params += ',label={%s}' % label
1649         if len(params) > 0:
1650             params = '[%s]' % params
1651             params = params.replace('\\', '\\backslash\n')
1652         if inline == 'true':
1653             document.body[i:(j+1)] = [r'\begin_inset ERT',
1654                                       'status %s' % status,
1655                                       r'\begin_layout %s' % document.default_layout,
1656                                       '',
1657                                       '',
1658                                       r'\backslash',
1659                                       'lstinline%s{%s}' % (params, inlinecode),
1660                                       r'\end_layout',
1661                                       '',
1662                                       r'\end_inset']
1663         else:
1664             document.body[i: j+1] =  [r'\begin_inset ERT',
1665                                       'status %s' % status,
1666                                       '',
1667                                       r'\begin_layout %s' % document.default_layout,
1668                                       '',
1669                                       '',
1670                                       r'\backslash',
1671                                       r'begin{lstlisting}%s' % params,
1672                                       r'\end_layout'
1673                                     ] + document.body[k : j - 1] + \
1674                                      ['',
1675                                       r'\begin_layout %s' % document.default_layout,
1676                                       '',
1677                                       r'\backslash',
1678                                       'end{lstlisting}',
1679                                       r'\end_layout',
1680                                       '',
1681                                       r'\end_inset']
1682
1683
1684 def revert_include_listings(document):
1685     r''' Revert lstinputlisting Include option , translate
1686 \begin_inset Include \lstinputlisting{file}[opt]
1687 preview false
1688
1689 \end_inset
1690
1691 TO
1692
1693 \begin_inset ERT
1694 status open
1695
1696 \begin_layout Standard
1697
1698
1699 \backslash
1700 lstinputlisting{file}[opt]
1701 \end_layout
1702
1703 \end_inset
1704     '''
1705
1706     i = 0
1707     while True:
1708         i = find_token(document.body, r'\begin_inset Include \lstinputlisting', i)
1709         if i == -1:
1710             break
1711         else:
1712             if not '\\usepackage{listings}' in document.preamble:
1713                 document.preamble.append('\\usepackage{listings}')
1714         j = find_end_of_inset(document.body, i + 1)
1715         if j == -1:
1716             # this should not happen
1717             break
1718         # find command line lstinputlisting{file}[options]
1719         cmd, file, option = '', '', ''
1720         if re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]):
1721             cmd, file, option = re.match(r'\\(lstinputlisting){([.\w]*)}(.*)', document.body[i].split()[2]).groups()
1722         option = option.replace('\\', '\\backslash\n')
1723         document.body[i : j + 1] = [r'\begin_inset ERT',
1724                                     'status open',
1725                                     '',
1726                                     r'\begin_layout %s' % document.default_layout,
1727                                     '',
1728                                     '',
1729                                     r'\backslash',
1730                                     '%s%s{%s}' % (cmd, option, file),
1731                                     r'\end_layout',
1732                                     '',
1733                                     r'\end_inset']
1734
1735
1736 def revert_ext_font_sizes(document):
1737     if document.backend != "latex": return
1738     if not document.textclass.startswith("ext"): return
1739
1740     fontsize = get_value(document.header, '\\paperfontsize', 0)
1741     if fontsize not in ('10', '11', '12'): return
1742     fontsize += 'pt'
1743
1744     i = find_token(document.header, '\\paperfontsize', 0)
1745     document.header[i] = '\\paperfontsize default'
1746
1747     i = find_token(document.header, '\\options', 0)
1748     if i == -1:
1749         i = find_token(document.header, '\\textclass', 0) + 1
1750         document.header[i:i] = ['\\options %s' % fontsize]
1751     else:
1752         document.header[i] += ',%s' % fontsize
1753
1754
1755 def convert_ext_font_sizes(document):
1756     if document.backend != "latex": return
1757     if not document.textclass.startswith("ext"): return
1758
1759     fontsize = get_value(document.header, '\\paperfontsize', 0)
1760     if fontsize != 'default': return
1761
1762     i = find_token(document.header, '\\options', 0)
1763     if i == -1: return
1764
1765     options = get_value(document.header, '\\options', i)
1766
1767     fontsizes = '10pt', '11pt', '12pt'
1768     for fs in fontsizes:
1769         if options.find(fs) != -1:
1770             break
1771     else: # this else will only be attained if the for cycle had no match
1772         return
1773
1774     options = options.split(',')
1775     for j, opt in enumerate(options):
1776         if opt in fontsizes:
1777             fontsize = opt[:-2]
1778             del options[j]
1779             break
1780     else:
1781         return
1782
1783     k = find_token(document.header, '\\paperfontsize', 0)
1784     document.header[k] = '\\paperfontsize %s' % fontsize
1785
1786     if options:
1787         document.header[i] = '\\options %s' % ','.join(options)
1788     else:
1789         del document.header[i]
1790
1791
1792 def revert_separator_layout(document):
1793     r'''Revert --Separator-- to a lyx note
1794 From
1795
1796 \begin_layout --Separator--
1797 something
1798 \end_layout
1799
1800 to
1801
1802 \begin_layout Standard
1803 \begin_inset Note Note
1804 status open
1805
1806 \begin_layout Standard
1807 Separate Evironment
1808 \end_layout
1809
1810 \end_inset
1811 something
1812
1813 \end_layout
1814
1815     '''
1816
1817     i = 0
1818     while True:
1819         i = find_token(document.body, r'\begin_layout --Separator--', i)
1820         if i == -1:
1821             break
1822         j = find_end_of_layout(document.body, i + 1)
1823         if j == -1:
1824             # this should not happen
1825             break
1826         document.body[i : j + 1] = [r'\begin_layout %s' % document.default_layout,
1827                                     r'\begin_inset Note Note',
1828                                     'status open',
1829                                     '',
1830                                     r'\begin_layout %s' % document.default_layout,
1831                                     'Separate Environment',
1832                                     r'\end_layout',
1833                                     '',
1834                                     r'\end_inset'] + \
1835                                     document.body[ i + 1 : j] + \
1836                                     ['',
1837                                     r'\end_layout'
1838                                     ]
1839
1840
1841 def convert_arabic (document):
1842     if document.language == "arabic":
1843         document.language = "arabic_arabtex"
1844         i = find_token(document.header, "\\language", 0)
1845         if i != -1:
1846             document.header[i] = "\\language arabic_arabtex"
1847     i = 0
1848     while i < len(document.body):
1849         h = document.body[i].find("\lang arabic", 0, len(document.body[i]))
1850         if (h != -1):
1851             # change the language name
1852             document.body[i] = '\lang arabic_arabtex'
1853         i = i + 1
1854
1855
1856 def revert_arabic (document):
1857     if document.language == "arabic_arabtex":
1858         document.language = "arabic"
1859         i = find_token(document.header, "\\language", 0)
1860         if i != -1:
1861             document.header[i] = "\\language arabic"
1862     i = 0
1863     while i < len(document.body):
1864         h = document.body[i].find("\lang arabic_arabtex", 0, len(document.body[i]))
1865         if (h != -1):
1866             # change the language name
1867             document.body[i] = '\lang arabic'
1868         i = i + 1
1869
1870
1871 def read_unicodesymbols():
1872     " Read the unicodesymbols list of unicode characters and corresponding commands."
1873     pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
1874     fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
1875     spec_chars = {}
1876     for line in fp.readlines():
1877         if line[0] != '#':
1878             line=line.replace(' "',' ') # remove all quotation marks with spaces before
1879             line=line.replace('" ',' ') # remove all quotation marks with spaces after
1880             line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
1881             try:
1882                 # flag1 and flag2 are preamble and other flags
1883                 [ucs4,command,flag1,flag2] =line.split(None,3)
1884                 spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
1885             except:
1886                 pass
1887     fp.close()
1888
1889     return spec_chars
1890
1891
1892 def revert_unicode(document):
1893     '''Transform unicode characters that can not be written using the
1894 document encoding to commands according to the unicodesymbols
1895 file. Characters that can not be replaced by commands are replaced by
1896 an replacement string.  Flags other than 'combined' are currently not
1897 implemented.'''
1898
1899     replacement_character = '???'
1900     spec_chars = read_unicodesymbols()
1901
1902     # Define strings to start and end ERT and math insets
1903     ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
1904     ert_outro='\n\\end_layout\n\n\\end_inset\n'
1905     math_intro='\n\\begin_inset Formula $'
1906     math_outro='$\n\\end_inset'
1907     # Find unicode characters and replace them
1908     in_ert = False # flag set to 1 if in ERT inset
1909     in_math = False # flag set to 1 if in math inset
1910     insets = [] # list of active insets
1911
1912     # Go through the file to capture all combining characters
1913     last_char = '' # to store the previous character
1914
1915     i = 0
1916     while i < len(document.body):
1917         line = document.body[i]
1918         # Check for insets
1919         if line.find('\\begin_inset') > -1:
1920             # check which inset to start
1921             if line.find('\\begin_inset ERT') > -1:
1922                 in_ert = True
1923                 insets.append('ert')
1924             elif line.find('\\begin_inset Formula') > -1:
1925                 in_math = True
1926                 insets.append('math')
1927             else:
1928                 insets.append('other')
1929         if line.find('\\end_inset') > -1:
1930             # check which inset to end
1931             try:
1932                 cur_inset = insets.pop()
1933                 if cur_inset == 'ert':
1934                     in_ert = False
1935                 elif cur_inset == 'math':
1936                     in_math = False
1937                 else:
1938                     pass # end of other inset
1939             except:
1940                 pass # inset list was empty (for some reason)
1941
1942         # Try to write the line
1943         try:
1944             # If all goes well the line is written here
1945             dummy = line.encode(document.encoding)
1946             last_char = line[-1]
1947             i += 1
1948         except:
1949             # Error, some character(s) in the line need to be replaced
1950             mod_line = u''
1951             for character in line:
1952                 try:
1953                     # Try to write the character
1954                     dummy = character.encode(document.encoding)
1955                     mod_line += character
1956                     last_char = character
1957                 except:
1958                     # Try to replace with ERT/math inset
1959                     if spec_chars.has_key(character):
1960                         command = spec_chars[character][0] # the command to replace unicode
1961                         flag1 = spec_chars[character][1]
1962                         flag2 = spec_chars[character][2]
1963                         if flag1.find('combining') > -1 or flag2.find('combining') > -1:
1964                             # We have a character that should be combined with the previous
1965                             command += '{' + last_char + '}'
1966                             # Remove the last character. Ignore if it is whitespace
1967                             if len(last_char.rstrip()):
1968                                 # last_char was found and is not whitespace
1969                                 if mod_line:
1970                                     mod_line = mod_line[:-1]
1971                                 else: # last_char belongs to the last line
1972                                     document.body[i-1] = document.body[i-1][:-1]
1973                             else:
1974                                 # The last character was replaced by a command. For now it is
1975                                 # ignored. This could be handled better.
1976                                 pass
1977                         if command[0:2] == '\\\\':
1978                             if command[2:12]=='ensuremath':
1979                                 if in_ert:
1980                                     # math in ERT
1981                                     command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
1982                                     command = command.replace('}', '$\n')
1983                                 elif not in_math:
1984                                     # add a math inset with the replacement character
1985                                     command = command.replace('\\\\ensuremath{\\', math_intro)
1986                                     command = command.replace('}', math_outro)
1987                                 else:
1988                                     # we are already in a math inset
1989                                     command = command.replace('\\\\ensuremath{\\', '')
1990                                     command = command.replace('}', '')
1991                             else:
1992                                 if in_math:
1993                                     # avoid putting an ERT in a math; instead put command as text
1994                                     command = command.replace('\\\\', '\mathrm{')
1995                                     command = command + '}'
1996                                 elif not in_ert:
1997                                     # add an ERT inset with the replacement character
1998                                     command = command.replace('\\\\', ert_intro)
1999                                     command = command + ert_outro
2000                                 else:
2001                                     command = command.replace('\\\\', '\n\\backslash\n')
2002                             last_char = '' # indicate that the character should not be removed
2003                         mod_line += command
2004                     else:
2005                         # Replace with replacement string
2006                         mod_line += replacement_character
2007             document.body[i:i+1] = mod_line.split('\n')
2008             i += len(mod_line.split('\n'))
2009
2010
2011 ##
2012 # Conversion hub
2013 #
2014
2015 supported_versions = ["1.5.0","1.5"]
2016 convert = [[246, []],
2017            [247, [convert_font_settings]],
2018            [248, []],
2019            [249, [convert_utf8]],
2020            [250, []],
2021            [251, []],
2022            [252, [convert_commandparams, convert_bibitem]],
2023            [253, []],
2024            [254, [convert_esint]],
2025            [255, []],
2026            [256, []],
2027            [257, [convert_caption]],
2028            [258, [convert_lyxline]],
2029            [259, [convert_accent, normalize_font_whitespace_259]],
2030            [260, []],
2031            [261, [convert_changes]],
2032            [262, []],
2033            [263, [normalize_language_name]],
2034            [264, [convert_cv_textclass]],
2035            [265, [convert_tableborder]],
2036            [266, []],
2037            [267, []],
2038            [268, []],
2039            [269, []],
2040            [270, []],
2041            [271, [convert_ext_font_sizes]],
2042            [272, []],
2043            [273, []],
2044            [274, [normalize_font_whitespace_274]],
2045            [275, [convert_graphics_rotation]],
2046            [276, [convert_arabic]]
2047           ]
2048
2049 revert =  [
2050            [275, [revert_arabic]],
2051            [274, [revert_graphics_rotation]],
2052            [273, []],
2053            [272, [revert_separator_layout]],
2054            [271, [revert_preamble_listings_params, revert_listings_inset, revert_include_listings]],
2055            [270, [revert_ext_font_sizes]],
2056            [269, [revert_beamer_alert, revert_beamer_structure]],
2057            [268, [revert_preamble_listings_params, revert_listings_inset, revert_include_listings]],
2058            [267, [revert_CJK]],
2059            [266, [revert_utf8plain]],
2060            [265, [revert_armenian]],
2061            [264, [revert_tableborder]],
2062            [263, [revert_cv_textclass]],
2063            [262, [revert_language_name]],
2064            [261, [revert_ascii]],
2065            [260, []],
2066            [259, [revert_utf8x]],
2067            [258, []],
2068            [257, []],
2069            [256, [revert_caption]],
2070            [255, [revert_encodings]],
2071            [254, [revert_clearpage, revert_cleardoublepage]],
2072            [253, [revert_esint]],
2073            [252, [revert_nomenclature, revert_printnomenclature]],
2074            [251, [revert_commandparams]],
2075            [250, [revert_cs_label]],
2076            [249, []],
2077            [248, [revert_accent, revert_utf8, revert_unicode]],
2078            [247, [revert_booktabs]],
2079            [246, [revert_font_settings]],
2080            [245, [revert_framed]]]
2081
2082
2083 if __name__ == "__main__":
2084     pass
2085
2086