lib/lyx2lyx/lyx_1_5.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2006 José Matos <jamatos@lyx.org>
   4 # Copyright (C) 2004-2006 Georg Baum <Georg.Baum@post.rwth-aachen.de>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19
  20 """ Convert files to the file format generated by lyx 1.5"""
  21
  22 import re
  23 import unicodedata
  24
  25 from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
  26 from LyX import get_encoding
  27
  28
  29 ####################################################################
  30 # Private helper functions
  31
  32 def find_end_of_inset(lines, i):
  33     " Find end of inset, where lines[i] is included."
  34     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
  35
  36 def find_end_of_layout(lines, i):
  37     " Find end of layout, where lines[i] is included."
  38     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
  39
  40 # End of helper functions
  41 ####################################################################
  42
  43
  44 ##
  45 #  Notes: Framed/Shaded
  46 #
  47
  48 def revert_framed(document):
  49     "Revert framed notes. "
  50     i = 0
  51     while 1:
  52         i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i)
  53
  54         if i == -1:
  55             return
  56         document.body[i] = "\\begin_inset Note"
  57         i = i + 1
  58
  59
  60 ##
  61 #  Fonts
  62 #
  63
  64 roman_fonts      = {'default' : 'default', 'ae'       : 'ae',
  65                     'times'   : 'times',   'palatino' : 'palatino',
  66                     'helvet'  : 'default', 'avant'    : 'default',
  67                     'newcent' : 'newcent', 'bookman'  : 'bookman',
  68                     'pslatex' : 'times'}
  69 sans_fonts       = {'default' : 'default', 'ae'       : 'default',
  70                     'times'   : 'default', 'palatino' : 'default',
  71                     'helvet'  : 'helvet',  'avant'    : 'avant',
  72                     'newcent' : 'default', 'bookman'  : 'default',
  73                     'pslatex' : 'helvet'}
  74 typewriter_fonts = {'default' : 'default', 'ae'       : 'default',
  75                     'times'   : 'default', 'palatino' : 'default',
  76                     'helvet'  : 'default', 'avant'    : 'default',
  77                     'newcent' : 'default', 'bookman'  : 'default',
  78                     'pslatex' : 'courier'}
  79
  80 def convert_font_settings(document):
  81     " Convert font settings. "
  82     i = 0
  83     i = find_token_exact(document.header, "\\fontscheme", i)
  84     if i == -1:
  85         document.warning("Malformed LyX document: Missing `\\fontscheme'.")
  86         return
  87     font_scheme = get_value(document.header, "\\fontscheme", i, i + 1)
  88     if font_scheme == '':
  89         document.warning("Malformed LyX document: Empty `\\fontscheme'.")
  90         font_scheme = 'default'
  91     if not font_scheme in roman_fonts.keys():
  92         document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
  93         font_scheme = 'default'
  94     document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme],
  95                           '\\font_sans %s' % sans_fonts[font_scheme],
  96                           '\\font_typewriter %s' % typewriter_fonts[font_scheme],
  97                           '\\font_default_family default',
  98                           '\\font_sc false',
  99                           '\\font_osf false',
 100                           '\\font_sf_scale 100',
 101                           '\\font_tt_scale 100']
 102
 103
 104 def revert_font_settings(document):
 105     " Revert font settings. "
 106     i = 0
 107     insert_line = -1
 108     fonts = {'roman' : 'default', 'sans' : 'default', 'typewriter' : 'default'}
 109     for family in 'roman', 'sans', 'typewriter':
 110         name = '\\font_%s' % family
 111         i = find_token_exact(document.header, name, i)
 112         if i == -1:
 113             document.warning("Malformed LyX document: Missing `%s'." % name)
 114             i = 0
 115         else:
 116             if (insert_line < 0):
 117                 insert_line = i
 118             fonts[family] = get_value(document.header, name, i, i + 1)
 119             del document.header[i]
 120     i = find_token_exact(document.header, '\\font_default_family', i)
 121     if i == -1:
 122         document.warning("Malformed LyX document: Missing `\\font_default_family'.")
 123         font_default_family = 'default'
 124     else:
 125         font_default_family = get_value(document.header, "\\font_default_family", i, i + 1)
 126         del document.header[i]
 127     i = find_token_exact(document.header, '\\font_sc', i)
 128     if i == -1:
 129         document.warning("Malformed LyX document: Missing `\\font_sc'.")
 130         font_sc = 'false'
 131     else:
 132         font_sc = get_value(document.header, '\\font_sc', i, i + 1)
 133         del document.header[i]
 134     if font_sc != 'false':
 135         document.warning("Conversion of '\\font_sc' not yet implemented.")
 136     i = find_token_exact(document.header, '\\font_osf', i)
 137     if i == -1:
 138         document.warning("Malformed LyX document: Missing `\\font_osf'.")
 139         font_osf = 'false'
 140     else:
 141         font_osf = get_value(document.header, '\\font_osf', i, i + 1)
 142         del document.header[i]
 143     i = find_token_exact(document.header, '\\font_sf_scale', i)
 144     if i == -1:
 145         document.warning("Malformed LyX document: Missing `\\font_sf_scale'.")
 146         font_sf_scale = '100'
 147     else:
 148         font_sf_scale = get_value(document.header, '\\font_sf_scale', i, i + 1)
 149         del document.header[i]
 150     if font_sf_scale != '100':
 151         document.warning("Conversion of '\\font_sf_scale' not yet implemented.")
 152     i = find_token_exact(document.header, '\\font_tt_scale', i)
 153     if i == -1:
 154         document.warning("Malformed LyX document: Missing `\\font_tt_scale'.")
 155         font_tt_scale = '100'
 156     else:
 157         font_tt_scale = get_value(document.header, '\\font_tt_scale', i, i + 1)
 158         del document.header[i]
 159     if font_tt_scale != '100':
 160         document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
 161     for font_scheme in roman_fonts.keys():
 162         if (roman_fonts[font_scheme] == fonts['roman'] and
 163             sans_fonts[font_scheme] == fonts['sans'] and
 164             typewriter_fonts[font_scheme] == fonts['typewriter']):
 165             document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
 166             if font_default_family != 'default':
 167                 document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
 168             if font_osf == 'true':
 169                 document.warning("Ignoring `\\font_osf = true'")
 170             return
 171     font_scheme = 'default'
 172     document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
 173     if fonts['roman'] == 'cmr':
 174         document.preamble.append('\\renewcommand{\\rmdefault}{cmr}')
 175         if font_osf == 'true':
 176             document.preamble.append('\\usepackage{eco}')
 177             font_osf = 'false'
 178     for font in 'lmodern', 'charter', 'utopia', 'beraserif', 'ccfonts', 'chancery':
 179         if fonts['roman'] == font:
 180             document.preamble.append('\\usepackage{%s}' % font)
 181     for font in 'cmss', 'lmss', 'cmbr':
 182         if fonts['sans'] == font:
 183             document.preamble.append('\\renewcommand{\\sfdefault}{%s}' % font)
 184     for font in 'berasans':
 185         if fonts['sans'] == font:
 186             document.preamble.append('\\usepackage{%s}' % font)
 187     for font in 'cmtt', 'lmtt', 'cmtl':
 188         if fonts['typewriter'] == font:
 189             document.preamble.append('\\renewcommand{\\ttdefault}{%s}' % font)
 190     for font in 'courier', 'beramono', 'luximono':
 191         if fonts['typewriter'] == font:
 192             document.preamble.append('\\usepackage{%s}' % font)
 193     if font_default_family != 'default':
 194         document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
 195     if font_osf == 'true':
 196         document.warning("Ignoring `\\font_osf = true'")
 197
 198
 199 def revert_booktabs(document):
 200     " We remove the booktabs flag or everything else will become a mess. "
 201     re_row = re.compile(r'^<row.*space="[^"]+".*>$')
 202     re_tspace = re.compile(r'\s+topspace="[^"]+"')
 203     re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
 204     re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
 205     i = 0
 206     while 1:
 207         i = find_token(document.body, "\\begin_inset Tabular", i)
 208         if i == -1:
 209             return
 210         j = find_end_of_inset(document.body, i + 1)
 211         if j == -1:
 212             document.warning("Malformed LyX document: Could not find end of tabular.")
 213             continue
 214         for k in range(i, j):
 215             if re.search('^<features.* booktabs="true".*>$', document.body[k]):
 216                 document.warning("Converting 'booktabs' table to normal table.")
 217                 document.body[k] = document.body[k].replace(' booktabs="true"', '')
 218             if re.search(re_row, document.body[k]):
 219                 document.warning("Removing extra row space.")
 220                 document.body[k] = re_tspace.sub('', document.body[k])
 221                 document.body[k] = re_bspace.sub('', document.body[k])
 222                 document.body[k] = re_ispace.sub('', document.body[k])
 223         i = i + 1
 224
 225
 226 def convert_multiencoding(document, forward):
 227     """ Fix files with multiple encodings.
 228 Files with an inputencoding of "auto" or "default" and multiple languages
 229 where at least two languages have different default encodings are encoded
 230 in multiple encodings for file formats < 249. These files are incorrectly
 231 read and written (as if the whole file was in the encoding of the main
 232 language).
 233
 234 This function
 235 - converts from fake unicode values to true unicode if forward is true, and
 236 - converts from true unicode values to fake unicode if forward is false.
 237 document.encoding must be set to the old value (format 248) in both cases.
 238
 239 We do this here and not in LyX.py because it is far easier to do the
 240 necessary parsing in modern formats than in ancient ones.
 241 """
 242     encoding_stack = [document.encoding]
 243     lang_re = re.compile(r"^\\lang\s(\S+)")
 244     if document.inputencoding == "auto" or document.inputencoding == "default":
 245         for i in range(len(document.body)):
 246             result = lang_re.match(document.body[i])
 247             if result:
 248                 language = result.group(1)
 249                 if language == "default":
 250                     document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding))
 251                     encoding_stack[-1] = document.encoding
 252                 else:
 253                     from lyx2lyx_lang import lang
 254                     document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3]))
 255                     encoding_stack[-1] = lang[language][3]
 256             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
 257                 document.warning("Adding nested encoding %s." % encoding_stack[-1])
 258                 encoding_stack.append(encoding_stack[-1])
 259             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
 260                 document.warning("Removing nested encoding %s." % encoding_stack[-1])
 261                 del encoding_stack[-1]
 262             if encoding_stack[-1] != document.encoding:
 263                 if forward:
 264                     # This line has been incorrectly interpreted as if it was
 265                     # encoded in 'encoding'.
 266                     # Convert back to the 8bit string that was in the file.
 267                     orig = document.body[i].encode(document.encoding)
 268                     # Convert the 8bit string that was in the file to unicode
 269                     # with the correct encoding.
 270                     document.body[i] = orig.decode(encoding_stack[-1])
 271                 else:
 272                     # Convert unicode to the 8bit string that will be written
 273                     # to the file with the correct encoding.
 274                     orig = document.body[i].encode(encoding_stack[-1])
 275                     # Convert the 8bit string that will be written to the
 276                     # file to fake unicode with the encoding that will later
 277                     # be used when writing to the file.
 278                     document.body[i] = orig.decode(document.encoding)
 279
 280
 281 def convert_utf8(document):
 282     " Set document encoding to UTF-8. "
 283     convert_multiencoding(document, True)
 284     document.encoding = "utf8"
 285
 286
 287 def revert_utf8(document):
 288     " Set document encoding to the value corresponding to inputencoding. "
 289     i = find_token(document.header, "\\inputencoding", 0)
 290     if i == -1:
 291         document.header.append("\\inputencoding auto")
 292     elif get_value(document.header, "\\inputencoding", i) == "utf8":
 293         document.header[i] = "\\inputencoding auto"
 294     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
 295     document.encoding = get_encoding(document.language, document.inputencoding, 248)
 296     convert_multiencoding(document, False)
 297
 298
 299 def revert_cs_label(document):
 300     " Remove status flag of charstyle label. "
 301     i = 0
 302     while 1:
 303         i = find_token(document.body, "\\begin_inset CharStyle", i)
 304         if i == -1:
 305             return
 306         # Seach for a line starting 'show_label'
 307         # If it is not there, break with a warning message
 308         i = i + 1
 309         while 1:
 310             if (document.body[i][:10] == "show_label"):
 311                 del document.body[i]
 312                 break
 313             elif (document.body[i][:13] == "\\begin_layout"):
 314                 document.warning("Malformed LyX document: Missing 'show_label'.")
 315                 break
 316             i = i + 1
 317
 318         i = i + 1
 319
 320
 321 def convert_bibitem(document):
 322     """ Convert
 323 \bibitem [option]{argument}
 324
 325 to
 326
 327 \begin_inset LatexCommand bibitem
 328 label "option"
 329 key "argument"
 330
 331 \end_inset
 332
 333 This must be called after convert_commandparams.
 334 """
 335     regex = re.compile(r'\S+\s*(\[[^\[\{]*\])?(\{[^}]*\})')
 336     i = 0
 337     while 1:
 338         i = find_token(document.body, "\\bibitem", i)
 339         if i == -1:
 340             break
 341         match = re.match(regex, document.body[i])
 342         option = match.group(1)
 343         argument = match.group(2)
 344         lines = ['\\begin_inset LatexCommand bibitem']
 345         if option != None:
 346             lines.append('label "%s"' % option[1:-1].replace('"', '\\"'))
 347         lines.append('key "%s"' % argument[1:-1].replace('"', '\\"'))
 348         lines.append('')
 349         lines.append('\\end_inset')
 350         document.body[i:i+1] = lines
 351         i = i + 1
 352
 353
 354 commandparams_info = {
 355     # command : [option1, option2, argument]
 356     "bibitem" : ["label", "", "key"],
 357     "bibtex" : ["options", "btprint", "bibfiles"],
 358     "cite"        : ["after", "before", "key"],
 359     "citet"       : ["after", "before", "key"],
 360     "citep"       : ["after", "before", "key"],
 361     "citealt"     : ["after", "before", "key"],
 362     "citealp"     : ["after", "before", "key"],
 363     "citeauthor"  : ["after", "before", "key"],
 364     "citeyear"    : ["after", "before", "key"],
 365     "citeyearpar" : ["after", "before", "key"],
 366     "citet*"      : ["after", "before", "key"],
 367     "citep*"      : ["after", "before", "key"],
 368     "citealt*"    : ["after", "before", "key"],
 369     "citealp*"    : ["after", "before", "key"],
 370     "citeauthor*" : ["after", "before", "key"],
 371     "Citet"       : ["after", "before", "key"],
 372     "Citep"       : ["after", "before", "key"],
 373     "Citealt"     : ["after", "before", "key"],
 374     "Citealp"     : ["after", "before", "key"],
 375     "Citeauthor"  : ["after", "before", "key"],
 376     "Citet*"      : ["after", "before", "key"],
 377     "Citep*"      : ["after", "before", "key"],
 378     "Citealt*"    : ["after", "before", "key"],
 379     "Citealp*"    : ["after", "before", "key"],
 380     "Citeauthor*" : ["after", "before", "key"],
 381     "citefield"   : ["after", "before", "key"],
 382     "citetitle"   : ["after", "before", "key"],
 383     "cite*"       : ["after", "before", "key"],
 384     "hfill" : ["", "", ""],
 385     "index"      : ["", "", "name"],
 386     "printindex" : ["", "", "name"],
 387     "label" : ["", "", "name"],
 388     "eqref"     : ["name", "", "reference"],
 389     "pageref"   : ["name", "", "reference"],
 390     "prettyref" : ["name", "", "reference"],
 391     "ref"       : ["name", "", "reference"],
 392     "vpageref"  : ["name", "", "reference"],
 393     "vref"      : ["name", "", "reference"],
 394     "tableofcontents" : ["", "", "type"],
 395     "htmlurl" : ["name", "", "target"],
 396     "url"     : ["name", "", "target"]}
 397
 398
 399 def convert_commandparams(document):
 400     """ Convert
 401
 402  \begin_inset LatexCommand \cmdname[opt1][opt2]{arg}
 403  \end_inset
 404
 405  to
 406
 407  \begin_inset LatexCommand cmdname
 408  name1 "opt1"
 409  name2 "opt2"
 410  name3 "arg"
 411  \end_inset
 412
 413  name1, name2 and name3 can be different for each command.
 414 """
 415     # \begin_inset LatexCommand bibitem was not the official version (see
 416     # convert_bibitem()), but could be read in, so we convert it here, too.
 417
 418     i = 0
 419     while 1:
 420         i = find_token(document.body, "\\begin_inset LatexCommand", i)
 421         if i == -1:
 422             break
 423         command = document.body[i][26:].strip()
 424         if command == "":
 425             document.warning("Malformed LyX document: Missing LatexCommand name.")
 426             i = i + 1
 427             continue
 428
 429         # The following parser is taken from the original InsetCommandParams::scanCommand
 430         name = ""
 431         option1 = ""
 432         option2 = ""
 433         argument = ""
 434         state = "WS"
 435         # Used to handle things like \command[foo[bar]]{foo{bar}}
 436         nestdepth = 0
 437         b = 0
 438         for c in command:
 439             if ((state == "CMDNAME" and c == ' ') or
 440                 (state == "CMDNAME" and c == '[') or
 441                 (state == "CMDNAME" and c == '{')):
 442                 state = "WS"
 443             if ((state == "OPTION" and c == ']') or
 444                 (state == "SECOPTION" and c == ']') or
 445                 (state == "CONTENT" and c == '}')):
 446                 if nestdepth == 0:
 447                     state = "WS"
 448                 else:
 449                     nestdepth = nestdepth - 1
 450             if ((state == "OPTION" and c == '[') or
 451                 (state == "SECOPTION" and c == '[') or
 452                 (state == "CONTENT" and c == '{')):
 453                 nestdepth = nestdepth + 1
 454             if state == "CMDNAME":
 455                     name += c
 456             elif state == "OPTION":
 457                     option1 += c
 458             elif state == "SECOPTION":
 459                     option2 += c
 460             elif state == "CONTENT":
 461                     argument += c
 462             elif state == "WS":
 463                 if c == '\\':
 464                     state = "CMDNAME"
 465                 elif c == '[' and b != ']':
 466                     state = "OPTION"
 467                     nestdepth = 0 # Just to be sure
 468                 elif c == '[' and b == ']':
 469                     state = "SECOPTION"
 470                     nestdepth = 0 # Just to be sure
 471                 elif c == '{':
 472                     state = "CONTENT"
 473                     nestdepth = 0 # Just to be sure
 474             b = c
 475
 476         # Now we have parsed the command, output the parameters
 477         lines = ["\\begin_inset LatexCommand %s" % name]
 478         if option1 != "":
 479             if commandparams_info[name][0] == "":
 480                 document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
 481             else:
 482                 lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
 483         if option2 != "":
 484             if commandparams_info[name][1] == "":
 485                 document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
 486             else:
 487                 lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
 488         if argument != "":
 489             if commandparams_info[name][2] == "":
 490                 document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
 491             else:
 492                 lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
 493         document.body[i:i+1] = lines
 494         i = i + 1
 495
 496
 497 def revert_commandparams(document):
 498     regex = re.compile(r'(\S+)\s+(.+)')
 499     i = 0
 500     while 1:
 501         i = find_token(document.body, "\\begin_inset LatexCommand", i)
 502         if i == -1:
 503             break
 504         name = document.body[i].split()[2]
 505         j = find_end_of_inset(document.body, i + 1)
 506         preview_line = ""
 507         option1 = ""
 508         option2 = ""
 509         argument = ""
 510         for k in range(i + 1, j):
 511             match = re.match(regex, document.body[k])
 512             if match:
 513                 pname = match.group(1)
 514                 pvalue = match.group(2)
 515                 if pname == "preview":
 516                     preview_line = document.body[k]
 517                 elif (commandparams_info[name][0] != "" and
 518                       pname == commandparams_info[name][0]):
 519                     option1 = pvalue.strip('"').replace('\\"', '"')
 520                 elif (commandparams_info[name][1] != "" and
 521                       pname == commandparams_info[name][1]):
 522                     option2 = pvalue.strip('"').replace('\\"', '"')
 523                 elif (commandparams_info[name][2] != "" and
 524                       pname == commandparams_info[name][2]):
 525                     argument = pvalue.strip('"').replace('\\"', '"')
 526             elif document.body[k].strip() != "":
 527                 document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
 528         if name == "bibitem":
 529             if option1 == "":
 530                 lines = ["\\bibitem {%s}" % argument]
 531             else:
 532                 lines = ["\\bibitem [%s]{%s}" % (option1, argument)]
 533         else:
 534             if option1 == "":
 535                 if option2 == "":
 536                     lines = ["\\begin_inset LatexCommand \\%s{%s}" % (name, argument)]
 537                 else:
 538                     lines = ["\\begin_inset LatexCommand \\%s[][%s]{%s}" % (name, option2, argument)]
 539             else:
 540                 if option2 == "":
 541                     lines = ["\\begin_inset LatexCommand \\%s[%s]{%s}" % (name, option1, argument)]
 542                 else:
 543                     lines = ["\\begin_inset LatexCommand \\%s[%s][%s]{%s}" % (name, option1, option2, argument)]
 544         if name != "bibitem":
 545             if preview_line != "":
 546                 lines.append(preview_line)
 547             lines.append('')
 548             lines.append('\\end_inset')
 549         document.body[i:j+1] = lines
 550         i = j + 1
 551
 552
 553 def revert_nomenclature(document):
 554     " Convert nomenclature entry to ERT. "
 555     regex = re.compile(r'(\S+)\s+(.+)')
 556     i = 0
 557     use_nomencl = 0
 558     while 1:
 559         i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
 560         if i == -1:
 561             break
 562         use_nomencl = 1
 563         j = find_end_of_inset(document.body, i + 1)
 564         preview_line = ""
 565         symbol = ""
 566         description = ""
 567         prefix = ""
 568         for k in range(i + 1, j):
 569             match = re.match(regex, document.body[k])
 570             if match:
 571                 name = match.group(1)
 572                 value = match.group(2)
 573                 if name == "preview":
 574                     preview_line = document.body[k]
 575                 elif name == "symbol":
 576                     symbol = value.strip('"').replace('\\"', '"')
 577                 elif name == "description":
 578                     description = value.strip('"').replace('\\"', '"')
 579                 elif name == "prefix":
 580                     prefix = value.strip('"').replace('\\"', '"')
 581             elif document.body[k].strip() != "":
 582                 document.warning("Ignoring unknown contents `%s' in nomenclature inset." % document.body[k])
 583         if prefix == "":
 584             command = 'nomenclature{%s}{%s}' % (symbol, description)
 585         else:
 586             command = 'nomenclature[%s]{%s}{%s}' % (prefix, symbol, description)
 587         document.body[i:j+1] = ['\\begin_inset ERT',
 588                                 'status collapsed',
 589                                 '',
 590                                 '\\begin_layout %s' % document.default_layout,
 591                                 '',
 592                                 '',
 593                                 '\\backslash',
 594                                 command,
 595                                 '\\end_layout',
 596                                 '',
 597                                 '\\end_inset']
 598         i = i + 11
 599     if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
 600         document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
 601         document.preamble.append('\\makenomenclature')
 602
 603
 604 def revert_printnomenclature(document):
 605     " Convert printnomenclature to ERT. "
 606     regex = re.compile(r'(\S+)\s+(.+)')
 607     i = 0
 608     use_nomencl = 0
 609     while 1:
 610         i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
 611         if i == -1:
 612             break
 613         use_nomencl = 1
 614         j = find_end_of_inset(document.body, i + 1)
 615         preview_line = ""
 616         labelwidth = ""
 617         for k in range(i + 1, j):
 618             match = re.match(regex, document.body[k])
 619             if match:
 620                 name = match.group(1)
 621                 value = match.group(2)
 622                 if name == "preview":
 623                     preview_line = document.body[k]
 624                 elif name == "labelwidth":
 625                     labelwidth = value.strip('"').replace('\\"', '"')
 626             elif document.body[k].strip() != "":
 627                 document.warning("Ignoring unknown contents `%s' in printnomenclature inset." % document.body[k])
 628         if labelwidth == "":
 629             command = 'nomenclature{}'
 630         else:
 631             command = 'nomenclature[%s]' % labelwidth
 632         document.body[i:j+1] = ['\\begin_inset ERT',
 633                                 'status collapsed',
 634                                 '',
 635                                 '\\begin_layout %s' % document.default_layout,
 636                                 '',
 637                                 '',
 638                                 '\\backslash',
 639                                 command,
 640                                 '\\end_layout',
 641                                 '',
 642                                 '\\end_inset']
 643         i = i + 11
 644     if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
 645         document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
 646         document.preamble.append('\\makenomenclature')
 647
 648
 649 def convert_esint(document):
 650     " Add \\use_esint setting to header. "
 651     i = find_token(document.header, "\\cite_engine", 0)
 652     if i == -1:
 653         document.warning("Malformed LyX document: Missing `\\cite_engine'.")
 654         return
 655     # 0 is off, 1 is auto, 2 is on.
 656     document.header.insert(i, '\\use_esint 0')
 657
 658
 659 def revert_esint(document):
 660     " Remove \\use_esint setting from header. "
 661     i = find_token(document.header, "\\use_esint", 0)
 662     if i == -1:
 663         document.warning("Malformed LyX document: Missing `\\use_esint'.")
 664         return
 665     use_esint = document.header[i].split()[1]
 666     del document.header[i]
 667     # 0 is off, 1 is auto, 2 is on.
 668     if (use_esint == 2):
 669         document.preamble.append('\\usepackage{esint}')
 670
 671
 672 def revert_clearpage(document):
 673     " clearpage -> ERT "
 674     i = 0
 675     while 1:
 676         i = find_token(document.body, "\\clearpage", i)
 677         if i == -1:
 678             break
 679         document.body[i:i+1] =  ['\\begin_inset ERT',
 680                                 'status collapsed',
 681                                 '',
 682                                 '\\begin_layout %s' % document.default_layout,
 683                                 '',
 684                                 '',
 685                                 '\\backslash',
 686                                 'clearpage',
 687                                 '\\end_layout',
 688                                 '',
 689                                 '\\end_inset']
 690     i = i + 1
 691
 692
 693 def revert_cleardoublepage(document):
 694     " cleardoublepage -> ERT "
 695     i = 0
 696     while 1:
 697         i = find_token(document.body, "\\cleardoublepage", i)
 698         if i == -1:
 699             break
 700         document.body[i:i+1] =  ['\\begin_inset ERT',
 701                                 'status collapsed',
 702                                 '',
 703                                 '\\begin_layout %s' % document.default_layout,
 704                                 '',
 705                                 '',
 706                                 '\\backslash',
 707                                 'cleardoublepage',
 708                                 '\\end_layout',
 709                                 '',
 710                                 '\\end_inset']
 711     i = i + 1
 712
 713
 714 def convert_lyxline(document):
 715     " remove fontsize commands for \lyxline "
 716     # The problematic is: The old \lyxline definition doesn't handle the fontsize
 717     # to change the line thickness. The new definiton does this so that imported
 718     # \lyxlines would have a different line thickness. The eventual fontsize command
 719     # before \lyxline is therefore removed to get the same output.
 720     fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
 721                  "large", "Large", "LARGE", "huge", "Huge"]
 722     for n in range(0, len(fontsizes)):
 723         i = 0
 724         k = 0
 725         while i < len(document.body):
 726             i = find_token(document.body, "\\size " + fontsizes[n], i)
 727             k = find_token(document.body, "\\lyxline",i)
 728             # the corresponding fontsize command is always 2 lines before the \lyxline
 729             if (i != -1 and k == i+2):
 730                 document.body[i:i+1] = []
 731             else:
 732                 break
 733         i = i + 1
 734
 735
 736 def revert_encodings(document):
 737     " Set new encodings to auto. "
 738     encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
 739                  "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250",
 740                  "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"]
 741     i = find_token(document.header, "\\inputencoding", 0)
 742     if i == -1:
 743         document.header.append("\\inputencoding auto")
 744     else:
 745         inputenc = get_value(document.header, "\\inputencoding", i)
 746         if inputenc in encodings:
 747             document.header[i] = "\\inputencoding auto"
 748     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
 749
 750
 751 def convert_caption(document):
 752     " Convert caption layouts to caption insets. "
 753     i = 0
 754     while 1:
 755         i = find_token(document.body, "\\begin_layout Caption", i)
 756         if i == -1:
 757             return
 758         j = find_end_of_layout(document.body, i)
 759         if j == -1:
 760             document.warning("Malformed LyX document: Missing `\\end_layout'.")
 761             return
 762
 763         document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
 764         document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
 765                             "\\begin_inset Caption", "",
 766                             "\\begin_layout %s" % document.default_layout]
 767         i = i + 1
 768
 769
 770 def revert_caption(document):
 771     " Convert caption insets to caption layouts. "
 772     " This assumes that the text class has a caption style. "
 773     i = 0
 774     while 1:
 775         i = find_token(document.body, "\\begin_inset Caption", i)
 776         if i == -1:
 777             return
 778
 779         # We either need to delete the previous \begin_layout line, or we
 780         # need to end the previous layout if this inset is not in the first
 781         # position of the paragraph.
 782         layout_before = find_token_backwards(document.body, "\\begin_layout", i)
 783         if layout_before == -1:
 784             document.warning("Malformed LyX document: Missing `\\begin_layout'.")
 785             return
 786         layout_line = document.body[layout_before]
 787         del_layout_before = True
 788         l = layout_before + 1
 789         while l < i:
 790             if document.body[l] != "":
 791                 del_layout_before = False
 792                 break
 793             l = l + 1
 794         if del_layout_before:
 795             del document.body[layout_before:i]
 796             i = layout_before
 797         else:
 798             document.body[i:i] = ["\\end_layout", ""]
 799             i = i + 2
 800
 801         # Find start of layout in the inset and end of inset
 802         j = find_token(document.body, "\\begin_layout", i)
 803         if j == -1:
 804             document.warning("Malformed LyX document: Missing `\\begin_layout'.")
 805             return
 806         k = find_end_of_inset(document.body, i)
 807         if k == -1:
 808             document.warning("Malformed LyX document: Missing `\\end_inset'.")
 809             return
 810
 811         # We either need to delete the following \end_layout line, or we need
 812         # to restart the old layout if this inset is not at the paragraph end.
 813         layout_after = find_token(document.body, "\\end_layout", k)
 814         if layout_after == -1:
 815             document.warning("Malformed LyX document: Missing `\\end_layout'.")
 816             return
 817         del_layout_after = True
 818         l = k + 1
 819         while l < layout_after:
 820             if document.body[l] != "":
 821                 del_layout_after = False
 822                 break
 823             l = l + 1
 824         if del_layout_after:
 825             del document.body[k+1:layout_after+1]
 826         else:
 827             document.body[k+1:k+1] = [layout_line, ""]
 828
 829         # delete \begin_layout and \end_inset and replace \begin_inset with
 830         # "\begin_layout Caption". This works because we can only have one
 831         # paragraph in the caption inset: The old \end_layout will be recycled.
 832         del document.body[k]
 833         if document.body[k] == "":
 834             del document.body[k]
 835         del document.body[j]
 836         if document.body[j] == "":
 837             del document.body[j]
 838         document.body[i] = "\\begin_layout Caption"
 839         if document.body[i+1] == "":
 840             del document.body[i+1]
 841         i = i + 1
 842
 843
 844 # Accents of InsetLaTeXAccent
 845 accent_map = {
 846     "`" : u'\u0300', # grave
 847     "'" : u'\u0301', # acute
 848     "^" : u'\u0302', # circumflex
 849     "~" : u'\u0303', # tilde
 850     "=" : u'\u0304', # macron
 851     "u" : u'\u0306', # breve
 852     "." : u'\u0307', # dot above
 853     "\"": u'\u0308', # diaresis
 854     "r" : u'\u030a', # ring above
 855     "H" : u'\u030b', # double acute
 856     "v" : u'\u030c', # caron
 857     "b" : u'\u0320', # minus sign below
 858     "d" : u'\u0323', # dot below
 859     "c" : u'\u0327', # cedilla
 860     "k" : u'\u0328', # ogonek
 861     "t" : u'\u0361'  # tie. This is special: It spans two characters, but
 862                      # only one is given as argument, so we don't need to
 863                      # treat it differently.
 864 }
 865
 866
 867 # special accents of InsetLaTeXAccent without argument
 868 special_accent_map = {
 869     'i' : u'\u0131', # dotless i
 870     'j' : u'\u0237', # dotless j
 871     'l' : u'\u0142', # l with stroke
 872     'L' : u'\u0141'  # L with stroke
 873 }
 874
 875
 876 # special accent arguments of InsetLaTeXAccent
 877 accented_map = {
 878     '\\i' : u'\u0131', # dotless i
 879     '\\j' : u'\u0237'  # dotless j
 880 }
 881
 882
 883 def _convert_accent(accent, accented_char):
 884     type = accent
 885     char = accented_char
 886     if char == '':
 887         if type in special_accent_map:
 888             return special_accent_map[type]
 889         # a missing char is treated as space by LyX
 890         char = ' '
 891     elif type == 'q' and char in ['t', 'd', 'l', 'L']:
 892         # Special caron, only used with t, d, l and L.
 893         # It is not in the map because we convert it to the same unicode
 894         # character as the normal caron: \q{} is only defined if babel with
 895         # the czech or slovak language is used, and the normal caron
 896         # produces the correct output if the T1 font encoding is used.
 897         # For the same reason we never convert to \q{} in the other direction.
 898         type = 'v'
 899     elif char in accented_map:
 900         char = accented_map[char]
 901     elif (len(char) > 1):
 902         # We can only convert accents on a single char
 903         return ''
 904     a = accent_map.get(type)
 905     if a:
 906         return unicodedata.normalize("NFKC", "%s%s" % (char, a))
 907     return ''
 908
 909
 910 def convert_ertbackslash(body, i, ert, default_layout):
 911     r""" -------------------------------------------------------------------------------------------
 912     Convert backslashes and '\n' into valid ERT code, append the converted
 913     text to body[i] and return the (maybe incremented) line index i"""
 914
 915     for c in ert:
 916         if c == '\\':
 917             body[i] = body[i] + '\\backslash '
 918             i = i + 1
 919             body.insert(i, '')
 920         elif c == '\n':
 921             body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
 922             i = i + 4
 923         else:
 924             body[i] = body[i] + c
 925     return i
 926
 927
 928 def convert_accent(document):
 929     # The following forms are supported by LyX:
 930     # '\i \"{a}' (standard form, as written by LyX)
 931     # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
 932     # '\i \"{ }' (also accepted if the accented char is a space)
 933     # '\i \" a'  (also accepted)
 934     # '\i \"'    (also accepted)
 935     re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
 936     re_contents = re.compile(r'^([^\s{]+)(.*)$')
 937     re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
 938     i = 0
 939     while 1:
 940         i = find_re(document.body, re_wholeinset, i)
 941         if i == -1:
 942             return
 943         match = re_wholeinset.match(document.body[i])
 944         prefix = match.group(1)
 945         contents = match.group(3).strip()
 946         match = re_contents.match(contents)
 947         if match:
 948             # Strip first char (always \)
 949             accent = match.group(1)[1:]
 950             accented_contents = match.group(2).strip()
 951             match = re_accentedcontents.match(accented_contents)
 952             accented_char = match.group(1)
 953             converted = _convert_accent(accent, accented_char)
 954             if converted == '':
 955                 # Normalize contents
 956                 contents = '%s{%s}' % (accent, accented_char),
 957             else:
 958                 document.body[i] = '%s%s' % (prefix, converted)
 959                 i += 1
 960                 continue
 961         document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
 962         document.body[i] = prefix
 963         document.body[i+1:i+1] = ['\\begin_inset ERT',
 964                                   'status collapsed',
 965                                   '',
 966                                   '\\begin_layout %s' % document.default_layout,
 967                                   '',
 968                                   '',
 969                                   '']
 970         i = convert_ertbackslash(document.body, i + 7,
 971                                  '\\%s' % contents,
 972                                  document.default_layout)
 973         document.body[i+1:i+1] = ['\\end_layout',
 974                                   '',
 975                                   '\\end_inset']
 976         i += 3
 977
 978
 979 def revert_accent(document):
 980     inverse_accent_map = {}
 981     for k in accent_map:
 982         inverse_accent_map[accent_map[k]] = k
 983     inverse_special_accent_map = {}
 984     for k in special_accent_map:
 985         inverse_special_accent_map[special_accent_map[k]] = k
 986     inverse_accented_map = {}
 987     for k in accented_map:
 988         inverse_accented_map[accented_map[k]] = k
 989
 990     # Since LyX may insert a line break within a word we must combine all
 991     # words before unicode normalization.
 992     # We do this only if the next line starts with an accent, otherwise we
 993     # would create things like '\begin_inset ERTstatus'.
 994     numberoflines = len(document.body)
 995     for i in range(numberoflines-1):
 996         if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
 997             continue
 998         if (document.body[i+1][0] in inverse_accent_map):
 999             # the last character of this line and the first of the next line
1000             # form probably a surrogate pair.
1001             while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
1002                 document.body[i] += document.body[i+1][0]
1003                 document.body[i+1] = document.body[i+1][1:]
1004
1005     # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
1006     # This is needed to catch all accented characters.
1007     for i in range(numberoflines):
1008         # Unfortunately we have a mixture of unicode strings and plain strings,
1009         # because we never use u'xxx' for string literals, but 'xxx'.
1010         # Therefore we may have to try two times to normalize the data.
1011         try:
1012             document.body[i] = unicodedata.normalize("NFKD", document.body[i])
1013         except TypeError:
1014             document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
1015
1016     # Replace accented characters with InsetLaTeXAccent
1017     # Do not convert characters that can be represented in the chosen
1018     # encoding.
1019     encoding_stack = [get_encoding(document.language, document.inputencoding, 248)]
1020     lang_re = re.compile(r"^\\lang\s(\S+)")
1021     for i in range(len(document.body)):
1022
1023         if document.inputencoding == "auto" or document.inputencoding == "default":
1024             # Track the encoding of the current line
1025             result = lang_re.match(document.body[i])
1026             if result:
1027                 language = result.group(1)
1028                 if language == "default":
1029                     encoding_stack[-1] = document.encoding
1030                 else:
1031                     from lyx2lyx_lang import lang
1032                     encoding_stack[-1] = lang[language][3]
1033                 continue
1034             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
1035                 encoding_stack.append(encoding_stack[-1])
1036                 continue
1037             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
1038                 del encoding_stack[-1]
1039                 continue
1040
1041         for j in range(len(document.body[i])):
1042             # dotless i and dotless j are both in special_accent_map and can
1043             # occur as an accented character, so we need to test that the
1044             # following character is no accent
1045             if (document.body[i][j] in inverse_special_accent_map and
1046                 (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
1047                 accent = document.body[i][j]
1048                 try:
1049                     dummy = accent.encode(encoding_stack[-1])
1050                 except UnicodeEncodeError:
1051                     # Insert the rest of the line as new line
1052                     if j < len(document.body[i]) - 1:
1053                         document.body[i+1:i+1] = document.body[i][j+1:]
1054                     # Delete the accented character
1055                     if j > 0:
1056                         document.body[i] = document.body[i][:j-1]
1057                     else:
1058                         document.body[i] = u''
1059                     # Finally add the InsetLaTeXAccent
1060                     document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
1061                     break
1062             elif j > 0 and document.body[i][j] in inverse_accent_map:
1063                 accented_char = document.body[i][j-1]
1064                 if accented_char == ' ':
1065                     # Conform to LyX output
1066                     accented_char = ''
1067                 elif accented_char in inverse_accented_map:
1068                     accented_char = inverse_accented_map[accented_char]
1069                 accent = document.body[i][j]
1070                 try:
1071                     dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
1072                 except UnicodeEncodeError:
1073                     # Insert the rest of the line as new line
1074                     if j < len(document.body[i]) - 1:
1075                         document.body[i+1:i+1] = document.body[i][j+1:]
1076                     # Delete the accented characters
1077                     if j > 1:
1078                         document.body[i] = document.body[i][:j-2]
1079                     else:
1080                         document.body[i] = u''
1081                     # Finally add the InsetLaTeXAccent
1082                     document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
1083                     break
1084     # Normalize to "Normal form C" (NFC, pre-composed characters) again
1085     for i in range(numberoflines):
1086         document.body[i] = unicodedata.normalize("NFKC", document.body[i])
1087
1088
1089 def normalize_font_whitespace(document):
1090     """ Before format 259 the font changes were ignored if a
1091     whitespace was the last character in the sequence, this function
1092     transfers the whitespace outside."""
1093
1094     if document.backend != "latex":
1095         return
1096
1097     lines = document.body
1098
1099     char_properties = ("\\series", "\\emph", "\\color", "\\shape", "\\family")
1100
1101     for i in range(len(lines)):
1102         words = lines[i].split()
1103
1104         if len(words) > 1 and words[0] in char_properties \
1105                and words[1] == "default" and lines[i-1][-1] == " ":
1106             lines[i-1] = lines[i-1][:-1]
1107             lines[i+1] = " " + lines[i+1]
1108
1109 ##
1110 # Conversion hub
1111 #
1112
1113 supported_versions = ["1.5.0","1.5"]
1114 convert = [[246, []],
1115            [247, [convert_font_settings]],
1116            [248, []],
1117            [249, [convert_utf8]],
1118            [250, []],
1119            [251, []],
1120            [252, [convert_commandparams, convert_bibitem]],
1121            [253, []],
1122            [254, [convert_esint]],
1123            [255, []],
1124            [256, []],
1125            [257, [convert_caption]],
1126            [258, [convert_lyxline]],
1127            [259, [convert_accent, normalize_font_whitespace]]]
1128
1129 revert =  [[258, []],
1130            [257, []],
1131            [256, [revert_caption]],
1132            [255, [revert_encodings]],
1133            [254, [revert_clearpage, revert_cleardoublepage]],
1134            [253, [revert_esint]],
1135            [252, [revert_nomenclature, revert_printnomenclature]],
1136            [251, [revert_commandparams]],
1137            [250, [revert_cs_label]],
1138            [249, []],
1139            [248, [revert_accent, revert_utf8]],
1140            [247, [revert_booktabs]],
1141            [246, [revert_font_settings]],
1142            [245, [revert_framed]]]
1143
1144
1145 if __name__ == "__main__":
1146     pass
1147
1148