lib/lyx2lyx/lyx_1_5.py

   1 # This file is part of lyx2lyx
   2 # Copyright (C) 2006 José Matos <jamatos@lyx.org>
   3 # Copyright (C) 2004-2006 Georg Baum <Georg.Baum@post.rwth-aachen.de>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  18
  19 """Convert files to the file format generated by lyx 1.5"""
  20
  21 import os
  22 import re
  23 import sys
  24 import unicodedata
  25
  26 from LyX import get_encoding
  27 from lyx2lyx_tools import insert_document_option
  28 from parser_tools import (
  29     find_beginning_of,
  30     find_end_of,
  31     find_nonempty_line,
  32     find_re,
  33     find_token,
  34     find_token_backwards,
  35     find_token_exact,
  36     find_tokens,
  37     get_value,
  38 )
  39
  40 ####################################################################
  41 # Private helper functions
  42
  43
  44 def find_end_of_inset(lines, i):
  45     "Find end of inset, where lines[i] is included."
  46     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
  47
  48
  49 def find_end_of_layout(lines, i):
  50     "Find end of layout, where lines[i] is included."
  51     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
  52
  53
  54 def find_beginning_of_layout(lines, i):
  55     "Find beginning of layout, where lines[i] is included."
  56     return find_beginning_of(lines, i, "\\begin_layout", "\\end_layout")
  57
  58
  59 # End of helper functions
  60 ####################################################################
  61
  62
  63 ##
  64 #  Notes: Framed/Shaded
  65 #
  66
  67
  68 def revert_framed(document):
  69     "Revert framed notes."
  70     i = 0
  71     while True:
  72         i = find_tokens(
  73             document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i
  74         )
  75
  76         if i == -1:
  77             return
  78         document.body[i] = "\\begin_inset Note"
  79         i = i + 1
  80
  81
  82 ##
  83 #  Fonts
  84 #
  85
  86 roman_fonts = {
  87     "default": "default",
  88     "ae": "ae",
  89     "times": "times",
  90     "palatino": "palatino",
  91     "helvet": "default",
  92     "avant": "default",
  93     "newcent": "newcent",
  94     "bookman": "bookman",
  95     "pslatex": "times",
  96 }
  97 sans_fonts = {
  98     "default": "default",
  99     "ae": "default",
 100     "times": "default",
 101     "palatino": "default",
 102     "helvet": "helvet",
 103     "avant": "avant",
 104     "newcent": "default",
 105     "bookman": "default",
 106     "pslatex": "helvet",
 107 }
 108 typewriter_fonts = {
 109     "default": "default",
 110     "ae": "default",
 111     "times": "default",
 112     "palatino": "default",
 113     "helvet": "default",
 114     "avant": "default",
 115     "newcent": "default",
 116     "bookman": "default",
 117     "pslatex": "courier",
 118 }
 119
 120
 121 def convert_font_settings(document):
 122     "Convert font settings."
 123     i = 0
 124     i = find_token_exact(document.header, "\\fontscheme", i)
 125     if i == -1:
 126         document.warning("Malformed LyX document: Missing `\\fontscheme'.")
 127         return
 128     font_scheme = get_value(document.header, "\\fontscheme", i, i + 1)
 129     if font_scheme == "":
 130         document.warning("Malformed LyX document: Empty `\\fontscheme'.")
 131         font_scheme = "default"
 132     if font_scheme not in list(roman_fonts.keys()):
 133         document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
 134         font_scheme = "default"
 135     document.header[i : i + 1] = [
 136         "\\font_roman %s" % roman_fonts[font_scheme],
 137         "\\font_sans %s" % sans_fonts[font_scheme],
 138         "\\font_typewriter %s" % typewriter_fonts[font_scheme],
 139         "\\font_default_family default",
 140         "\\font_sc false",
 141         "\\font_osf false",
 142         "\\font_sf_scale 100",
 143         "\\font_tt_scale 100",
 144     ]
 145
 146
 147 def revert_font_settings(document):
 148     "Revert font settings."
 149     i = 0
 150     insert_line = -1
 151     fonts = {"roman": "default", "sans": "default", "typewriter": "default"}
 152     for family in "roman", "sans", "typewriter":
 153         name = "\\font_%s" % family
 154         i = find_token_exact(document.header, name, i)
 155         if i == -1:
 156             document.warning("Malformed LyX document: Missing `%s'." % name)
 157             i = 0
 158         else:
 159             if insert_line < 0:
 160                 insert_line = i
 161             fonts[family] = get_value(document.header, name, i, i + 1)
 162             del document.header[i]
 163     i = find_token_exact(document.header, "\\font_default_family", i)
 164     if i == -1:
 165         document.warning("Malformed LyX document: Missing `\\font_default_family'.")
 166         font_default_family = "default"
 167     else:
 168         font_default_family = get_value(document.header, "\\font_default_family", i, i + 1)
 169         del document.header[i]
 170     i = find_token_exact(document.header, "\\font_sc", i)
 171     if i == -1:
 172         document.warning("Malformed LyX document: Missing `\\font_sc'.")
 173         font_sc = "false"
 174     else:
 175         font_sc = get_value(document.header, "\\font_sc", i, i + 1)
 176         del document.header[i]
 177     if font_sc != "false":
 178         document.warning("Conversion of '\\font_sc' not yet implemented.")
 179     i = find_token_exact(document.header, "\\font_osf", i)
 180     if i == -1:
 181         document.warning("Malformed LyX document: Missing `\\font_osf'.")
 182         font_osf = "false"
 183     else:
 184         font_osf = get_value(document.header, "\\font_osf", i, i + 1)
 185         del document.header[i]
 186     i = find_token_exact(document.header, "\\font_sf_scale", i)
 187     if i == -1:
 188         document.warning("Malformed LyX document: Missing `\\font_sf_scale'.")
 189         font_sf_scale = "100"
 190     else:
 191         font_sf_scale = get_value(document.header, "\\font_sf_scale", i, i + 1)
 192         del document.header[i]
 193     if font_sf_scale != "100":
 194         document.warning("Conversion of '\\font_sf_scale' not yet implemented.")
 195     i = find_token_exact(document.header, "\\font_tt_scale", i)
 196     if i == -1:
 197         document.warning("Malformed LyX document: Missing `\\font_tt_scale'.")
 198         font_tt_scale = "100"
 199     else:
 200         font_tt_scale = get_value(document.header, "\\font_tt_scale", i, i + 1)
 201         del document.header[i]
 202     if font_tt_scale != "100":
 203         document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
 204     for font_scheme in list(roman_fonts.keys()):
 205         if (
 206             roman_fonts[font_scheme] == fonts["roman"]
 207             and sans_fonts[font_scheme] == fonts["sans"]
 208             and typewriter_fonts[font_scheme] == fonts["typewriter"]
 209         ):
 210             document.header.insert(insert_line, "\\fontscheme %s" % font_scheme)
 211             if font_default_family != "default":
 212                 document.preamble.append(
 213                     "\\renewcommand{\\familydefault}{\\%s}" % font_default_family
 214                 )
 215             if font_osf == "true":
 216                 document.warning("Ignoring `\\font_osf = true'")
 217             return
 218     font_scheme = "default"
 219     document.header.insert(insert_line, "\\fontscheme %s" % font_scheme)
 220     if fonts["roman"] == "cmr":
 221         document.preamble.append("\\renewcommand{\\rmdefault}{cmr}")
 222         if font_osf == "true":
 223             document.preamble.append("\\usepackage{eco}")
 224             font_osf = "false"
 225     for font in "lmodern", "charter", "utopia", "beraserif", "ccfonts", "chancery":
 226         if fonts["roman"] == font:
 227             document.preamble.append("\\usepackage{%s}" % font)
 228     for font in "cmss", "lmss", "cmbr":
 229         if fonts["sans"] == font:
 230             document.preamble.append("\\renewcommand{\\sfdefault}{%s}" % font)
 231     for font in "berasans":
 232         if fonts["sans"] == font:
 233             document.preamble.append("\\usepackage{%s}" % font)
 234     for font in "cmtt", "lmtt", "cmtl":
 235         if fonts["typewriter"] == font:
 236             document.preamble.append("\\renewcommand{\\ttdefault}{%s}" % font)
 237     for font in "courier", "beramono", "luximono":
 238         if fonts["typewriter"] == font:
 239             document.preamble.append("\\usepackage{%s}" % font)
 240     if font_default_family != "default":
 241         document.preamble.append("\\renewcommand{\\familydefault}{\\%s}" % font_default_family)
 242     if font_osf == "true":
 243         document.warning("Ignoring `\\font_osf = true'")
 244
 245
 246 def revert_booktabs(document):
 247     "We remove the booktabs flag or everything else will become a mess."
 248     re_row = re.compile(r'^<row.*space="[^"]+".*>$')
 249     re_tspace = re.compile(r'\s+topspace="[^"]+"')
 250     re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
 251     re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
 252     i = 0
 253     while True:
 254         i = find_token(document.body, "\\begin_inset Tabular", i)
 255         if i == -1:
 256             return
 257         j = find_end_of_inset(document.body, i + 1)
 258         if j == -1:
 259             document.warning("Malformed LyX document: Could not find end of tabular.")
 260             continue
 261         for k in range(i, j):
 262             if re.search('^<features.* booktabs="true".*>$', document.body[k]):
 263                 document.warning("Converting 'booktabs' table to normal table.")
 264                 document.body[k] = document.body[k].replace(' booktabs="true"', "")
 265             if re.search(re_row, document.body[k]):
 266                 document.warning("Removing extra row space.")
 267                 document.body[k] = re_tspace.sub("", document.body[k])
 268                 document.body[k] = re_bspace.sub("", document.body[k])
 269                 document.body[k] = re_ispace.sub("", document.body[k])
 270         i = i + 1
 271
 272
 273 def convert_multiencoding(document, forward):
 274     """Fix files with multiple encodings.
 275     Files with an inputencoding of "auto" or "default" and multiple languages
 276     where at least two languages have different default encodings are encoded
 277     in multiple encodings for file formats < 249. These files are incorrectly
 278     read and written (as if the whole file was in the encoding of the main
 279     language).
 280     This is not true for files written by CJK-LyX, they are always in the locale
 281     encoding.
 282
 283     This function
 284     - converts from fake unicode values to true unicode if forward is true, and
 285     - converts from true unicode values to fake unicode if forward is false.
 286     document.encoding must be set to the old value (format 248) in both cases.
 287
 288     We do this here and not in LyX.py because it is far easier to do the
 289     necessary parsing in modern formats than in ancient ones.
 290     """
 291     inset_types = ["Foot", "Note"]
 292     if document.cjk_encoding != "":
 293         return
 294     encoding_stack = [document.encoding]
 295     insets = []
 296     lang_re = re.compile(r"^\\lang\s(\S+)")
 297     inset_re = re.compile(r"^\\begin_inset\s(\S+)")
 298     if not forward:  # no need to read file unless we are reverting
 299         spec_chars = read_unicodesymbols()
 300
 301     if document.inputencoding == "auto" or document.inputencoding == "default":
 302         i = 0
 303         while i < len(document.body):
 304             result = lang_re.match(document.body[i])
 305             if result:
 306                 language = result.group(1)
 307                 if language == "default":
 308                     document.warning(
 309                         f"Resetting encoding from {encoding_stack[-1]} to {document.encoding}.",
 310                         3,
 311                     )
 312                     encoding_stack[-1] = document.encoding
 313                 else:
 314                     from lyx2lyx_lang import lang
 315
 316                     document.warning(
 317                         f"Setting encoding from {encoding_stack[-1]} to {lang[language][3]}.",
 318                         3,
 319                     )
 320                     encoding_stack[-1] = lang[language][3]
 321             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
 322                 document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
 323                 if len(insets) > 0 and insets[-1] in inset_types:
 324                     from lyx2lyx_lang import lang
 325
 326                     encoding_stack.append(lang[document.language][3])
 327                 else:
 328                     encoding_stack.append(encoding_stack[-1])
 329             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
 330                 document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
 331                 if len(encoding_stack) == 1:
 332                     # Don't remove the document encoding from the stack
 333                     document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
 334                 else:
 335                     del encoding_stack[-1]
 336             elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
 337                 inset_result = inset_re.match(document.body[i])
 338                 if inset_result:
 339                     insets.append(inset_result.group(1))
 340                 else:
 341                     insets.append("")
 342             elif find_token(document.body, "\\end_inset", i, i + 1) == i:
 343                 del insets[-1]
 344             if encoding_stack[-1] != document.encoding:
 345                 if forward:
 346                     # This line has been incorrectly interpreted as if it was
 347                     # encoded in 'encoding'.
 348                     # Convert back to the 8bit string that was in the file.
 349                     orig = document.body[i].encode(document.encoding)
 350                     # Convert the 8bit string that was in the file to unicode
 351                     # with the correct encoding.
 352                     document.body[i] = orig.decode(encoding_stack[-1])
 353                 else:
 354                     try:
 355                         # Convert unicode to the 8bit string that will be written
 356                         # to the file with the correct encoding.
 357                         orig = document.body[i].encode(encoding_stack[-1])
 358                         # Convert the 8bit string that will be written to the
 359                         # file to fake unicode with the encoding that will later
 360                         # be used when writing to the file.
 361                         document.body[i] = orig.decode(document.encoding)
 362                     except:
 363                         mod_line = revert_unicode_line(document, i, insets, spec_chars)
 364                         document.body[i : i + 1] = mod_line.split("\n")
 365                         i += len(mod_line.split("\n")) - 1
 366             i += 1
 367
 368
 369 def convert_utf8(document):
 370     "Set document encoding to UTF-8."
 371     convert_multiencoding(document, True)
 372     document.encoding = "utf8"
 373
 374
 375 def revert_utf8(document):
 376     "Set document encoding to the value corresponding to inputencoding."
 377     i = find_token(document.header, "\\inputencoding", 0)
 378     if i == -1:
 379         document.header.append("\\inputencoding auto")
 380     elif get_value(document.header, "\\inputencoding", i) == "utf8":
 381         document.header[i] = "\\inputencoding auto"
 382     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
 383     document.encoding = get_encoding(
 384         document.language, document.inputencoding, 248, document.cjk_encoding
 385     )
 386     convert_multiencoding(document, False)
 387
 388
 389 # FIXME: Use the version in unicode_symbols.py which has some bug fixes
 390 def read_unicodesymbols():
 391     "Read the unicodesymbols list of unicode characters and corresponding commands."
 392     pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
 393     fp = open(os.path.join(pathname.strip("lyx2lyx"), "unicodesymbols"))
 394     spec_chars = {}
 395     for line in fp.readlines():
 396         if line[0] != "#":
 397             line = line.replace(' "', " ")  # remove all quotation marks with spaces before
 398             line = line.replace('" ', " ")  # remove all quotation marks with spaces after
 399             line = line.replace(r"\"", '"')  # replace \" by " (for characters with diaeresis)
 400             try:
 401                 # flag1 and flag2 are preamble and other flags
 402                 [ucs4, command, flag1, flag2] = line.split(None, 3)
 403                 spec_chars[chr(eval(ucs4))] = [command, flag1, flag2]
 404             except:
 405                 pass
 406     fp.close()
 407     return spec_chars
 408
 409
 410 def revert_unicode_line(document, i, insets, spec_chars, replacement_character="???"):
 411     # Define strings to start and end ERT and math insets
 412     ert_intro = (
 413         "\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s" % document.default_layout
 414     )
 415     ert_outro = "\n\\end_layout\n\n\\end_inset\n"
 416     math_intro = "\n\\begin_inset Formula $"
 417     math_outro = "$\n\\end_inset"
 418
 419     mod_line = ""
 420     if i and not is_inset_line(document, i - 1):
 421         last_char = document.body[i - 1][-1:]
 422     else:
 423         last_char = ""
 424
 425     line = document.body[i]
 426     for character in line:
 427         try:
 428             # Try to write the character
 429             dummy = character.encode(document.encoding)
 430             mod_line += character
 431             last_char = character
 432         except:
 433             # Try to replace with ERT/math inset
 434             if character in spec_chars:
 435                 command = spec_chars[character][0]  # the command to replace unicode
 436                 flag1 = spec_chars[character][1]
 437                 flag2 = spec_chars[character][2]
 438                 if flag1.find("combining") > -1 or flag2.find("combining") > -1:
 439                     # We have a character that should be combined with the previous
 440                     command += "{" + last_char + "}"
 441                     # Remove the last character. Ignore if it is whitespace
 442                     if len(last_char.rstrip()):
 443                         # last_char was found and is not whitespace
 444                         if mod_line:
 445                             mod_line = mod_line[:-1]
 446                         else:  # last_char belongs to the last line
 447                             document.body[i - 1] = document.body[i - 1][:-1]
 448                     else:
 449                         # The last character was replaced by a command. For now it is
 450                         # ignored. This could be handled better.
 451                         pass
 452                 if command[0:2] == "\\\\":
 453                     if command[2:12] == "ensuremath":
 454                         if insets and insets[-1] == "ERT":
 455                             # math in ERT
 456                             command = command.replace("\\\\ensuremath{\\\\", "$\n\\backslash\n")
 457                             command = command.replace("}", "$\n")
 458                         elif not insets or insets[-1] != "Formula":
 459                             # add a math inset with the replacement character
 460                             command = command.replace("\\\\ensuremath{\\", math_intro)
 461                             command = command.replace("}", math_outro)
 462                         else:
 463                             # we are already in a math inset
 464                             command = command.replace("\\\\ensuremath{\\", "")
 465                             command = command.replace("}", "")
 466                     else:
 467                         if insets and insets[-1] == "Formula":
 468                             # avoid putting an ERT in a math; instead put command as text
 469                             command = command.replace("\\\\", r"\mathrm{")
 470                             command = command + "}"
 471                         elif not insets or insets[-1] != "ERT":
 472                             # add an ERT inset with the replacement character
 473                             command = command.replace("\\\\", "\n\\backslash\n")
 474                             command = ert_intro + command + ert_outro
 475                         else:
 476                             command = command.replace("\\\\", "\n\\backslash\n")
 477                     last_char = ""  # indicate that the character should not be removed
 478                 mod_line += command
 479             else:
 480                 # Replace with replacement string
 481                 mod_line += replacement_character
 482     return mod_line
 483
 484
 485 def revert_unicode(document):
 486     """Transform unicode characters that can not be written using the
 487     document encoding to commands according to the unicodesymbols
 488     file. Characters that can not be replaced by commands are replaced by
 489     an replacement string.  Flags other than 'combined' are currently not
 490     implemented."""
 491     spec_chars = read_unicodesymbols()
 492     insets = []  # list of active insets
 493
 494     # Go through the document to capture all combining characters
 495     i = 0
 496     while i < len(document.body):
 497         line = document.body[i]
 498         # Check for insets
 499         if line.find("\\begin_inset") > -1:
 500             insets.append(line[13:].split()[0])
 501         if line.find("\\end_inset") > -1:
 502             del insets[-1]
 503
 504         # Try to write the line
 505         try:
 506             # If all goes well the line is written here
 507             dummy = line.encode(document.encoding)
 508             i += 1
 509         except:
 510             # Error, some character(s) in the line need to be replaced
 511             mod_line = revert_unicode_line(document, i, insets, spec_chars)
 512             document.body[i : i + 1] = mod_line.split("\n")
 513             i += len(mod_line.split("\n"))
 514
 515
 516 def revert_cs_label(document):
 517     "Remove status flag of charstyle label."
 518     i = 0
 519     while True:
 520         i = find_token(document.body, "\\begin_inset CharStyle", i)
 521         if i == -1:
 522             return
 523         # Seach for a line starting 'show_label'
 524         # If it is not there, break with a warning message
 525         i = i + 1
 526         while True:
 527             if document.body[i][:10] == "show_label":
 528                 del document.body[i]
 529                 break
 530             elif document.body[i][:13] == "\\begin_layout":
 531                 document.warning("Malformed LyX document: Missing 'show_label'.")
 532                 break
 533             i = i + 1
 534
 535         i = i + 1
 536
 537
 538 def convert_bibitem(document):
 539     r"""Convert
 540     \bibitem [option]{argument}
 541
 542     to
 543
 544     \begin_inset LatexCommand bibitem
 545     label "option"
 546     key "argument"
 547
 548     \end_inset
 549
 550     This must be called after convert_commandparams.
 551     """
 552     i = 0
 553     while True:
 554         i = find_token(document.body, "\\bibitem", i)
 555         if i == -1:
 556             break
 557         j = document.body[i].find("[") + 1
 558         k = document.body[i].rfind("]")
 559         if j == 0:  # No optional argument found
 560             option = None
 561         else:
 562             option = document.body[i][j:k]
 563         j = document.body[i].rfind("{") + 1
 564         k = document.body[i].rfind("}")
 565         argument = document.body[i][j:k]
 566         lines = ["\\begin_inset LatexCommand bibitem"]
 567         if option != None:
 568             lines.append('label "%s"' % option.replace('"', '\\"'))
 569         lines.append('key "%s"' % argument.replace('"', '\\"'))
 570         lines.append("")
 571         lines.append("\\end_inset")
 572         document.body[i : i + 1] = lines
 573         i = i + 1
 574
 575
 576 commandparams_info = {
 577     # command : [option1, option2, argument]
 578     "bibitem": ["label", "", "key"],
 579     "bibtex": ["options", "btprint", "bibfiles"],
 580     "cite": ["after", "before", "key"],
 581     "citet": ["after", "before", "key"],
 582     "citep": ["after", "before", "key"],
 583     "citealt": ["after", "before", "key"],
 584     "citealp": ["after", "before", "key"],
 585     "citeauthor": ["after", "before", "key"],
 586     "citeyear": ["after", "before", "key"],
 587     "citeyearpar": ["after", "before", "key"],
 588     "citet*": ["after", "before", "key"],
 589     "citep*": ["after", "before", "key"],
 590     "citealt*": ["after", "before", "key"],
 591     "citealp*": ["after", "before", "key"],
 592     "citeauthor*": ["after", "before", "key"],
 593     "Citet": ["after", "before", "key"],
 594     "Citep": ["after", "before", "key"],
 595     "Citealt": ["after", "before", "key"],
 596     "Citealp": ["after", "before", "key"],
 597     "Citeauthor": ["after", "before", "key"],
 598     "Citet*": ["after", "before", "key"],
 599     "Citep*": ["after", "before", "key"],
 600     "Citealt*": ["after", "before", "key"],
 601     "Citealp*": ["after", "before", "key"],
 602     "Citeauthor*": ["after", "before", "key"],
 603     "citefield": ["after", "before", "key"],
 604     "citetitle": ["after", "before", "key"],
 605     "cite*": ["after", "before", "key"],
 606     "hfill": ["", "", ""],
 607     "index": ["", "", "name"],
 608     "printindex": ["", "", "name"],
 609     "label": ["", "", "name"],
 610     "eqref": ["name", "", "reference"],
 611     "pageref": ["name", "", "reference"],
 612     "prettyref": ["name", "", "reference"],
 613     "ref": ["name", "", "reference"],
 614     "vpageref": ["name", "", "reference"],
 615     "vref": ["name", "", "reference"],
 616     "tableofcontents": ["", "", "type"],
 617     "htmlurl": ["name", "", "target"],
 618     "url": ["name", "", "target"],
 619 }
 620
 621
 622 def convert_commandparams(document):
 623     """Convert
 624
 625     \\begin_inset LatexCommand \\cmdname[opt1][opt2]{arg}
 626     \\end_inset
 627
 628     to
 629
 630     \\begin_inset LatexCommand cmdname
 631     name1 "opt1"
 632     name2 "opt2"
 633     name3 "arg"
 634     \\end_inset
 635
 636     name1, name2 and name3 can be different for each command.
 637     """
 638     # \begin_inset LatexCommand bibitem was not the official version (see
 639     # convert_bibitem()), but could be read in, so we convert it here, too.
 640
 641     i = 0
 642     while True:
 643         i = find_token(document.body, "\\begin_inset LatexCommand", i)
 644         if i == -1:
 645             break
 646         command = document.body[i][26:].strip()
 647         if command == "":
 648             document.warning("Malformed LyX document: Missing LatexCommand name.")
 649             i = i + 1
 650             continue
 651
 652         j = find_token(document.body, "\\end_inset", i + 1)
 653         if j == -1:
 654             document.warning("Malformed document")
 655         else:
 656             command += "".join(document.body[i + 1 : j])
 657             document.body[i + 1 : j] = []
 658
 659         # The following parser is taken from the original InsetCommandParams::scanCommand
 660         name = ""
 661         option1 = ""
 662         option2 = ""
 663         argument = ""
 664         state = "WS"
 665         # Used to handle things like \command[foo[bar]]{foo{bar}}
 666         nestdepth = 0
 667         b = 0
 668         for c in command:
 669             if (
 670                 (state == "CMDNAME" and c == " ")
 671                 or (state == "CMDNAME" and c == "[")
 672                 or (state == "CMDNAME" and c == "{")
 673             ):
 674                 state = "WS"
 675             if (
 676                 (state == "OPTION" and c == "]")
 677                 or (state == "SECOPTION" and c == "]")
 678                 or (state == "CONTENT" and c == "}")
 679             ):
 680                 if nestdepth == 0:
 681                     state = "WS"
 682                 else:
 683                     nestdepth = nestdepth - 1
 684             if (
 685                 (state == "OPTION" and c == "[")
 686                 or (state == "SECOPTION" and c == "[")
 687                 or (state == "CONTENT" and c == "{")
 688             ):
 689                 nestdepth = nestdepth + 1
 690             if state == "CMDNAME":
 691                 name += c
 692             elif state == "OPTION":
 693                 option1 += c
 694             elif state == "SECOPTION":
 695                 option2 += c
 696             elif state == "CONTENT":
 697                 argument += c
 698             elif state == "WS":
 699                 if c == "\\":
 700                     state = "CMDNAME"
 701                 elif c == "[" and b != "]":
 702                     state = "OPTION"
 703                     nestdepth = 0  # Just to be sure
 704                 elif c == "[" and b == "]":
 705                     state = "SECOPTION"
 706                     nestdepth = 0  # Just to be sure
 707                 elif c == "{":
 708                     state = "CONTENT"
 709                     nestdepth = 0  # Just to be sure
 710             b = c
 711
 712         # Now we have parsed the command, output the parameters
 713         lines = ["\\begin_inset LatexCommand %s" % name]
 714         if option1 != "":
 715             if commandparams_info[name][0] == "":
 716                 document.warning(f"Ignoring invalid option `{option1}' of command `{name}'.")
 717             else:
 718                 lines.append(
 719                     '{} "{}"'.format(
 720                         commandparams_info[name][0],
 721                         option1.replace("\\", "\\\\").replace('"', '\\"'),
 722                     )
 723                 )
 724         if option2 != "":
 725             if commandparams_info[name][1] == "":
 726                 document.warning(
 727                     f"Ignoring invalid second option `{option2}' of command `{name}'."
 728                 )
 729             else:
 730                 lines.append(
 731                     '{} "{}"'.format(
 732                         commandparams_info[name][1],
 733                         option2.replace("\\", "\\\\").replace('"', '\\"'),
 734                     )
 735                 )
 736         if argument != "":
 737             if commandparams_info[name][2] == "":
 738                 document.warning(f"Ignoring invalid argument `{argument}' of command `{name}'.")
 739             else:
 740                 lines.append(
 741                     '{} "{}"'.format(
 742                         commandparams_info[name][2],
 743                         argument.replace("\\", "\\\\").replace('"', '\\"'),
 744                     )
 745                 )
 746         document.body[i : i + 1] = lines
 747         i = i + 1
 748
 749
 750 def revert_commandparams(document):
 751     regex = re.compile(r"(\S+)\s+(.+)")
 752     i = 0
 753     while True:
 754         i = find_token(document.body, "\\begin_inset LatexCommand", i)
 755         if i == -1:
 756             break
 757         name = document.body[i].split()[2]
 758         j = find_end_of_inset(document.body, i)
 759         preview_line = ""
 760         option1 = ""
 761         option2 = ""
 762         argument = ""
 763         for k in range(i + 1, j):
 764             match = re.match(regex, document.body[k])
 765             if match:
 766                 pname = match.group(1)
 767                 pvalue = match.group(2)
 768                 if pname == "preview":
 769                     preview_line = document.body[k]
 770                 elif commandparams_info[name][0] != "" and pname == commandparams_info[name][0]:
 771                     option1 = pvalue.strip('"').replace('\\"', '"').replace("\\\\", "\\")
 772                 elif commandparams_info[name][1] != "" and pname == commandparams_info[name][1]:
 773                     option2 = pvalue.strip('"').replace('\\"', '"').replace("\\\\", "\\")
 774                 elif commandparams_info[name][2] != "" and pname == commandparams_info[name][2]:
 775                     argument = pvalue.strip('"').replace('\\"', '"').replace("\\\\", "\\")
 776             elif document.body[k].strip() != "":
 777                 document.warning(
 778                     f"Ignoring unknown contents `{document.body[k]}' in command inset {name}."
 779                 )
 780         if name == "bibitem":
 781             if option1 == "":
 782                 lines = ["\\bibitem {%s}" % argument]
 783             else:
 784                 lines = [f"\\bibitem [{option1}]{{{argument}}}"]
 785         else:
 786             if option1 == "":
 787                 if option2 == "":
 788                     lines = [f"\\begin_inset LatexCommand \\{name}{{{argument}}}"]
 789                 else:
 790                     lines = [f"\\begin_inset LatexCommand \\{name}[][{option2}]{{{argument}}}"]
 791             else:
 792                 if option2 == "":
 793                     lines = [f"\\begin_inset LatexCommand \\{name}[{option1}]{{{argument}}}"]
 794                 else:
 795                     lines = [
 796                         f"\\begin_inset LatexCommand \\{name}[{option1}][{option2}]{{{argument}}}"
 797                     ]
 798         if name != "bibitem":
 799             if preview_line != "":
 800                 lines.append(preview_line)
 801             lines.append("")
 802             lines.append("\\end_inset")
 803         document.body[i : j + 1] = lines
 804         i += len(lines) + 1
 805
 806
 807 def revert_nomenclature(document):
 808     "Convert nomenclature entry to ERT."
 809     regex = re.compile(r"(\S+)\s+(.+)")
 810     i = 0
 811     use_nomencl = 0
 812     while True:
 813         i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
 814         if i == -1:
 815             break
 816         use_nomencl = 1
 817         j = find_end_of_inset(document.body, i + 1)
 818         preview_line = ""
 819         symbol = ""
 820         description = ""
 821         prefix = ""
 822         for k in range(i + 1, j):
 823             match = re.match(regex, document.body[k])
 824             if match:
 825                 name = match.group(1)
 826                 value = match.group(2)
 827                 if name == "preview":
 828                     preview_line = document.body[k]
 829                 elif name == "symbol":
 830                     symbol = value.strip('"').replace('\\"', '"')
 831                 elif name == "description":
 832                     description = value.strip('"').replace('\\"', '"')
 833                 elif name == "prefix":
 834                     prefix = value.strip('"').replace('\\"', '"')
 835             elif document.body[k].strip() != "":
 836                 document.warning(
 837                     "Ignoring unknown contents `%s' in nomenclature inset." % document.body[k]
 838                 )
 839         if prefix == "":
 840             command = f"nomenclature{{{symbol}}}{{{description}}}"
 841         else:
 842             command = f"nomenclature[{prefix}]{{{symbol}}}{{{description}}}"
 843         document.body[i : j + 1] = [
 844             "\\begin_inset ERT",
 845             "status collapsed",
 846             "",
 847             "\\begin_layout %s" % document.default_layout,
 848             "",
 849             "",
 850             "\\backslash",
 851             command,
 852             "\\end_layout",
 853             "",
 854             "\\end_inset",
 855         ]
 856         i = i + 11
 857     if (
 858         use_nomencl
 859         and find_token(document.preamble, "\\usepackage{nomencl}[2005/09/22]", 0) == -1
 860     ):
 861         document.preamble.append("\\usepackage{nomencl}[2005/09/22]")
 862         document.preamble.append("\\makenomenclature")
 863
 864
 865 def revert_printnomenclature(document):
 866     "Convert printnomenclature to ERT."
 867     regex = re.compile(r"(\S+)\s+(.+)")
 868     i = 0
 869     use_nomencl = 0
 870     while True:
 871         i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
 872         if i == -1:
 873             break
 874         use_nomencl = 1
 875         j = find_end_of_inset(document.body, i + 1)
 876         preview_line = ""
 877         labelwidth = ""
 878         for k in range(i + 1, j):
 879             match = re.match(regex, document.body[k])
 880             if match:
 881                 name = match.group(1)
 882                 value = match.group(2)
 883                 if name == "preview":
 884                     preview_line = document.body[k]
 885                 elif name == "labelwidth":
 886                     labelwidth = value.strip('"').replace('\\"', '"')
 887             elif document.body[k].strip() != "":
 888                 document.warning(
 889                     "Ignoring unknown contents `%s' in printnomenclature inset."
 890                     % document.body[k]
 891                 )
 892         if labelwidth == "":
 893             command = "nomenclature{}"
 894         else:
 895             command = "nomenclature[%s]" % labelwidth
 896         document.body[i : j + 1] = [
 897             "\\begin_inset ERT",
 898             "status collapsed",
 899             "",
 900             "\\begin_layout %s" % document.default_layout,
 901             "",
 902             "",
 903             "\\backslash",
 904             command,
 905             "\\end_layout",
 906             "",
 907             "\\end_inset",
 908         ]
 909         i = i + 11
 910     if (
 911         use_nomencl
 912         and find_token(document.preamble, "\\usepackage{nomencl}[2005/09/22]", 0) == -1
 913     ):
 914         document.preamble.append("\\usepackage{nomencl}[2005/09/22]")
 915         document.preamble.append("\\makenomenclature")
 916
 917
 918 def convert_esint(document):
 919     "Add \\use_esint setting to header."
 920     i = find_token(document.header, "\\cite_engine", 0)
 921     if i == -1:
 922         document.warning("Malformed LyX document: Missing `\\cite_engine'.")
 923         return
 924     # 0 is off, 1 is auto, 2 is on.
 925     document.header.insert(i, "\\use_esint 0")
 926
 927
 928 def revert_esint(document):
 929     "Remove \\use_esint setting from header."
 930     i = find_token(document.header, "\\use_esint", 0)
 931     if i == -1:
 932         document.warning("Malformed LyX document: Missing `\\use_esint'.")
 933         return
 934     use_esint = document.header[i].split()[1]
 935     del document.header[i]
 936     # 0 is off, 1 is auto, 2 is on.
 937     if use_esint == 2:
 938         document.preamble.append("\\usepackage{esint}")
 939
 940
 941 def revert_clearpage(document):
 942     "clearpage -> ERT"
 943     i = 0
 944     while True:
 945         i = find_token(document.body, "\\clearpage", i)
 946         if i == -1:
 947             break
 948         document.body[i : i + 1] = [
 949             "\\begin_inset ERT",
 950             "status collapsed",
 951             "",
 952             "\\begin_layout %s" % document.default_layout,
 953             "",
 954             "",
 955             "\\backslash",
 956             "clearpage",
 957             "\\end_layout",
 958             "",
 959             "\\end_inset",
 960         ]
 961     i = i + 1
 962
 963
 964 def revert_cleardoublepage(document):
 965     "cleardoublepage -> ERT"
 966     i = 0
 967     while True:
 968         i = find_token(document.body, "\\cleardoublepage", i)
 969         if i == -1:
 970             break
 971         document.body[i : i + 1] = [
 972             "\\begin_inset ERT",
 973             "status collapsed",
 974             "",
 975             "\\begin_layout %s" % document.default_layout,
 976             "",
 977             "",
 978             "\\backslash",
 979             "cleardoublepage",
 980             "\\end_layout",
 981             "",
 982             "\\end_inset",
 983         ]
 984     i = i + 1
 985
 986
 987 def convert_lyxline(document):
 988     r"remove fontsize commands for \lyxline"
 989     # The problematic is: The old \lyxline definition doesn't handle the fontsize
 990     # to change the line thickness. The new definiton does this so that imported
 991     # \lyxlines would have a different line thickness. The eventual fontsize command
 992     # before \lyxline is therefore removed to get the same output.
 993     fontsizes = [
 994         "tiny",
 995         "scriptsize",
 996         "footnotesize",
 997         "small",
 998         "normalsize",
 999         "large",
1000         "Large",
1001         "LARGE",
1002         "huge",
1003         "Huge",
1004     ]
1005     for n in range(0, len(fontsizes)):
1006         i = 0
1007         k = 0
1008         while i < len(document.body):
1009             i = find_token(document.body, "\\size " + fontsizes[n], i)
1010             k = find_token(document.body, "\\lyxline", i)
1011             # the corresponding fontsize command is always 2 lines before the \lyxline
1012             if i != -1 and k == i + 2:
1013                 document.body[i : i + 1] = []
1014             else:
1015                 break
1016         i = i + 1
1017
1018
1019 def revert_encodings(document):
1020     "Set new encodings to auto."
1021     encodings = [
1022         "8859-6",
1023         "8859-8",
1024         "cp437",
1025         "cp437de",
1026         "cp850",
1027         "cp852",
1028         "cp855",
1029         "cp858",
1030         "cp862",
1031         "cp865",
1032         "cp866",
1033         "cp1250",
1034         "cp1252",
1035         "cp1256",
1036         "cp1257",
1037         "latin10",
1038         "pt254",
1039         "tis620-0",
1040     ]
1041     i = find_token(document.header, "\\inputencoding", 0)
1042     if i == -1:
1043         document.header.append("\\inputencoding auto")
1044     else:
1045         inputenc = get_value(document.header, "\\inputencoding", i)
1046         if inputenc in encodings:
1047             document.header[i] = "\\inputencoding auto"
1048     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1049
1050
1051 def convert_caption(document):
1052     "Convert caption layouts to caption insets."
1053     i = 0
1054     while True:
1055         i = find_token(document.body, "\\begin_layout Caption", i)
1056         if i == -1:
1057             return
1058         j = find_end_of_layout(document.body, i)
1059         if j == -1:
1060             document.warning("Malformed LyX document: Missing `\\end_layout'.")
1061             return
1062
1063         document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
1064         document.body[i : i + 1] = [
1065             "\\begin_layout %s" % document.default_layout,
1066             "\\begin_inset Caption",
1067             "",
1068             "\\begin_layout %s" % document.default_layout,
1069         ]
1070         i = i + 1
1071
1072
1073 def revert_caption(document):
1074     "Convert caption insets to caption layouts."
1075     " This assumes that the text class has a caption style. "
1076     i = 0
1077     while True:
1078         i = find_token(document.body, "\\begin_inset Caption", i)
1079         if i == -1:
1080             return
1081
1082         # We either need to delete the previous \begin_layout line, or we
1083         # need to end the previous layout if this inset is not in the first
1084         # position of the paragraph.
1085         layout_before = find_token_backwards(document.body, "\\begin_layout", i)
1086         if layout_before == -1:
1087             document.warning("Malformed LyX document: Missing `\\begin_layout'.")
1088             return
1089         layout_line = document.body[layout_before]
1090         del_layout_before = True
1091         l = layout_before + 1
1092         while l < i:
1093             if document.body[l] != "":
1094                 del_layout_before = False
1095                 break
1096             l = l + 1
1097         if del_layout_before:
1098             del document.body[layout_before:i]
1099             i = layout_before
1100         else:
1101             document.body[i:i] = ["\\end_layout", ""]
1102             i = i + 2
1103
1104         # Find start of layout in the inset and end of inset
1105         j = find_token(document.body, "\\begin_layout", i)
1106         if j == -1:
1107             document.warning("Malformed LyX document: Missing `\\begin_layout'.")
1108             return
1109         k = find_end_of_inset(document.body, i)
1110         if k == -1:
1111             document.warning("Malformed LyX document: Missing `\\end_inset'.")
1112             return
1113
1114         # We either need to delete the following \end_layout line, or we need
1115         # to restart the old layout if this inset is not at the paragraph end.
1116         layout_after = find_token(document.body, "\\end_layout", k)
1117         if layout_after == -1:
1118             document.warning("Malformed LyX document: Missing `\\end_layout'.")
1119             return
1120         del_layout_after = True
1121         l = k + 1
1122         while l < layout_after:
1123             if document.body[l] != "":
1124                 del_layout_after = False
1125                 break
1126             l = l + 1
1127         if del_layout_after:
1128             del document.body[k + 1 : layout_after + 1]
1129         else:
1130             document.body[k + 1 : k + 1] = [layout_line, ""]
1131
1132         # delete \begin_layout and \end_inset and replace \begin_inset with
1133         # "\begin_layout Caption". This works because we can only have one
1134         # paragraph in the caption inset: The old \end_layout will be recycled.
1135         del document.body[k]
1136         if document.body[k] == "":
1137             del document.body[k]
1138         del document.body[j]
1139         if document.body[j] == "":
1140             del document.body[j]
1141         document.body[i] = "\\begin_layout Caption"
1142         if document.body[i + 1] == "":
1143             del document.body[i + 1]
1144         i = i + 1
1145
1146
1147 # Accents of InsetLaTeXAccent
1148 accent_map = {
1149     "`": "\u0300",  # grave
1150     "'": "\u0301",  # acute
1151     "^": "\u0302",  # circumflex
1152     "~": "\u0303",  # tilde
1153     "=": "\u0304",  # macron
1154     "u": "\u0306",  # breve
1155     ".": "\u0307",  # dot above
1156     '"': "\u0308",  # diaeresis
1157     "r": "\u030a",  # ring above
1158     "H": "\u030b",  # double acute
1159     "v": "\u030c",  # caron
1160     "b": "\u0320",  # minus sign below
1161     "d": "\u0323",  # dot below
1162     "c": "\u0327",  # cedilla
1163     "k": "\u0328",  # ogonek
1164     "t": "\u0361",  # tie. This is special: It spans two characters, but
1165     # only one is given as argument, so we don't need to
1166     # treat it differently.
1167 }
1168
1169
1170 # special accents of InsetLaTeXAccent without argument
1171 special_accent_map = {
1172     "i": "\u0131",  # dotless i
1173     "j": "\u0237",  # dotless j
1174     "l": "\u0142",  # l with stroke
1175     "L": "\u0141",  # L with stroke
1176 }
1177
1178
1179 # special accent arguments of InsetLaTeXAccent
1180 accented_map = {
1181     "\\i": "\u0131",  # dotless i
1182     "\\j": "\u0237",  # dotless j
1183 }
1184
1185
1186 def _convert_accent(accent, accented_char):
1187     type = accent
1188     char = accented_char
1189     if char == "":
1190         if type in special_accent_map:
1191             return special_accent_map[type]
1192         # a missing char is treated as space by LyX
1193         char = " "
1194     elif type == "q" and char in ["t", "d", "l", "L"]:
1195         # Special caron, only used with t, d, l and L.
1196         # It is not in the map because we convert it to the same unicode
1197         # character as the normal caron: \q{} is only defined if babel with
1198         # the czech or slovak language is used, and the normal caron
1199         # produces the correct output if the T1 font encoding is used.
1200         # For the same reason we never convert to \q{} in the other direction.
1201         type = "v"
1202     elif char in accented_map:
1203         char = accented_map[char]
1204     elif len(char) > 1:
1205         # We can only convert accents on a single char
1206         return ""
1207     a = accent_map.get(type)
1208     if a:
1209         return unicodedata.normalize("NFC", f"{char}{a}")
1210     return ""
1211
1212
1213 def convert_ertbackslash(body, i, ert, default_layout):
1214     r"""-------------------------------------------------------------------------------------------
1215     Convert backslashes and '\n' into valid ERT code, append the converted
1216     text to body[i] and return the (maybe incremented) line index i"""
1217
1218     for c in ert:
1219         if c == "\\":
1220             body[i] = body[i] + "\\backslash "
1221             i = i + 1
1222             body.insert(i, "")
1223         elif c == "\n":
1224             body[i + 1 : i + 1] = [
1225                 "\\end_layout",
1226                 "",
1227                 "\\begin_layout %s" % default_layout,
1228                 "",
1229             ]
1230             i = i + 4
1231         else:
1232             body[i] = body[i] + c
1233     return i
1234
1235
1236 def convert_accent(document):
1237     # The following forms are supported by LyX:
1238     # '\i \"{a}' (standard form, as written by LyX)
1239     # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
1240     # '\i \"{ }' (also accepted if the accented char is a space)
1241     # '\i \" a'  (also accepted)
1242     # '\i \"'    (also accepted)
1243     re_wholeinset = re.compile(r"^(.*)(\\i\s+)(.*)$")
1244     re_contents = re.compile(r"^([^\s{]+)(.*)$")
1245     re_accentedcontents = re.compile(r"^\s*{?([^{}]*)}?\s*$")
1246     i = 0
1247     while True:
1248         i = find_re(document.body, re_wholeinset, i)
1249         if i == -1:
1250             return
1251         match = re_wholeinset.match(document.body[i])
1252         prefix = match.group(1)
1253         contents = match.group(3).strip()
1254         match = re_contents.match(contents)
1255         if match:
1256             # Strip first char (always \)
1257             accent = match.group(1)[1:]
1258             accented_contents = match.group(2).strip()
1259             match = re_accentedcontents.match(accented_contents)
1260             accented_char = match.group(1)
1261             converted = _convert_accent(accent, accented_char)
1262             if converted == "":
1263                 # Normalize contents
1264                 contents = (f"{accent}{{{accented_char}}}",)
1265             else:
1266                 document.body[i] = f"{prefix}{converted}"
1267                 i += 1
1268                 continue
1269         document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
1270         document.body[i] = prefix
1271         document.body[i + 1 : i + 1] = [
1272             "\\begin_inset ERT",
1273             "status collapsed",
1274             "",
1275             "\\begin_layout %s" % document.default_layout,
1276             "",
1277             "",
1278             "",
1279         ]
1280         i = convert_ertbackslash(
1281             document.body, i + 7, "\\%s" % contents, document.default_layout
1282         )
1283         document.body[i + 1 : i + 1] = ["\\end_layout", "", "\\end_inset"]
1284         i += 3
1285
1286
1287 def is_inset_line(document, i):
1288     """Line i of body has an inset"""
1289     if document.body[i][:1] == "\\":
1290         return True
1291     last_tokens = "".join(document.body[i].split()[-2:])
1292     return last_tokens.find("\\") != -1
1293
1294
1295 # A wrapper around normalize that handles special cases (cf. bug 3313)
1296 def normalize(form, text):
1297     # do not normalize OHM, ANGSTROM
1298     keep_characters = [0x2126, 0x212B]
1299     result = ""
1300     convert = ""
1301     for i in text:
1302         if ord(i) in keep_characters:
1303             if len(convert) > 0:
1304                 result = result + unicodedata.normalize(form, convert)
1305                 convert = ""
1306             result = result + i
1307         else:
1308             convert = convert + i
1309     if len(convert) > 0:
1310         result = result + unicodedata.normalize(form, convert)
1311     return result
1312
1313
1314 def revert_accent(document):
1315     inverse_accent_map = {}
1316     for k in accent_map:
1317         inverse_accent_map[accent_map[k]] = k
1318     inverse_special_accent_map = {}
1319     for k in special_accent_map:
1320         inverse_special_accent_map[special_accent_map[k]] = k
1321     inverse_accented_map = {}
1322     for k in accented_map:
1323         inverse_accented_map[accented_map[k]] = k
1324
1325     # Since LyX may insert a line break within a word we must combine all
1326     # words before unicode normalization.
1327     # We do this only if the next line starts with an accent, otherwise we
1328     # would create things like '\begin_inset ERTstatus'.
1329     for i in range(len(document.body) - 1):
1330         if document.body[i] == "" or document.body[i + 1] == "" or document.body[i][-1] == " ":
1331             continue
1332         if document.body[i + 1][0] in inverse_accent_map and not is_inset_line(document, i):
1333             # the last character of this line and the first of the next line
1334             # form probably a surrogate pair, inline insets are excluded (second part of the test)
1335             while len(document.body[i + 1]) > 0 and document.body[i + 1][0] != " ":
1336                 document.body[i] += document.body[i + 1][0]
1337                 document.body[i + 1] = document.body[i + 1][1:]
1338
1339     # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
1340     # This is needed to catch all accented characters.
1341     for i in range(len(document.body)):
1342         # Unfortunately we have a mixture of unicode strings and plain strings,
1343         # because we never use u'xxx' for string literals, but 'xxx'.
1344         # Therefore we may have to try two times to normalize the data.
1345         try:
1346             document.body[i] = normalize("NFD", document.body[i])
1347         except TypeError:
1348             document.body[i] = normalize("NFD", str(document.body[i], "utf-8"))
1349
1350     # Replace accented characters with InsetLaTeXAccent
1351     # Do not convert characters that can be represented in the chosen
1352     # encoding.
1353     encoding_stack = [
1354         get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
1355     ]
1356     lang_re = re.compile(r"^\\lang\s(\S+)")
1357
1358     i = 0
1359     while i < len(document.body):
1360         if (
1361             document.inputencoding == "auto" or document.inputencoding == "default"
1362         ) and document.cjk_encoding != "":
1363             # Track the encoding of the current line
1364             result = lang_re.match(document.body[i])
1365             if result:
1366                 language = result.group(1)
1367                 if language == "default":
1368                     encoding_stack[-1] = document.encoding
1369                 else:
1370                     from lyx2lyx_lang import lang
1371
1372                     encoding_stack[-1] = lang[language][3]
1373                 continue
1374             elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
1375                 encoding_stack.append(encoding_stack[-1])
1376                 continue
1377             elif find_token(document.body, "\\end_layout", i, i + 1) == i:
1378                 del encoding_stack[-1]
1379                 continue
1380
1381         for j in range(len(document.body[i])):
1382             # dotless i and dotless j are both in special_accent_map and can
1383             # occur as an accented character, so we need to test that the
1384             # following character is no accent
1385             if document.body[i][j] in inverse_special_accent_map and (
1386                 j == len(document.body[i]) - 1
1387                 or document.body[i][j + 1] not in inverse_accent_map
1388             ):
1389                 accent = document.body[i][j]
1390                 try:
1391                     dummy = accent.encode(encoding_stack[-1])
1392                 except UnicodeEncodeError:
1393                     # Insert the rest of the line as new line
1394                     if j < len(document.body[i]) - 1:
1395                         document.body.insert(i + 1, document.body[i][j + 1 :])
1396                     # Delete the accented character
1397                     document.body[i] = document.body[i][:j]
1398                     # Finally add the InsetLaTeXAccent
1399                     document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
1400                     break
1401             elif j > 0 and document.body[i][j] in inverse_accent_map:
1402                 accented_char = document.body[i][j - 1]
1403                 if accented_char == " ":
1404                     # Conform to LyX output
1405                     accented_char = ""
1406                 elif accented_char in inverse_accented_map:
1407                     accented_char = inverse_accented_map[accented_char]
1408                 accent = document.body[i][j]
1409                 try:
1410                     dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
1411                 except UnicodeEncodeError:
1412                     # Insert the rest of the line as new line
1413                     if j < len(document.body[i]) - 1:
1414                         document.body.insert(i + 1, document.body[i][j + 1 :])
1415                     # Delete the accented characters
1416                     document.body[i] = document.body[i][: j - 1]
1417                     # Finally add the InsetLaTeXAccent
1418                     document.body[i] += f"\\i \\{inverse_accent_map[accent]}{{{accented_char}}}"
1419                     break
1420         i = i + 1
1421
1422     # Normalize to "Normal form C" (NFC, pre-composed characters) again
1423     for i in range(len(document.body)):
1424         document.body[i] = normalize("NFC", document.body[i])
1425
1426
1427 def normalize_font_whitespace_259(document):
1428     """Before format 259 the font changes were ignored if a
1429     whitespace was the first or last character in the sequence, this function
1430     transfers the whitespace outside."""
1431
1432     char_properties = {
1433         "\\series": "default",
1434         "\\emph": "default",
1435         "\\color": "none",
1436         "\\shape": "default",
1437         "\\bar": "default",
1438         "\\family": "default",
1439     }
1440     return normalize_font_whitespace(document, char_properties)
1441
1442
1443 def normalize_font_whitespace_274(document):
1444     """Before format 259 (sic) the font changes were ignored if a
1445     whitespace was the first or last character in the sequence. This was
1446     corrected for most font properties in format 259, but the language
1447     was forgotten then. This function applies the same conversion done
1448     there (namely, transfers the whitespace outside) for font language
1449     changes, as well."""
1450
1451     char_properties = {"\\lang": "default"}
1452     return normalize_font_whitespace(document, char_properties)
1453
1454
1455 def get_paragraph_language(document, i):
1456     """Return the language of the paragraph in which line i of the document
1457     body is. If the first thing in the paragraph is a \\lang command, that
1458     is the paragraph's langauge; otherwise, the paragraph's language is the
1459     document's language."""
1460
1461     lines = document.body
1462
1463     first_nonempty_line = find_nonempty_line(lines, find_beginning_of_layout(lines, i) + 1)
1464
1465     words = lines[first_nonempty_line].split()
1466
1467     if len(words) > 1 and words[0] == "\\lang":
1468         return words[1]
1469     else:
1470         return document.language
1471
1472
1473 def normalize_font_whitespace(document, char_properties):
1474     """Before format 259 the font changes were ignored if a
1475     whitespace was the first or last character in the sequence, this function
1476     transfers the whitespace outside. Only a change in one of the properties
1477     in the provided     char_properties is handled by this function."""
1478
1479     if document.backend != "latex":
1480         return
1481
1482     lines = document.body
1483
1484     changes = {}
1485
1486     i = 0
1487     while i < len(lines):
1488         words = lines[i].split()
1489
1490         if len(words) > 0 and words[0] == "\\begin_layout":
1491             # a new paragraph resets all font changes
1492             changes.clear()
1493             # also reset the default language to be the paragraph's language
1494             if "\\lang" in list(char_properties.keys()):
1495                 char_properties["\\lang"] = get_paragraph_language(document, i + 1)
1496
1497         elif len(words) > 1 and words[0] in list(char_properties.keys()):
1498             # we have a font change
1499             if char_properties[words[0]] == words[1]:
1500                 # property gets reset
1501                 if words[0] in list(changes.keys()):
1502                     del changes[words[0]]
1503                 defaultproperty = True
1504             else:
1505                 # property gets set
1506                 changes[words[0]] = words[1]
1507                 defaultproperty = False
1508
1509             # We need to explicitly reset all changed properties if we find
1510             # a space below, because LyX 1.4 would output the space after
1511             # closing the previous change and before starting the new one,
1512             # and closing a font change means to close all properties, not
1513             # just the changed one.
1514
1515             if lines[i - 1] and lines[i - 1][-1] == " ":
1516                 lines[i - 1] = lines[i - 1][:-1]
1517                 # a space before the font change
1518                 added_lines = [" "]
1519                 for k in list(changes.keys()):
1520                     # exclude property k because that is already in lines[i]
1521                     if k != words[0]:
1522                         added_lines[1:1] = [f"{k} {changes[k]}"]
1523                 for k in list(changes.keys()):
1524                     # exclude property k because that must be added below anyway
1525                     if k != words[0]:
1526                         added_lines[0:0] = [f"{k} {char_properties[k]}"]
1527                 if defaultproperty:
1528                     # Property is reset in lines[i], so add the new stuff afterwards
1529                     lines[i + 1 : i + 1] = added_lines
1530                 else:
1531                     # Reset property for the space
1532                     added_lines[0:0] = [f"{words[0]} {char_properties[words[0]]}"]
1533                     lines[i:i] = added_lines
1534                 i = i + len(added_lines)
1535
1536             elif (
1537                 lines[i + 1]
1538                 and lines[i + 1][0] == " "
1539                 and (len(changes) > 0 or not defaultproperty)
1540             ):
1541                 # a space after the font change
1542                 if lines[i + 1] == " " and lines[i + 2]:
1543                     next_words = lines[i + 2].split()
1544                     if len(next_words) > 0 and next_words[0] == words[0]:
1545                         # a single blank with a property different from the
1546                         # previous and the next line must not be changed
1547                         i = i + 2
1548                         continue
1549                 lines[i + 1] = lines[i + 1][1:]
1550                 added_lines = [" "]
1551                 for k in list(changes.keys()):
1552                     # exclude property k because that is already in lines[i]
1553                     if k != words[0]:
1554                         added_lines[1:1] = [f"{k} {changes[k]}"]
1555                 for k in list(changes.keys()):
1556                     # exclude property k because that must be added below anyway
1557                     if k != words[0]:
1558                         added_lines[0:0] = [f"{k} {char_properties[k]}"]
1559                 # Reset property for the space
1560                 added_lines[0:0] = [f"{words[0]} {char_properties[words[0]]}"]
1561                 lines[i:i] = added_lines
1562                 i = i + len(added_lines)
1563
1564         i = i + 1
1565
1566
1567 def revert_utf8x(document):
1568     "Set utf8x encoding to utf8."
1569     i = find_token(document.header, "\\inputencoding", 0)
1570     if i == -1:
1571         document.header.append("\\inputencoding auto")
1572     else:
1573         inputenc = get_value(document.header, "\\inputencoding", i)
1574         if inputenc == "utf8x":
1575             document.header[i] = "\\inputencoding utf8"
1576     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1577
1578
1579 def revert_utf8plain(document):
1580     "Set utf8plain encoding to utf8."
1581     i = find_token(document.header, "\\inputencoding", 0)
1582     if i == -1:
1583         document.header.append("\\inputencoding auto")
1584     else:
1585         inputenc = get_value(document.header, "\\inputencoding", i)
1586         if inputenc == "utf8-plain":
1587             document.header[i] = "\\inputencoding utf8"
1588     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1589
1590
1591 def revert_beamer_alert(document):
1592     "Revert beamer's \\alert inset back to ERT."
1593     i = 0
1594     while True:
1595         i = find_token(document.body, "\\begin_inset CharStyle Alert", i)
1596         if i == -1:
1597             return
1598         document.body[i] = "\\begin_inset ERT"
1599         i = i + 1
1600         while True:
1601             if document.body[i][:13] == "\\begin_layout":
1602                 # Insert the \alert command
1603                 document.body[i + 1] = "\\alert{" + document.body[i + 1] + "}"
1604                 break
1605             i = i + 1
1606
1607         i = i + 1
1608
1609
1610 def revert_beamer_structure(document):
1611     "Revert beamer's \\structure inset back to ERT."
1612     i = 0
1613     while True:
1614         i = find_token(document.body, "\\begin_inset CharStyle Structure", i)
1615         if i == -1:
1616             return
1617         document.body[i] = "\\begin_inset ERT"
1618         i = i + 1
1619         while True:
1620             if document.body[i][:13] == "\\begin_layout":
1621                 document.body[i + 1] = "\\structure{" + document.body[i + 1] + "}"
1622                 break
1623             i = i + 1
1624
1625         i = i + 1
1626
1627
1628 def convert_changes(document):
1629     "Switch output_changes off if tracking_changes is off."
1630     i = find_token(document.header, "\\tracking_changes", 0)
1631     if i == -1:
1632         document.warning("Malformed lyx document: Missing '\\tracking_changes'.")
1633         return
1634     j = find_token(document.header, "\\output_changes", 0)
1635     if j == -1:
1636         document.warning("Malformed lyx document: Missing '\\output_changes'.")
1637         return
1638     tracking_changes = get_value(document.header, "\\tracking_changes", i)
1639     output_changes = get_value(document.header, "\\output_changes", j)
1640     if tracking_changes == "false" and output_changes == "true":
1641         document.header[j] = "\\output_changes false"
1642
1643
1644 def revert_ascii(document):
1645     "Set ascii encoding to auto."
1646     i = find_token(document.header, "\\inputencoding", 0)
1647     if i == -1:
1648         document.header.append("\\inputencoding auto")
1649     else:
1650         inputenc = get_value(document.header, "\\inputencoding", i)
1651         if inputenc == "ascii":
1652             document.header[i] = "\\inputencoding auto"
1653     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1654
1655
1656 def normalize_language_name(document):
1657     lang = {"brazil": "brazilian", "portuges": "portuguese"}
1658
1659     if document.language in lang:
1660         document.language = lang[document.language]
1661         i = find_token(document.header, "\\language", 0)
1662         document.header[i] = "\\language %s" % document.language
1663
1664
1665 def revert_language_name(document):
1666     lang = {"brazilian": "brazil", "portuguese": "portuges"}
1667
1668     if document.language in lang:
1669         document.language = lang[document.language]
1670         i = find_token(document.header, "\\language", 0)
1671         document.header[i] = "\\language %s" % document.language
1672
1673
1674 #
1675 #  \textclass cv -> \textclass simplecv
1676 def convert_cv_textclass(document):
1677     if document.textclass == "cv":
1678         document.textclass = "simplecv"
1679
1680
1681 def revert_cv_textclass(document):
1682     if document.textclass == "simplecv":
1683         document.textclass = "cv"
1684
1685
1686 #
1687 # add scaleBeforeRotation graphics param
1688 def convert_graphics_rotation(document):
1689     "add scaleBeforeRotation graphics parameter."
1690     i = 0
1691     while True:
1692         i = find_token(document.body, "\\begin_inset Graphics", i)
1693         if i == -1:
1694             return
1695         j = find_end_of_inset(document.body, i + 1)
1696         if j == -1:
1697             # should not happen
1698             document.warning("Malformed LyX document: Could not find end of graphics inset.")
1699         # Seach for rotateAngle and width or height or scale
1700         # If these params are not there, nothing needs to be done.
1701         k = find_token(document.body, "\trotateAngle", i + 1, j)
1702         l = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
1703         if k != -1 and l != -1:
1704             document.body.insert(j, "scaleBeforeRotation")
1705         i = i + 1
1706
1707
1708 #
1709 # remove scaleBeforeRotation graphics param
1710 def revert_graphics_rotation(document):
1711     "remove scaleBeforeRotation graphics parameter."
1712     i = 0
1713     while True:
1714         i = find_token(document.body, "\\begin_inset Graphics", i)
1715         if i == -1:
1716             return
1717         j = find_end_of_inset(document.body, i + 1)
1718         if j == -1:
1719             # should not happen
1720             document.warning("Malformed LyX document: Could not find end of graphics inset.")
1721         # If there's a scaleBeforeRotation param, just remove that
1722         k = find_token(document.body, "\tscaleBeforeRotation", i + 1, j)
1723         if k != -1:
1724             del document.body[k]
1725         else:
1726             # if not, and if we have rotateAngle and width or height or scale,
1727             # we have to put the rotateAngle value to special
1728             rotateAngle = get_value(document.body, "rotateAngle", i + 1, j)
1729             special = get_value(document.body, "special", i + 1, j)
1730             if rotateAngle != "":
1731                 k = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
1732                 if k == -1:
1733                     break
1734                 if special == "":
1735                     document.body.insert(j - 1, "\tspecial angle=%s" % rotateAngle)
1736                 else:
1737                     l = find_token(document.body, "\tspecial", i + 1, j)
1738                     document.body[l] = document.body[l].replace(
1739                         special, f"angle={rotateAngle},{special}"
1740                     )
1741                 k = find_token(document.body, "\trotateAngle", i + 1, j)
1742                 if k != -1:
1743                     del document.body[k]
1744         i = i + 1
1745
1746
1747 def convert_tableborder(document):
1748     # The problem is: LyX doubles the table cell border as it ignores the "|" character in
1749     # the cell arguments. A fix takes care of this and therefore the "|" has to be removed
1750     i = 0
1751     while i < len(document.body):
1752         h = document.body[i].find('leftline="true"', 0, len(document.body[i]))
1753         k = document.body[i].find("|>{", 0, len(document.body[i]))
1754         # the two tokens have to be in one line
1755         if h != -1 and k != -1:
1756             # delete the "|"
1757             document.body[i] = (
1758                 document.body[i][:k] + document.body[i][k + 1 : len(document.body[i])]
1759             )
1760         i = i + 1
1761
1762
1763 def revert_tableborder(document):
1764     i = 0
1765     while i < len(document.body):
1766         h = document.body[i].find('leftline="true"', 0, len(document.body[i]))
1767         k = document.body[i].find(">{", 0, len(document.body[i]))
1768         # the two tokens have to be in one line
1769         if h != -1 and k != -1:
1770             # add the "|"
1771             document.body[i] = document.body[i][:k] + "|" + document.body[i][k:]
1772         i = i + 1
1773
1774
1775 def revert_armenian(document):
1776     # set inputencoding from armscii8 to auto
1777     if document.inputencoding == "armscii8":
1778         i = find_token(document.header, "\\inputencoding", 0)
1779         if i != -1:
1780             document.header[i] = "\\inputencoding auto"
1781     # check if preamble exists, if not k is set to -1
1782     i = 0
1783     k = -1
1784     while i < len(document.preamble):
1785         if k == -1:
1786             k = document.preamble[i].find("\\", 0, len(document.preamble[i]))
1787         if k == -1:
1788             k = document.preamble[i].find("%", 0, len(document.preamble[i]))
1789         i = i + 1
1790     # add the entry \usepackage{armtex} to the document preamble
1791     if document.language == "armenian":
1792         # set the armtex entry as the first preamble line
1793         if k != -1:
1794             document.preamble[0:0] = ["\\usepackage{armtex}"]
1795         # create the preamble when it doesn't exist
1796         else:
1797             document.preamble.append("\\usepackage{armtex}")
1798     # Set document language from armenian to english
1799     if document.language == "armenian":
1800         document.language = "english"
1801         i = find_token(document.header, "\\language", 0)
1802         if i != -1:
1803             document.header[i] = "\\language english"
1804
1805
1806 def revert_CJK(document):
1807     "Set CJK encodings to default and languages chinese, japanese and korean to english."
1808     encodings = [
1809         "Bg5",
1810         "Bg5+",
1811         "GB",
1812         "GBt",
1813         "GBK",
1814         "JIS",
1815         "KS",
1816         "SJIS",
1817         "UTF8",
1818         "EUC-TW",
1819         "EUC-JP",
1820     ]
1821     i = find_token(document.header, "\\inputencoding", 0)
1822     if i == -1:
1823         document.header.append("\\inputencoding auto")
1824     else:
1825         inputenc = get_value(document.header, "\\inputencoding", i)
1826         if inputenc in encodings:
1827             document.header[i] = "\\inputencoding default"
1828     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
1829
1830     if (
1831         document.language == "chinese-simplified"
1832         or document.language == "chinese-traditional"
1833         or document.language == "japanese"
1834         or document.language == "korean"
1835     ):
1836         document.language = "english"
1837         i = find_token(document.header, "\\language", 0)
1838         if i != -1:
1839             document.header[i] = "\\language english"
1840
1841
1842 def revert_preamble_listings_params(document):
1843     r"Revert preamble option \listings_params"
1844     i = find_token(document.header, "\\listings_params", 0)
1845     if i != -1:
1846         document.preamble.append("\\usepackage{listings}")
1847         document.preamble.append("\\lstset{%s}" % document.header[i].split()[1].strip('"'))
1848         document.header.pop(i)
1849
1850
1851 def revert_listings_inset(document):
1852     r"""Revert listings inset to \lstinline or \begin, \end lstlisting, translate
1853     FROM
1854
1855     \begin_inset
1856     lstparams "language=Delphi"
1857     inline true
1858     status open
1859
1860     \begin_layout Standard
1861     var i = 10;
1862     \end_layout
1863
1864     \end_inset
1865
1866     TO
1867
1868     \begin_inset ERT
1869     status open
1870     \begin_layout Standard
1871
1872
1873     \backslash
1874     lstinline[language=Delphi]{var i = 10;}
1875     \end_layout
1876
1877     \end_inset
1878
1879     There can be an caption inset in this inset
1880
1881     \begin_layout Standard
1882     \begin_inset Caption
1883
1884     \begin_layout Standard
1885     before label
1886     \begin_inset LatexCommand label
1887     name "lst:caption"
1888
1889     \end_inset
1890
1891     after label
1892     \end_layout
1893
1894     \end_inset
1895
1896
1897     \end_layout
1898
1899     """
1900     i = 0
1901     while True:
1902         i = find_token(document.body, "\\begin_inset listings", i)
1903         if i == -1:
1904             break
1905         else:
1906             if "\\usepackage{listings}" not in document.preamble:
1907                 document.preamble.append("\\usepackage{listings}")
1908         j = find_end_of_inset(document.body, i + 1)
1909         if j == -1:
1910             # this should not happen
1911             break
1912         inline = "false"
1913         params = ""
1914         status = "open"
1915         # first three lines
1916         for line in range(i + 1, i + 4):
1917             if document.body[line].startswith("inline"):
1918                 inline = document.body[line].split()[1]
1919             if document.body[line].startswith("lstparams"):
1920                 params = document.body[line].split()[1].strip('"')
1921             if document.body[line].startswith("status"):
1922                 status = document.body[line].split()[1].strip()
1923                 k = line + 1
1924         # caption?
1925         caption = ""
1926         label = ""
1927         cap = find_token(document.body, "\\begin_inset Caption", i)
1928         if cap != -1:
1929             cap_end = find_end_of_inset(document.body, cap + 1)
1930             if cap_end == -1:
1931                 # this should not happen
1932                 break
1933             # label?
1934             lbl = find_token(document.body, "\\begin_inset LatexCommand label", cap + 1)
1935             if lbl != -1:
1936                 lbl_end = find_end_of_inset(document.body, lbl + 1)
1937                 if lbl_end == -1:
1938                     # this should not happen
1939                     break
1940             else:
1941                 lbl = cap_end
1942                 lbl_end = cap_end
1943             for line in document.body[lbl : lbl_end + 1]:
1944                 if line.startswith("name "):
1945                     label = line.split()[1].strip('"')
1946                     break
1947             for line in document.body[cap:lbl] + document.body[lbl_end + 1 : cap_end + 1]:
1948                 if not line.startswith("\\"):
1949                     caption += line.strip()
1950             k = cap_end + 1
1951         inlinecode = ""
1952         # looking for the oneline code for lstinline
1953         inlinecode = document.body[
1954             find_end_of_layout(
1955                 document.body,
1956                 find_token(document.body, "\\begin_layout %s" % document.default_layout, i + 1)
1957                 + 1,
1958             )
1959             - 1
1960         ]
1961         if len(caption) > 0:
1962             if len(params) == 0:
1963                 params = "caption={%s}" % caption
1964             else:
1965                 params += ",caption={%s}" % caption
1966         if len(label) > 0:
1967             if len(params) == 0:
1968                 params = "label={%s}" % label
1969             else:
1970                 params += ",label={%s}" % label
1971         if len(params) > 0:
1972             params = "[%s]" % params
1973             params = params.replace("\\", "\\backslash\n")
1974         if inline == "true":
1975             document.body[i : (j + 1)] = [
1976                 r"\begin_inset ERT",
1977                 "status %s" % status,
1978                 r"\begin_layout %s" % document.default_layout,
1979                 "",
1980                 "",
1981                 r"\backslash",
1982                 f"lstinline{params}{{{inlinecode}}}",
1983                 r"\end_layout",
1984                 "",
1985                 r"\end_inset",
1986             ]
1987         else:
1988             document.body[i : j + 1] = (
1989                 [
1990                     r"\begin_inset ERT",
1991                     "status %s" % status,
1992                     "",
1993                     r"\begin_layout %s" % document.default_layout,
1994                     "",
1995                     "",
1996                     r"\backslash",
1997                     r"begin{lstlisting}%s" % params,
1998                     r"\end_layout",
1999                     "",
2000                     r"\begin_layout %s" % document.default_layout,
2001                 ]
2002                 + document.body[k : j - 1]
2003                 + [
2004                     "",
2005                     r"\begin_layout %s" % document.default_layout,
2006                     "",
2007                     r"\backslash",
2008                     "end{lstlisting}",
2009                     r"\end_layout",
2010                     "",
2011                     r"\end_inset",
2012                 ]
2013             )
2014
2015
2016 def revert_include_listings(document):
2017     r"""Revert lstinputlisting Include option , translate
2018     \begin_inset Include \lstinputlisting{file}[opt]
2019     preview false
2020
2021     \end_inset
2022
2023     TO
2024
2025     \begin_inset ERT
2026     status open
2027
2028     \begin_layout Standard
2029
2030
2031     \backslash
2032     lstinputlisting{file}[opt]
2033     \end_layout
2034
2035     \end_inset
2036     """
2037
2038     i = 0
2039     while True:
2040         i = find_token(document.body, r"\begin_inset Include \lstinputlisting", i)
2041         if i == -1:
2042             break
2043         else:
2044             if "\\usepackage{listings}" not in document.preamble:
2045                 document.preamble.append("\\usepackage{listings}")
2046         j = find_end_of_inset(document.body, i + 1)
2047         if j == -1:
2048             # this should not happen
2049             break
2050         # find command line lstinputlisting{file}[options]
2051         cmd, file, option = "", "", ""
2052         if re.match(r"\\(lstinputlisting){([.\w]*)}(.*)", document.body[i].split()[2]):
2053             cmd, file, option = re.match(
2054                 r"\\(lstinputlisting){([.\w]*)}(.*)", document.body[i].split()[2]
2055             ).groups()
2056         option = option.replace("\\", "\\backslash\n")
2057         document.body[i : j + 1] = [
2058             r"\begin_inset ERT",
2059             "status open",
2060             "",
2061             r"\begin_layout %s" % document.default_layout,
2062             "",
2063             "",
2064             r"\backslash",
2065             f"{cmd}{option}{{{file}}}",
2066             r"\end_layout",
2067             "",
2068             r"\end_inset",
2069         ]
2070
2071
2072 def revert_ext_font_sizes(document):
2073     if document.backend != "latex":
2074         return
2075     if not document.textclass.startswith("ext"):
2076         return
2077
2078     fontsize = get_value(document.header, "\\paperfontsize", 0)
2079     if fontsize not in ("10", "11", "12"):
2080         return
2081     fontsize += "pt"
2082
2083     i = find_token(document.header, "\\paperfontsize", 0)
2084     document.header[i] = "\\paperfontsize default"
2085     insert_document_option(document, fontsize)
2086
2087
2088 def convert_ext_font_sizes(document):
2089     if document.backend != "latex":
2090         return
2091     if not document.textclass.startswith("ext"):
2092         return
2093
2094     fontsize = get_value(document.header, "\\paperfontsize", 0)
2095     if fontsize != "default":
2096         return
2097
2098     i = find_token(document.header, "\\options", 0)
2099     if i == -1:
2100         return
2101
2102     options = get_value(document.header, "\\options", i)
2103
2104     fontsizes = "10pt", "11pt", "12pt"
2105     for fs in fontsizes:
2106         if options.find(fs) != -1:
2107             break
2108     else:  # this else will only be attained if the for cycle had no match
2109         return
2110
2111     options = options.split(",")
2112     for j, opt in enumerate(options):
2113         if opt in fontsizes:
2114             fontsize = opt[:-2]
2115             del options[j]
2116             break
2117     else:
2118         return
2119
2120     k = find_token(document.header, "\\paperfontsize", 0)
2121     document.header[k] = "\\paperfontsize %s" % fontsize
2122
2123     if options:
2124         document.header[i] = "\\options %s" % ",".join(options)
2125     else:
2126         del document.header[i]
2127
2128
2129 def revert_separator_layout(document):
2130     r"""Revert --Separator-- to a lyx note
2131     From
2132
2133     \begin_layout --Separator--
2134     something
2135     \end_layout
2136
2137     to
2138
2139     \begin_layout Standard
2140     \begin_inset Note Note
2141     status open
2142
2143     \begin_layout Standard
2144     Separate Evironment
2145     \end_layout
2146
2147     \end_inset
2148     something
2149
2150     \end_layout
2151
2152     """
2153
2154     i = 0
2155     while True:
2156         i = find_token(document.body, r"\begin_layout --Separator--", i)
2157         if i == -1:
2158             break
2159         j = find_end_of_layout(document.body, i + 1)
2160         if j == -1:
2161             # this should not happen
2162             break
2163         document.body[i : j + 1] = (
2164             [
2165                 r"\begin_layout %s" % document.default_layout,
2166                 r"\begin_inset Note Note",
2167                 "status open",
2168                 "",
2169                 r"\begin_layout %s" % document.default_layout,
2170                 "Separate Environment",
2171                 r"\end_layout",
2172                 "",
2173                 r"\end_inset",
2174             ]
2175             + document.body[i + 1 : j]
2176             + ["", r"\end_layout"]
2177         )
2178
2179
2180 def convert_arabic(document):
2181     if document.language == "arabic":
2182         document.language = "arabic_arabtex"
2183         i = find_token(document.header, "\\language", 0)
2184         if i != -1:
2185             document.header[i] = "\\language arabic_arabtex"
2186     i = 0
2187     while i < len(document.body):
2188         h = document.body[i].find(r"\lang arabic", 0, len(document.body[i]))
2189         if h != -1:
2190             # change the language name
2191             document.body[i] = r"\lang arabic_arabtex"
2192         i = i + 1
2193
2194
2195 def revert_arabic(document):
2196     if document.language == "arabic_arabtex":
2197         document.language = "arabic"
2198         i = find_token(document.header, "\\language", 0)
2199         if i != -1:
2200             document.header[i] = "\\language arabic"
2201     i = 0
2202     while i < len(document.body):
2203         h = document.body[i].find(r"\lang arabic_arabtex", 0, len(document.body[i]))
2204         if h != -1:
2205             # change the language name
2206             document.body[i] = r"\lang arabic"
2207         i = i + 1
2208
2209
2210 ##
2211 # Conversion hub
2212 #
2213
2214 supported_versions = ["1.5.0", "1.5"]
2215 convert = [
2216     [246, []],
2217     [247, [convert_font_settings]],
2218     [248, []],
2219     [249, [convert_utf8]],
2220     [250, []],
2221     [251, []],
2222     [252, [convert_commandparams, convert_bibitem]],
2223     [253, []],
2224     [254, [convert_esint]],
2225     [255, []],
2226     [256, []],
2227     [257, [convert_caption]],
2228     [258, [convert_lyxline]],
2229     [259, [convert_accent, normalize_font_whitespace_259]],
2230     [260, []],
2231     [261, [convert_changes]],
2232     [262, []],
2233     [263, [normalize_language_name]],
2234     [264, [convert_cv_textclass]],
2235     [265, [convert_tableborder]],
2236     [266, []],
2237     [267, []],
2238     [268, []],
2239     [269, []],
2240     [270, []],
2241     [271, [convert_ext_font_sizes]],
2242     [272, []],
2243     [273, []],
2244     [274, [normalize_font_whitespace_274]],
2245     [275, [convert_graphics_rotation]],
2246     [276, [convert_arabic]],
2247 ]
2248
2249 revert = [
2250     [275, [revert_arabic]],
2251     [274, [revert_graphics_rotation]],
2252     [273, []],
2253     [272, [revert_separator_layout]],
2254     [
2255         271,
2256         [
2257             revert_preamble_listings_params,
2258             revert_listings_inset,
2259             revert_include_listings,
2260         ],
2261     ],
2262     [270, [revert_ext_font_sizes]],
2263     [269, [revert_beamer_alert, revert_beamer_structure]],
2264     [
2265         268,
2266         [
2267             revert_preamble_listings_params,
2268             revert_listings_inset,
2269             revert_include_listings,
2270         ],
2271     ],
2272     [267, [revert_CJK]],
2273     [266, [revert_utf8plain]],
2274     [265, [revert_armenian]],
2275     [264, [revert_tableborder]],
2276     [263, [revert_cv_textclass]],
2277     [262, [revert_language_name]],
2278     [261, [revert_ascii]],
2279     [260, []],
2280     [259, [revert_utf8x]],
2281     [258, []],
2282     [257, []],
2283     [256, [revert_caption]],
2284     [255, [revert_encodings]],
2285     [254, [revert_clearpage, revert_cleardoublepage]],
2286     [253, [revert_esint]],
2287     [252, [revert_nomenclature, revert_printnomenclature]],
2288     [251, [revert_commandparams]],
2289     [250, [revert_cs_label]],
2290     [249, []],
2291     [248, [revert_accent, revert_utf8, revert_unicode]],
2292     [247, [revert_booktabs]],
2293     [246, [revert_font_settings]],
2294     [245, [revert_framed]],
2295 ]
2296
2297
2298 if __name__ == "__main__":
2299     pass