lib/lyx2lyx/parser_tools.py

   1 # This file is part of lyx2lyx
   2 # Copyright (C) 2002-2011 Dekel Tsur <dekel@lyx.org>,
   3 # José Matos <jamatos@lyx.org>, Richard Kimberly Heck <rikiheck@lyx.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  18
  19
  20 """
  21 This module offers several free functions to help parse lines.
  22 More documentaton is below, but here is a quick guide to what
  23 they do. Optional arguments are marked by brackets.
  24
  25 find_token(lines, token[, start[, end[, ignorews]]]):
  26   Returns the first line i, start <= i < end, on which
  27   token is found at the beginning. Returns -1 if not
  28   found.
  29   If ignorews is (given and) True, then differences
  30   in whitespace do not count, except that there must be no
  31   extra whitespace following token itself.
  32
  33 find_token_exact(lines, token[, start[, end]]]):
  34   As find_token, but with ignorews set to True.
  35
  36 find_tokens(lines, tokens[, start[, end[, ignorews]]]):
  37   Returns the first line i, start <= i < end, on which
  38   one of the tokens in tokens is found at the beginning.
  39   Returns -1 if not found.
  40   If ignorews is (given and) True, then differences
  41   in whitespace do not count, except that there must be no
  42   extra whitespace following token itself.
  43
  44 find_tokens_exact(lines, token[, start[, end]]):
  45   As find_tokens, but with ignorews True.
  46
  47 find_token_backwards(lines, token, start):
  48 find_tokens_backwards(lines, tokens, start):
  49   As before, but look backwards.
  50
  51 find_substring(lines, sub[, start[, end]]) -> int
  52   As find_token, but sub may be anywhere in the line.
  53
  54 find_re(lines, rexp, start[, end]):
  55   As find_token, but rexp is a regular expression object,
  56   so it has to be passed as e.g.: re.compile(r'...').
  57
  58 get_value(lines, token[, start[, end[, default[, delete]]]]):
  59   Similar to find_token, but it returns what follows the
  60   token on the found line. Example:
  61     get_value(document.header, "\\use_xetex", 0)
  62   will find a line like:
  63     \\use_xetex true
  64   and, in that case, return "true". (Note that whitespace
  65   is stripped.) The final argument, default, defaults to "",
  66   and is what is returned if we do not find anything. So you
  67   can use that to set a default.
  68   If delete is True, then delete the line if found.
  69
  70 get_quoted_value(lines, token[, start[, end[, default[, delete]]]]):
  71   Similar to get_value, but it will strip quotes off the
  72   value, if they are present. So use this one for cases
  73   where the value is normally quoted.
  74
  75 get_option_value(line, option):
  76   This assumes we have a line with something like:
  77       option="value"
  78   and returns value. Returns "" if not found.
  79
  80 get_bool_value(lines, token[, start[, end[, default, delete]]]]):
  81   Like get_value, but returns a boolean.
  82
  83 set_bool_value(lines, token, value[, start[, end]]):
  84   Find `token` in `lines[start:end]` and set to boolean value bool(`value`).
  85   Return old value. Raise ValueError if token is not in lines.
  86
  87 del_token(lines, token[, start[, end]]):
  88   Like find_token, but deletes the line if it finds one.
  89   Returns True if a line got deleted, otherwise False.
  90
  91   Use get_* with the optional argument "delete=True", if you want to
  92   get and delete a token.
  93
  94 find_beginning_of(lines, i, start_token, end_token):
  95   Here, start_token and end_token are meant to be a matching
  96   pair, like "\\begin_layout" and "\\end_layout". We look for
  97   the start_token that pairs with the end_token that occurs
  98   on or after line i. Returns -1 if not found.
  99   So, in the layout case, this would find the \\begin_layout
 100   for the layout line i is in.
 101   Example:
 102     ec = find_token(document.body, "</cell", i)
 103     bc = find_beginning_of(document.body, ec, \
 104         "<cell", "</cell")
 105   Now, assuming no -1s, bc-ec wraps the cell for line i.
 106
 107 find_end_of(lines, i, start_token, end_token):
 108   Like find_beginning_of, but looking for the matching
 109   end_token. This might look like:
 110     bc = find_token_(document.body, "<cell", i)
 111     ec = find_end_of(document.body, bc,  "<cell", "</cell")
 112   Now, assuming no -1s, bc-ec wrap the next cell.
 113
 114 find_end_of_inset(lines, i):
 115   Specialization of find_end_of for insets.
 116
 117 find_end_of_layout(lines, i):
 118   Specialization of find_end_of for layouts.
 119
 120 find_end_of_sequence(lines, i):
 121   Find the end of the sequence of layouts of the same kind.
 122   Considers nesting. If the last paragraph in sequence is nested,
 123   the position of the last \\end_deeper is returned, else
 124   the position of the last \\end_layout.
 125
 126 is_in_inset(lines, i, inset, default=(-1,-1)):
 127   Check if line i is in an inset of the given type.
 128   If so, returns starting and ending lines. Otherwise,
 129   return default.
 130   Example:
 131     is_in_inset(document.body, i, "\\begin_inset Tabular")
 132   returns (-1,-1) unless i is within a table. If it is, then
 133   it returns the line on which the table begins and the one
 134   on which it ends. Note that this pair will evaulate to
 135   boolean True, so
 136     if is_in_inset(..., default=False):
 137   will do what you expect.
 138
 139 get_containing_inset(lines, i):
 140   Finds out what kind of inset line i is within. Returns a
 141   list containing what follows \\begin_inset on the line
 142   on which the inset begins, plus the starting and ending line.
 143   Returns False on any kind of error or if it isn't in an inset.
 144   So get_containing_inset(document.body, i) might return:
 145     ("CommandInset ref", 300, 306)
 146   if i is within an InsetRef beginning on line 300 and ending
 147   on line 306.
 148
 149 get_containing_layout(lines, i):
 150   As get_containing_inset, but for layout. Additionally returns the
 151   position of real paragraph start (after par params) as 4th value.
 152
 153 find_nonempty_line(lines, start[, end):
 154   Finds the next non-empty line.
 155
 156 check_token(line, token):
 157   Does line begin with token?
 158
 159 is_nonempty_line(line):
 160   Does line contain something besides whitespace?
 161
 162 count_pars_in_inset(lines, i):
 163   Counts the paragraphs inside an inset.
 164
 165 """
 166
 167 import re
 168
 169
 170 # Utilities for one line
 171 def check_token(line, token):
 172     """check_token(line, token) -> bool
 173
 174     Return True if token is present in line and is the first element
 175     else returns False.
 176
 177     Deprecated. Use line.startswith(token).
 178     """
 179     return line.startswith(token)
 180
 181
 182 def is_nonempty_line(line):
 183     """is_nonempty_line(line) -> bool
 184
 185     Return False if line is either empty or it has only whitespaces,
 186     else return True."""
 187     return bool(line.strip())
 188
 189
 190 # Utilities for a list of lines
 191 def find_token(lines, token, start=0, end=0, ignorews=False):
 192     """find_token(lines, token, start[[, end], ignorews]) -> int
 193
 194     Return the lowest line where token is found, and is the first
 195     element, in lines[start, end].
 196
 197     If ignorews is True (default is False), then differences in
 198     whitespace are ignored, but there must be whitespace following
 199     token itself.
 200
 201     Use find_substring(lines, sub) to find a substring anywhere in `lines`.
 202
 203     Return -1 on failure."""
 204
 205     if end == 0 or end > len(lines):
 206         end = len(lines)
 207     if ignorews:
 208         y = token.split()
 209     for i in range(start, end):
 210         if ignorews:
 211             x = lines[i].split()
 212             if len(x) < len(y):
 213                 continue
 214             if x[: len(y)] == y:
 215                 return i
 216         else:
 217             if lines[i].startswith(token):
 218                 return i
 219     return -1
 220
 221
 222 def find_token_exact(lines, token, start=0, end=0):
 223     return find_token(lines, token, start, end, True)
 224
 225
 226 def find_tokens(lines, tokens, start=0, end=0, ignorews=False):
 227     """find_tokens(lines, tokens, start[[, end], ignorews]) -> int
 228
 229     Return the lowest line where one token in tokens is found, and is
 230     the first element, in lines[start, end].
 231
 232     Return -1 on failure."""
 233
 234     if end == 0 or end > len(lines):
 235         end = len(lines)
 236
 237     for i in range(start, end):
 238         for token in tokens:
 239             if ignorews:
 240                 x = lines[i].split()
 241                 y = token.split()
 242                 if len(x) < len(y):
 243                     continue
 244                 if x[: len(y)] == y:
 245                     return i
 246             else:
 247                 if lines[i].startswith(token):
 248                     return i
 249     return -1
 250
 251
 252 def find_tokens_exact(lines, tokens, start=0, end=0):
 253     return find_tokens(lines, tokens, start, end, True)
 254
 255
 256 def find_substring(lines, sub, start=0, end=0):
 257     """find_substring(lines, sub[, start[, end]]) -> int
 258
 259     Return the lowest line number `i` in [start, end] where
 260     `sub` is a substring of line[i].
 261
 262     Return -1 on failure."""
 263
 264     if end == 0 or end > len(lines):
 265         end = len(lines)
 266     for i in range(start, end):
 267         if sub in lines[i]:
 268             return i
 269     return -1
 270
 271
 272 def find_re(lines, rexp, start=0, end=0):
 273     """find_re(lines, rexp[, start[, end]]) -> int
 274
 275     Return the lowest line number `i` in [start, end] where the regular
 276     expression object `rexp` matches at the beginning of line[i].
 277     Return -1 on failure.
 278
 279     Start your pattern with the wildcard ".*" to find a match anywhere in a
 280     line. Use find_substring() to find a substring anywhere in the lines.
 281     """
 282     if end == 0 or end > len(lines):
 283         end = len(lines)
 284     for i in range(start, end):
 285         if rexp.match(lines[i]):
 286             return i
 287     return -1
 288
 289
 290 def find_token_backwards(lines, token, start):
 291     """find_token_backwards(lines, token, start) -> int
 292
 293     Return the highest line where token is found, and is the first
 294     element, in lines[start, end].
 295
 296     Return -1 on failure."""
 297     for i in range(start, -1, -1):
 298         if lines[i].startswith(token):
 299             return i
 300     return -1
 301
 302
 303 def find_tokens_backwards(lines, tokens, start):
 304     """find_tokens_backwards(lines, token, start) -> int
 305
 306     Return the highest line where token is found, and is the first
 307     element, in lines[end, start].
 308
 309     Return -1 on failure."""
 310     for i in range(start, -1, -1):
 311         line = lines[i]
 312         for token in tokens:
 313             if line.startswith(token):
 314                 return i
 315     return -1
 316
 317
 318 def find_complete_lines(lines, sublines, start=0, end=0):
 319     """Find first occurence of sequence `sublines` in list `lines`.
 320     Return index of first line or -1 on failure.
 321
 322     Efficient search for a sub-list in a large list. Works for any values.
 323
 324     >>> find_complete_lines([1, 2, 3, 1, 1, 2], [1, 2])
 325     0
 326
 327     The `start` and `end` arguments work similar to list.index()
 328
 329     >>> find_complete_lines([1, 2, 3, 1, 1, 2], [1, 2], start=1)
 330     4
 331     >>> find_complete_lines([1, 2, 3, 1, 1, 2], [1, 2], start=1, end=4)
 332     -1
 333
 334     The return value can be used to substitute the sub-list.
 335     Take care to check before use:
 336
 337     >>> l = [1, 1, 2]
 338     >>> s = find_complete_lines(l, [1, 2])
 339     >>> if s != -1:
 340     ...     l[s : s + 2] = [3]
 341     ...     l
 342     [1, 3]
 343
 344     See also del_complete_lines().
 345     """
 346     if not sublines:
 347         return start
 348     end = end or len(lines)
 349     N = len(sublines)
 350     try:
 351         while True:
 352             for j, value in enumerate(sublines):
 353                 i = lines.index(value, start, end)
 354                 if j and i != start:
 355                     start = i - j
 356                     break
 357                 start = i + 1
 358             else:
 359                 return i + 1 - N
 360     except ValueError:  # `sublines` not found
 361         return -1
 362
 363
 364 def find_across_lines(lines, sub, start=0, end=0):
 365     sublines = sub.splitlines()
 366     if len(sublines) > 2:
 367         # at least 3 lines: the middle one(s) are complete -> use index search
 368         i = find_complete_lines(lines, sublines[1:-1], start + 1, end - 1)
 369         if i < start + 1:
 370             return -1
 371         try:
 372             if lines[i - 1].endswith(sublines[0]) and lines[i + len(sublines)].startswith(
 373                 sublines[-1]
 374             ):
 375                 return i - 1
 376         except IndexError:
 377             pass
 378     elif len(sublines) > 1:
 379         # last subline must start a line
 380         i = find_token(lines, sublines[-1], start, end)
 381         if i < start + 1:
 382             return -1
 383         if lines[i - 1].endswith(sublines[0]):
 384             return i - 1
 385     else:  # no line-break, may be in the middle of a line
 386         if end == 0 or end > len(lines):
 387             end = len(lines)
 388         for i in range(start, end):
 389             if sub in lines[i]:
 390                 return i
 391     return -1
 392
 393
 394 def get_value(lines, token, start=0, end=0, default="", delete=False):
 395     """Find `token` in `lines` and return part of line that follows it.
 396
 397     Find the next line that looks like:
 398       token followed by other stuff
 399
 400     If `delete` is True, delete the line (if found).
 401
 402     Return "followed by other stuff" with leading and trailing
 403     whitespace removed.
 404     """
 405     i = find_token_exact(lines, token, start, end)
 406     if i == -1:
 407         return default
 408     # TODO: establish desired behaviour, eventually change to
 409     #  return lines.pop(i)[len(token):].strip() # or default
 410     # see test_parser_tools.py
 411     l = lines[i].split(None, 1)
 412     if delete:
 413         del lines[i]
 414     if len(l) > 1:
 415         return l[1].strip()
 416     return default
 417
 418
 419 def get_quoted_value(lines, token, start=0, end=0, default="", delete=False):
 420     """get_quoted_value(lines, token, start[[, end], default]) -> string
 421
 422     Find the next line that looks like:
 423       token "followed by other stuff"
 424     Returns "followed by other stuff" with leading and trailing
 425     whitespace and quotes removed. If there are no quotes, that is OK too.
 426     So use get_value to preserve possible quotes, this one to remove them,
 427     if they are there.
 428     Note that we will NOT strip quotes from default!
 429     """
 430     val = get_value(lines, token, start, end, "", delete)
 431     if not val:
 432         return default
 433     return val.strip('"')
 434
 435
 436 bool_values = {"true": True, "1": True, "false": False, "0": False}
 437
 438
 439 def get_bool_value(lines, token, start=0, end=0, default=None, delete=False):
 440     """get_bool_value(lines, token, start[[, end], default]) -> string
 441
 442     Find the next line that looks like:
 443       `token` <bool_value>
 444
 445     Return True if <bool_value> is 1 or "true", False if <bool_value>
 446     is 0 or "false", else `default`.
 447     """
 448     val = get_quoted_value(lines, token, start, end, default, delete)
 449     return bool_values.get(val, default)
 450
 451
 452 def set_bool_value(lines, token, value, start=0, end=0):
 453     """Find `token` in `lines` and set to bool(`value`).
 454
 455     Return previous value. Raise `ValueError` if `token` is not in lines.
 456
 457     Cf. find_token(), get_bool_value().
 458     """
 459     i = find_token(lines, token, start, end)
 460     if i == -1:
 461         raise ValueError
 462     oldvalue = get_bool_value(lines, token, i, i + 1)
 463     if oldvalue is value:
 464         return oldvalue
 465     # set to new value
 466     if get_quoted_value(lines, token, i, i + 1) in ("0", "1"):
 467         lines[i] = "%s %d" % (token, value)
 468     else:
 469         lines[i] = f"{token} {str(value).lower()}"
 470
 471     return oldvalue
 472
 473
 474 def get_option_value(line, option):
 475     rx = option + r'\s*=\s*"([^"]+)"'
 476     rx = re.compile(rx)
 477     m = rx.search(line)
 478     if not m:
 479         return ""
 480     return m.group(1)
 481
 482
 483 def set_option_value(line, option, value):
 484     rx = "(" + option + r'\s*=\s*")[^"]+"'
 485     rx = re.compile(rx)
 486     m = rx.search(line)
 487     if not m:
 488         return line
 489     return re.sub(rx, r"\g<1>" + value + '"', line)
 490
 491
 492 def del_token(lines, token, start=0, end=0):
 493     """del_token(lines, token, start, end) -> int
 494
 495     Find the first line in lines where token is the first element
 496     and delete that line. Returns True if we deleted a line, False
 497     if we did not."""
 498
 499     k = find_token_exact(lines, token, start, end)
 500     if k == -1:
 501         return False
 502     del lines[k]
 503     return True
 504
 505
 506 def del_complete_lines(lines, sublines, start=0, end=0):
 507     """Delete first occurence of `sublines` in list `lines`.
 508
 509     Efficient deletion of a sub-list in a list. Works for any values.
 510     The `start` and `end` arguments work similar to list.index()
 511
 512     Returns True if a deletion was done and False if not.
 513
 514     >>> l = [1, 0, 1, 1, 1, 2]
 515     >>> del_complete_lines(l, [0, 1, 1])
 516     True
 517     >>> l
 518     [1, 1, 2]
 519     """
 520     i = find_complete_lines(lines, sublines, start, end)
 521     if i == -1:
 522         return False
 523     del lines[i : i + len(sublines)]
 524     return True
 525
 526
 527 def del_value(lines, token, start=0, end=0, default=None):
 528     """
 529     Find the next line that looks like:
 530       token followed by other stuff
 531     Delete that line and return "followed by other stuff"
 532     with leading and trailing whitespace removed.
 533
 534     If token is not found, return `default`.
 535     """
 536     i = find_token_exact(lines, token, start, end)
 537     if i == -1:
 538         return default
 539     return lines.pop(i)[len(token) :].strip()
 540
 541
 542 def find_beginning_of(lines, i, start_token, end_token):
 543     count = 1
 544     while i > 0:
 545         i = find_tokens_backwards(lines, [start_token, end_token], i - 1)
 546         if i == -1:
 547             return -1
 548         if lines[i].startswith(end_token):
 549             count = count + 1
 550         else:
 551             count = count - 1
 552         if count == 0:
 553             return i
 554     return -1
 555
 556
 557 def find_end_of(lines, i, start_token, end_token):
 558     count = 1
 559     n = len(lines)
 560     while i < n:
 561         i = find_tokens(lines, [end_token, start_token], i + 1)
 562         if i == -1:
 563             return -1
 564         if lines[i].startswith(start_token):
 565             count = count + 1
 566         else:
 567             count = count - 1
 568         if count == 0:
 569             return i
 570     return -1
 571
 572
 573 def find_nonempty_line(lines, start=0, end=0):
 574     if end == 0:
 575         end = len(lines)
 576     for i in range(start, end):
 577         if lines[i].strip():
 578             return i
 579     return -1
 580
 581
 582 def find_end_of_inset(lines, i):
 583     "Find end of inset, where lines[i] is included."
 584     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
 585
 586
 587 def find_end_of_layout(lines, i):
 588     "Find end of layout, where lines[i] is included."
 589     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
 590
 591
 592 def is_in_inset(lines, i, inset, default=(-1, -1)):
 593     """
 594     Check if line i is in an inset of the given type.
 595     If so, return starting and ending lines, otherwise `default`.
 596     Example:
 597       is_in_inset(document.body, i, "\\begin_inset Tabular")
 598     returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
 599     If it is, then it returns the line on which the table begins and the one
 600     on which it ends.
 601     Note that this pair will evaulate to boolean True, so (with the optional
 602     default value set to False)
 603       if is_in_inset(..., default=False):
 604     will do what you expect.
 605     """
 606     start = find_token_backwards(lines, inset, i)
 607     if start == -1:
 608         return default
 609     end = find_end_of_inset(lines, start)
 610     if end < i:  # this includes the notfound case.
 611         return default
 612     return (start, end)
 613
 614
 615 def get_containing_inset(lines, i):
 616     """
 617     Finds out what kind of inset line i is within. Returns a
 618     list containing (i) what follows \\begin_inset on the line
 619     on which the inset begins, plus the starting and ending line.
 620     Returns False on any kind of error or if it isn't in an inset.
 621     """
 622     j = i
 623     while True:
 624         stins = find_token_backwards(lines, "\\begin_inset", j)
 625         if stins == -1:
 626             return False
 627         endins = find_end_of_inset(lines, stins)
 628         if endins > j:
 629             break
 630         j = stins - 1
 631
 632     if endins < i:
 633         return False
 634
 635     inset = get_value(lines, "\\begin_inset", stins)
 636     if inset == "":
 637         # shouldn't happen
 638         return False
 639     return (inset, stins, endins)
 640
 641
 642 def get_containing_layout(lines, i):
 643     """
 644     Find out what kind of layout line `i` is within.
 645     Return a tuple
 646       (layoutname, layoutstart, layoutend, startofcontent)
 647     containing
 648       * layout style/name,
 649       * start line number,
 650       * end line number, and
 651       * number of first paragraph line (after all params).
 652     Return `False` on any kind of error.
 653     """
 654     j = i
 655     while True:
 656         stlay = find_token_backwards(lines, "\\begin_layout", j)
 657         if stlay == -1:
 658             return False
 659         endlay = find_end_of_layout(lines, stlay)
 660         if endlay > i:
 661             break
 662         j = stlay - 1
 663
 664     if endlay < i:
 665         return False
 666
 667     layoutname = get_value(lines, "\\begin_layout", stlay)
 668     if layoutname == "":  # layout style missing
 669         # TODO: What shall we do in this case?
 670         pass
 671         # layoutname == "Standard" # use same fallback as the LyX parser:
 672         # raise ValueError("Missing layout name on line %d"%stlay) # diagnosis
 673         # return False # generic error response
 674     par_params = [
 675         "\\noindent",
 676         "\\indent",
 677         "\\indent-toggle",
 678         "\\leftindent",
 679         "\\start_of_appendix",
 680         "\\paragraph_spacing",
 681         "\\align",
 682         "\\labelwidthstring",
 683     ]
 684     stpar = stlay
 685     while True:
 686         stpar += 1
 687         if lines[stpar].split(" ", 1)[0] not in par_params:
 688             break
 689     return (layoutname, stlay, endlay, stpar)
 690
 691
 692 def count_pars_in_inset(lines, i):
 693     """
 694     Counts the paragraphs within this inset
 695     """
 696     ins = get_containing_inset(lines, i)
 697     if ins == -1:
 698         return -1
 699     pars = 0
 700     for j in range(ins[1], ins[2]):
 701         m = re.match(r"\\begin_layout (.*)", lines[j])
 702         found_inset = get_containing_inset(lines, j)
 703         if m and found_inset and found_inset[1] == ins[1]:
 704             pars += 1
 705
 706     return pars
 707
 708
 709 def find_end_of_sequence(lines, i):
 710     """
 711     Returns the end of a sequence of identical layouts.
 712     """
 713     lay = get_containing_layout(lines, i)
 714     if lay == False:
 715         return -1
 716     layout = lay[0]
 717     endlay = lay[2]
 718     i = endlay
 719     while True:
 720         m = re.match(r"\\begin_layout (.*)", lines[i])
 721         if m and m.group(1) != layout:
 722             return endlay
 723         elif lines[i] == "\\begin_deeper":
 724             j = find_end_of(lines, i, "\\begin_deeper", "\\end_deeper")
 725             if j != -1:
 726                 i = j
 727                 endlay = j
 728                 continue
 729         if m and m.group(1) == layout:
 730             endlay = find_end_of_layout(lines, i)
 731             i = endlay
 732             continue
 733         if i == len(lines) - 1:
 734             break
 735         i = i + 1
 736
 737     return endlay