lib/lyx2lyx/parser_tools.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2002-2011 Dekel Tsur <dekel@lyx.org>,
   4 # José Matos <jamatos@lyx.org>, Richard Kimberly Heck <rikiheck@lyx.org>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  19
  20
  21 """
  22 This module offers several free functions to help parse lines.
  23 More documentaton is below, but here is a quick guide to what
  24 they do. Optional arguments are marked by brackets.
  25
  26 find_token(lines, token[, start[, end[, ignorews]]]):
  27   Returns the first line i, start <= i < end, on which
  28   token is found at the beginning. Returns -1 if not
  29   found.
  30   If ignorews is (given and) True, then differences
  31   in whitespace do not count, except that there must be no
  32   extra whitespace following token itself.
  33
  34 find_token_exact(lines, token[, start[, end]]]):
  35   As find_token, but with ignorews set to True.
  36
  37 find_tokens(lines, tokens[, start[, end[, ignorews]]]):
  38   Returns the first line i, start <= i < end, on which
  39   one of the tokens in tokens is found at the beginning.
  40   Returns -1 if not found.
  41   If ignorews is (given and) True, then differences
  42   in whitespace do not count, except that there must be no
  43   extra whitespace following token itself.
  44
  45 find_tokens_exact(lines, token[, start[, end]]):
  46   As find_tokens, but with ignorews True.
  47
  48 find_token_backwards(lines, token, start):
  49 find_tokens_backwards(lines, tokens, start):
  50   As before, but look backwards.
  51
  52 find_substring(lines, sub[, start[, end]]) -> int
  53   As find_token, but sub may be anywhere in the line.
  54
  55 find_re(lines, rexp, start[, end]):
  56   As find_token, but rexp is a regular expression object,
  57   so it has to be passed as e.g.: re.compile(r'...').
  58
  59 get_value(lines, token[, start[, end[, default[, delete]]]]):
  60   Similar to find_token, but it returns what follows the
  61   token on the found line. Example:
  62     get_value(document.header, "\\use_xetex", 0)
  63   will find a line like:
  64     \\use_xetex true
  65   and, in that case, return "true". (Note that whitespace
  66   is stripped.) The final argument, default, defaults to "",
  67   and is what is returned if we do not find anything. So you
  68   can use that to set a default.
  69   If delete is True, then delete the line if found.
  70
  71 get_quoted_value(lines, token[, start[, end[, default[, delete]]]]):
  72   Similar to get_value, but it will strip quotes off the
  73   value, if they are present. So use this one for cases
  74   where the value is normally quoted.
  75
  76 get_option_value(line, option):
  77   This assumes we have a line with something like:
  78       option="value"
  79   and returns value. Returns "" if not found.
  80
  81 get_bool_value(lines, token[, start[, end[, default, delete]]]]):
  82   Like get_value, but returns a boolean.
  83
  84 set_bool_value(lines, token, value[, start[, end]]):
  85   Find `token` in `lines[start:end]` and set to boolean value bool(`value`).
  86   Return old value. Raise ValueError if token is not in lines.
  87
  88 del_token(lines, token[, start[, end]]):
  89   Like find_token, but deletes the line if it finds one.
  90   Returns True if a line got deleted, otherwise False.
  91
  92   Use get_* with the optional argument "delete=True", if you want to
  93   get and delete a token.
  94
  95 find_beginning_of(lines, i, start_token, end_token):
  96   Here, start_token and end_token are meant to be a matching
  97   pair, like "\\begin_layout" and "\\end_layout". We look for
  98   the start_token that pairs with the end_token that occurs
  99   on or after line i. Returns -1 if not found.
 100   So, in the layout case, this would find the \\begin_layout
 101   for the layout line i is in.
 102   Example:
 103     ec = find_token(document.body, "</cell", i)
 104     bc = find_beginning_of(document.body, ec, \
 105         "<cell", "</cell")
 106   Now, assuming no -1s, bc-ec wraps the cell for line i.
 107
 108 find_end_of(lines, i, start_token, end_token):
 109   Like find_beginning_of, but looking for the matching
 110   end_token. This might look like:
 111     bc = find_token_(document.body, "<cell", i)
 112     ec = find_end_of(document.body, bc,  "<cell", "</cell")
 113   Now, assuming no -1s, bc-ec wrap the next cell.
 114
 115 find_end_of_inset(lines, i):
 116   Specialization of find_end_of for insets.
 117
 118 find_end_of_layout(lines, i):
 119   Specialization of find_end_of for layouts.
 120
 121 find_end_of_sequence(lines, i):
 122   Find the end of the sequence of layouts of the same kind.
 123   Considers nesting. If the last paragraph in sequence is nested,
 124   the position of the last \\end_deeper is returned, else
 125   the position of the last \\end_layout.
 126
 127 is_in_inset(lines, i, inset, default=(-1,-1)):
 128   Check if line i is in an inset of the given type.
 129   If so, returns starting and ending lines. Otherwise,
 130   return default.
 131   Example:
 132     is_in_inset(document.body, i, "\\begin_inset Tabular")
 133   returns (-1,-1) unless i is within a table. If it is, then
 134   it returns the line on which the table begins and the one
 135   on which it ends. Note that this pair will evaulate to
 136   boolean True, so
 137     if is_in_inset(..., default=False):
 138   will do what you expect.
 139
 140 get_containing_inset(lines, i):
 141   Finds out what kind of inset line i is within. Returns a
 142   list containing what follows \\begin_inset on the line
 143   on which the inset begins, plus the starting and ending line.
 144   Returns False on any kind of error or if it isn't in an inset.
 145   So get_containing_inset(document.body, i) might return:
 146     ("CommandInset ref", 300, 306)
 147   if i is within an InsetRef beginning on line 300 and ending
 148   on line 306.
 149
 150 get_containing_layout(lines, i):
 151   As get_containing_inset, but for layout. Additionally returns the
 152   position of real paragraph start (after par params) as 4th value.
 153
 154 find_nonempty_line(lines, start[, end):
 155   Finds the next non-empty line.
 156
 157 check_token(line, token):
 158   Does line begin with token?
 159
 160 is_nonempty_line(line):
 161   Does line contain something besides whitespace?
 162
 163 count_pars_in_inset(lines, i):
 164   Counts the paragraphs inside an inset.
 165
 166 """
 167
 168 import re
 169
 170 # Utilities for one line
 171 def check_token(line, token):
 172     """ check_token(line, token) -> bool
 173
 174     Return True if token is present in line and is the first element
 175     else returns False.
 176
 177     Deprecated. Use line.startswith(token).
 178     """
 179     return line.startswith(token)
 180
 181
 182 def is_nonempty_line(line):
 183     """ is_nonempty_line(line) -> bool
 184
 185     Return False if line is either empty or it has only whitespaces,
 186     else return True."""
 187     return bool(line.strip())
 188
 189
 190 # Utilities for a list of lines
 191 def find_token(lines, token, start=0, end=0, ignorews=False):
 192     """ find_token(lines, token, start[[, end], ignorews]) -> int
 193
 194     Return the lowest line where token is found, and is the first
 195     element, in lines[start, end].
 196
 197     If ignorews is True (default is False), then differences in
 198     whitespace are ignored, but there must be whitespace following
 199     token itself.
 200
 201     Use find_substring(lines, sub) to find a substring anywhere in `lines`.
 202
 203     Return -1 on failure."""
 204
 205     if end == 0 or end > len(lines):
 206         end = len(lines)
 207     if ignorews:
 208         y = token.split()
 209     for i in range(start, end):
 210         if ignorews:
 211             x = lines[i].split()
 212             if len(x) < len(y):
 213                 continue
 214             if x[:len(y)] == y:
 215                 return i
 216         else:
 217             if lines[i].startswith(token):
 218                 return i
 219     return -1
 220
 221
 222 def find_token_exact(lines, token, start=0, end=0):
 223     return find_token(lines, token, start, end, True)
 224
 225
 226 def find_tokens(lines, tokens, start=0, end=0, ignorews=False):
 227     """ find_tokens(lines, tokens, start[[, end], ignorews]) -> int
 228
 229     Return the lowest line where one token in tokens is found, and is
 230     the first element, in lines[start, end].
 231
 232     Return -1 on failure."""
 233
 234     if end == 0 or end > len(lines):
 235         end = len(lines)
 236
 237     for i in range(start, end):
 238         for token in tokens:
 239             if ignorews:
 240                 x = lines[i].split()
 241                 y = token.split()
 242                 if len(x) < len(y):
 243                     continue
 244                 if x[:len(y)] == y:
 245                     return i
 246             else:
 247                 if lines[i].startswith(token):
 248                     return i
 249     return -1
 250
 251
 252 def find_tokens_exact(lines, tokens, start=0, end=0):
 253     return find_tokens(lines, tokens, start, end, True)
 254
 255
 256 def find_substring(lines, sub, start=0, end=0):
 257     """ find_substring(lines, sub[, start[, end]]) -> int
 258
 259     Return the lowest line number `i` in [start, end] where
 260     `sub` is a substring of line[i].
 261
 262     Return -1 on failure."""
 263
 264     if end == 0 or end > len(lines):
 265         end = len(lines)
 266     for i in range(start, end):
 267         if sub in lines[i]:
 268                 return i
 269     return -1
 270
 271
 272 def find_re(lines, rexp, start=0, end=0):
 273     """ find_re(lines, rexp[, start[, end]]) -> int
 274
 275     Return the lowest line number `i` in [start, end] where the regular
 276     expression object `rexp` matches at the beginning of line[i].
 277     Return -1 on failure.
 278
 279     Start your pattern with the wildcard ".*" to find a match anywhere in a
 280     line. Use find_substring() to find a substring anywhere in the lines.
 281     """
 282     if end == 0 or end > len(lines):
 283         end = len(lines)
 284     for i in range(start, end):
 285         if rexp.match(lines[i]):
 286                 return i
 287     return -1
 288
 289
 290 def find_token_backwards(lines, token, start):
 291     """ find_token_backwards(lines, token, start) -> int
 292
 293     Return the highest line where token is found, and is the first
 294     element, in lines[start, end].
 295
 296     Return -1 on failure."""
 297     for i in range(start, -1, -1):
 298         if lines[i].startswith(token):
 299             return i
 300     return -1
 301
 302
 303 def find_tokens_backwards(lines, tokens, start):
 304     """ find_tokens_backwards(lines, token, start) -> int
 305
 306     Return the highest line where token is found, and is the first
 307     element, in lines[end, start].
 308
 309     Return -1 on failure."""
 310     for i in range(start, -1, -1):
 311         line = lines[i]
 312         for token in tokens:
 313             if line.startswith(token):
 314                 return i
 315     return -1
 316
 317
 318 def find_complete_lines(lines, sublines, start=0, end=0):
 319     """Find first occurence of sequence `sublines` in list `lines`.
 320     Return index of first line or -1 on failure.
 321
 322     Efficient search for a sub-list in a large list. Works for any values.
 323
 324     >>> find_complete_lines([1, 2, 3, 1, 1, 2], [1, 2])
 325     0
 326
 327     The `start` and `end` arguments work similar to list.index()
 328
 329     >>> find_complete_lines([1, 2, 3, 1, 1 ,2], [1, 2], start=1)
 330     4
 331     >>> find_complete_lines([1, 2, 3, 1, 1 ,2], [1, 2], start=1, end=4)
 332     -1
 333
 334     The return value can be used to substitute the sub-list.
 335     Take care to check before use:
 336
 337     >>> l = [1, 1, 2]
 338     >>> s = find_complete_lines(l, [1, 2])
 339     >>> if s != -1:
 340     ...     l[s:s+2] = [3]; l
 341     [1, 3]
 342
 343     See also del_complete_lines().
 344     """
 345     if not sublines:
 346         return start
 347     end = end or len(lines)
 348     N = len(sublines)
 349     try:
 350         while True:
 351             for j, value in enumerate(sublines):
 352                 i = lines.index(value, start, end)
 353                 if j and i != start:
 354                     start = i-j
 355                     break
 356                 start = i + 1
 357             else:
 358                 return i +1 - N
 359     except ValueError: # `sublines` not found
 360         return -1
 361
 362
 363 def find_across_lines(lines, sub, start=0, end=0):
 364     sublines = sub.splitlines()
 365     if len(sublines) > 2:
 366         # at least 3 lines: the middle one(s) are complete -> use index search
 367         i = find_complete_lines(lines, sublines[1:-1], start+1, end-1)
 368         if i < start+1:
 369             return -1
 370         try:
 371             if (lines[i-1].endswith(sublines[0]) and
 372                 lines[i+len(sublines)].startswith(sublines[-1])):
 373                 return i-1
 374         except IndexError:
 375             pass
 376     elif len(sublines) > 1:
 377         # last subline must start a line
 378         i = find_token(lines, sublines[-1], start, end)
 379         if i < start + 1:
 380             return -1
 381         if lines[i-1].endswith(sublines[0]):
 382             return i-1
 383     else: # no line-break, may be in the middle of a line
 384         if end == 0 or end > len(lines):
 385             end = len(lines)
 386         for i in range(start, end):
 387             if sub in lines[i]:
 388                 return i
 389     return -1
 390
 391
 392 def get_value(lines, token, start=0, end=0, default="", delete=False):
 393     """Find `token` in `lines` and return part of line that follows it.
 394
 395     Find the next line that looks like:
 396       token followed by other stuff
 397
 398     If `delete` is True, delete the line (if found).
 399
 400     Return "followed by other stuff" with leading and trailing
 401     whitespace removed.
 402     """
 403     i = find_token_exact(lines, token, start, end)
 404     if i == -1:
 405         return default
 406     # TODO: establish desired behaviour, eventually change to
 407     #  return lines.pop(i)[len(token):].strip() # or default
 408     # see test_parser_tools.py
 409     l = lines[i].split(None, 1)
 410     if delete:
 411         del(lines[i])
 412     if len(l) > 1:
 413         return l[1].strip()
 414     return default
 415
 416
 417 def get_quoted_value(lines, token, start=0, end=0, default="", delete=False):
 418     """ get_quoted_value(lines, token, start[[, end], default]) -> string
 419
 420     Find the next line that looks like:
 421       token "followed by other stuff"
 422     Returns "followed by other stuff" with leading and trailing
 423     whitespace and quotes removed. If there are no quotes, that is OK too.
 424     So use get_value to preserve possible quotes, this one to remove them,
 425     if they are there.
 426     Note that we will NOT strip quotes from default!
 427     """
 428     val = get_value(lines, token, start, end, "", delete)
 429     if not val:
 430       return default
 431     return val.strip('"')
 432
 433
 434 bool_values = {"true": True, "1": True,
 435                "false": False, "0": False}
 436
 437 def get_bool_value(lines, token, start=0, end=0, default=None, delete=False):
 438     """ get_bool_value(lines, token, start[[, end], default]) -> string
 439
 440     Find the next line that looks like:
 441       `token` <bool_value>
 442
 443     Return True if <bool_value> is 1 or "true", False if <bool_value>
 444     is 0 or "false", else `default`.
 445     """
 446     val = get_quoted_value(lines, token, start, end, default, delete)
 447     return bool_values.get(val, default)
 448
 449
 450 def set_bool_value(lines, token, value, start=0, end=0):
 451     """Find `token` in `lines` and set to bool(`value`).
 452
 453     Return previous value. Raise `ValueError` if `token` is not in lines.
 454
 455     Cf. find_token(), get_bool_value().
 456     """
 457     i = find_token(lines, token, start, end)
 458     if i == -1:
 459         raise ValueError
 460     oldvalue = get_bool_value(lines, token, i, i+1)
 461     if oldvalue is value:
 462         return oldvalue
 463     # set to new value
 464     if get_quoted_value(lines, token, i, i+1) in ('0', '1'):
 465         lines[i] = "%s %d" % (token, value)
 466     else:
 467         lines[i] = "%s %s" % (token, str(value).lower())
 468
 469     return oldvalue
 470
 471
 472 def get_option_value(line, option):
 473     rx = option + r'\s*=\s*"([^"]+)"'
 474     rx = re.compile(rx)
 475     m = rx.search(line)
 476     if not m:
 477       return ""
 478     return m.group(1)
 479
 480
 481 def set_option_value(line, option, value):
 482     rx = '(' + option + r'\s*=\s*")[^"]+"'
 483     rx = re.compile(rx)
 484     m = rx.search(line)
 485     if not m:
 486         return line
 487     return re.sub(rx, r'\g<1>' + value + '"', line)
 488
 489
 490 def del_token(lines, token, start=0, end=0):
 491     """ del_token(lines, token, start, end) -> int
 492
 493     Find the first line in lines where token is the first element
 494     and delete that line. Returns True if we deleted a line, False
 495     if we did not."""
 496
 497     k = find_token_exact(lines, token, start, end)
 498     if k == -1:
 499         return False
 500     del lines[k]
 501     return True
 502
 503 def del_complete_lines(lines, sublines, start=0, end=0):
 504     """Delete first occurence of `sublines` in list `lines`.
 505
 506     Efficient deletion of a sub-list in a list. Works for any values.
 507     The `start` and `end` arguments work similar to list.index()
 508
 509     Returns True if a deletion was done and False if not.
 510
 511     >>> l = [1, 0, 1, 1, 1, 2]
 512     >>> del_complete_lines(l, [0, 1, 1])
 513     True
 514     >>> l
 515     [1, 1, 2]
 516     """
 517     i = find_complete_lines(lines, sublines, start, end)
 518     if i == -1:
 519         return False
 520     del(lines[i:i+len(sublines)])
 521     return True
 522
 523
 524 def del_value(lines, token, start=0, end=0, default=None):
 525     """
 526     Find the next line that looks like:
 527       token followed by other stuff
 528     Delete that line and return "followed by other stuff"
 529     with leading and trailing whitespace removed.
 530
 531     If token is not found, return `default`.
 532     """
 533     i = find_token_exact(lines, token, start, end)
 534     if i == -1:
 535         return default
 536     return lines.pop(i)[len(token):].strip()
 537
 538
 539 def find_beginning_of(lines, i, start_token, end_token):
 540     count = 1
 541     while i > 0:
 542         i = find_tokens_backwards(lines, [start_token, end_token], i-1)
 543         if i == -1:
 544             return -1
 545         if lines[i].startswith(end_token):
 546             count = count+1
 547         else:
 548             count = count-1
 549         if count == 0:
 550             return i
 551     return -1
 552
 553
 554 def find_end_of(lines, i, start_token, end_token):
 555     count = 1
 556     n = len(lines)
 557     while i < n:
 558         i = find_tokens(lines, [end_token, start_token], i+1)
 559         if i == -1:
 560             return -1
 561         if lines[i].startswith(start_token):
 562             count = count+1
 563         else:
 564             count = count-1
 565         if count == 0:
 566             return i
 567     return -1
 568
 569
 570 def find_nonempty_line(lines, start=0, end=0):
 571     if end == 0:
 572         end = len(lines)
 573     for i in range(start, end):
 574         if lines[i].strip():
 575             return i
 576     return -1
 577
 578
 579 def find_end_of_inset(lines, i):
 580     " Find end of inset, where lines[i] is included."
 581     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
 582
 583
 584 def find_end_of_layout(lines, i):
 585     " Find end of layout, where lines[i] is included."
 586     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
 587
 588
 589 def is_in_inset(lines, i, inset, default=(-1,-1)):
 590     """
 591     Check if line i is in an inset of the given type.
 592     If so, return starting and ending lines, otherwise `default`.
 593     Example:
 594       is_in_inset(document.body, i, "\\begin_inset Tabular")
 595     returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
 596     If it is, then it returns the line on which the table begins and the one
 597     on which it ends.
 598     Note that this pair will evaulate to boolean True, so (with the optional
 599     default value set to False)
 600       if is_in_inset(..., default=False):
 601     will do what you expect.
 602     """
 603     start = find_token_backwards(lines, inset, i)
 604     if start == -1:
 605       return default
 606     end = find_end_of_inset(lines, start)
 607     if end < i: # this includes the notfound case.
 608       return default
 609     return (start, end)
 610
 611
 612 def get_containing_inset(lines, i):
 613   '''
 614   Finds out what kind of inset line i is within. Returns a
 615   list containing (i) what follows \\begin_inset on the line
 616   on which the inset begins, plus the starting and ending line.
 617   Returns False on any kind of error or if it isn't in an inset.
 618   '''
 619   j = i
 620   while True:
 621       stins = find_token_backwards(lines, "\\begin_inset", j)
 622       if stins == -1:
 623           return False
 624       endins = find_end_of_inset(lines, stins)
 625       if endins > j:
 626           break
 627       j = stins - 1
 628
 629   if endins < i:
 630       return False
 631
 632   inset = get_value(lines, "\\begin_inset", stins)
 633   if inset == "":
 634       # shouldn't happen
 635       return False
 636   return (inset, stins, endins)
 637
 638
 639 def get_containing_layout(lines, i):
 640   '''
 641   Find out what kind of layout line `i` is within.
 642   Return a tuple
 643     (layoutname, layoutstart, layoutend, startofcontent)
 644   containing
 645     * layout style/name,
 646     * start line number,
 647     * end line number, and
 648     * number of first paragraph line (after all params).
 649   Return `False` on any kind of error.
 650   '''
 651   j = i
 652   while True:
 653       stlay = find_token_backwards(lines, "\\begin_layout", j)
 654       if stlay == -1:
 655           return False
 656       endlay = find_end_of_layout(lines, stlay)
 657       if endlay > i:
 658           break
 659       j = stlay - 1
 660
 661   if endlay < i:
 662       return False
 663
 664   layoutname = get_value(lines, "\\begin_layout", stlay)
 665   if layoutname == "": # layout style missing
 666       # TODO: What shall we do in this case?
 667       pass
 668       # layoutname == "Standard" # use same fallback as the LyX parser:
 669       # raise ValueError("Missing layout name on line %d"%stlay) # diagnosis
 670       # return False # generic error response
 671   par_params = ["\\noindent", "\\indent", "\\indent-toggle", "\\leftindent",
 672                 "\\start_of_appendix", "\\paragraph_spacing", "\\align",
 673                 "\\labelwidthstring"]
 674   stpar = stlay
 675   while True:
 676       stpar += 1
 677       if lines[stpar].split(' ', 1)[0] not in par_params:
 678           break
 679   return (layoutname, stlay, endlay, stpar)
 680
 681
 682 def count_pars_in_inset(lines, i):
 683   '''
 684   Counts the paragraphs within this inset
 685   '''
 686   ins = get_containing_inset(lines, i)
 687   if ins == -1:
 688       return -1
 689   pars = 0
 690   for j in range(ins[1], ins[2]):
 691       m = re.match(r'\\begin_layout (.*)', lines[j])
 692       if m and get_containing_inset(lines, j)[1] == ins[1]:
 693           pars += 1
 694
 695   return pars
 696
 697
 698 def find_end_of_sequence(lines, i):
 699   '''
 700   Returns the end of a sequence of identical layouts.
 701   '''
 702   lay = get_containing_layout(lines, i)
 703   if lay == False:
 704       return -1
 705   layout = lay[0]
 706   endlay = lay[2]
 707   i = endlay
 708   while True:
 709       m = re.match(r'\\begin_layout (.*)', lines[i])
 710       if m and m.group(1) != layout:
 711           return endlay
 712       elif lines[i] == "\\begin_deeper":
 713           j = find_end_of(lines, i, "\\begin_deeper", "\\end_deeper")
 714           if j != -1:
 715               i = j
 716               endlay = j
 717               continue
 718       if m and m.group(1) == layout:
 719           endlay = find_end_of_layout(lines, i)
 720           i = endlay
 721           continue
 722       if i == len(lines) - 1:
 723           break
 724       i = i + 1
 725
 726   return endlay