lib/lyx2lyx/parser_tools.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2002-2011 Dekel Tsur <dekel@lyx.org>,
   4 # José Matos <jamatos@lyx.org>, Richard Heck <rgheck@comcast.net>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  19
  20
  21 """
  22 This module offers several free functions to help parse lines.
  23 More documentaton is below, but here is a quick guide to what
  24 they do. Optional arguments are marked by brackets.
  25
  26 find_token(lines, token[, start[, end[, ignorews]]]):
  27   Returns the first line i, start <= i < end, on which
  28   token is found at the beginning. Returns -1 if not
  29   found.
  30   If ignorews is (given and) True, then differences
  31   in whitespace do not count, except that there must be no
  32   extra whitespace following token itself.
  33
  34 find_token_exact(lines, token[, start[, end]]]):
  35   As find_token, but with ignorews set to True.
  36
  37 find_tokens(lines, tokens[, start[, end[, ignorews]]]):
  38   Returns the first line i, start <= i < end, on which
  39   one of the tokens in tokens is found at the beginning.
  40   Returns -1 if not found.
  41   If ignorews is (given and) True, then differences
  42   in whitespace do not count, except that there must be no
  43   extra whitespace following token itself.
  44
  45 find_tokens_exact(lines, token[, start[, end]]):
  46   As find_tokens, but with ignorews True.
  47
  48 find_token_backwards(lines, token, start):
  49 find_tokens_backwards(lines, tokens, start):
  50   As before, but look backwards.
  51
  52 find_substring(lines, sub[, start[, end]]) -> int
  53   As find_token, but sub may be anywhere in the line.
  54
  55 find_re(lines, rexp, start[, end]):
  56   As find_token, but rexp is a regular expression object,
  57   so it has to be passed as e.g.: re.compile(r'...').
  58
  59 get_value(lines, token[, start[, end[, default[, delete]]]]):
  60   Similar to find_token, but it returns what follows the
  61   token on the found line. Example:
  62     get_value(document.header, "\\use_xetex", 0)
  63   will find a line like:
  64     \\use_xetex true
  65   and, in that case, return "true". (Note that whitespace
  66   is stripped.) The final argument, default, defaults to "",
  67   and is what is returned if we do not find anything. So you
  68   can use that to set a default.
  69
  70 get_quoted_value(lines, token[, start[, end[, default[, delete]]]]):
  71   Similar to get_value, but it will strip quotes off the
  72   value, if they are present. So use this one for cases
  73   where the value is normally quoted.
  74
  75 get_option_value(line, option):
  76   This assumes we have a line with something like:
  77       option="value"
  78   and returns value. Returns "" if not found.
  79
  80 get_bool_value(lines, token[, start[, end[, default, delete]]]]):
  81   Like get_value, but returns a boolean.
  82
  83 set_bool_value(lines, token, value[, start[, end]]):
  84   Find `token` in `lines[start:end]` and set to boolean value bool(`value`).
  85   Return old value. Raise ValueError if token is not in lines.
  86
  87 del_token(lines, token[, start[, end]]):
  88   Like find_token, but deletes the line if it finds one.
  89   Returns True if a line got deleted, otherwise False.
  90
  91 find_beginning_of(lines, i, start_token, end_token):
  92   Here, start_token and end_token are meant to be a matching
  93   pair, like "\\begin_layout" and "\\end_layout". We look for
  94   the start_token that pairs with the end_token that occurs
  95   on or after line i. Returns -1 if not found.
  96   So, in the layout case, this would find the \\begin_layout
  97   for the layout line i is in.
  98   Example:
  99     ec = find_token(document.body, "</cell", i)
 100     bc = find_beginning_of(document.body, ec, \
 101         "<cell", "</cell")
 102   Now, assuming no -1s, bc-ec wraps the cell for line i.
 103
 104 find_end_of(lines, i, start_token, end_token):
 105   Like find_beginning_of, but looking for the matching
 106   end_token. This might look like:
 107     bc = find_token_(document.body, "<cell", i)
 108     ec = find_end_of(document.body, bc,  "<cell", "</cell")
 109   Now, assuming no -1s, bc-ec wrap the next cell.
 110
 111 find_end_of_inset(lines, i):
 112   Specialization of find_end_of for insets.
 113
 114 find_end_of_layout(lines, i):
 115   Specialization of find_end_of for layouts.
 116
 117 find_end_of_sequence(lines, i):
 118   Find the end of the sequence of layouts of the same kind.
 119   Considers nesting. If the last paragraph in sequence is nested,
 120   the position of the last \end_deeper is returned, else
 121   the position of the last \end_layout.
 122
 123 is_in_inset(lines, i, inset, default=(-1,-1)):
 124   Check if line i is in an inset of the given type.
 125   If so, returns starting and ending lines. Otherwise,
 126   return default.
 127   Example:
 128     is_in_inset(document.body, i, "\\begin_inset Tabular")
 129   returns (-1,-1) unless i is within a table. If it is, then
 130   it returns the line on which the table begins and the one
 131   on which it ends. Note that this pair will evaulate to
 132   boolean True, so
 133     if is_in_inset(..., default=False):
 134   will do what you expect.
 135
 136 get_containing_inset(lines, i):
 137   Finds out what kind of inset line i is within. Returns a
 138   list containing what follows \begin_inset on the line
 139   on which the inset begins, plus the starting and ending line.
 140   Returns False on any kind of error or if it isn't in an inset.
 141   So get_containing_inset(document.body, i) might return:
 142     ("CommandInset ref", 300, 306)
 143   if i is within an InsetRef beginning on line 300 and ending
 144   on line 306.
 145
 146 get_containing_layout(lines, i):
 147   As get_containing_inset, but for layout. Additionally returns the
 148   position of real paragraph start (after par params) as 4th value.
 149
 150 find_nonempty_line(lines, start[, end):
 151   Finds the next non-empty line.
 152
 153 check_token(line, token):
 154   Does line begin with token?
 155
 156 is_nonempty_line(line):
 157   Does line contain something besides whitespace?
 158
 159 count_pars_in_inset(lines, i):
 160   Counts the paragraphs inside an inset.
 161
 162 """
 163
 164 import re
 165
 166 # Utilities for one line
 167 def check_token(line, token):
 168     """ check_token(line, token) -> bool
 169
 170     Return True if token is present in line and is the first element
 171     else returns False.
 172
 173     Deprecated. Use line.startswith(token).
 174     """
 175     return line.startswith(token)
 176
 177
 178 def is_nonempty_line(line):
 179     """ is_nonempty_line(line) -> bool
 180
 181     Return False if line is either empty or it has only whitespaces,
 182     else return True."""
 183     return bool(line.strip())
 184
 185
 186 # Utilities for a list of lines
 187 def find_token(lines, token, start=0, end=0, ignorews=False):
 188     """ find_token(lines, token, start[[, end], ignorews]) -> int
 189
 190     Return the lowest line where token is found, and is the first
 191     element, in lines[start, end].
 192
 193     If ignorews is True (default is False), then differences in
 194     whitespace are ignored, but there must be whitespace following
 195     token itself.
 196
 197     Use find_substring(lines, sub) to find a substring anywhere in `lines`.
 198
 199     Return -1 on failure."""
 200
 201     if end == 0 or end > len(lines):
 202         end = len(lines)
 203     if ignorews:
 204         y = token.split()
 205     for i in range(start, end):
 206         if ignorews:
 207             x = lines[i].split()
 208             if len(x) < len(y):
 209                 continue
 210             if x[:len(y)] == y:
 211                 return i
 212         else:
 213             if lines[i].startswith(token):
 214                 return i
 215     return -1
 216
 217
 218 def find_token_exact(lines, token, start=0, end=0):
 219     return find_token(lines, token, start, end, True)
 220
 221
 222 def find_tokens(lines, tokens, start=0, end=0, ignorews=False):
 223     """ find_tokens(lines, tokens, start[[, end], ignorews]) -> int
 224
 225     Return the lowest line where one token in tokens is found, and is
 226     the first element, in lines[start, end].
 227
 228     Return -1 on failure."""
 229     if end == 0 or end > len(lines):
 230         end = len(lines)
 231
 232     for i in range(start, end):
 233         for token in tokens:
 234             if ignorews:
 235                 x = lines[i].split()
 236                 y = token.split()
 237                 if len(x) < len(y):
 238                     continue
 239                 if x[:len(y)] == y:
 240                     return i
 241             else:
 242                 if lines[i].startswith(token):
 243                     return i
 244     return -1
 245
 246
 247 def find_tokens_exact(lines, tokens, start=0, end=0):
 248     return find_tokens(lines, tokens, start, end, True)
 249
 250
 251 def find_substring(lines, sub, start=0, end=0):
 252     """ find_substring(lines, sub[, start[, end]]) -> int
 253
 254     Return the lowest line number `i` in [start, end] where
 255     `sub` is a substring of line[i].
 256
 257     Return -1 on failure."""
 258
 259     if end == 0 or end > len(lines):
 260         end = len(lines)
 261     for i in range(start, end):
 262         if sub in lines[i]:
 263                 return i
 264     return -1
 265
 266
 267 def find_re(lines, rexp, start=0, end=0):
 268     """ find_re(lines, rexp[, start[, end]]) -> int
 269
 270     Return the lowest line number `i` in [start, end] where the regular
 271     expression object `rexp` matches at the beginning of line[i].
 272     Return -1 on failure.
 273
 274     Start your pattern with the wildcard ".*" to find a match anywhere in a
 275     line. Use find_substring() to find a substring anywhere in the lines.
 276     """
 277     if end == 0 or end > len(lines):
 278         end = len(lines)
 279     for i in range(start, end):
 280         if rexp.match(lines[i]):
 281                 return i
 282     return -1
 283
 284
 285 def find_token_backwards(lines, token, start):
 286     """ find_token_backwards(lines, token, start) -> int
 287
 288     Return the highest line where token is found, and is the first
 289     element, in lines[start, end].
 290
 291     Return -1 on failure."""
 292     for i in range(start, -1, -1):
 293         if lines[i].startswith(token):
 294             return i
 295     return -1
 296
 297
 298 def find_tokens_backwards(lines, tokens, start):
 299     """ find_tokens_backwards(lines, token, start) -> int
 300
 301     Return the highest line where token is found, and is the first
 302     element, in lines[end, start].
 303
 304     Return -1 on failure."""
 305     for i in range(start, -1, -1):
 306         line = lines[i]
 307         for token in tokens:
 308             if line.startswith(token):
 309                 return i
 310     return -1
 311
 312
 313 def find_complete_lines(lines, sublines, start=0, end=0):
 314     """Find first occurence of sequence `sublines` in list `lines`.
 315     Return index of first line or -1 on failure.
 316
 317     Efficient search for a sub-list in a large list. Works for any values.
 318
 319     >>> find_complete_lines([1, 2, 3, 1, 1, 2], [1, 2])
 320     0
 321
 322     The `start` and `end` arguments work similar to list.index()
 323
 324     >>> find_complete_lines([1, 2, 3, 1, 1 ,2], [1, 2], start=1)
 325     4
 326     >>> find_complete_lines([1, 2, 3, 1, 1 ,2], [1, 2], start=1, end=4)
 327     -1
 328
 329     The return value can be used to substitute the sub-list.
 330     Take care to check before use:
 331
 332     >>> l = [1, 1, 2]
 333     >>> s = find_complete_lines(l, [1, 2])
 334     >>> if s != -1:
 335     ...     l[s:s+2] = [3]; l
 336     [1, 3]
 337
 338     See also del_complete_lines().
 339     """
 340     if not sublines:
 341         return start
 342     end = end or len(lines)
 343     N = len(sublines)
 344     try:
 345         while True:
 346             for j, value in enumerate(sublines):
 347                 i = lines.index(value, start, end)
 348                 if j and i != start:
 349                     start = i-j
 350                     break
 351                 start = i + 1
 352             else:
 353                 return i +1 - N
 354     except ValueError: # `sublines` not found
 355         return -1
 356
 357
 358 def find_across_lines(lines, sub, start=0, end=0):
 359     sublines = sub.splitlines()
 360     if len(sublines) > 2:
 361         # at least 3 lines: the middle one(s) are complete -> use index search
 362         i = find_complete_lines(lines, sublines[1:-1], start+1, end-1)
 363         if i < start+1:
 364             return -1
 365         try:
 366             if (lines[i-1].endswith(sublines[0]) and
 367                 lines[i+len(sublines)].startswith(sublines[-1])):
 368                 return i-1
 369         except IndexError:
 370             pass
 371     elif len(sublines) > 1:
 372         # last subline must start a line
 373         i = find_token(lines, sublines[-1], start, end)
 374         if i < start + 1:
 375             return -1
 376         if lines[i-1].endswith(sublines[0]):
 377             return i-1
 378     else: # no line-break, may be in the middle of a line
 379         if end == 0 or end > len(lines):
 380             end = len(lines)
 381         for i in range(start, end):
 382             if sub in lines[i]:
 383                 return i
 384     return -1
 385
 386
 387 def get_value(lines, token, start=0, end=0, default="", delete=False):
 388     """Find `token` in `lines` and return part of line that follows it.
 389
 390     Find the next line that looks like:
 391       token followed by other stuff
 392
 393     If `delete` is True, delete the line (if found).
 394
 395     Return "followed by other stuff" with leading and trailing
 396     whitespace removed.
 397     """
 398     i = find_token_exact(lines, token, start, end)
 399     if i == -1:
 400         return default
 401     # TODO: establish desired behaviour, eventually change to
 402     #  return lines.pop(i)[len(token):].strip() # or default
 403     # see test_parser_tools.py
 404     l = lines[i].split(None, 1)
 405     if delete:
 406         del(lines[i])
 407     if len(l) > 1:
 408         return l[1].strip()
 409     return default
 410
 411
 412 def get_quoted_value(lines, token, start=0, end=0, default="", delete=False):
 413     """ get_quoted_value(lines, token, start[[, end], default]) -> string
 414
 415     Find the next line that looks like:
 416       token "followed by other stuff"
 417     Returns "followed by other stuff" with leading and trailing
 418     whitespace and quotes removed. If there are no quotes, that is OK too.
 419     So use get_value to preserve possible quotes, this one to remove them,
 420     if they are there.
 421     Note that we will NOT strip quotes from default!
 422     """
 423     val = get_value(lines, token, start, end, "", delete)
 424     if not val:
 425       return default
 426     return val.strip('"')
 427
 428 bool_values = {True:  ("true", "1"),
 429                False: ("false", "0")}
 430
 431 def get_bool_value(lines, token, start=0, end=0, default=None, delete=False):
 432     """ get_bool_value(lines, token, start[[, end], default]) -> string
 433
 434     Find the next line that looks like:
 435       token <bool_value>
 436
 437     Return True if <bool_value> is 1 or "true", False if bool_value
 438     is 0 or "false", else `default`.
 439     """
 440
 441     val = get_quoted_value(lines, token, start, end, default, delete)
 442     if val in bool_values[True]:
 443         return True
 444     if val in bool_values[False]:
 445         return False
 446     return default
 447
 448
 449 def set_bool_value(lines, token, value, start=0, end=0):
 450     """Find `token` in `lines` and set to bool(`value`).
 451
 452     Return previous value. Raise `ValueError` if `token` is not in lines.
 453
 454     Cf. find_token(), get_bool_value().
 455     """
 456     i = find_token(lines, token, start, end)
 457     if i == -1:
 458         raise ValueError
 459     oldvalue = get_bool_value(lines, token, i, i+1)
 460     if oldvalue is value:
 461         return oldvalue
 462     # Use 0/1 or true/false?
 463     if get_quoted_value(lines, token, i, i+1) in ('0', '1'):
 464         value_string = bool_values[value][1]
 465     else:
 466         value_string = bool_values[value][0]
 467     # set to new value
 468     lines[i] = "%s %s" % (token, value_string)
 469
 470     return oldvalue
 471
 472
 473 def get_option_value(line, option):
 474     rx = option + '\s*=\s*"([^"]+)"'
 475     rx = re.compile(rx)
 476     m = rx.search(line)
 477     if not m:
 478       return ""
 479     return m.group(1)
 480
 481
 482 def set_option_value(line, option, value):
 483     rx = '(' + option + '\s*=\s*")[^"]+"'
 484     rx = re.compile(rx)
 485     m = rx.search(line)
 486     if not m:
 487         return line
 488     return re.sub(rx, '\g<1>' + value + '"', line)
 489
 490
 491 def del_token(lines, token, start=0, end=0):
 492     """ del_token(lines, token, start, end) -> int
 493
 494     Find the first line in lines where token is the first element
 495     and delete that line. Returns True if we deleted a line, False
 496     if we did not."""
 497
 498     k = find_token_exact(lines, token, start, end)
 499     if k == -1:
 500         return False
 501     del lines[k]
 502     return True
 503
 504 def del_complete_lines(lines, sublines, start=0, end=0):
 505     """Delete first occurence of `sublines` in list `lines`.
 506
 507     Efficient deletion of a sub-list in a list. Works for any values.
 508     The `start` and `end` arguments work similar to list.index()
 509
 510     Returns True if a deletion was done and False if not.
 511
 512     >>> l = [1, 0, 1, 1, 1, 2]
 513     >>> del_complete_lines(l, [0, 1, 1])
 514     True
 515     >>> l
 516     [1, 1, 2]
 517     """
 518     i = find_complete_lines(lines, sublines, start, end)
 519     if i == -1:
 520         return False
 521     del(lines[i:i+len(sublines)])
 522     return True
 523
 524
 525 def del_value(lines, token, start=0, end=0, default=None):
 526     """
 527     Find the next line that looks like:
 528       token followed by other stuff
 529     Delete that line and return "followed by other stuff"
 530     with leading and trailing whitespace removed.
 531
 532     If token is not found, return `default`.
 533     """
 534     i = find_token_exact(lines, token, start, end)
 535     if i == -1:
 536         return default
 537     return lines.pop(i)[len(token):].strip()
 538
 539
 540 def find_beginning_of(lines, i, start_token, end_token):
 541     count = 1
 542     while i > 0:
 543         i = find_tokens_backwards(lines, [start_token, end_token], i-1)
 544         if i == -1:
 545             return -1
 546         if lines[i].startswith(end_token):
 547             count = count+1
 548         else:
 549             count = count-1
 550         if count == 0:
 551             return i
 552     return -1
 553
 554
 555 def find_end_of(lines, i, start_token, end_token):
 556     count = 1
 557     n = len(lines)
 558     while i < n:
 559         i = find_tokens(lines, [end_token, start_token], i+1)
 560         if i == -1:
 561             return -1
 562         if lines[i].startswith(start_token):
 563             count = count+1
 564         else:
 565             count = count-1
 566         if count == 0:
 567             return i
 568     return -1
 569
 570
 571 def find_nonempty_line(lines, start=0, end=0):
 572     if end == 0:
 573         end = len(lines)
 574     for i in range(start, end):
 575         if lines[i].strip():
 576             return i
 577     return -1
 578
 579
 580 def find_end_of_inset(lines, i):
 581     " Find end of inset, where lines[i] is included."
 582     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
 583
 584
 585 def find_end_of_layout(lines, i):
 586     " Find end of layout, where lines[i] is included."
 587     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
 588
 589
 590 def is_in_inset(lines, i, inset, default=(-1,-1)):
 591     """
 592     Check if line i is in an inset of the given type.
 593     If so, return starting and ending lines, otherwise `default`.
 594     Example:
 595       is_in_inset(document.body, i, "\\begin_inset Tabular")
 596     returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
 597     If it is, then it returns the line on which the table begins and the one
 598     on which it ends.
 599     Note that this pair will evaulate to boolean True, so (with the optional
 600     default value set to False)
 601       if is_in_inset(..., default=False):
 602     will do what you expect.
 603     """
 604     start = find_token_backwards(lines, inset, i)
 605     if start == -1:
 606       return default
 607     end = find_end_of_inset(lines, start)
 608     if end < i: # this includes the notfound case.
 609       return default
 610     return (start, end)
 611
 612
 613 def get_containing_inset(lines, i):
 614   '''
 615   Finds out what kind of inset line i is within. Returns a
 616   list containing (i) what follows \begin_inset on the line
 617   on which the inset begins, plus the starting and ending line.
 618   Returns False on any kind of error or if it isn't in an inset.
 619   '''
 620   j = i
 621   while True:
 622       stins = find_token_backwards(lines, "\\begin_inset", j)
 623       if stins == -1:
 624           return False
 625       endins = find_end_of_inset(lines, stins)
 626       if endins > j:
 627           break
 628       j = stins - 1
 629
 630   if endins < i:
 631       return False
 632
 633   inset = get_value(lines, "\\begin_inset", stins)
 634   if inset == "":
 635       # shouldn't happen
 636       return False
 637   return (inset, stins, endins)
 638
 639
 640 def get_containing_layout(lines, i):
 641   '''
 642   Find out what kind of layout line `i` is within.
 643   Return a tuple
 644     (layoutname, layoutstart, layoutend, startofcontent)
 645   containing
 646     * layout style/name,
 647     * start line number,
 648     * end line number, and
 649     * number of first paragraph line (after all params).
 650   Return `False` on any kind of error.
 651   '''
 652   j = i
 653   while True:
 654       stlay = find_token_backwards(lines, "\\begin_layout", j)
 655       if stlay == -1:
 656           return False
 657       endlay = find_end_of_layout(lines, stlay)
 658       if endlay > i:
 659           break
 660       j = stlay - 1
 661
 662   if endlay < i:
 663       return False
 664
 665   layoutname = get_value(lines, "\\begin_layout", stlay)
 666   if layoutname == "": # layout style missing
 667       # TODO: What shall we do in this case?
 668       pass
 669       # layoutname == "Standard" # use same fallback as the LyX parser:
 670       # raise ValueError("Missing layout name on line %d"%stlay) # diagnosis
 671       # return False # generic error response
 672   par_params = ["\\noindent", "\\indent", "\\indent-toggle", "\\leftindent",
 673                 "\\start_of_appendix", "\\paragraph_spacing", "\\align",
 674                 "\\labelwidthstring"]
 675   stpar = stlay
 676   while True:
 677       stpar += 1
 678       if lines[stpar].split(' ', 1)[0] not in par_params:
 679           break
 680   return (layoutname, stlay, endlay, stpar)
 681
 682
 683 def count_pars_in_inset(lines, i):
 684   '''
 685   Counts the paragraphs within this inset
 686   '''
 687   ins = get_containing_inset(lines, i)
 688   if ins == -1:
 689       return -1
 690   pars = 0
 691   for j in range(ins[1], ins[2]):
 692       m = re.match(r'\\begin_layout (.*)', lines[j])
 693       if m and get_containing_inset(lines, j)[0] == ins[0]:
 694           pars += 1
 695
 696   return pars
 697
 698
 699 def find_end_of_sequence(lines, i):
 700   '''
 701   Returns the end of a sequence of identical layouts.
 702   '''
 703   lay = get_containing_layout(lines, i)
 704   if lay == False:
 705       return -1
 706   layout = lay[0]
 707   endlay = lay[2]
 708   i = endlay
 709   while True:
 710       m = re.match(r'\\begin_layout (.*)', lines[i])
 711       if m and m.group(1) != layout:
 712           return endlay
 713       elif lines[i] == "\\begin_deeper":
 714           j = find_end_of(lines, i, "\\begin_deeper", "\\end_deeper")
 715           if j != -1:
 716               i = j
 717               endlay = j
 718               continue
 719       if m and m.group(1) == layout:
 720           endlay = find_end_of_layout(lines, i)
 721           i = endlay
 722           continue
 723       if i == len(lines) - 1:
 724           break
 725       i = i + 1
 726
 727   return endlay