lib/lyx2lyx/parser_tools.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2002-2011 Dekel Tsur <dekel@lyx.org>,
   4 # José Matos <jamatos@lyx.org>, Richard Heck <rgheck@comcast.net>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  19
  20
  21 '''
  22 This modules offer several free functions to help parse lines.
  23 More documentaton is below, but here is a quick guide to what
  24 they do. Optional arguments are marked by brackets.
  25
  26 find_token(lines, token, start[, end[, ignorews]]):
  27   Returns the first line i, start <= i < end, on which
  28   token is found at the beginning. Returns -1 if not
  29   found.
  30   If ignorews is (given and) True, then differences
  31   in whitespace do not count, except that there must be no
  32   extra whitespace following token itself.
  33
  34 find_token_exact(lines, token, start[, end]):
  35   As find_token, but with ignorews True.
  36
  37 find_tokens(lines, tokens, start[, end[, ignorews]]):
  38   Returns the first line i, start <= i < end, on which
  39   oen of the tokens in tokens is found at the beginning.
  40   Returns -1 if not found.
  41   If ignorews is (given and) True, then differences
  42   in whitespace do not count, except that there must be no
  43   extra whitespace following token itself.
  44
  45 find_tokens_exact(lines, token, start[, end]):
  46   As find_tokens, but with ignorews True.
  47
  48 find_token_backwards(lines, token, start):
  49 find_tokens_backwards(lines, tokens, start):
  50   As before, but look backwards.
  51
  52 find_re(lines, rexp, start[, end]):
  53   As find_token, but rexp is a regular expression object,
  54   so it has to be passed as e.g.: re.compile(r'...').
  55
  56 get_value(lines, token, start[, end[, default]):
  57   Similar to find_token, but it returns what follows the
  58   token on the found line. Example:
  59     get_value(document.header, "\use_xetex", 0)
  60   will find a line like:
  61     \use_xetex true
  62   and, in that case, return "true". (Note that whitespace
  63   is stripped.) The final argument, default, defaults to "",
  64   and is what is returned if we do not find anything. So you
  65   can use that to set a default.
  66
  67 get_quoted_value(lines, token, start[, end[, default]):
  68   Similar to get_value, but it will strip quotes off the
  69   value, if they are present. So use this one for cases
  70   where the value is normally quoted.
  71
  72 get_option_value(line, option):
  73   This assumes we have a line with something like:
  74       option="value"
  75   and returns value. Returns "" if not found.
  76
  77 del_token(lines, token, start[, end]):
  78   Like find_token, but deletes the line if it finds one.
  79   Returns True if a line got deleted, otherwise False.
  80
  81 find_beginning_of(lines, i, start_token, end_token):
  82   Here, start_token and end_token are meant to be a matching
  83   pair, like "\begin_layout" and "\end_layout". We look for
  84   the start_token that pairs with the end_token that occurs
  85   on or after line i. Returns -1 if not found.
  86   So, in the layout case, this would find the \begin_layout
  87   for the layout line i is in.
  88   Example:
  89     ec = find_token(document.body, "</cell", i)
  90     bc = find_beginning_of(document.body, ec, \
  91         "<cell", "</cell")
  92   Now, assuming no -1s, bc-ec wraps the cell for line i.
  93
  94 find_end_of(lines, i, start_token, end_token):
  95   Like find_beginning_of, but looking for the matching
  96   end_token. This might look like:
  97     bc = find_token_(document.body, "<cell", i)
  98     ec = find_end_of(document.body, bc,  "<cell", "</cell")
  99   Now, assuming no -1s, bc-ec wrap the next cell.
 100
 101 find_end_of_inset(lines, i):
 102   Specialization of find_end_of for insets.
 103
 104 find_end_of_layout(lines, i):
 105   Specialization of find_end_of for layouts.
 106
 107 is_in_inset(lines, i, inset):
 108   Checks if line i is in an inset of the given type.
 109   If so, returns starting and ending lines. Otherwise,
 110   returns False.
 111   Example:
 112     is_in_inset(document.body, i, "\\begin_inset Tabular")
 113   returns False unless i is within a table. If it is, then
 114   it returns the line on which the table begins and the one
 115   on which it ends. Note that this pair will evaulate to
 116   boolean True, so
 117     if is_in_inset(...):
 118   will do what you expect.
 119
 120 get_containing_inset(lines, i):
 121   Finds out what kind of inset line i is within. Returns a
 122   list containing what follows \begin_inset on the the line
 123   on which the inset begins, plus the starting and ending line.
 124   Returns False on any kind of error or if it isn't in an inset.
 125   So get_containing_inset(document.body, i) might return:
 126     ("CommandInset ref", 300, 306)
 127   if i is within an InsetRef beginning on line 300 and ending
 128   on line 306.
 129
 130 get_containing_layout(lines, i):
 131   As get_containing_inset, but for layout. Additionally returns the
 132   position of real paragraph start (after par params) as 4th value.
 133
 134
 135 find_nonempty_line(lines, start[, end):
 136   Finds the next non-empty line.
 137
 138 check_token(line, token):
 139   Does line begin with token?
 140
 141 is_nonempty_line(line):
 142   Does line contain something besides whitespace?
 143
 144 '''
 145
 146 import re
 147
 148 # Utilities for one line
 149 def check_token(line, token):
 150     """ check_token(line, token) -> bool
 151
 152     Return True if token is present in line and is the first element
 153     else returns False."""
 154
 155     return line[:len(token)] == token
 156
 157
 158 def is_nonempty_line(line):
 159     """ is_nonempty_line(line) -> bool
 160
 161     Return False if line is either empty or it has only whitespaces,
 162     else return True."""
 163     return line != " "*len(line)
 164
 165
 166 # Utilities for a list of lines
 167 def find_token(lines, token, start, end = 0, ignorews = False):
 168     """ find_token(lines, token, start[[, end], ignorews]) -> int
 169
 170     Return the lowest line where token is found, and is the first
 171     element, in lines[start, end].
 172
 173     If ignorews is True (default is False), then differences in
 174     whitespace are ignored, except that there must be no extra
 175     whitespace following token itself.
 176
 177     Return -1 on failure."""
 178
 179     if end == 0 or end > len(lines):
 180         end = len(lines)
 181     m = len(token)
 182     for i in xrange(start, end):
 183         if ignorews:
 184             x = lines[i].split()
 185             y = token.split()
 186             if len(x) < len(y):
 187                 continue
 188             if x[:len(y)] == y:
 189                 return i
 190         else:
 191             if lines[i][:m] == token:
 192                 return i
 193     return -1
 194
 195
 196 def find_token_exact(lines, token, start, end = 0):
 197     return find_token(lines, token, start, end, True)
 198
 199
 200 def find_tokens(lines, tokens, start, end = 0, ignorews = False):
 201     """ find_tokens(lines, tokens, start[[, end], ignorews]) -> int
 202
 203     Return the lowest line where one token in tokens is found, and is
 204     the first element, in lines[start, end].
 205
 206     Return -1 on failure."""
 207     if end == 0 or end > len(lines):
 208         end = len(lines)
 209
 210     for i in xrange(start, end):
 211         for token in tokens:
 212             if ignorews:
 213                 x = lines[i].split()
 214                 y = token.split()
 215                 if len(x) < len(y):
 216                     continue
 217                 if x[:len(y)] == y:
 218                     return i
 219             else:
 220                 if lines[i][:len(token)] == token:
 221                     return i
 222     return -1
 223
 224
 225 def find_tokens_exact(lines, tokens, start, end = 0):
 226     return find_tokens(lines, tokens, start, end, True)
 227
 228
 229 def find_re(lines, rexp, start, end = 0):
 230     """ find_token_re(lines, rexp, start[, end]) -> int
 231
 232     Return the lowest line where rexp, a regular expression, is found
 233     in lines[start, end].
 234
 235     Return -1 on failure."""
 236
 237     if end == 0 or end > len(lines):
 238         end = len(lines)
 239     for i in xrange(start, end):
 240         if rexp.match(lines[i]):
 241                 return i
 242     return -1
 243
 244
 245 def find_token_backwards(lines, token, start):
 246     """ find_token_backwards(lines, token, start) -> int
 247
 248     Return the highest line where token is found, and is the first
 249     element, in lines[start, end].
 250
 251     Return -1 on failure."""
 252     m = len(token)
 253     for i in xrange(start, -1, -1):
 254         line = lines[i]
 255         if line[:m] == token:
 256             return i
 257     return -1
 258
 259
 260 def find_tokens_backwards(lines, tokens, start):
 261     """ find_tokens_backwards(lines, token, start) -> int
 262
 263     Return the highest line where token is found, and is the first
 264     element, in lines[end, start].
 265
 266     Return -1 on failure."""
 267     for i in xrange(start, -1, -1):
 268         line = lines[i]
 269         for token in tokens:
 270             if line[:len(token)] == token:
 271                 return i
 272     return -1
 273
 274
 275 def get_value(lines, token, start, end = 0, default = ""):
 276     """ get_value(lines, token, start[[, end], default]) -> string
 277
 278     Find the next line that looks like:
 279       token followed by other stuff
 280     Returns "followed by other stuff" with leading and trailing
 281     whitespace removed.
 282     """
 283
 284     i = find_token_exact(lines, token, start, end)
 285     if i == -1:
 286         return default
 287     l = lines[i].split(None, 1)
 288     if len(l) > 1:
 289         return l[1].strip()
 290     return default
 291
 292
 293 def get_quoted_value(lines, token, start, end = 0, default = ""):
 294     """ get_quoted_value(lines, token, start[[, end], default]) -> string
 295
 296     Find the next line that looks like:
 297       token "followed by other stuff"
 298     Returns "followed by other stuff" with leading and trailing
 299     whitespace and quotes removed. If there are no quotes, that is OK too.
 300     So use get_value to preserve possible quotes, this one to remove them,
 301     if they are there.
 302     Note that we will NOT strip quotes from default!
 303     """
 304     val = get_value(lines, token, start, end, "")
 305     if not val:
 306       return default
 307     return val.strip('"')
 308
 309
 310 def get_option_value(line, option):
 311     rx = option + '\s*=\s*"([^"]+)"'
 312     rx = re.compile(rx)
 313     m = rx.search(line)
 314     if not m:
 315       return ""
 316     return m.group(1)
 317
 318
 319 def set_option_value(line, option, value):
 320     rx = '(' + option + '\s*=\s*")[^"]+"'
 321     rx = re.compile(rx)
 322     m = rx.search(line)
 323     if not m:
 324         return line
 325     return re.sub(rx, '\g<1>' + value + '"', line)
 326
 327
 328 def del_token(lines, token, start, end = 0):
 329     """ del_token(lines, token, start, end) -> int
 330
 331     Find the first line in lines where token is the first element
 332     and delete that line. Returns True if we deleted a line, False
 333     if we did not."""
 334
 335     k = find_token_exact(lines, token, start, end)
 336     if k == -1:
 337         return False
 338     del lines[k]
 339     return True
 340
 341
 342 def find_beginning_of(lines, i, start_token, end_token):
 343     count = 1
 344     while i > 0:
 345         i = find_tokens_backwards(lines, [start_token, end_token], i-1)
 346         if i == -1:
 347             return -1
 348         if check_token(lines[i], end_token):
 349             count = count+1
 350         else:
 351             count = count-1
 352         if count == 0:
 353             return i
 354     return -1
 355
 356
 357 def find_end_of(lines, i, start_token, end_token):
 358     count = 1
 359     n = len(lines)
 360     while i < n:
 361         i = find_tokens(lines, [end_token, start_token], i+1)
 362         if i == -1:
 363             return -1
 364         if check_token(lines[i], start_token):
 365             count = count+1
 366         else:
 367             count = count-1
 368         if count == 0:
 369             return i
 370     return -1
 371
 372
 373 def find_nonempty_line(lines, start, end = 0):
 374     if end == 0:
 375         end = len(lines)
 376     for i in xrange(start, end):
 377         if is_nonempty_line(lines[i]):
 378             return i
 379     return -1
 380
 381
 382 def find_end_of_inset(lines, i):
 383     " Find end of inset, where lines[i] is included."
 384     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
 385
 386
 387 def find_end_of_layout(lines, i):
 388     " Find end of layout, where lines[i] is included."
 389     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
 390
 391
 392 def is_in_inset(lines, i, inset):
 393     '''
 394     Checks if line i is in an inset of the given type.
 395     If so, returns starting and ending lines.
 396     Otherwise, returns False.
 397     Example:
 398       is_in_inset(document.body, i, "\\begin_inset Tabular")
 399     returns False unless i is within a table. If it is, then
 400     it returns the line on which the table begins and the one
 401     on which it ends. Note that this pair will evaulate to
 402     boolean True, so
 403       if is_in_inset(...):
 404     will do what you expect.
 405     '''
 406     defval = (-1, -1)
 407     stins = find_token_backwards(lines, inset, i)
 408     if stins == -1:
 409       return defval
 410     endins = find_end_of_inset(lines, stins)
 411     # note that this includes the notfound case.
 412     if endins < i:
 413       return defval
 414     return (stins, endins)
 415
 416
 417 def get_containing_inset(lines, i):
 418   '''
 419   Finds out what kind of inset line i is within. Returns a
 420   list containing (i) what follows \begin_inset on the the line
 421   on which the inset begins, plus the starting and ending line.
 422   Returns False on any kind of error or if it isn't in an inset.
 423   '''
 424   j = i
 425   while True:
 426       stins = find_token_backwards(lines, "\\begin_inset", j)
 427       if stins == -1:
 428           return False
 429       endins = find_end_of_inset(lines, stins)
 430       if endins > j:
 431           break
 432       j = stins - 1
 433
 434   inset = get_value(lines, "\\begin_inset", stins)
 435   if inset == "":
 436       # shouldn't happen
 437       return False
 438   return (inset, stins, endins)
 439
 440
 441 def get_containing_layout(lines, i):
 442   '''
 443   Finds out what kind of layout line i is within. Returns a
 444   list containing (i) what follows \begin_layout on the the line
 445   on which the layout begins, plus the starting and ending line
 446   and the start of the apargraph (after all params).
 447   Returns False on any kind of error.
 448   '''
 449   j = i
 450   while True:
 451       stlay = find_token_backwards(lines, "\\begin_layout", j)
 452       if stlay == -1:
 453           return False
 454       endlay = find_end_of_layout(lines, stlay)
 455       if endlay > i:
 456           break
 457       j = stlay - 1
 458
 459   lay = get_value(lines, "\\begin_layout", stlay)
 460   if lay == "":
 461       # shouldn't happen
 462       return False
 463   par_params = ["\\noindent", "\\indent", "\\indent-toggle", "\\leftindent",
 464                 "\\start_of_appendix", "\\paragraph_spacing single",
 465                 "\\paragraph_spacing onehalf", "\\paragraph_spacing double",
 466                 "\\paragraph_spacing other", "\\align", "\\labelwidthstring"]
 467   stpar = stlay
 468   while True:
 469       stpar += 1
 470       if lines[stpar] not in par_params:
 471           break
 472   return (lay, stlay, endlay, stpar)
 473
 474
 475 def count_pars_in_inset(lines, i):
 476   '''
 477   Counts the paragraphs within this inset
 478   '''
 479   ins = get_containing_inset(lines, i)
 480   if ins == -1:
 481       return -1
 482   pars = 0
 483   for j in range(ins[1], ins[2]):
 484       m = re.match(r'\\begin_layout (.*)', lines[j])
 485       if m and get_containing_inset(lines, j)[0] == ins[0]:
 486           pars += 1
 487
 488   return pars