lib/lyx2lyx/parser_tools.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2002-2010 Dekel Tsur <dekel@lyx.org>,
   4 # José Matos <jamatos@lyx.org>, Richard Heck <rgheck@comcast.net>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19
  20
  21 '''
  22 This modules offer several free functions to help parse lines.
  23 More documentaton is below, but here is a quick guide to what
  24 they do. Optional arguments are marked by brackets.
  25
  26 find_token(lines, token, start[, end[, ignorews]]):
  27   Returns the first line i, start <= i < end, on which
  28   token is found at the beginning. Returns -1 if not
  29   found.
  30   If ignorews is (given and) True, then differences
  31   in whitespace do not count, except that there must be no
  32   extra whitespace following token itself.
  33
  34 find_token_exact(lines, token, start[, end]):
  35   As find_token, but with ignorews True.
  36
  37 find_tokens(lines, tokens, start[, end[, ignorews]]):
  38   Returns the first line i, start <= i < end, on which
  39   oen of the tokens in tokens is found at the beginning.
  40   Returns -1 if not found.
  41   If ignorews is (given and) True, then differences
  42   in whitespace do not count, except that there must be no
  43   extra whitespace following token itself.
  44
  45 find_tokens_exact(lines, token, start[, end]):
  46   As find_tokens, but with ignorews True.
  47
  48 find_token_backwards(lines, token, start):
  49 find_tokens_backwards(lines, tokens, start):
  50   As before, but look backwards.
  51
  52 find_re(lines, rexp, start[, end]):
  53   As find_token, but rexp is a regular expression object,
  54   so it has to be passed as e.g.: re.compile(r'...').
  55
  56 get_value(lines, token, start[, end[, default]):
  57   Similar to find_token, but it returns what follows the
  58   token on the found line. Example:
  59     get_value(document.header, "\use_xetex", 0)
  60   will find a line like:
  61     \use_xetex true
  62   and, in that case, return "true". (Note that whitespace
  63   is stripped.) The final argument, default, defaults to "",
  64   and is what is returned if we do not find anything. So you
  65   can use that to set a default.
  66
  67 get_quoted_value(lines, token, start[, end[, default]):
  68   Similar to get_value, but it will strip quotes off the
  69   value, if they are present. So use this one for cases
  70   where the value is normally quoted.
  71
  72 del_token(lines, token, start[, end]):
  73   Like find_token, but deletes the line if it finds one.
  74   Returns True if a line got deleted, otherwise False.
  75
  76 find_beginning_of(lines, i, start_token, end_token):
  77   Here, start_token and end_token are meant to be a matching
  78   pair, like "\begin_layout" and "\end_layout". We look for
  79   the start_token that pairs with the end_token that occurs
  80   on or after line i. Returns -1 if not found.
  81   So, in the layout case, this would find the \begin_layout
  82   for the layout line i is in.
  83   Example:
  84     ec = find_token(document.body, "</cell", i)
  85     bc = find_beginning_of(document.body, ec, \
  86         "<cell", "</cell")
  87   Now, assuming no -1s, bc-ec wraps the cell for line i.
  88
  89 find_end_of(lines, i, start_token, end_token):
  90   Like find_beginning_of, but looking for the matching
  91   end_token. This might look like:
  92     bc = find_token_(document.body, "<cell", i)
  93     ec = find_end_of(document.body, bc,  "<cell", "</cell")
  94   Now, assuming no -1s, bc-ec wrap the next cell.
  95
  96 find_end_of_inset(lines, i):
  97   Specialization of find_end_of for insets.
  98
  99 find_end_of_layout(lines, i):
 100   Specialization of find_end_of for layouts.
 101
 102 is_in_inset(lines, i, inset):
 103   Checks if line i is in an inset of the given type.
 104   If so, returns starting and ending lines. Otherwise,
 105   returns False.
 106   Example:
 107     is_in_inset(document.body, i, "\\begin_inset Tabular")
 108   returns False unless i is within a table. If it is, then
 109   it returns the line on which the table begins and the one
 110   on which it ends. Note that this pair will evaulate to
 111   boolean True, so
 112     if is_in_inset(...):
 113   will do what you expect.
 114
 115 get_containing_inset(lines, i):
 116   Finds out what kind of inset line i is within. Returns a
 117   list containing what follows \begin_inset on the the line
 118   on which the inset begins, plus the starting and ending line.
 119   Returns False on any kind of error or if it isn't in an inset.
 120   So get_containing_inset(document.body, i) might return:
 121     ("CommandInset ref", 300, 306)
 122   if i is within an InsetRef beginning on line 300 and ending
 123   on line 306.
 124
 125 get_containing_layout(lines, i):
 126   As get_containing_inset, but for layout.
 127
 128
 129 find_nonempty_line(lines, start[, end):
 130   Finds the next non-empty line.
 131
 132 check_token(line, token):
 133   Does line begin with token?
 134
 135 is_nonempty_line(line):
 136   Does line contain something besides whitespace?
 137
 138 '''
 139
 140 # Utilities for one line
 141 def check_token(line, token):
 142     """ check_token(line, token) -> bool
 143
 144     Return True if token is present in line and is the first element
 145     else returns False."""
 146
 147     return line[:len(token)] == token
 148
 149
 150 def is_nonempty_line(line):
 151     """ is_nonempty_line(line) -> bool
 152
 153     Return False if line is either empty or it has only whitespaces,
 154     else return True."""
 155     return line != " "*len(line)
 156
 157
 158 # Utilities for a list of lines
 159 def find_token(lines, token, start, end = 0, ignorews = False):
 160     """ find_token(lines, token, start[[, end], ignorews]) -> int
 161
 162     Return the lowest line where token is found, and is the first
 163     element, in lines[start, end].
 164
 165     If ignorews is True (default is False), then differences in
 166     whitespace are ignored, except that there must be no extra
 167     whitespace following token itself.
 168
 169     Return -1 on failure."""
 170
 171     if end == 0 or end > len(lines):
 172         end = len(lines)
 173     m = len(token)
 174     for i in xrange(start, end):
 175         if ignorews:
 176             x = lines[i].split()
 177             y = token.split()
 178             if len(x) < len(y):
 179                 continue
 180             if x[:len(y)] == y:
 181                 return i
 182         else:
 183             if lines[i][:m] == token:
 184                 return i
 185     return -1
 186
 187
 188 def find_token_exact(lines, token, start, end = 0):
 189     return find_token(lines, token, start, end, True)
 190
 191
 192 def find_tokens(lines, tokens, start, end = 0, ignorews = False):
 193     """ find_tokens(lines, tokens, start[[, end], ignorews]) -> int
 194
 195     Return the lowest line where one token in tokens is found, and is
 196     the first element, in lines[start, end].
 197
 198     Return -1 on failure."""
 199     if end == 0 or end > len(lines):
 200         end = len(lines)
 201
 202     for i in xrange(start, end):
 203         for token in tokens:
 204             if ignorews:
 205                 x = lines[i].split()
 206                 y = token.split()
 207                 if len(x) < len(y):
 208                     continue
 209                 if x[:len(y)] == y:
 210                     return i
 211             else:
 212                 if lines[i][:len(token)] == token:
 213                     return i
 214     return -1
 215
 216
 217 def find_tokens_exact(lines, tokens, start, end = 0):
 218     return find_tokens(lines, tokens, start, end, True)
 219
 220
 221 def find_re(lines, rexp, start, end = 0):
 222     """ find_token_re(lines, rexp, start[, end]) -> int
 223
 224     Return the lowest line where rexp, a regular expression, is found
 225     in lines[start, end].
 226
 227     Return -1 on failure."""
 228
 229     if end == 0 or end > len(lines):
 230         end = len(lines)
 231     for i in xrange(start, end):
 232         if rexp.match(lines[i]):
 233                 return i
 234     return -1
 235
 236
 237 def find_token_backwards(lines, token, start):
 238     """ find_token_backwards(lines, token, start) -> int
 239
 240     Return the highest line where token is found, and is the first
 241     element, in lines[start, end].
 242
 243     Return -1 on failure."""
 244     m = len(token)
 245     for i in xrange(start, -1, -1):
 246         line = lines[i]
 247         if line[:m] == token:
 248             return i
 249     return -1
 250
 251
 252 def find_tokens_backwards(lines, tokens, start):
 253     """ find_tokens_backwards(lines, token, start) -> int
 254
 255     Return the highest line where token is found, and is the first
 256     element, in lines[end, start].
 257
 258     Return -1 on failure."""
 259     for i in xrange(start, -1, -1):
 260         line = lines[i]
 261         for token in tokens:
 262             if line[:len(token)] == token:
 263                 return i
 264     return -1
 265
 266
 267 def get_value(lines, token, start, end = 0, default = ""):
 268     """ get_value(lines, token, start[[, end], default]) -> string
 269
 270     Find the next line that looks like:
 271       token followed by other stuff
 272     Returns "followed by other stuff" with leading and trailing
 273     whitespace removed.
 274     """
 275
 276     i = find_token_exact(lines, token, start, end)
 277     if i == -1:
 278         return default
 279     l = lines[i].split(None, 1)
 280     if len(l) > 1:
 281         return l[1].strip()
 282     return default
 283
 284
 285 def get_quoted_value(lines, token, start, end = 0, default = ""):
 286     """ get_quoted_value(lines, token, start[[, end], default]) -> string
 287
 288     Find the next line that looks like:
 289       token "followed by other stuff"
 290     Returns "followed by other stuff" with leading and trailing
 291     whitespace and quotes removed. If there are no quotes, that is OK too.
 292     So use get_value to preserve possible quotes, this one to remove them,
 293     if they are there.
 294     Note that we will NOT strip quotes from default!
 295     """
 296     val = get_value(lines, token, start, end, "")
 297     if not val:
 298       return default
 299     return val.strip('"')
 300
 301
 302 def del_token(lines, token, start, end = 0):
 303     """ del_token(lines, token, start, end) -> int
 304
 305     Find the first line in lines where token is the first element
 306     and delete that line. Returns True if we deleted a line, False
 307     if we did not."""
 308
 309     k = find_token_exact(lines, token, start, end)
 310     if k == -1:
 311         return False
 312     del lines[k]
 313     return True
 314
 315
 316 def find_beginning_of(lines, i, start_token, end_token):
 317     count = 1
 318     while i > 0:
 319         i = find_tokens_backwards(lines, [start_token, end_token], i-1)
 320         if i == -1:
 321             return -1
 322         if check_token(lines[i], end_token):
 323             count = count+1
 324         else:
 325             count = count-1
 326         if count == 0:
 327             return i
 328     return -1
 329
 330
 331 def find_end_of(lines, i, start_token, end_token):
 332     count = 1
 333     n = len(lines)
 334     while i < n:
 335         i = find_tokens(lines, [end_token, start_token], i+1)
 336         if i == -1:
 337             return -1
 338         if check_token(lines[i], start_token):
 339             count = count+1
 340         else:
 341             count = count-1
 342         if count == 0:
 343             return i
 344     return -1
 345
 346
 347 def find_nonempty_line(lines, start, end = 0):
 348     if end == 0:
 349         end = len(lines)
 350     for i in xrange(start, end):
 351         if is_nonempty_line(lines[i]):
 352             return i
 353     return -1
 354
 355
 356 def find_end_of_inset(lines, i):
 357     " Find end of inset, where lines[i] is included."
 358     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
 359
 360
 361 def find_end_of_layout(lines, i):
 362     " Find end of layout, where lines[i] is included."
 363     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
 364
 365
 366 def is_in_inset(lines, i, inset):
 367     '''
 368     Checks if line i is in an inset of the given type.
 369     If so, returns starting and ending lines.
 370     Otherwise, returns False.
 371     Example:
 372       is_in_inset(document.body, i, "\\begin_inset Tabular")
 373     returns False unless i is within a table. If it is, then
 374     it returns the line on which the table begins and the one
 375     on which it ends. Note that this pair will evaulate to
 376     boolean True, so
 377       if is_in_inset(...):
 378     will do what you expect.
 379     '''
 380     defval = (-1, -1)
 381     stins = find_token_backwards(lines, inset, i)
 382     if stins == -1:
 383       return defval
 384     endins = find_end_of_inset(lines, stins)
 385     # note that this includes the notfound case.
 386     if endins < i:
 387       return defval
 388     return (stins, endins)
 389
 390
 391 def get_containing_inset(lines, i):
 392   '''
 393   Finds out what kind of inset line i is within. Returns a
 394   list containing (i) what follows \begin_inset on the the line
 395   on which the inset begins, plus the starting and ending line.
 396   Returns False on any kind of error or if it isn't in an inset.
 397   '''
 398   stins = find_token_backwards(lines, i, "\\begin_inset")
 399   if stins == -1:
 400       return False
 401   endins = find_end_of_inset(lines, stins)
 402   if endins < i:
 403       return False
 404   inset = get_value(lines, "\\begin_inset", stins)
 405   if inset == "":
 406       # shouldn't happen
 407       return False
 408   return (inset, stins, endins)
 409
 410
 411 def get_containing_layout(lines, i):
 412   '''
 413   Finds out what kind of layout line i is within. Returns a
 414   list containing (i) what follows \begin_layout on the the line
 415   on which the layout begins, plus the starting and ending line.
 416   Returns False on any kind of error.
 417   '''
 418   stins = find_token_backwards(lines, i, "\\begin_layout")
 419   if stins == -1:
 420       return False
 421   endins = find_end_of_layout(lines, stins)
 422   if endins < i:
 423       return False
 424   lay = get_value(lines, "\\begin_layout", stins)
 425   if lay == "":
 426       # shouldn't happen
 427       return False
 428   return (lay, stins, endins)