lib/lyx2lyx/parser_tools.py

   1 # This file is part of lyx2lyx
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2002-2010 Dekel Tsur <dekel@lyx.org>,
   4 # José Matos <jamatos@lyx.org>, Richard Heck <rgheck@comcast.net>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19
  20
  21 '''
  22 This modules offer several free functions to help parse lines.
  23 More documentaton is below, but here is a quick guide to what
  24 they do. Optional arguments are marked by brackets.
  25
  26 find_token(lines, token, start[, end[, exact]]):
  27   Returns the first line i, start <= i < end, on which
  28   token is found at the beginning. Returns -1 if not
  29   found. If exact is (given and) True, then differences
  30   in whitespace do not count.
  31
  32 find_token_exact(lines, token, start[, end]):
  33   Badly named. As find_token, but with ignorews True.
  34
  35 find_tokens(lines, tokens, start[, end[, ignorews]]):
  36   Returns the first line i, start <= i < end, on which
  37   oen of the tokens in tokens is found at the beginning.
  38   Returns -1 if not found. If ignorews is (given and) True,
  39   then differences in whitespace do not count.
  40
  41 find_tokens_exact(lines, token, start[, end]):
  42   Badly named. As find_tokens, but with ignorews True.
  43
  44 find_token_backwards(lines, token, start):
  45 find_tokens_backwards(lines, tokens, start):
  46   As before, but look backwards.
  47
  48 find_re(lines, rexp, start[, end]):
  49   As find_token, but rexp is a regular expression object,
  50   so it has to be passed as e.g.: re.compile(r'...').
  51
  52 get_value(lines, token, start[, end[, default]):
  53   Similar to find_token, but it returns what follows the
  54   token on the found line. Example:
  55     get_value(document.header, "\use_xetex", 0)
  56   will find a line like:
  57     \use_xetex true
  58   and, in that case, return "true". (Note that whitespace
  59   is stripped.) The final argument, default, defaults to "",
  60   and is what is returned if we do not find anything. So you
  61   can use that to set a default.
  62
  63 get_quoted_value(lines, token, start[, end[, default]):
  64   Similar to get_value, but it will strip quotes off the
  65   value, if they are present. So use this one for cases
  66   where the value is normally quoted.
  67
  68 del_token(lines, token, start[, end]):
  69   Like find_token, but deletes the line if it finds one.
  70   Returns True if a line got deleted, otherwise False.
  71
  72 find_beginning_of(lines, i, start_token, end_token):
  73   Here, start_token and end_token are meant to be a matching
  74   pair, like "\begin_layout" and "\end_layout". We look for
  75   the start_token that pairs with the end_token that occurs
  76   on or after line i. Returns -1 if not found.
  77   So, in the layout case, this would find the \begin_layout
  78   for the layout line i is in.
  79   Example:
  80     ec = find_token(document.body, "</cell", i)
  81     bc = find_beginning_of(document.body, ec, \
  82         "<cell", "</cell")
  83   Now, assuming no -1s, bc-ec wraps the cell for line i.
  84
  85 find_end_of(lines, i, start_token, end_token):
  86   Like find_beginning_of, but looking for the matching
  87   end_token. This might look like:
  88     bc = find_token_(document.body, "<cell", i)
  89     ec = find_end_of(document.body, bc,  "<cell", "</cell")
  90   Now, assuming no -1s, bc-ec wrap the next cell.
  91
  92 find_end_of_inset(lines, i):
  93   Specialization of find_end_of for insets.
  94
  95 find_end_of_layout(lines, i):
  96   Specialization of find_end_of for layouts.
  97
  98 is_in_inset(lines, i, inset):
  99   Checks if line i is in an inset of the given type.
 100   If so, returns starting and ending lines. Otherwise,
 101   returns False.
 102   Example:
 103     is_in_inset(document.body, i, "\\begin_inset Tabular")
 104   returns False unless i is within a table. If it is, then
 105   it returns the line on which the table begins and the one
 106   on which it ends. Note that this pair will evaulate to
 107   boolean True, so
 108     if is_in_inset(...):
 109   will do what you expect.
 110
 111 get_containing_inset(lines, i):
 112   Finds out what kind of inset line i is within. Returns a
 113   list containing what follows \begin_inset on the the line
 114   on which the inset begins, plus the starting and ending line.
 115   Returns False on any kind of error or if it isn't in an inset.
 116   So get_containing_inset(document.body, i) might return:
 117     ("CommandInset ref", 300, 306)
 118   if i is within an InsetRef beginning on line 300 and ending
 119   on line 306.
 120
 121 get_containing_layout(lines, i):
 122   As get_containing_inset, but for layout.
 123
 124
 125 find_nonempty_line(lines, start[, end):
 126   Finds the next non-empty line.
 127
 128 check_token(line, token):
 129   Does line begin with token?
 130
 131 is_nonempty_line(line):
 132   Does line contain something besides whitespace?
 133
 134 '''
 135
 136 # Utilities for one line
 137 def check_token(line, token):
 138     """ check_token(line, token) -> bool
 139
 140     Return True if token is present in line and is the first element
 141     else returns False."""
 142
 143     return line[:len(token)] == token
 144
 145
 146 def is_nonempty_line(line):
 147     """ is_nonempty_line(line) -> bool
 148
 149     Return False if line is either empty or it has only whitespaces,
 150     else return True."""
 151     return line != " "*len(line)
 152
 153
 154 # Utilities for a list of lines
 155 def find_token(lines, token, start, end = 0, exact = False):
 156     """ find_token(lines, token, start[[, end], exact]) -> int
 157
 158     Return the lowest line where token is found, and is the first
 159     element, in lines[start, end].
 160
 161     If exact is True (default is False), then differences in
 162     whitespace are ignored.
 163
 164     Return -1 on failure."""
 165
 166     if end == 0 or end > len(lines):
 167         end = len(lines)
 168     m = len(token)
 169     for i in xrange(start, end):
 170         if exact:
 171             x = lines[i].split()
 172             y = token.split()
 173             if len(x) < len(y):
 174                 continue
 175             if x[:len(y)] == y:
 176                 return i
 177         else:
 178             if lines[i][:m] == token:
 179                 return i
 180     return -1
 181
 182
 183 def find_token_exact(lines, token, start, end = 0):
 184     return find_token(lines, token, start, end, True)
 185
 186
 187 def find_tokens(lines, tokens, start, end = 0, exact = False):
 188     """ find_tokens(lines, tokens, start[[, end], exact]) -> int
 189
 190     Return the lowest line where one token in tokens is found, and is
 191     the first element, in lines[start, end].
 192
 193     Return -1 on failure."""
 194     if end == 0 or end > len(lines):
 195         end = len(lines)
 196
 197     for i in xrange(start, end):
 198         for token in tokens:
 199             if exact:
 200                 x = lines[i].split()
 201                 y = token.split()
 202                 if len(x) < len(y):
 203                     continue
 204                 if x[:len(y)] == y:
 205                     return i
 206             else:
 207                 if lines[i][:len(token)] == token:
 208                     return i
 209     return -1
 210
 211
 212 def find_tokens_exact(lines, tokens, start, end = 0):
 213     return find_tokens(lines, tokens, start, end, True)
 214
 215
 216 def find_re(lines, rexp, start, end = 0):
 217     """ find_token_re(lines, rexp, start[, end]) -> int
 218
 219     Return the lowest line where rexp, a regular expression, is found
 220     in lines[start, end].
 221
 222     Return -1 on failure."""
 223
 224     if end == 0 or end > len(lines):
 225         end = len(lines)
 226     for i in xrange(start, end):
 227         if rexp.match(lines[i]):
 228                 return i
 229     return -1
 230
 231
 232 def find_token_backwards(lines, token, start):
 233     """ find_token_backwards(lines, token, start) -> int
 234
 235     Return the highest line where token is found, and is the first
 236     element, in lines[start, end].
 237
 238     Return -1 on failure."""
 239     m = len(token)
 240     for i in xrange(start, -1, -1):
 241         line = lines[i]
 242         if line[:m] == token:
 243             return i
 244     return -1
 245
 246
 247 def find_tokens_backwards(lines, tokens, start):
 248     """ find_tokens_backwards(lines, token, start) -> int
 249
 250     Return the highest line where token is found, and is the first
 251     element, in lines[end, start].
 252
 253     Return -1 on failure."""
 254     for i in xrange(start, -1, -1):
 255         line = lines[i]
 256         for token in tokens:
 257             if line[:len(token)] == token:
 258                 return i
 259     return -1
 260
 261
 262 def get_value(lines, token, start, end = 0, default = ""):
 263     """ get_value(lines, token, start[[, end], default]) -> string
 264
 265     Find the next line that looks like:
 266       token followed by other stuff
 267     Returns "followed by other stuff" with leading and trailing
 268     whitespace removed.
 269     """
 270
 271     i = find_token_exact(lines, token, start, end)
 272     if i == -1:
 273         return default
 274     l = lines[i].split(None, 1)
 275     if len(l) > 1:
 276         return l[1].strip()
 277     return default
 278
 279
 280 def get_quoted_value(lines, token, start, end = 0, default = ""):
 281     """ get_quoted_value(lines, token, start[[, end], default]) -> string
 282
 283     Find the next line that looks like:
 284       token "followed by other stuff"
 285     Returns "followed by other stuff" with leading and trailing
 286     whitespace and quotes removed. If there are no quotes, that is OK too.
 287     So use get_value to preserve possible quotes, this one to remove them,
 288     if they are there.
 289     Note that we will NOT strip quotes from default!
 290     """
 291     val = get_value(lines, token, start, end, "")
 292     if not val:
 293       return default
 294     return val.strip('"')
 295
 296
 297 def del_token(lines, token, start, end = 0):
 298     """ del_token(lines, token, start, end) -> int
 299
 300     Find the first line in lines where token is the first element
 301     and delete that line. Returns True if we deleted a line, False
 302     if we did not."""
 303
 304     k = find_token_exact(lines, token, start, end)
 305     if k == -1:
 306         return False
 307     del lines[k]
 308     return True
 309
 310
 311 def find_beginning_of(lines, i, start_token, end_token):
 312     count = 1
 313     while i > 0:
 314         i = find_tokens_backwards(lines, [start_token, end_token], i-1)
 315         if i == -1:
 316             return -1
 317         if check_token(lines[i], end_token):
 318             count = count+1
 319         else:
 320             count = count-1
 321         if count == 0:
 322             return i
 323     return -1
 324
 325
 326 def find_end_of(lines, i, start_token, end_token):
 327     count = 1
 328     n = len(lines)
 329     while i < n:
 330         i = find_tokens(lines, [end_token, start_token], i+1)
 331         if i == -1:
 332             return -1
 333         if check_token(lines[i], start_token):
 334             count = count+1
 335         else:
 336             count = count-1
 337         if count == 0:
 338             return i
 339     return -1
 340
 341
 342 def find_nonempty_line(lines, start, end = 0):
 343     if end == 0:
 344         end = len(lines)
 345     for i in xrange(start, end):
 346         if is_nonempty_line(lines[i]):
 347             return i
 348     return -1
 349
 350
 351 def find_end_of_inset(lines, i):
 352     " Find end of inset, where lines[i] is included."
 353     return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
 354
 355
 356 def find_end_of_layout(lines, i):
 357     " Find end of layout, where lines[i] is included."
 358     return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
 359
 360
 361 def is_in_inset(lines, i, inset):
 362     '''
 363     Checks if line i is in an inset of the given type.
 364     If so, returns starting and ending lines.
 365     Otherwise, returns False.
 366     Example:
 367       is_in_inset(document.body, i, "\\begin_inset Tabular")
 368     returns False unless i is within a table. If it is, then
 369     it returns the line on which the table begins and the one
 370     on which it ends. Note that this pair will evaulate to
 371     boolean True, so
 372       if is_in_inset(...):
 373     will do what you expect.
 374     '''
 375     defval = (-1, -1)
 376     stins = find_token_backwards(lines, inset, i)
 377     if stins == -1:
 378       return defval
 379     endins = find_end_of_inset(lines, stins)
 380     # note that this includes the notfound case.
 381     if endins < i:
 382       return defval
 383     return (stins, endins)
 384
 385
 386 def get_containing_inset(lines, i):
 387   '''
 388   Finds out what kind of inset line i is within. Returns a
 389   list containing (i) what follows \begin_inset on the the line
 390   on which the inset begins, plus the starting and ending line.
 391   Returns False on any kind of error or if it isn't in an inset.
 392   '''
 393   stins = find_token_backwards(lines, i, "\\begin_inset")
 394   if stins == -1:
 395       return False
 396   endins = find_end_of_inset(lines, stins)
 397   if endins < i:
 398       return False
 399   inset = get_value(lines, "\\begin_inset", stins)
 400   if inset == "":
 401       # shouldn't happen
 402       return False
 403   return (inset, stins, endins)
 404
 405
 406 def get_containing_layout(lines, i):
 407   '''
 408   Finds out what kind of layout line i is within. Returns a
 409   list containing (i) what follows \begin_layout on the the line
 410   on which the layout begins, plus the starting and ending line.
 411   Returns False on any kind of error.
 412   '''
 413   stins = find_token_backwards(lines, i, "\\begin_layout")
 414   if stins == -1:
 415       return False
 416   endins = find_end_of_layout(lines, stins)
 417   if endins < i:
 418       return False
 419   lay = get_value(lines, "\\begin_layout", stins)
 420   if lay == "":
 421       # shouldn't happen
 422       return False
 423   return (lay, stins, endins)