1 # This file is part of lyx2lyx
2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2002-2011 Dekel Tsur <dekel@lyx.org>,
4 # José Matos <jamatos@lyx.org>, Richard Heck <rgheck@comcast.net>
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 This module offers several free functions to help parse lines.
23 More documentaton is below, but here is a quick guide to what
24 they do. Optional arguments are marked by brackets.
26 find_token(lines, token[, start[, end[, ignorews]]]):
27 Returns the first line i, start <= i < end, on which
28 token is found at the beginning. Returns -1 if not
30 If ignorews is (given and) True, then differences
31 in whitespace do not count, except that there must be no
32 extra whitespace following token itself.
34 find_token_exact(lines, token[, start[, end]]]):
35 As find_token, but with ignorews set to True.
37 find_tokens(lines, tokens[, start[, end[, ignorews]]]):
38 Returns the first line i, start <= i < end, on which
39 one of the tokens in tokens is found at the beginning.
40 Returns -1 if not found.
41 If ignorews is (given and) True, then differences
42 in whitespace do not count, except that there must be no
43 extra whitespace following token itself.
45 find_tokens_exact(lines, token[, start[, end]]):
46 As find_tokens, but with ignorews True.
48 find_token_backwards(lines, token, start):
49 find_tokens_backwards(lines, tokens, start):
50 As before, but look backwards.
52 find_substring(lines, sub[, start[, end]]) -> int
53 As find_token, but sub may be anywhere in the line.
55 find_re(lines, rexp, start[, end]):
56 As find_token, but rexp is a regular expression object,
57 so it has to be passed as e.g.: re.compile(r'...').
59 get_value(lines, token[, start[, end[, default[, delete]]]]):
60 Similar to find_token, but it returns what follows the
61 token on the found line. Example:
62 get_value(document.header, "\\use_xetex", 0)
63 will find a line like:
65 and, in that case, return "true". (Note that whitespace
66 is stripped.) The final argument, default, defaults to "",
67 and is what is returned if we do not find anything. So you
68 can use that to set a default.
70 get_quoted_value(lines, token[, start[, end[, default[, delete]]]]):
71 Similar to get_value, but it will strip quotes off the
72 value, if they are present. So use this one for cases
73 where the value is normally quoted.
75 get_option_value(line, option):
76 This assumes we have a line with something like:
78 and returns value. Returns "" if not found.
80 get_bool_value(lines, token[, start[, end[, default, delete]]]]):
81 Like get_value, but returns a boolean.
83 set_bool_value(lines, token, value[, start[, end]]):
84 Find `token` in `lines[start:end]` and set to boolean value bool(`value`).
85 Return old value. Raise ValueError if token is not in lines.
87 del_token(lines, token[, start[, end]]):
88 Like find_token, but deletes the line if it finds one.
89 Returns True if a line got deleted, otherwise False.
91 Use get_* with the optional argument "delete=True", if you want to
92 get and delete a token.
94 find_beginning_of(lines, i, start_token, end_token):
95 Here, start_token and end_token are meant to be a matching
96 pair, like "\\begin_layout" and "\\end_layout". We look for
97 the start_token that pairs with the end_token that occurs
98 on or after line i. Returns -1 if not found.
99 So, in the layout case, this would find the \\begin_layout
100 for the layout line i is in.
102 ec = find_token(document.body, "</cell", i)
103 bc = find_beginning_of(document.body, ec, \
105 Now, assuming no -1s, bc-ec wraps the cell for line i.
107 find_end_of(lines, i, start_token, end_token):
108 Like find_beginning_of, but looking for the matching
109 end_token. This might look like:
110 bc = find_token_(document.body, "<cell", i)
111 ec = find_end_of(document.body, bc, "<cell", "</cell")
112 Now, assuming no -1s, bc-ec wrap the next cell.
114 find_end_of_inset(lines, i):
115 Specialization of find_end_of for insets.
117 find_end_of_layout(lines, i):
118 Specialization of find_end_of for layouts.
120 find_end_of_sequence(lines, i):
121 Find the end of the sequence of layouts of the same kind.
122 Considers nesting. If the last paragraph in sequence is nested,
123 the position of the last \end_deeper is returned, else
124 the position of the last \end_layout.
126 is_in_inset(lines, i, inset, default=(-1,-1)):
127 Check if line i is in an inset of the given type.
128 If so, returns starting and ending lines. Otherwise,
131 is_in_inset(document.body, i, "\\begin_inset Tabular")
132 returns (-1,-1) unless i is within a table. If it is, then
133 it returns the line on which the table begins and the one
134 on which it ends. Note that this pair will evaulate to
136 if is_in_inset(..., default=False):
137 will do what you expect.
139 get_containing_inset(lines, i):
140 Finds out what kind of inset line i is within. Returns a
141 list containing what follows \begin_inset on the line
142 on which the inset begins, plus the starting and ending line.
143 Returns False on any kind of error or if it isn't in an inset.
144 So get_containing_inset(document.body, i) might return:
145 ("CommandInset ref", 300, 306)
146 if i is within an InsetRef beginning on line 300 and ending
149 get_containing_layout(lines, i):
150 As get_containing_inset, but for layout. Additionally returns the
151 position of real paragraph start (after par params) as 4th value.
153 find_nonempty_line(lines, start[, end):
154 Finds the next non-empty line.
156 check_token(line, token):
157 Does line begin with token?
159 is_nonempty_line(line):
160 Does line contain something besides whitespace?
162 count_pars_in_inset(lines, i):
163 Counts the paragraphs inside an inset.
169 # Utilities for one line
170 def check_token(line, token):
171 """ check_token(line, token) -> bool
173 Return True if token is present in line and is the first element
176 Deprecated. Use line.startswith(token).
178 return line.startswith(token)
181 def is_nonempty_line(line):
182 """ is_nonempty_line(line) -> bool
184 Return False if line is either empty or it has only whitespaces,
186 return bool(line.strip())
189 # Utilities for a list of lines
190 def find_token(lines, token, start=0, end=0, ignorews=False):
191 """ find_token(lines, token, start[[, end], ignorews]) -> int
193 Return the lowest line where token is found, and is the first
194 element, in lines[start, end].
196 If ignorews is True (default is False), then differences in
197 whitespace are ignored, but there must be whitespace following
200 Use find_substring(lines, sub) to find a substring anywhere in `lines`.
202 Return -1 on failure."""
204 if end == 0 or end > len(lines):
208 for i in range(start, end):
216 if lines[i].startswith(token):
221 def find_token_exact(lines, token, start=0, end=0):
222 return find_token(lines, token, start, end, True)
225 def find_tokens(lines, tokens, start=0, end=0, ignorews=False):
226 """ find_tokens(lines, tokens, start[[, end], ignorews]) -> int
228 Return the lowest line where one token in tokens is found, and is
229 the first element, in lines[start, end].
231 Return -1 on failure."""
232 if end == 0 or end > len(lines):
235 for i in range(start, end):
245 if lines[i].startswith(token):
250 def find_tokens_exact(lines, tokens, start=0, end=0):
251 return find_tokens(lines, tokens, start, end, True)
254 def find_substring(lines, sub, start=0, end=0):
255 """ find_substring(lines, sub[, start[, end]]) -> int
257 Return the lowest line number `i` in [start, end] where
258 `sub` is a substring of line[i].
260 Return -1 on failure."""
262 if end == 0 or end > len(lines):
264 for i in range(start, end):
270 def find_re(lines, rexp, start=0, end=0):
271 """ find_re(lines, rexp[, start[, end]]) -> int
273 Return the lowest line number `i` in [start, end] where the regular
274 expression object `rexp` matches at the beginning of line[i].
275 Return -1 on failure.
277 Start your pattern with the wildcard ".*" to find a match anywhere in a
278 line. Use find_substring() to find a substring anywhere in the lines.
280 if end == 0 or end > len(lines):
282 for i in range(start, end):
283 if rexp.match(lines[i]):
288 def find_token_backwards(lines, token, start):
289 """ find_token_backwards(lines, token, start) -> int
291 Return the highest line where token is found, and is the first
292 element, in lines[start, end].
294 Return -1 on failure."""
295 for i in range(start, -1, -1):
296 if lines[i].startswith(token):
301 def find_tokens_backwards(lines, tokens, start):
302 """ find_tokens_backwards(lines, token, start) -> int
304 Return the highest line where token is found, and is the first
305 element, in lines[end, start].
307 Return -1 on failure."""
308 for i in range(start, -1, -1):
311 if line.startswith(token):
316 def find_complete_lines(lines, sublines, start=0, end=0):
317 """Find first occurence of sequence `sublines` in list `lines`.
318 Return index of first line or -1 on failure.
320 Efficient search for a sub-list in a large list. Works for any values.
322 >>> find_complete_lines([1, 2, 3, 1, 1, 2], [1, 2])
325 The `start` and `end` arguments work similar to list.index()
327 >>> find_complete_lines([1, 2, 3, 1, 1 ,2], [1, 2], start=1)
329 >>> find_complete_lines([1, 2, 3, 1, 1 ,2], [1, 2], start=1, end=4)
332 The return value can be used to substitute the sub-list.
333 Take care to check before use:
336 >>> s = find_complete_lines(l, [1, 2])
338 ... l[s:s+2] = [3]; l
341 See also del_complete_lines().
345 end = end or len(lines)
349 for j, value in enumerate(sublines):
350 i = lines.index(value, start, end)
357 except ValueError: # `sublines` not found
361 def find_across_lines(lines, sub, start=0, end=0):
362 sublines = sub.splitlines()
363 if len(sublines) > 2:
364 # at least 3 lines: the middle one(s) are complete -> use index search
365 i = find_complete_lines(lines, sublines[1:-1], start+1, end-1)
369 if (lines[i-1].endswith(sublines[0]) and
370 lines[i+len(sublines)].startswith(sublines[-1])):
374 elif len(sublines) > 1:
375 # last subline must start a line
376 i = find_token(lines, sublines[-1], start, end)
379 if lines[i-1].endswith(sublines[0]):
381 else: # no line-break, may be in the middle of a line
382 if end == 0 or end > len(lines):
384 for i in range(start, end):
390 def get_value(lines, token, start=0, end=0, default="", delete=False):
391 """Find `token` in `lines` and return part of line that follows it.
393 Find the next line that looks like:
394 token followed by other stuff
396 If `delete` is True, delete the line (if found).
398 Return "followed by other stuff" with leading and trailing
401 i = find_token_exact(lines, token, start, end)
404 # TODO: establish desired behaviour, eventually change to
405 # return lines.pop(i)[len(token):].strip() # or default
406 # see test_parser_tools.py
407 l = lines[i].split(None, 1)
415 def get_quoted_value(lines, token, start=0, end=0, default="", delete=False):
416 """ get_quoted_value(lines, token, start[[, end], default]) -> string
418 Find the next line that looks like:
419 token "followed by other stuff"
420 Returns "followed by other stuff" with leading and trailing
421 whitespace and quotes removed. If there are no quotes, that is OK too.
422 So use get_value to preserve possible quotes, this one to remove them,
424 Note that we will NOT strip quotes from default!
426 val = get_value(lines, token, start, end, "", delete)
429 return val.strip('"')
432 bool_values = {"true": True, "1": True,
433 "false": False, "0": False}
435 def get_bool_value(lines, token, start=0, end=0, default=None, delete=False):
436 """ get_bool_value(lines, token, start[[, end], default]) -> string
438 Find the next line that looks like:
441 Return True if <bool_value> is 1 or "true", False if <bool_value>
442 is 0 or "false", else `default`.
444 val = get_quoted_value(lines, token, start, end, default, delete)
445 return bool_values.get(val, default)
448 def set_bool_value(lines, token, value, start=0, end=0):
449 """Find `token` in `lines` and set to bool(`value`).
451 Return previous value. Raise `ValueError` if `token` is not in lines.
453 Cf. find_token(), get_bool_value().
455 i = find_token(lines, token, start, end)
458 oldvalue = get_bool_value(lines, token, i, i+1)
459 if oldvalue is value:
462 if get_quoted_value(lines, token, i, i+1) in ('0', '1'):
463 lines[i] = "%s %d" % (token, value)
465 lines[i] = "%s %s" % (token, str(value).lower())
470 def get_option_value(line, option):
471 rx = option + '\s*=\s*"([^"]+)"'
479 def set_option_value(line, option, value):
480 rx = '(' + option + '\s*=\s*")[^"]+"'
485 return re.sub(rx, '\g<1>' + value + '"', line)
488 def del_token(lines, token, start=0, end=0):
489 """ del_token(lines, token, start, end) -> int
491 Find the first line in lines where token is the first element
492 and delete that line. Returns True if we deleted a line, False
495 k = find_token_exact(lines, token, start, end)
501 def del_complete_lines(lines, sublines, start=0, end=0):
502 """Delete first occurence of `sublines` in list `lines`.
504 Efficient deletion of a sub-list in a list. Works for any values.
505 The `start` and `end` arguments work similar to list.index()
507 Returns True if a deletion was done and False if not.
509 >>> l = [1, 0, 1, 1, 1, 2]
510 >>> del_complete_lines(l, [0, 1, 1])
515 i = find_complete_lines(lines, sublines, start, end)
518 del(lines[i:i+len(sublines)])
522 def del_value(lines, token, start=0, end=0, default=None):
524 Find the next line that looks like:
525 token followed by other stuff
526 Delete that line and return "followed by other stuff"
527 with leading and trailing whitespace removed.
529 If token is not found, return `default`.
531 i = find_token_exact(lines, token, start, end)
534 return lines.pop(i)[len(token):].strip()
537 def find_beginning_of(lines, i, start_token, end_token):
540 i = find_tokens_backwards(lines, [start_token, end_token], i-1)
543 if lines[i].startswith(end_token):
552 def find_end_of(lines, i, start_token, end_token):
556 i = find_tokens(lines, [end_token, start_token], i+1)
559 if lines[i].startswith(start_token):
568 def find_nonempty_line(lines, start=0, end=0):
571 for i in range(start, end):
577 def find_end_of_inset(lines, i):
578 " Find end of inset, where lines[i] is included."
579 return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
582 def find_end_of_layout(lines, i):
583 " Find end of layout, where lines[i] is included."
584 return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
587 def is_in_inset(lines, i, inset, default=(-1,-1)):
589 Check if line i is in an inset of the given type.
590 If so, return starting and ending lines, otherwise `default`.
592 is_in_inset(document.body, i, "\\begin_inset Tabular")
593 returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
594 If it is, then it returns the line on which the table begins and the one
596 Note that this pair will evaulate to boolean True, so (with the optional
597 default value set to False)
598 if is_in_inset(..., default=False):
599 will do what you expect.
601 start = find_token_backwards(lines, inset, i)
604 end = find_end_of_inset(lines, start)
605 if end < i: # this includes the notfound case.
610 def get_containing_inset(lines, i):
612 Finds out what kind of inset line i is within. Returns a
613 list containing (i) what follows \begin_inset on the line
614 on which the inset begins, plus the starting and ending line.
615 Returns False on any kind of error or if it isn't in an inset.
619 stins = find_token_backwards(lines, "\\begin_inset", j)
622 endins = find_end_of_inset(lines, stins)
630 inset = get_value(lines, "\\begin_inset", stins)
634 return (inset, stins, endins)
637 def get_containing_layout(lines, i):
639 Find out what kind of layout line `i` is within.
641 (layoutname, layoutstart, layoutend, startofcontent)
645 * end line number, and
646 * number of first paragraph line (after all params).
647 Return `False` on any kind of error.
651 stlay = find_token_backwards(lines, "\\begin_layout", j)
654 endlay = find_end_of_layout(lines, stlay)
662 layoutname = get_value(lines, "\\begin_layout", stlay)
663 if layoutname == "": # layout style missing
664 # TODO: What shall we do in this case?
666 # layoutname == "Standard" # use same fallback as the LyX parser:
667 # raise ValueError("Missing layout name on line %d"%stlay) # diagnosis
668 # return False # generic error response
669 par_params = ["\\noindent", "\\indent", "\\indent-toggle", "\\leftindent",
670 "\\start_of_appendix", "\\paragraph_spacing", "\\align",
671 "\\labelwidthstring"]
675 if lines[stpar].split(' ', 1)[0] not in par_params:
677 return (layoutname, stlay, endlay, stpar)
680 def count_pars_in_inset(lines, i):
682 Counts the paragraphs within this inset
684 ins = get_containing_inset(lines, i)
688 for j in range(ins[1], ins[2]):
689 m = re.match(r'\\begin_layout (.*)', lines[j])
690 if m and get_containing_inset(lines, j)[0] == ins[0]:
696 def find_end_of_sequence(lines, i):
698 Returns the end of a sequence of identical layouts.
700 lay = get_containing_layout(lines, i)
707 m = re.match(r'\\begin_layout (.*)', lines[i])
708 if m and m.group(1) != layout:
710 elif lines[i] == "\\begin_deeper":
711 j = find_end_of(lines, i, "\\begin_deeper", "\\end_deeper")
716 if m and m.group(1) == layout:
717 endlay = find_end_of_layout(lines, i)
720 if i == len(lines) - 1: