6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE perl_matcher_common.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Definitions of perl_matcher member functions that are
17 * common to both the recursive and non-recursive versions.
20 #ifndef BOOST_REGEX_V4_PERL_MATCHER_COMMON_HPP
21 #define BOOST_REGEX_V4_PERL_MATCHER_COMMON_HPP
25 #pragma warning(disable: 4103)
27 #ifdef BOOST_HAS_ABI_HEADERS
28 # include BOOST_ABI_PREFIX
35 # pragma option push -w-8008 -w-8066
38 # pragma warning(push)
39 # pragma warning(disable: 4800)
45 template <class BidiIterator, class Allocator, class traits>
46 void perl_matcher<BidiIterator, Allocator, traits>::construct_init(const basic_regex<char_type, traits>& e, match_flag_type f)
48 typedef typename regex_iterator_traits<BidiIterator>::iterator_category category;
49 typedef typename basic_regex<char_type, traits>::flag_type expression_flag_type;
53 // precondition failure: e is not a valid regex.
54 std::invalid_argument ex("Invalid regular expression object");
55 boost::throw_exception(ex);
59 estimate_max_state_count(static_cast<category*>(0));
60 expression_flag_type re_f = re.flags();
61 icase = re_f & regex_constants::icase;
62 if(!(m_match_flags & (match_perl|match_posix)))
64 if((re_f & (regbase::main_option_type|regbase::no_perl_ex)) == 0)
65 m_match_flags |= match_perl;
66 else if((re_f & (regbase::main_option_type|regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
67 m_match_flags |= match_perl;
69 m_match_flags |= match_posix;
71 if(m_match_flags & match_posix)
73 m_temp_match.reset(new match_results<BidiIterator, Allocator>());
74 m_presult = m_temp_match.get();
77 m_presult = &m_result;
78 #ifdef BOOST_REGEX_NON_RECURSIVE
82 // find the value to use for matching word boundaries:
83 m_word_mask = re.get_data().m_word_mask;
84 // find bitmask to use for matching '.':
85 match_any_mask = static_cast<unsigned char>((f & match_not_dot_newline) ? re_detail::test_not_newline : re_detail::test_newline);
88 template <class BidiIterator, class Allocator, class traits>
89 void perl_matcher<BidiIterator, Allocator, traits>::estimate_max_state_count(std::random_access_iterator_tag*)
92 // How many states should we allow our machine to visit before giving up?
93 // This is a heuristic: it takes the greater of O(N^2) and O(NS^2)
94 // where N is the length of the string, and S is the number of states
95 // in the machine. It's tempting to up this to O(N^2S) or even O(N^2S^2)
96 // but these take unreasonably amounts of time to bale out in pathological
99 // Calculate NS^2 first:
101 static const boost::uintmax_t k = 100000;
102 boost::uintmax_t dist = boost::re_detail::distance(base, last);
105 boost::uintmax_t states = re.size();
109 if((std::numeric_limits<boost::uintmax_t>::max)() / dist < states)
111 max_state_count = (std::numeric_limits<boost::uintmax_t>::max)() - 2;
115 if((std::numeric_limits<boost::uintmax_t>::max)() - k < states)
117 max_state_count = (std::numeric_limits<boost::uintmax_t>::max)() - 2;
122 max_state_count = states;
125 // Now calculate N^2:
128 if((std::numeric_limits<boost::uintmax_t>::max)() / dist < states)
130 max_state_count = (std::numeric_limits<boost::uintmax_t>::max)() - 2;
134 if((std::numeric_limits<boost::uintmax_t>::max)() - k < states)
136 max_state_count = (std::numeric_limits<boost::uintmax_t>::max)() - 2;
141 // N^2 can be a very large number indeed, to prevent things getting out
142 // of control, cap the max states:
144 if(states > BOOST_REGEX_MAX_STATE_COUNT)
145 states = BOOST_REGEX_MAX_STATE_COUNT;
147 // If (the possibly capped) N^2 is larger than our first estimate,
150 if(states > max_state_count)
151 max_state_count = states;
154 template <class BidiIterator, class Allocator, class traits>
155 inline void perl_matcher<BidiIterator, Allocator, traits>::estimate_max_state_count(void*)
157 // we don't know how long the sequence is:
158 max_state_count = BOOST_REGEX_MAX_STATE_COUNT;
161 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
162 template <class BidiIterator, class Allocator, class traits>
163 inline bool perl_matcher<BidiIterator, Allocator, traits>::protected_call(
164 protected_proc_type proc)
166 ::boost::re_detail::concrete_protected_call
167 <perl_matcher<BidiIterator, Allocator, traits> >
169 return obj.execute();
174 template <class BidiIterator, class Allocator, class traits>
175 inline bool perl_matcher<BidiIterator, Allocator, traits>::match()
177 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
178 return protected_call(&perl_matcher<BidiIterator, Allocator, traits>::match_imp);
184 template <class BidiIterator, class Allocator, class traits>
185 bool perl_matcher<BidiIterator, Allocator, traits>::match_imp()
187 // initialise our stack if we are non-recursive:
188 #ifdef BOOST_REGEX_NON_RECURSIVE
189 save_state_init init(&m_stack_base, &m_backup_state);
190 used_block_count = BOOST_REGEX_MAX_BLOCKS;
191 #if !defined(BOOST_NO_EXCEPTIONS)
196 // reset our state machine:
200 m_match_flags |= regex_constants::match_all;
201 m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last);
202 m_presult->set_base(base);
203 m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
204 if(m_match_flags & match_posix)
205 m_result = *m_presult;
206 verify_options(re.flags(), m_match_flags);
207 if(0 == match_prefix())
209 return (m_result[0].second == last) && (m_result[0].first == base);
211 #if defined(BOOST_REGEX_NON_RECURSIVE) && !defined(BOOST_NO_EXCEPTIONS)
215 // unwind all pushed states, apart from anything else this
216 // ensures that all the states are correctly destructed
217 // not just the memory freed.
218 while(unwind(true)){}
224 template <class BidiIterator, class Allocator, class traits>
225 inline bool perl_matcher<BidiIterator, Allocator, traits>::find()
227 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
228 return protected_call(&perl_matcher<BidiIterator, Allocator, traits>::find_imp);
234 template <class BidiIterator, class Allocator, class traits>
235 bool perl_matcher<BidiIterator, Allocator, traits>::find_imp()
237 static matcher_proc_type const s_find_vtable[7] =
239 &perl_matcher<BidiIterator, Allocator, traits>::find_restart_any,
240 &perl_matcher<BidiIterator, Allocator, traits>::find_restart_word,
241 &perl_matcher<BidiIterator, Allocator, traits>::find_restart_line,
242 &perl_matcher<BidiIterator, Allocator, traits>::find_restart_buf,
243 &perl_matcher<BidiIterator, Allocator, traits>::match_prefix,
244 &perl_matcher<BidiIterator, Allocator, traits>::find_restart_lit,
245 &perl_matcher<BidiIterator, Allocator, traits>::find_restart_lit,
248 // initialise our stack if we are non-recursive:
249 #ifdef BOOST_REGEX_NON_RECURSIVE
250 save_state_init init(&m_stack_base, &m_backup_state);
251 used_block_count = BOOST_REGEX_MAX_BLOCKS;
252 #if !defined(BOOST_NO_EXCEPTIONS)
258 if((m_match_flags & regex_constants::match_init) == 0)
260 // reset our state machine:
261 search_base = position = base;
262 pstate = re.get_first_state();
263 m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last);
264 m_presult->set_base(base);
265 m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
266 m_match_flags |= regex_constants::match_init;
271 search_base = position = m_result[0].second;
272 // If last match was null and match_not_null was not set then increment
273 // our start position, otherwise we go into an infinite loop:
274 if(((m_match_flags & match_not_null) == 0) && (m_result.length() == 0))
282 m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last);
283 //if((base != search_base) && (base == backstop))
284 // m_match_flags |= match_prev_avail;
286 if(m_match_flags & match_posix)
288 m_result.set_size(re.mark_count(), base, last);
289 m_result.set_base(base);
292 verify_options(re.flags(), m_match_flags);
293 // find out what kind of expression we have:
294 unsigned type = (m_match_flags & match_continuous) ?
295 static_cast<unsigned int>(regbase::restart_continue)
296 : static_cast<unsigned int>(re.get_restart_type());
298 // call the appropriate search routine:
299 matcher_proc_type proc = s_find_vtable[type];
300 return (this->*proc)();
302 #if defined(BOOST_REGEX_NON_RECURSIVE) && !defined(BOOST_NO_EXCEPTIONS)
306 // unwind all pushed states, apart from anything else this
307 // ensures that all the states are correctly destructed
308 // not just the memory freed.
309 while(unwind(true)){}
315 template <class BidiIterator, class Allocator, class traits>
316 bool perl_matcher<BidiIterator, Allocator, traits>::match_prefix()
318 m_has_partial_match = false;
319 m_has_found_match = false;
320 pstate = re.get_first_state();
321 m_presult->set_first(position);
324 if(!m_has_found_match && m_has_partial_match && (m_match_flags & match_partial))
326 m_has_found_match = true;
327 m_presult->set_second(last, 0, false);
330 #ifdef BOOST_REGEX_MATCH_EXTRA
331 if(m_has_found_match && (match_extra & m_match_flags))
334 // we have a match, reverse the capture information:
336 for(unsigned i = 0; i < m_presult->size(); ++i)
338 typename sub_match<BidiIterator>::capture_sequence_type & seq = ((*m_presult)[i]).get_captures();
339 std::reverse(seq.begin(), seq.end());
343 if(!m_has_found_match)
344 position = restart; // reset search postion
345 return m_has_found_match;
348 template <class BidiIterator, class Allocator, class traits>
349 bool perl_matcher<BidiIterator, Allocator, traits>::match_literal()
351 unsigned int len = static_cast<const re_literal*>(pstate)->length;
352 const char_type* what = reinterpret_cast<const char_type*>(static_cast<const re_literal*>(pstate) + 1);
354 // compare string with what we stored in
356 for(unsigned int i = 0; i < len; ++i, ++position)
358 if((position == last) || (traits_inst.translate(*position, icase) != what[i]))
361 pstate = pstate->next.p;
365 template <class BidiIterator, class Allocator, class traits>
366 bool perl_matcher<BidiIterator, Allocator, traits>::match_start_line()
368 if(position == backstop)
370 if((m_match_flags & match_prev_avail) == 0)
372 if((m_match_flags & match_not_bol) == 0)
374 pstate = pstate->next.p;
380 else if(m_match_flags & match_single_line)
383 // check the previous value character:
384 BidiIterator t(position);
388 if(is_separator(*t) && !((*t == static_cast<char_type>('\r')) && (*position == static_cast<char_type>('\n'))) )
390 pstate = pstate->next.p;
394 else if(is_separator(*t))
396 pstate = pstate->next.p;
402 template <class BidiIterator, class Allocator, class traits>
403 bool perl_matcher<BidiIterator, Allocator, traits>::match_end_line()
407 if(m_match_flags & match_single_line)
409 // we're not yet at the end so *first is always valid:
410 if(is_separator(*position))
412 if((position != backstop) || (m_match_flags & match_prev_avail))
414 // check that we're not in the middle of \r\n sequence
415 BidiIterator t(position);
417 if((*t == static_cast<char_type>('\r')) && (*position == static_cast<char_type>('\n')))
422 pstate = pstate->next.p;
426 else if((m_match_flags & match_not_eol) == 0)
428 pstate = pstate->next.p;
434 template <class BidiIterator, class Allocator, class traits>
435 bool perl_matcher<BidiIterator, Allocator, traits>::match_wild()
439 if(is_separator(*position) && ((match_any_mask & static_cast<const re_dot*>(pstate)->mask) == 0))
441 if((*position == char_type(0)) && (m_match_flags & match_not_dot_null))
443 pstate = pstate->next.p;
448 template <class BidiIterator, class Allocator, class traits>
449 bool perl_matcher<BidiIterator, Allocator, traits>::match_word_boundary()
451 bool b; // indcates whether next character is a word character
454 // prev and this character must be opposites:
455 #if defined(BOOST_REGEX_USE_C_LOCALE) && defined(__GNUC__) && (__GNUC__ == 2) && (__GNUC_MINOR__ < 95)
456 b = traits::isctype(*position, m_word_mask);
458 b = traits_inst.isctype(*position, m_word_mask);
463 b = (m_match_flags & match_not_eow) ? true : false;
465 if((position == backstop) && ((m_match_flags & match_prev_avail) == 0))
467 if(m_match_flags & match_not_bow)
475 b ^= traits_inst.isctype(*position, m_word_mask);
480 pstate = pstate->next.p;
483 return false; // no match if we get to here...
486 template <class BidiIterator, class Allocator, class traits>
487 bool perl_matcher<BidiIterator, Allocator, traits>::match_within_word()
491 // both prev and this character must be m_word_mask:
492 bool prev = traits_inst.isctype(*position, m_word_mask);
495 if((position == backstop) && ((m_match_flags & match_prev_avail) == 0))
500 b = traits_inst.isctype(*position, m_word_mask);
505 pstate = pstate->next.p;
512 template <class BidiIterator, class Allocator, class traits>
513 bool perl_matcher<BidiIterator, Allocator, traits>::match_word_start()
516 return false; // can't be starting a word if we're already at the end of input
517 if(!traits_inst.isctype(*position, m_word_mask))
518 return false; // next character isn't a word character
519 if((position == backstop) && ((m_match_flags & match_prev_avail) == 0))
521 if(m_match_flags & match_not_bow)
522 return false; // no previous input
526 // otherwise inside buffer:
527 BidiIterator t(position);
529 if(traits_inst.isctype(*t, m_word_mask))
530 return false; // previous character not non-word
532 // OK we have a match:
533 pstate = pstate->next.p;
537 template <class BidiIterator, class Allocator, class traits>
538 bool perl_matcher<BidiIterator, Allocator, traits>::match_word_end()
540 if((position == backstop) && ((m_match_flags & match_prev_avail) == 0))
541 return false; // start of buffer can't be end of word
542 BidiIterator t(position);
544 if(traits_inst.isctype(*t, m_word_mask) == false)
545 return false; // previous character wasn't a word character
549 if(m_match_flags & match_not_eow)
550 return false; // end of buffer but not end of word
554 // otherwise inside buffer:
555 if(traits_inst.isctype(*position, m_word_mask))
556 return false; // next character is a word character
558 pstate = pstate->next.p;
559 return true; // if we fall through to here then we've succeeded
562 template <class BidiIterator, class Allocator, class traits>
563 bool perl_matcher<BidiIterator, Allocator, traits>::match_buffer_start()
565 if((position != backstop) || (m_match_flags & match_not_bob))
568 pstate = pstate->next.p;
572 template <class BidiIterator, class Allocator, class traits>
573 bool perl_matcher<BidiIterator, Allocator, traits>::match_buffer_end()
575 if((position != last) || (m_match_flags & match_not_eob))
578 pstate = pstate->next.p;
582 template <class BidiIterator, class Allocator, class traits>
583 bool perl_matcher<BidiIterator, Allocator, traits>::match_backref()
586 // Compare with what we previously matched.
587 // Note that this succeeds if the backref did not partisipate
588 // in the match, this is in line with ECMAScript, but not Perl
591 BidiIterator i = (*m_presult)[static_cast<const re_brace*>(pstate)->index].first;
592 BidiIterator j = (*m_presult)[static_cast<const re_brace*>(pstate)->index].second;
595 if((position == last) || (traits_inst.translate(*position, icase) != traits_inst.translate(*i, icase)))
600 pstate = pstate->next.p;
604 template <class BidiIterator, class Allocator, class traits>
605 bool perl_matcher<BidiIterator, Allocator, traits>::match_long_set()
607 typedef typename traits::char_class_type char_class_type;
608 // let the traits class do the work:
611 BidiIterator t = re_is_set_member(position, last, static_cast<const re_set_long<char_class_type>*>(pstate), re.get_data(), icase);
614 pstate = pstate->next.p;
621 template <class BidiIterator, class Allocator, class traits>
622 bool perl_matcher<BidiIterator, Allocator, traits>::match_set()
626 if(static_cast<const re_set*>(pstate)->_map[static_cast<unsigned char>(traits_inst.translate(*position, icase))])
628 pstate = pstate->next.p;
635 template <class BidiIterator, class Allocator, class traits>
636 bool perl_matcher<BidiIterator, Allocator, traits>::match_jump()
638 pstate = static_cast<const re_jump*>(pstate)->alt.p;
642 template <class BidiIterator, class Allocator, class traits>
643 bool perl_matcher<BidiIterator, Allocator, traits>::match_combining()
647 if(is_combining(traits_inst.translate(*position, icase)))
650 while((position != last) && is_combining(traits_inst.translate(*position, icase)))
652 pstate = pstate->next.p;
656 template <class BidiIterator, class Allocator, class traits>
657 bool perl_matcher<BidiIterator, Allocator, traits>::match_soft_buffer_end()
659 if(m_match_flags & match_not_eob)
661 BidiIterator p(position);
662 while((p != last) && is_separator(traits_inst.translate(*p, icase)))++p;
665 pstate = pstate->next.p;
669 template <class BidiIterator, class Allocator, class traits>
670 bool perl_matcher<BidiIterator, Allocator, traits>::match_restart_continue()
672 if(position == search_base)
674 pstate = pstate->next.p;
680 template <class BidiIterator, class Allocator, class traits>
681 bool perl_matcher<BidiIterator, Allocator, traits>::match_backstep()
684 #pragma warning(push)
685 #pragma warning(disable:4127)
687 if( ::boost::is_random_access_iterator<BidiIterator>::value)
689 std::ptrdiff_t maxlen = ::boost::re_detail::distance(backstop, position);
690 if(maxlen < static_cast<const re_brace*>(pstate)->index)
692 std::advance(position, -static_cast<const re_brace*>(pstate)->index);
696 int c = static_cast<const re_brace*>(pstate)->index;
699 if(position == backstop)
704 pstate = pstate->next.p;
711 template <class BidiIterator, class Allocator, class traits>
712 inline bool perl_matcher<BidiIterator, Allocator, traits>::match_assert_backref()
714 // return true if marked sub-expression N has been matched:
715 int index = static_cast<const re_brace*>(pstate)->index;
719 // Magic value for a (DEFINE) block:
724 // Check if index is a hash value:
726 index = re.get_data().get_id(index);
727 // Have we matched subexpression "index"?
728 result = (*m_presult)[index].matched;
729 pstate = pstate->next.p;
733 // Have we recursed into subexpression "index"?
734 // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
737 id = re.get_data().get_id(id);
738 result = recursion_stack_position && ((recursion_stack[recursion_stack_position-1].id == id) || (index == 0));
739 pstate = pstate->next.p;
744 template <class BidiIterator, class Allocator, class traits>
745 bool perl_matcher<BidiIterator, Allocator, traits>::match_toggle_case()
747 // change our case sensitivity:
748 this->icase = static_cast<const re_case*>(pstate)->icase;
749 pstate = pstate->next.p;
754 template <class BidiIterator, class Allocator, class traits>
755 bool perl_matcher<BidiIterator, Allocator, traits>::find_restart_any()
758 #pragma warning(push)
759 #pragma warning(disable:4127)
761 const unsigned char* _map = re.get_map();
764 // skip everything we can't match:
765 while((position != last) && !can_start(*position, _map, (unsigned char)mask_any) )
769 // run out of characters, try a null match if possible:
771 return match_prefix();
774 // now try and obtain a match:
787 template <class BidiIterator, class Allocator, class traits>
788 bool perl_matcher<BidiIterator, Allocator, traits>::find_restart_word()
791 #pragma warning(push)
792 #pragma warning(disable:4127)
794 // do search optimised for word starts:
795 const unsigned char* _map = re.get_map();
796 if((m_match_flags & match_prev_avail) || (position != base))
798 else if(match_prefix())
802 while((position != last) && traits_inst.isctype(*position, m_word_mask))
804 while((position != last) && !traits_inst.isctype(*position, m_word_mask))
809 if(can_start(*position, _map, (unsigned char)mask_any) )
823 template <class BidiIterator, class Allocator, class traits>
824 bool perl_matcher<BidiIterator, Allocator, traits>::find_restart_line()
826 // do search optimised for line starts:
827 const unsigned char* _map = re.get_map();
830 while(position != last)
832 while((position != last) && !is_separator(*position))
839 if(re.can_be_null() && match_prefix())
844 if( can_start(*position, _map, (unsigned char)mask_any) )
856 template <class BidiIterator, class Allocator, class traits>
857 bool perl_matcher<BidiIterator, Allocator, traits>::find_restart_buf()
859 if((position == base) && ((m_match_flags & match_not_bob) == 0))
860 return match_prefix();
864 template <class BidiIterator, class Allocator, class traits>
865 bool perl_matcher<BidiIterator, Allocator, traits>::find_restart_lit()
869 return false; // can't possibly match if we're at the end already
871 unsigned type = (m_match_flags & match_continuous) ?
872 static_cast<unsigned int>(regbase::restart_continue)
873 : static_cast<unsigned int>(re.get_restart_type());
875 const kmp_info<char_type>* info = access::get_kmp(re);
877 const char_type* x = info->pstr;
879 while (position != last)
881 while((j > -1) && (x[j] != traits_inst.translate(*position, icase)))
882 j = info->kmp_next[j];
887 if(type == regbase::restart_fixed_lit)
889 std::advance(position, -j);
891 std::advance(restart, len);
892 m_result.set_first(position);
893 m_result.set_second(restart);
900 std::advance(position, -j);
905 for(int k = 0; (restart != position) && (k < j); ++k, --restart)
906 {} // dwa 10/20/2000 - warning suppression for MWCW
910 j = 0; //we could do better than this...
915 if((m_match_flags & match_partial) && (position == last) && j)
917 // we need to check for a partial match:
919 std::advance(position, -j);
920 return match_prefix();
926 } // namespace re_detail
931 # pragma warning(pop)
938 #pragma warning(push)
939 #pragma warning(disable: 4103)
941 #ifdef BOOST_HAS_ABI_HEADERS
942 # include BOOST_ABI_SUFFIX