3 * Copyright (c) 1998-2002
6 * Permission to use, copy, modify, distribute and sell this software
7 * and its documentation for any purpose is hereby granted without fee,
8 * provided that the above copyright notice appear in all copies and
9 * that both that copyright notice and this permission notice appear
10 * in supporting documentation. Dr John Maddock makes no representations
11 * about the suitability of this software for any purpose.
12 * It is provided "as is" without express or implied warranty.
17 * LOCATION: see http://www.boost.org for most recent version.
18 * FILE: c_regex_traits.cpp
19 * VERSION: see <boost/version.hpp>
20 * DESCRIPTION: Implements the cpp_regex_traits<charT> traits class
24 #define BOOST_REGEX_SOURCE
26 #include <boost/regex/config.hpp>
28 #if !defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_WSTREAMBUF)
31 # pragma warning(disable:4786 4702 4127 4244)
41 #include <boost/regex/regex_traits.hpp>
42 #include <boost/cregex.hpp>
43 #include <boost/scoped_array.hpp>
44 #include "primary_transform.hpp"
47 # pragma warning(disable:4786 4702 4127 4244)
51 const unsigned int re_classes_max = 14;
52 const unsigned int char_set_size = CHAR_MAX - CHAR_MIN + 1;
54 boost::uint_fast32_t re_char_class_id[] = {
55 boost::re_detail::cpp_regex_traits_base::char_class_alnum,
56 boost::re_detail::cpp_regex_traits_base::char_class_alpha,
57 boost::re_detail::cpp_regex_traits_base::char_class_cntrl,
58 boost::re_detail::cpp_regex_traits_base::char_class_digit,
59 boost::re_detail::cpp_regex_traits_base::char_class_graph,
60 boost::re_detail::cpp_regex_traits_base::char_class_lower,
61 boost::re_detail::cpp_regex_traits_base::char_class_print,
62 boost::re_detail::cpp_regex_traits_base::char_class_punct,
63 boost::re_detail::cpp_regex_traits_base::char_class_space,
64 boost::re_detail::cpp_regex_traits_base::char_class_upper,
65 boost::re_detail::cpp_regex_traits_base::char_class_xdigit,
66 boost::re_detail::cpp_regex_traits_base::char_class_blank,
67 boost::re_detail::cpp_regex_traits_base::char_class_word,
68 boost::re_detail::cpp_regex_traits_base::char_class_unicode,
71 const char* re_char_class_names[] = {
88 template <class charT,
89 class traits = ::std::char_traits<charT> >
90 class parser_buf : public ::std::basic_streambuf<charT, traits>
92 typedef ::std::basic_streambuf<charT, traits> base_type;
93 typedef typename base_type::int_type int_type;
94 typedef typename base_type::char_type char_type;
95 typedef typename base_type::pos_type pos_type;
96 typedef ::std::streamsize streamsize;
97 typedef typename base_type::off_type off_type;
99 parser_buf() : base_type() { setbuf(0, 0); }
100 const charT* getnext() { return this->gptr(); }
102 std::basic_streambuf<charT, traits>* setbuf(char_type* s, streamsize n);
103 typename parser_buf<charT, traits>::pos_type seekpos(pos_type sp, ::std::ios_base::openmode which);
104 typename parser_buf<charT, traits>::pos_type seekoff(off_type off, ::std::ios_base::seekdir way, ::std::ios_base::openmode which);
106 parser_buf& operator=(const parser_buf&);
107 parser_buf(const parser_buf&);
110 template<class charT, class traits>
111 std::basic_streambuf<charT, traits>*
112 parser_buf<charT, traits>::setbuf(char_type* s, streamsize n)
114 this->setg(s, s, s + n);
118 template<class charT, class traits>
119 typename parser_buf<charT, traits>::pos_type
120 parser_buf<charT, traits>::seekoff(off_type off, ::std::ios_base::seekdir way, ::std::ios_base::openmode which)
122 typedef typename parser_buf<charT, traits>::pos_type pos_type;
123 if(which & ::std::ios_base::out)
124 return pos_type(off_type(-1));
125 std::ptrdiff_t size = this->egptr() - this->eback();
126 std::ptrdiff_t pos = this->gptr() - this->eback();
127 charT* g = this->eback();
130 case ::std::ios_base::beg:
131 if((off < 0) || (off > size))
132 return pos_type(off_type(-1));
134 this->setg(g, g + off, g + size);
135 case ::std::ios_base::end:
136 if((off < 0) || (off > size))
137 return pos_type(off_type(-1));
139 this->setg(g, g + size - off, g + size);
140 case ::std::ios_base::cur:
142 std::ptrdiff_t newpos = pos + off;
143 if((newpos < 0) || (newpos > size))
144 return pos_type(off_type(-1));
146 this->setg(g, g + newpos, g + size);
149 return static_cast<pos_type>(this->gptr() - this->eback());
152 template<class charT, class traits>
153 typename parser_buf<charT, traits>::pos_type
154 parser_buf<charT, traits>::seekpos(pos_type sp, ::std::ios_base::openmode which)
156 if(which & ::std::ios_base::out)
157 return pos_type(off_type(-1));
158 std::ptrdiff_t size = this->egptr() - this->eback();
159 charT* g = this->eback();
162 this->setg(g, g + ::std::streamsize(sp), g + size);
164 return pos_type(off_type(-1));
174 struct message_data<char>
176 unsigned char syntax_map[CHAR_MAX-CHAR_MIN];
177 std::map<std::string, std::string, std::less<std::string> > collating_elements;
178 std::map<std::string, std::size_t, std::less<std::string> > classes;
181 parser_buf<char> sbuf;
183 std::string error_strings[boost::REG_E_UNKNOWN+1];
185 message_data(const std::locale& l, const std::string& regex_message_catalogue);
187 message_data(const message_data&);
188 message_data& operator=(const message_data&);
192 message_data<char>::message_data(const std::locale& l, const std::string& regex_message_catalogue)
196 #ifndef BOOST_NO_STD_MESSAGES
198 const std::messages<char>* pm = 0;
199 std::messages<char>::catalog cat = -1;
200 if(regex_message_catalogue.size())
202 pm = &BOOST_USE_FACET(std::messages<char>, l);
203 cat = pm->open(regex_message_catalogue, l);
204 #ifndef BOOST_NO_EXCEPTIONS
207 std::string m("Unable to open message catalog: ");
208 throw std::runtime_error(m + regex_message_catalogue);
211 BOOST_REGEX_NOEH_ASSERT(cat >= 0);
215 std::memset(syntax_map, cpp_regex_traits<char>::syntax_char, 256);
217 scoped_array<char> a;
218 std::size_t array_size = 0;
219 std::size_t new_size;
220 for(i = 1; i < cpp_regex_traits<char>::syntax_max; ++i)
222 new_size = re_get_default_message(0, 0, i+100);
223 if(new_size > array_size)
225 a.reset(new char[new_size]);
226 array_size = new_size;
228 re_get_default_message(a.get(), array_size, i+100);
229 std::string s = a.get();
230 #ifndef BOOST_NO_STD_MESSAGES
232 s = pm->get(cat, 0, i+100, s);
234 for(std::size_t j = 0; j < s.size(); ++j)
236 syntax_map[s[j]] = (unsigned char)(i);
240 #ifndef BOOST_NO_STD_MESSAGES
241 // load any custom collate names:
243 // for some reason Borland C++ Builder 6 won't let us use
244 // std::isspace(char, std::locale) unless we call it
245 // unqualifed - weird. This seems to be affecting other
246 // STLport users as well (gcc3.1+STLport5), so enable the
247 // workaround for all STLport users...
249 #if defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)
251 # define BOOST_REGEX_STD
253 # define BOOST_REGEX_STD std::
260 c2 = pm->get(cat, 0, i, c1);
263 const char* p1, *p2, *p3, *p4;;
265 while(*p1 && BOOST_REGEX_STD isspace((char)*p1, l))++p1;
267 while(*p2 && !BOOST_REGEX_STD isspace((char)*p2, l))++p2;
269 while(*p3 && BOOST_REGEX_STD isspace((char)*p3, l))++p3;
271 while(*p4 && !BOOST_REGEX_STD isspace((char)*p4, l))++p4;
272 collating_elements[std::string(p1, p2)] = std::string(p3, p4);
275 c2 = pm->get(cat, 0, i, c1);
281 #ifndef BOOST_NO_STD_MESSAGES
284 for(i = 0; i < re_classes_max; ++i)
286 s = pm->get(cat, 0, i+300, m);
290 for(i = 0; i <= boost::REG_E_UNKNOWN ; ++i)
292 s = pm->get(cat, 0, i+200, m);
293 error_strings[i] = s;
302 std::string BOOST_REGEX_CALL cpp_regex_traits_base::set_message_catalogue(const std::string& l)
304 if(sizeof(regex_message_cat) <= l.size())
306 std::string old(regex_message_cat);
307 std::strcpy(regex_message_cat, l.c_str());
311 char cpp_regex_traits_base::regex_message_cat[BOOST_REGEX_MAX_PATH] = {0};
314 } // namespace re_detail
317 cpp_regex_traits<char>::cpp_regex_traits()
319 pmd = new re_detail::message_data<char>(locale_inst, regex_message_cat);
320 psyntax = pmd->syntax_map;
321 #ifndef BOOST_NO_EXCEPTIONS
324 lower_map = new char[char_set_size];
325 BOOST_REGEX_NOEH_ASSERT(lower_map)
326 #ifndef BOOST_NO_EXCEPTIONS
334 for(unsigned int i = 0; i < char_set_size; ++i)
335 lower_map[i] = static_cast<char>(i);
336 pctype = &BOOST_USE_FACET(std::ctype<char>, locale_inst);
337 pctype->tolower(&lower_map[0], &lower_map[char_set_size]);
338 pcollate = &BOOST_USE_FACET(std::collate<char>, locale_inst);
339 sort_type = re_detail::find_sort_syntax(this, &(this->sort_delim));
342 cpp_regex_traits<char>::~cpp_regex_traits()
348 int BOOST_REGEX_CALL cpp_regex_traits<char>::toi(char c)const
350 pmd->sbuf.pubsetbuf(&c, 1);
362 int BOOST_REGEX_CALL cpp_regex_traits<char>::toi(const char*& first, const char* last, int radix)const
364 pmd->sbuf.pubsetbuf(const_cast<char*>(first), static_cast<std::streamsize>(last-first));
366 if(std::abs(radix) == 16) pmd->is >> std::hex;
367 else if(std::abs(radix) == 8) pmd->is >> std::oct;
368 else pmd->is >> std::dec;
372 first = first + ((last - first) - pmd->sbuf.in_avail());
379 boost::uint_fast32_t BOOST_REGEX_CALL cpp_regex_traits<char>::lookup_classname(const char* first, const char* last)const
383 std::string s(first, last);
385 std::map<std::string, std::size_t, std::less<std::string> >::const_iterator pos = pmd->classes.find(s);
386 if(pos != pmd->classes.end())
387 return re_char_class_id[(*pos).second];
389 for(i = 0; i < re_classes_max; ++i)
391 if(s == re_char_class_names[i])
392 return re_char_class_id[i];
397 bool BOOST_REGEX_CALL cpp_regex_traits<char>::lookup_collatename(std::string& s, const char* first, const char* last)const
400 std::string name(first, last);
401 std::map<std::string, std::string, std::less<std::string > >::const_iterator pos = pmd->collating_elements.find(name);
402 if(pos != pmd->collating_elements.end())
407 return re_detail::re_lookup_def_collate_name(s, name.c_str());
410 void BOOST_REGEX_CALL cpp_regex_traits<char>::transform_primary(std::string& out, const std::string& in)const
415 case re_detail::sort_C:
416 case re_detail::sort_unknown:
418 case re_detail::sort_fixed:
419 if((unsigned)sort_delim < out.size())
420 out.erase((int)sort_delim);
422 case re_detail::sort_delim:
423 for(unsigned int i = 0; i < out.size(); ++i)
425 if((out[i] == sort_delim) && (i+1 < out.size()))
435 std::string BOOST_REGEX_CALL cpp_regex_traits<char>::error_string(unsigned id)const
437 if((id <= boost::REG_E_UNKNOWN) && (pmd->error_strings[id].size()))
438 return pmd->error_strings[id];
439 return boost::re_detail::re_default_error_messages[id];
442 cpp_regex_traits<char>::locale_type BOOST_REGEX_CALL cpp_regex_traits<char>::imbue(locale_type l)
444 locale_type old_l(locale_inst);
446 re_detail::message_data<char>* npmd = new re_detail::message_data<char>(locale_inst, regex_message_cat);
449 psyntax = pmd->syntax_map;
450 for(unsigned int i = 0; i < char_set_size; ++i)
451 lower_map[i] = static_cast<char>(i);
452 pctype = &BOOST_USE_FACET(std::ctype<char>, locale_inst);
453 pctype->tolower(&lower_map[0], &lower_map[char_set_size]);
454 pcollate = &BOOST_USE_FACET(std::collate<char>, locale_inst);
455 sort_type = re_detail::find_sort_syntax(this, &(this->sort_delim));
459 #ifndef BOOST_NO_WREGEX
463 std::string BOOST_REGEX_CALL to_narrow(const std::basic_string<wchar_t>& is, const std::codecvt<wchar_t, char, std::mbstate_t>& cvt)
466 std::basic_string<wchar_t>::size_type bufsize = is.size() * 2;
468 // declare buffer first as VC6 workaround for internal compiler error!
469 char* pc = new char[bufsize];
470 scoped_array<char> t(pc);
471 #if defined(BOOST_MSVC) && !defined(DINKUMWARE_CE)
472 std::mbstate_t state = 0;
474 std::mbstate_t state = std::mbstate_t();
477 const wchar_t* next_in;
481 switch(cvt.out(state, is.c_str(), is.c_str() + is.size(), next_in, t.get(), t.get() + bufsize, next_out))
483 case std::codecvt_base::ok:
484 return std::string(t.get(), next_out);
485 case std::codecvt_base::partial:
487 t.reset(new char[bufsize]);
489 case std::codecvt_base::error:
490 // not much we can do here but guess:
491 case std::codecvt_base::noconv:
493 for(unsigned i = 0; i < is.size(); ++i)
495 out.append(1, (char)is[i]);
502 std::wstring BOOST_REGEX_CALL to_wide(const std::string& is, const std::codecvt<wchar_t, char, std::mbstate_t>& cvt)
505 std::string::size_type bufsize = is.size() + 2;
506 std::string::size_type maxsize = is.size() * 100;
508 // declare buffer first as VC6 workaround for internal compiler error!
509 wchar_t* pc = new wchar_t[bufsize];
510 scoped_array<wchar_t> t(pc);
511 #if defined(BOOST_MSVC) && !defined(DINKUMWARE_CE)
512 std::mbstate_t state = 0;
514 std::mbstate_t state = std::mbstate_t();
522 switch(cvt.in(state, is.c_str(), is.c_str() + is.size(), next_in, t.get(), t.get() + bufsize, next_out))
524 case std::codecvt_base::ok:
525 return std::wstring(t.get(), next_out);
526 case std::codecvt_base::partial:
528 if(bufsize < maxsize)
530 t.reset(new wchar_t[bufsize]);
534 // error fall through:
535 case std::codecvt_base::error:
536 // not much we can do here but guess:
537 case std::codecvt_base::noconv:
539 for(unsigned i = 0; i < is.size(); ++i)
541 out.append(1, is[i]);
551 struct message_data<wchar_t>
553 #ifndef BOOST_NO_STD_MESSAGES
554 typedef std::messages<wchar_t>::string_type string_type;
556 typedef std::wstring string_type;
567 std::list<syntax_map> syntax;
568 std::map<string_type, std::size_t> classes;
569 std::map<string_type, string_type> collating_elements;
570 unsigned char syntax_[CHAR_MAX-CHAR_MIN+1];
572 parser_buf<wchar_t> sbuf;
574 std::string error_strings[boost::REG_E_UNKNOWN+1];
576 message_data(const std::locale& l, const std::string& regex_message_catalogue);
578 message_data(const message_data&);
579 message_data& operator=(const message_data&);
582 message_data<wchar_t>::message_data(const std::locale& l, const std::string& regex_message_catalogue)
587 typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type;
588 const cvt_type& cvt = BOOST_USE_FACET(cvt_type, l);
589 #ifndef BOOST_NO_STD_MESSAGES
590 const std::messages<wchar_t>& msgs = BOOST_USE_FACET(std::messages<wchar_t>, l);
591 std::messages<wchar_t>::catalog cat = -1;
592 if(regex_message_catalogue.size())
594 cat = msgs.open(regex_message_catalogue, l);
595 #ifndef BOOST_NO_EXCEPTIONS
598 std::string m("Unable to open message catalog: ");
599 throw std::runtime_error(m + regex_message_catalogue);
602 BOOST_REGEX_NOEH_ASSERT(cat >= 0);
606 scoped_array<char> a;
607 std::size_t array_size = 0;
608 std::size_t new_size;
610 std::memset(syntax_, cpp_regex_traits<wchar_t>::syntax_char, sizeof(syntax_));
611 for(i = 1; i < cpp_regex_traits<wchar_t>::syntax_max; ++i)
613 new_size = re_get_default_message(0, 0, i+100);
614 if(new_size > array_size)
616 a.reset(new char[new_size]);
617 array_size = new_size;
619 re_get_default_message(a.get(), array_size, i+100);
620 std::string ns = a.get();
621 string_type s = to_wide(ns, cvt);
622 #ifndef BOOST_NO_STD_MESSAGES
624 s = BOOST_USE_FACET(std::messages<wchar_t>, l).get(cat, 0, (int)i+100, s);
626 for(unsigned int j = 0; j < s.size(); ++j)
628 if((s[j] <= UCHAR_MAX) && (s[j] >= 0))
629 syntax_[s[j]] = static_cast<unsigned char>(i);
633 m.type = static_cast<unsigned int>(i);
639 #ifndef BOOST_NO_STD_MESSAGES
640 // load any custom collate names:
645 c2 = msgs.get(cat, 0, (int)i, c1);
648 const wchar_t* p1, *p2, *p3, *p4;;
650 while(*p1 && BOOST_REGEX_STD isspace((wchar_t)*p1, l))++p1;
652 while(*p2 && !BOOST_REGEX_STD isspace((wchar_t)*p2, l))++p2;
654 while(*p3 && BOOST_REGEX_STD isspace((wchar_t)*p3, l))++p3;
656 while(*p4 && !BOOST_REGEX_STD isspace((wchar_t)*p4, l))++p4;
657 collating_elements[std::basic_string<wchar_t>(p1, p2)] = std::basic_string<wchar_t>(p3, p4);
660 c2 = msgs.get(cat, 0, (int)i, c1);
667 for(i = 0; i < re_classes_max; ++i)
669 c1 = msgs.get(cat, 0, static_cast<int>(i+300), c2);
673 for(i = 0; i <= boost::REG_E_UNKNOWN ; ++i)
675 c1 = msgs.get(cat, 0, static_cast<int>(i+200), c2);
676 error_strings[i] = to_narrow(c1, cvt);
685 } // namespace re_detail
687 unsigned int BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::do_syntax_type(size_type c)const
689 std::list<re_detail::message_data<wchar_t>::syntax_map>::const_iterator i, j;
690 i = pmd->syntax.begin();
691 j = pmd->syntax.end();
694 if(((uchar_type)(*i).c) == c)
701 void BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::transform_primary(std::basic_string<wchar_t>& out, const std::basic_string<wchar_t>& in)const
706 case re_detail::sort_C:
707 case re_detail::sort_unknown:
709 case re_detail::sort_fixed:
710 if((unsigned)sort_delim < out.size())
711 out.erase((int)sort_delim);
713 case re_detail::sort_delim:
714 for(unsigned int i = 0; i < out.size(); ++i)
716 if((out[i] == sort_delim) && (i+1 < out.size()))
725 int BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::toi(wchar_t c)const
727 pmd->sbuf.pubsetbuf(&c, 1);
739 int BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::toi(const wchar_t*& first, const wchar_t* last, int radix)const
741 pmd->sbuf.pubsetbuf(const_cast<wchar_t*>(first), static_cast<std::streamsize>(last-first));
743 if(std::abs(radix) == 16) pmd->is >> std::hex;
744 else if(std::abs(radix) == 8) pmd->is >> std::oct;
745 else pmd->is >> std::dec;
749 first = first + ((last - first) - pmd->sbuf.in_avail());
756 boost::uint_fast32_t BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::lookup_classname(const wchar_t* first, const wchar_t* last)const
760 std::wstring s(first, last);
762 std::map<std::wstring, std::size_t>::const_iterator pos = pmd->classes.find(s);
763 if(pos != pmd->classes.end())
764 return re_char_class_id[(*pos).second];
766 std::string ns = re_detail::to_narrow(s, *pcdv);
768 for(i = 0; i < re_classes_max; ++i)
770 if(ns == re_char_class_names[i])
771 return re_char_class_id[i];
776 bool BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::lookup_collatename(std::basic_string<wchar_t>& s, const wchar_t* first, const wchar_t* last)const
779 std::wstring name(first, last);
780 std::map<std::wstring, std::wstring>::const_iterator pos = pmd->collating_elements.find(name);
781 if(pos != pmd->collating_elements.end())
786 std::string ns = re_detail::to_narrow(name, *pcdv);
788 bool result = re_detail::re_lookup_def_collate_name(ns2, ns.c_str());
789 s = re_detail::to_wide(ns2, *pcdv);
793 std::string BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::error_string(unsigned id)const
795 if((id <= boost::REG_E_UNKNOWN) && (pmd->error_strings[id].size()))
796 return pmd->error_strings[id];
797 return boost::re_detail::re_default_error_messages[id];
800 cpp_regex_traits<wchar_t>::cpp_regex_traits()
802 pmd = new re_detail::message_data<wchar_t>(locale_inst, std::string(regex_message_cat));
803 psyntax = pmd->syntax_;
804 #ifndef BOOST_NO_EXCEPTIONS
807 lower_map = new wchar_t[char_set_size];
808 BOOST_REGEX_NOEH_ASSERT(lower_map)
809 #ifndef BOOST_NO_EXCEPTIONS
817 for(unsigned int i = 0; i < char_set_size; ++i)
818 lower_map[i] = static_cast<wchar_t>(i);
819 pctype = &BOOST_USE_FACET(std::ctype<wchar_t>, locale_inst);
820 pctype->tolower(&lower_map[0], &lower_map[char_set_size]);
821 pcollate = &BOOST_USE_FACET(std::collate<wchar_t>, locale_inst);
822 typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_t;
823 pcdv = &BOOST_USE_FACET(cvt_t, locale_inst);
824 sort_type = re_detail::find_sort_syntax(this, &(this->sort_delim));
827 cpp_regex_traits<wchar_t>::~cpp_regex_traits()
833 cpp_regex_traits<wchar_t>::locale_type BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::imbue(locale_type l)
835 locale_type old_l(locale_inst);
837 re_detail::message_data<wchar_t>* npmd = new re_detail::message_data<wchar_t>(locale_inst, std::string(regex_message_cat));
840 psyntax = pmd->syntax_;
841 for(unsigned int i = 0; i < char_set_size; ++i)
842 lower_map[i] = static_cast<wchar_t>(i);
843 pctype = &BOOST_USE_FACET(std::ctype<wchar_t>, locale_inst);
844 pctype->tolower(&lower_map[0], &lower_map[char_set_size]);
845 pcollate = &BOOST_USE_FACET(std::collate<wchar_t>, locale_inst);
846 typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_t;
847 pcdv = &BOOST_USE_FACET(cvt_t, locale_inst);
848 sort_type = re_detail::find_sort_syntax(this, &(this->sort_delim));
852 std::size_t BOOST_REGEX_CALL cpp_regex_traits<wchar_t>::strwiden(wchar_t *s1, std::size_t len, const char *s2)const
855 std::wstring ws = re_detail::to_wide(s2, *pcdv);
857 std::wcscpy(s1, ws.c_str());
861 #endif // BOOST_NO_WREGEX