boost/boost/token_functions.hpp

   1 // Boost token_functions.hpp  ------------------------------------------------//
   2
   3 // Copyright John R. Bandela 2001.
   4
   5 // Permission to copy, use, modify, sell and distribute this software
   6 // is granted provided this copyright notice appears in all
   7 // copies. This software is provided "as is" without express or
   8 // implied warranty, and with no claim as to its suitability for any
   9 // purpose.
  10
  11 // See http://www.boost.org/libs/tokenizer for documentation.
  12
  13 // Revision History:
  14
  15 // 20 Feb 2002   John Maddock
  16 //      Removed using namespace std declarations and added
  17 //      workaround for BOOST_NO_STDC_NAMESPACE (the library
  18 //      can be safely mixed with regex).
  19 // 06 Feb 2002   Jeremy Siek
  20 //      Added char_separator.
  21 // 02 Feb 2002   Jeremy Siek
  22 //      Removed tabs and a little cleanup.
  23
  24
  25 #ifndef BOOST_TOKEN_FUNCTIONS_JRB051801_HPP_
  26 #define BOOST_TOKEN_FUNCTIONS_JRB051801_HPP_
  27
  28 #include <vector>
  29 #include <stdexcept>
  30 #include <cassert>
  31 #include <string>
  32 #include <cctype>
  33
  34 //
  35 // the following must not be macros if we are to prefix them
  36 // with std:: (they shouldn't be macros anyway...)
  37 //
  38 #ifdef ispunct
  39 #  undef ispunct
  40 #endif
  41 #ifdef isspace
  42 #  undef isspace
  43 #endif
  44 //
  45 // fix namespace problems:
  46 //
  47 #ifdef BOOST_NO_STDC_NAMESPACE
  48 namespace std{
  49  using ::ispunct;
  50  using ::isspace;
  51 }
  52 #endif
  53
  54 namespace boost{
  55
  56   //===========================================================================
  57   // The escaped_list_separator class. Which is a model of TokenizerFunction
  58   // An escaped list is a super-set of what is commonly known as a comma
  59   // separated value (csv) list.It is separated into fields by a comma or
  60   // other character. If the delimiting character is inside quotes, then it is
  61   // counted as a regular character.To allow for embedded quotes in a field,
  62   // there can be escape sequences using the \ much like C.
  63   // The role of the comma, the quotation mark, and the escape
  64   // character (backslash \), can be assigned to other characters.
  65
  66   struct escaped_list_error : public std::runtime_error{
  67     escaped_list_error(const std::string& what):std::runtime_error(what) { }
  68   };
  69
  70
  71 // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  72 // MSVC does not like the following typename
  73 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  74   template <class Char,
  75     class Traits = typename std::basic_string<Char>::traits_type >
  76 #else
  77   template <class Char,
  78     class Traits = std::basic_string<Char>::traits_type >
  79 #endif
  80   class escaped_list_separator {
  81
  82   private:
  83     typedef std::basic_string<Char,Traits> string_type;
  84     struct char_eq {
  85       Char e_;
  86       char_eq(Char e):e_(e) { }
  87       bool operator()(Char c) {
  88         return Traits::eq(e_,c);
  89       }
  90     };
  91     string_type  escape_;
  92     string_type  c_;
  93     string_type  quote_;
  94     bool last_;
  95
  96     bool is_escape(Char e) {
  97       char_eq f(e);
  98       return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
  99     }
 100     bool is_c(Char e) {
 101       char_eq f(e);
 102       return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
 103     }
 104     bool is_quote(Char e) {
 105       char_eq f(e);
 106       return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
 107     }
 108     template <typename iterator, typename Token>
 109     void do_escape(iterator& next,iterator end,Token& tok) {
 110       if (++next == end)
 111         throw escaped_list_error(std::string("cannot end with escape"));
 112       if (Traits::eq(*next,'n')) {
 113         tok+='\n';
 114         return;
 115       }
 116       else if (is_quote(*next)) {
 117         tok+=*next;
 118         return;
 119       }
 120       else if (is_c(*next)) {
 121         tok+=*next;
 122         return;
 123       }
 124       else if (is_escape(*next)) {
 125         tok+=*next;
 126         return;
 127       }
 128       else
 129         throw escaped_list_error(std::string("unknown escape sequence"));
 130     }
 131
 132     public:
 133
 134     explicit escaped_list_separator(Char  e = '\\',
 135                                     Char c = ',',Char  q = '\"')
 136       : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
 137
 138     escaped_list_separator(string_type e, string_type c, string_type q)
 139       : escape_(e), c_(c), quote_(q), last_(false) { }
 140
 141     void reset() {last_=false;}
 142
 143     template <typename InputIterator, typename Token>
 144     bool operator()(InputIterator& next,InputIterator end,Token& tok) {
 145       bool bInQuote = false;
 146       tok = Token();
 147
 148       if (next == end) {
 149         if (last_) {
 150           last_ = false;
 151           return true;
 152         }
 153         else
 154           return false;
 155       }
 156       last_ = false;
 157       for (;next != end;++next) {
 158         if (is_escape(*next)) {
 159           do_escape(next,end,tok);
 160         }
 161         else if (is_c(*next)) {
 162           if (!bInQuote) {
 163             // If we are not in quote, then we are done
 164             ++next;
 165             // The last character was a c, that means there is
 166             // 1 more blank field
 167             last_ = true;
 168             return true;
 169           }
 170           else tok+=*next;
 171         }
 172         else if (is_quote(*next)) {
 173           bInQuote=!bInQuote;
 174         }
 175         else {
 176           tok += *next;
 177         }
 178       }
 179       return true;
 180     }
 181   };
 182
 183
 184   //===========================================================================
 185   // The offset_separator class, which is a model of TokenizerFunction.
 186   // Offset breaks a string into tokens based on a range of offsets
 187
 188   class offset_separator {
 189   private:
 190
 191     std::vector<int> offsets_;
 192     unsigned int current_offset_;
 193     bool wrap_offsets_;
 194     bool return_partial_last_;
 195
 196   public:
 197     template <typename Iter>
 198     offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
 199                      bool return_partial_last = true)
 200       : offsets_(begin,end), current_offset_(0),
 201         wrap_offsets_(wrap_offsets),
 202         return_partial_last_(return_partial_last) { }
 203
 204     offset_separator()
 205       : offsets_(1,1), current_offset_(),
 206         wrap_offsets_(true), return_partial_last_(true) { }
 207
 208     void reset() {
 209       current_offset_ = 0;
 210     }
 211
 212     template <typename InputIterator, typename Token>
 213     bool operator()(InputIterator& next, InputIterator end, Token& tok)
 214     {
 215       assert(!offsets_.empty());
 216
 217       tok = Token();
 218
 219       if (next == end)
 220         return false;
 221
 222       if (current_offset_ == offsets_.size())
 223         if (wrap_offsets_)
 224           current_offset_=0;
 225         else
 226           return false;
 227
 228       int c = offsets_[current_offset_];
 229       int i = 0;
 230       for (; i < c; ++i) {
 231         if (next == end)break;
 232         tok+=*next++;
 233       }
 234
 235       if (!return_partial_last_)
 236         if (i < (c-1) )
 237           return false;
 238
 239       ++current_offset_;
 240       return true;
 241     }
 242   };
 243
 244
 245   //===========================================================================
 246   // The char_separator class breaks a sequence of characters into
 247   // tokens based on the character delimiters (very much like bad old
 248   // strtok). A delimiter character can either be kept or dropped. A
 249   // kept delimiter shows up as an output token, whereas a dropped
 250   // delimiter does not.
 251
 252   // This class replaces the char_delimiters_separator class. The
 253   // constructor for the char_delimiters_separator class was too
 254   // confusing and needed to be deprecated. However, because of the
 255   // default arguments to the constructor, adding the new constructor
 256   // would cause ambiguity, so instead I deprecated the whole class.
 257   // The implementation of the class was also simplified considerably.
 258
 259   enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
 260
 261   // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
 262 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
 263   template <typename Char,
 264     typename Traits = typename std::basic_string<Char>::traits_type >
 265 #else
 266   template <typename Char,
 267     typename Traits = std::basic_string<Char>::traits_type >
 268 #endif
 269   class char_separator
 270   {
 271     typedef std::basic_string<Char,Traits> string_type;
 272   public:
 273     explicit
 274     char_separator(const Char* dropped_delims,
 275                    const Char* kept_delims = 0,
 276                    empty_token_policy empty_tokens = drop_empty_tokens)
 277       : m_dropped_delims(dropped_delims),
 278         m_use_ispunct(false),
 279         m_use_isspace(false),
 280         m_empty_tokens(empty_tokens),
 281         m_output_done(false)
 282     {
 283       // Borland workaround
 284       if (kept_delims)
 285         m_kept_delims = kept_delims;
 286     }
 287
 288                 // use ispunct() for kept delimiters and isspace for dropped.
 289     explicit
 290     char_separator()
 291       : m_use_ispunct(true),
 292         m_use_isspace(true),
 293         m_empty_tokens(drop_empty_tokens) { }
 294
 295     void reset() { }
 296
 297     template <typename InputIterator, typename Token>
 298     bool operator()(InputIterator& next, InputIterator end, Token& tok)
 299     {
 300       tok = Token();
 301
 302       // skip past all dropped_delims
 303       if (m_empty_tokens == drop_empty_tokens)
 304         for (; next != end  && is_dropped(*next); ++next)
 305           { }
 306
 307       if (m_empty_tokens == drop_empty_tokens) {
 308
 309         if (next == end)
 310           return false;
 311
 312         // if we are on a kept_delims move past it and stop
 313         if (is_kept(*next)) {
 314           tok += *next;
 315           ++next;
 316         } else
 317           // append all the non delim characters
 318           for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
 319             tok += *next;
 320       }
 321       else { // m_empty_tokens == keep_empty_tokens
 322
 323         // Handle empty token at the end
 324         if (next == end)
 325           if (m_output_done == false) {
 326             m_output_done = true;
 327             return true;
 328           } else
 329             return false;
 330
 331         if (is_kept(*next)) {
 332           if (m_output_done == false)
 333             m_output_done = true;
 334           else {
 335             tok += *next;
 336             ++next;
 337             m_output_done = false;
 338           }
 339         }
 340         else if (m_output_done == false && is_dropped(*next)) {
 341           m_output_done = true;
 342         }
 343         else {
 344           if (is_dropped(*next))
 345             ++next;
 346           for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
 347             tok += *next;
 348           m_output_done = true;
 349         }
 350       }
 351       return true;
 352     }
 353
 354   private:
 355     string_type m_kept_delims;
 356     string_type m_dropped_delims;
 357     bool m_use_ispunct;
 358     bool m_use_isspace;
 359     empty_token_policy m_empty_tokens;
 360     bool m_output_done;
 361
 362     bool is_kept(Char E) const
 363     {
 364       if (m_kept_delims.length())
 365         return m_kept_delims.find(E) != string_type::npos;
 366       else if (m_use_ispunct) {
 367         return std::ispunct(E) != 0;
 368       } else
 369         return false;
 370     }
 371     bool is_dropped(Char E) const
 372     {
 373       if (m_dropped_delims.length())
 374         return m_dropped_delims.find(E) != string_type::npos;
 375       else if (m_use_isspace) {
 376         return std::isspace(E) != 0;
 377       } else
 378         return false;
 379     }
 380   };
 381
 382   //===========================================================================
 383   // The following class is DEPRECATED, use class char_separators instead.
 384   //
 385   // The char_delimiters_separator class, which is a model of
 386   // TokenizerFunction.  char_delimiters_separator breaks a string
 387   // into tokens based on character delimiters. There are 2 types of
 388   // delimiters. returnable delimiters can be returned as
 389   // tokens. These are often punctuation. nonreturnable delimiters
 390   // cannot be returned as tokens. These are often whitespace
 391
 392   // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
 393 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
 394   template <class Char,
 395     class Traits = typename std::basic_string<Char>::traits_type >
 396 #else
 397   template <class Char,
 398     class Traits = std::basic_string<Char>::traits_type >
 399 #endif
 400   class char_delimiters_separator {
 401   private:
 402
 403     typedef std::basic_string<Char,Traits> string_type;
 404     string_type returnable_;
 405     string_type nonreturnable_;
 406     bool return_delims_;
 407     bool no_ispunct_;
 408     bool no_isspace_;
 409
 410     bool is_ret(Char E)const
 411     {
 412       if (returnable_.length())
 413         return  returnable_.find(E) != string_type::npos;
 414       else{
 415         if (no_ispunct_) {return false;}
 416         else{
 417           int r = std::ispunct(E);
 418           return r != 0;
 419         }
 420       }
 421     }
 422     bool is_nonret(Char E)const
 423     {
 424       if (nonreturnable_.length())
 425         return  nonreturnable_.find(E) != string_type::npos;
 426       else{
 427         if (no_isspace_) {return false;}
 428         else{
 429           int r = std::isspace(E);
 430           return r != 0;
 431         }
 432       }
 433     }
 434
 435   public:
 436     explicit char_delimiters_separator(bool return_delims = false,
 437                                        const Char* returnable = 0,
 438                                        const Char* nonreturnable = 0)
 439       : returnable_(returnable ? returnable : string_type().c_str()),
 440         nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
 441         return_delims_(return_delims), no_ispunct_(returnable!=0),
 442         no_isspace_(nonreturnable!=0) { }
 443
 444     void reset() { }
 445
 446   public:
 447
 448      template <typename InputIterator, typename Token>
 449      bool operator()(InputIterator& next, InputIterator end,Token& tok) {
 450      tok = Token();
 451
 452      // skip past all nonreturnable delims
 453      // skip past the returnable only if we are not returning delims
 454      for (;next!=end && ( is_nonret(*next) || (is_ret(*next)
 455        && !return_delims_ ) );++next) { }
 456
 457      if (next == end) {
 458        return false;
 459      }
 460
 461      // if we are to return delims and we are one a returnable one
 462      // move past it and stop
 463      if (is_ret(*next) && return_delims_) {
 464        tok+=*next;
 465        ++next;
 466      }
 467      else
 468        // append all the non delim characters
 469        for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next)
 470          tok+=*next;
 471
 472
 473      return true;
 474    }
 475   };
 476
 477
 478 } //namespace boost
 479
 480
 481 #endif
 482
 483
 484
 485
 486