boost/boost/token_functions.hpp

   1 // Boost token_functions.hpp  ------------------------------------------------//
   2
   3 // Copyright John R. Bandela 2001.
   4
   5 // Permission to copy, use, modify, sell and distribute this software
   6 // is granted provided this copyright notice appears in all
   7 // copies. This software is provided "as is" without express or
   8 // implied warranty, and with no claim as to its suitability for any
   9 // purpose.
  10
  11 // See http://www.boost.org/libs/tokenizer for documentation.
  12
  13 // Revision History:
  14
  15 // 20 Feb 2002   John Maddock
  16 //      Removed using namespace std declarations and added
  17 //      workaround for BOOST_NO_STDC_NAMESPACE (the library
  18 //      can be safely mixed with regex).
  19 // 06 Feb 2002   Jeremy Siek
  20 //      Added char_separator.
  21 // 02 Feb 2002   Jeremy Siek
  22 //      Removed tabs and a little cleanup.
  23
  24
  25 #ifndef BOOST_TOKEN_FUNCTIONS_JRB051801_HPP_
  26 #define BOOST_TOKEN_FUNCTIONS_JRB051801_HPP_
  27
  28 #include <vector>
  29 #include <stdexcept>
  30 #include <cassert>
  31 #include <string>
  32 #include <cctype>
  33 #include <algorithm> // for find_if
  34
  35 //
  36 // the following must not be macros if we are to prefix them
  37 // with std:: (they shouldn't be macros anyway...)
  38 //
  39 #ifdef ispunct
  40 #  undef ispunct
  41 #endif
  42 #ifdef isspace
  43 #  undef isspace
  44 #endif
  45 //
  46 // fix namespace problems:
  47 //
  48 #ifdef BOOST_NO_STDC_NAMESPACE
  49 namespace std{
  50  using ::ispunct;
  51  using ::isspace;
  52 }
  53 #endif
  54
  55 namespace boost{
  56
  57   //===========================================================================
  58   // The escaped_list_separator class. Which is a model of TokenizerFunction
  59   // An escaped list is a super-set of what is commonly known as a comma
  60   // separated value (csv) list.It is separated into fields by a comma or
  61   // other character. If the delimiting character is inside quotes, then it is
  62   // counted as a regular character.To allow for embedded quotes in a field,
  63   // there can be escape sequences using the \ much like C.
  64   // The role of the comma, the quotation mark, and the escape
  65   // character (backslash \), can be assigned to other characters.
  66
  67   struct escaped_list_error : public std::runtime_error{
  68     escaped_list_error(const std::string& what):std::runtime_error(what) { }
  69   };
  70
  71
  72 // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  73 // MSVC does not like the following typename
  74 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  75   template <class Char,
  76     class Traits = typename std::basic_string<Char>::traits_type >
  77 #else
  78   template <class Char,
  79     class Traits = std::basic_string<Char>::traits_type >
  80 #endif
  81   class escaped_list_separator {
  82
  83   private:
  84     typedef std::basic_string<Char,Traits> string_type;
  85     struct char_eq {
  86       Char e_;
  87       char_eq(Char e):e_(e) { }
  88       bool operator()(Char c) {
  89         return Traits::eq(e_,c);
  90       }
  91     };
  92     string_type  escape_;
  93     string_type  c_;
  94     string_type  quote_;
  95     bool last_;
  96
  97     bool is_escape(Char e) {
  98       char_eq f(e);
  99       return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
 100     }
 101     bool is_c(Char e) {
 102       char_eq f(e);
 103       return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
 104     }
 105     bool is_quote(Char e) {
 106       char_eq f(e);
 107       return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
 108     }
 109     template <typename iterator, typename Token>
 110     void do_escape(iterator& next,iterator end,Token& tok) {
 111       if (++next == end)
 112         throw escaped_list_error(std::string("cannot end with escape"));
 113       if (Traits::eq(*next,'n')) {
 114         tok+='\n';
 115         return;
 116       }
 117       else if (is_quote(*next)) {
 118         tok+=*next;
 119         return;
 120       }
 121       else if (is_c(*next)) {
 122         tok+=*next;
 123         return;
 124       }
 125       else if (is_escape(*next)) {
 126         tok+=*next;
 127         return;
 128       }
 129       else
 130         throw escaped_list_error(std::string("unknown escape sequence"));
 131     }
 132
 133     public:
 134
 135     explicit escaped_list_separator(Char  e = '\\',
 136                                     Char c = ',',Char  q = '\"')
 137       : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
 138
 139     escaped_list_separator(string_type e, string_type c, string_type q)
 140       : escape_(e), c_(c), quote_(q), last_(false) { }
 141
 142     void reset() {last_=false;}
 143
 144     template <typename InputIterator, typename Token>
 145     bool operator()(InputIterator& next,InputIterator end,Token& tok) {
 146       bool bInQuote = false;
 147       tok = Token();
 148
 149       if (next == end) {
 150         if (last_) {
 151           last_ = false;
 152           return true;
 153         }
 154         else
 155           return false;
 156       }
 157       last_ = false;
 158       for (;next != end;++next) {
 159         if (is_escape(*next)) {
 160           do_escape(next,end,tok);
 161         }
 162         else if (is_c(*next)) {
 163           if (!bInQuote) {
 164             // If we are not in quote, then we are done
 165             ++next;
 166             // The last character was a c, that means there is
 167             // 1 more blank field
 168             last_ = true;
 169             return true;
 170           }
 171           else tok+=*next;
 172         }
 173         else if (is_quote(*next)) {
 174           bInQuote=!bInQuote;
 175         }
 176         else {
 177           tok += *next;
 178         }
 179       }
 180       return true;
 181     }
 182   };
 183
 184
 185   //===========================================================================
 186   // The offset_separator class, which is a model of TokenizerFunction.
 187   // Offset breaks a string into tokens based on a range of offsets
 188
 189   class offset_separator {
 190   private:
 191
 192     std::vector<int> offsets_;
 193     unsigned int current_offset_;
 194     bool wrap_offsets_;
 195     bool return_partial_last_;
 196
 197   public:
 198     template <typename Iter>
 199     offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
 200                      bool return_partial_last = true)
 201       : offsets_(begin,end), current_offset_(0),
 202         wrap_offsets_(wrap_offsets),
 203         return_partial_last_(return_partial_last) { }
 204
 205     offset_separator()
 206       : offsets_(1,1), current_offset_(),
 207         wrap_offsets_(true), return_partial_last_(true) { }
 208
 209     void reset() {
 210       current_offset_ = 0;
 211     }
 212
 213     template <typename InputIterator, typename Token>
 214     bool operator()(InputIterator& next, InputIterator end, Token& tok)
 215     {
 216       assert(!offsets_.empty());
 217
 218       tok = Token();
 219
 220       if (next == end)
 221         return false;
 222
 223       if (current_offset_ == offsets_.size())
 224         if (wrap_offsets_)
 225           current_offset_=0;
 226         else
 227           return false;
 228
 229       int c = offsets_[current_offset_];
 230       int i = 0;
 231       for (; i < c; ++i) {
 232         if (next == end)break;
 233         tok+=*next++;
 234       }
 235
 236       if (!return_partial_last_)
 237         if (i < (c-1) )
 238           return false;
 239
 240       ++current_offset_;
 241       return true;
 242     }
 243   };
 244
 245
 246   //===========================================================================
 247   // The char_separator class breaks a sequence of characters into
 248   // tokens based on the character delimiters (very much like bad old
 249   // strtok). A delimiter character can either be kept or dropped. A
 250   // kept delimiter shows up as an output token, whereas a dropped
 251   // delimiter does not.
 252
 253   // This class replaces the char_delimiters_separator class. The
 254   // constructor for the char_delimiters_separator class was too
 255   // confusing and needed to be deprecated. However, because of the
 256   // default arguments to the constructor, adding the new constructor
 257   // would cause ambiguity, so instead I deprecated the whole class.
 258   // The implementation of the class was also simplified considerably.
 259
 260   enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
 261
 262   // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
 263 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
 264   template <typename Char,
 265     typename Traits = typename std::basic_string<Char>::traits_type >
 266 #else
 267   template <typename Char,
 268     typename Traits = std::basic_string<Char>::traits_type >
 269 #endif
 270   class char_separator
 271   {
 272     typedef std::basic_string<Char,Traits> string_type;
 273   public:
 274     explicit
 275     char_separator(const Char* dropped_delims,
 276                    const Char* kept_delims = 0,
 277                    empty_token_policy empty_tokens = drop_empty_tokens)
 278       : m_dropped_delims(dropped_delims),
 279         m_use_ispunct(false),
 280         m_use_isspace(false),
 281         m_empty_tokens(empty_tokens),
 282         m_output_done(false)
 283     {
 284       // Borland workaround
 285       if (kept_delims)
 286         m_kept_delims = kept_delims;
 287     }
 288
 289                 // use ispunct() for kept delimiters and isspace for dropped.
 290     explicit
 291     char_separator()
 292       : m_use_ispunct(true),
 293         m_use_isspace(true),
 294         m_empty_tokens(drop_empty_tokens) { }
 295
 296     void reset() { }
 297
 298     template <typename InputIterator, typename Token>
 299     bool operator()(InputIterator& next, InputIterator end, Token& tok)
 300     {
 301       tok = Token();
 302
 303       // skip past all dropped_delims
 304       if (m_empty_tokens == drop_empty_tokens)
 305         for (; next != end  && is_dropped(*next); ++next)
 306           { }
 307
 308       if (m_empty_tokens == drop_empty_tokens) {
 309
 310         if (next == end)
 311           return false;
 312
 313         // if we are on a kept_delims move past it and stop
 314         if (is_kept(*next)) {
 315           tok += *next;
 316           ++next;
 317         } else
 318           // append all the non delim characters
 319           for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
 320             tok += *next;
 321       }
 322       else { // m_empty_tokens == keep_empty_tokens
 323
 324         // Handle empty token at the end
 325         if (next == end)
 326           if (m_output_done == false) {
 327             m_output_done = true;
 328             return true;
 329           } else
 330             return false;
 331
 332         if (is_kept(*next)) {
 333           if (m_output_done == false)
 334             m_output_done = true;
 335           else {
 336             tok += *next;
 337             ++next;
 338             m_output_done = false;
 339           }
 340         }
 341         else if (m_output_done == false && is_dropped(*next)) {
 342           m_output_done = true;
 343         }
 344         else {
 345           if (is_dropped(*next))
 346             ++next;
 347           for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
 348             tok += *next;
 349           m_output_done = true;
 350         }
 351       }
 352       return true;
 353     }
 354
 355   private:
 356     string_type m_kept_delims;
 357     string_type m_dropped_delims;
 358     bool m_use_ispunct;
 359     bool m_use_isspace;
 360     empty_token_policy m_empty_tokens;
 361     bool m_output_done;
 362
 363     bool is_kept(Char E) const
 364     {
 365       if (m_kept_delims.length())
 366         return m_kept_delims.find(E) != string_type::npos;
 367       else if (m_use_ispunct) {
 368         return std::ispunct(E) != 0;
 369       } else
 370         return false;
 371     }
 372     bool is_dropped(Char E) const
 373     {
 374       if (m_dropped_delims.length())
 375         return m_dropped_delims.find(E) != string_type::npos;
 376       else if (m_use_isspace) {
 377         return std::isspace(E) != 0;
 378       } else
 379         return false;
 380     }
 381   };
 382
 383   //===========================================================================
 384   // The following class is DEPRECATED, use class char_separators instead.
 385   //
 386   // The char_delimiters_separator class, which is a model of
 387   // TokenizerFunction.  char_delimiters_separator breaks a string
 388   // into tokens based on character delimiters. There are 2 types of
 389   // delimiters. returnable delimiters can be returned as
 390   // tokens. These are often punctuation. nonreturnable delimiters
 391   // cannot be returned as tokens. These are often whitespace
 392
 393   // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
 394 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
 395   template <class Char,
 396     class Traits = typename std::basic_string<Char>::traits_type >
 397 #else
 398   template <class Char,
 399     class Traits = std::basic_string<Char>::traits_type >
 400 #endif
 401   class char_delimiters_separator {
 402   private:
 403
 404     typedef std::basic_string<Char,Traits> string_type;
 405     string_type returnable_;
 406     string_type nonreturnable_;
 407     bool return_delims_;
 408     bool no_ispunct_;
 409     bool no_isspace_;
 410
 411     bool is_ret(Char E)const
 412     {
 413       if (returnable_.length())
 414         return  returnable_.find(E) != string_type::npos;
 415       else{
 416         if (no_ispunct_) {return false;}
 417         else{
 418           int r = std::ispunct(E);
 419           return r != 0;
 420         }
 421       }
 422     }
 423     bool is_nonret(Char E)const
 424     {
 425       if (nonreturnable_.length())
 426         return  nonreturnable_.find(E) != string_type::npos;
 427       else{
 428         if (no_isspace_) {return false;}
 429         else{
 430           int r = std::isspace(E);
 431           return r != 0;
 432         }
 433       }
 434     }
 435
 436   public:
 437     explicit char_delimiters_separator(bool return_delims = false,
 438                                        const Char* returnable = 0,
 439                                        const Char* nonreturnable = 0)
 440       : returnable_(returnable ? returnable : string_type().c_str()),
 441         nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
 442         return_delims_(return_delims), no_ispunct_(returnable!=0),
 443         no_isspace_(nonreturnable!=0) { }
 444
 445     void reset() { }
 446
 447   public:
 448
 449      template <typename InputIterator, typename Token>
 450      bool operator()(InputIterator& next, InputIterator end,Token& tok) {
 451      tok = Token();
 452
 453      // skip past all nonreturnable delims
 454      // skip past the returnable only if we are not returning delims
 455      for (;next!=end && ( is_nonret(*next) || (is_ret(*next)
 456        && !return_delims_ ) );++next) { }
 457
 458      if (next == end) {
 459        return false;
 460      }
 461
 462      // if we are to return delims and we are one a returnable one
 463      // move past it and stop
 464      if (is_ret(*next) && return_delims_) {
 465        tok+=*next;
 466        ++next;
 467      }
 468      else
 469        // append all the non delim characters
 470        for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next)
 471          tok+=*next;
 472
 473
 474      return true;
 475    }
 476   };
 477
 478
 479 } //namespace boost
 480
 481
 482 #endif
 483
 484
 485
 486
 487