diff options
Diffstat (limited to 'ext/boost/token_functions.hpp')
-rw-r--r-- | ext/boost/token_functions.hpp | 621 |
1 files changed, 621 insertions, 0 deletions
diff --git a/ext/boost/token_functions.hpp b/ext/boost/token_functions.hpp new file mode 100644 index 0000000000..ef7ba73066 --- /dev/null +++ b/ext/boost/token_functions.hpp @@ -0,0 +1,621 @@ +// Boost token_functions.hpp ------------------------------------------------// + +// Copyright John R. Bandela 2001. + +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// See http://www.boost.org/libs/tokenizer/ for documentation. + +// Revision History: +// 01 Oct 2004 Joaquin M Lopez Munoz +// Workaround for a problem with string::assign in msvc-stlport +// 06 Apr 2004 John Bandela +// Fixed a bug involving using char_delimiter with a true input iterator +// 28 Nov 2003 Robert Zeh and John Bandela +// Converted into "fast" functions that avoid using += when +// the supplied iterator isn't an input_iterator; based on +// some work done at Archelon and a version that was checked into +// the boost CVS for a short period of time. +// 20 Feb 2002 John Maddock +// Removed using namespace std declarations and added +// workaround for BOOST_NO_STDC_NAMESPACE (the library +// can be safely mixed with regex). +// 06 Feb 2002 Jeremy Siek +// Added char_separator. +// 02 Feb 2002 Jeremy Siek +// Removed tabs and a little cleanup. + + +#ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ +#define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ + +#include <vector> +#include <stdexcept> +#include <string> +#include <cctype> +#include <algorithm> // for find_if +#include <boost/config.hpp> +#include <boost/assert.hpp> +#include <boost/detail/workaround.hpp> +#include <boost/mpl/if.hpp> + +// +// the following must not be macros if we are to prefix them +// with std:: (they shouldn't be macros anyway...) +// +#ifdef ispunct +# undef ispunct +#endif +#ifdef isspace +# undef isspace +#endif +// +// fix namespace problems: +// +#ifdef BOOST_NO_STDC_NAMESPACE +namespace std{ + using ::ispunct; + using ::isspace; +} +#endif + +namespace boost{ + + //=========================================================================== + // The escaped_list_separator class. Which is a model of TokenizerFunction + // An escaped list is a super-set of what is commonly known as a comma + // separated value (csv) list.It is separated into fields by a comma or + // other character. If the delimiting character is inside quotes, then it is + // counted as a regular character.To allow for embedded quotes in a field, + // there can be escape sequences using the \ much like C. + // The role of the comma, the quotation mark, and the escape + // character (backslash \), can be assigned to other characters. + + struct escaped_list_error : public std::runtime_error{ + escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { } + }; + + +// The out of the box GCC 2.95 on cygwin does not have a char_traits class. +// MSVC does not like the following typename +#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 + template <class Char, + class Traits = typename std::basic_string<Char>::traits_type > +#else + template <class Char, + class Traits = std::basic_string<Char>::traits_type > +#endif + class escaped_list_separator { + + private: + typedef std::basic_string<Char,Traits> string_type; + struct char_eq { + Char e_; + char_eq(Char e):e_(e) { } + bool operator()(Char c) { + return Traits::eq(e_,c); + } + }; + string_type escape_; + string_type c_; + string_type quote_; + bool last_; + + bool is_escape(Char e) { + char_eq f(e); + return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); + } + bool is_c(Char e) { + char_eq f(e); + return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); + } + bool is_quote(Char e) { + char_eq f(e); + return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); + } + template <typename iterator, typename Token> + void do_escape(iterator& next,iterator end,Token& tok) { + if (++next == end) + throw escaped_list_error(std::string("cannot end with escape")); + if (Traits::eq(*next,'n')) { + tok+='\n'; + return; + } + else if (is_quote(*next)) { + tok+=*next; + return; + } + else if (is_c(*next)) { + tok+=*next; + return; + } + else if (is_escape(*next)) { + tok+=*next; + return; + } + else + throw escaped_list_error(std::string("unknown escape sequence")); + } + + public: + + explicit escaped_list_separator(Char e = '\\', + Char c = ',',Char q = '\"') + : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } + + escaped_list_separator(string_type e, string_type c, string_type q) + : escape_(e), c_(c), quote_(q), last_(false) { } + + void reset() {last_=false;} + + template <typename InputIterator, typename Token> + bool operator()(InputIterator& next,InputIterator end,Token& tok) { + bool bInQuote = false; + tok = Token(); + + if (next == end) { + if (last_) { + last_ = false; + return true; + } + else + return false; + } + last_ = false; + for (;next != end;++next) { + if (is_escape(*next)) { + do_escape(next,end,tok); + } + else if (is_c(*next)) { + if (!bInQuote) { + // If we are not in quote, then we are done + ++next; + // The last character was a c, that means there is + // 1 more blank field + last_ = true; + return true; + } + else tok+=*next; + } + else if (is_quote(*next)) { + bInQuote=!bInQuote; + } + else { + tok += *next; + } + } + return true; + } + }; + + //=========================================================================== + // The classes here are used by offset_separator and char_separator to implement + // faster assigning of tokens using assign instead of += + + namespace tokenizer_detail { + + // The assign_or_plus_equal struct contains functions that implement + // assign, +=, and clearing based on the iterator type. The + // generic case does nothing for plus_equal and clearing, while + // passing through the call for assign. + // + // When an input iterator is being used, the situation is reversed. + // The assign method does nothing, plus_equal invokes operator +=, + // and the clearing method sets the supplied token to the default + // token constructor's result. + // + + template<class IteratorTag> + struct assign_or_plus_equal { + template<class Iterator, class Token> + static void assign(Iterator b, Iterator e, Token &t) { + +#if BOOST_WORKAROUND(BOOST_MSVC, < 1300) &&\ + BOOST_WORKAROUND(__SGI_STL_PORT, < 0x500) &&\ + defined(_STLP_DEBUG) &&\ + (defined(_STLP_USE_DYNAMIC_LIB) || defined(_DLL)) + // Problem with string::assign for msvc-stlport in debug mode: the + // linker tries to import the templatized version of this memfun, + // which is obviously not exported. + // See http://www.stlport.com/dcforum/DCForumID6/1763.html for details. + + t = Token(); + while(b != e) t += *b++; +#else + t.assign(b, e); +#endif + + } + + template<class Token, class Value> + static void plus_equal(Token &, const Value &) { + + } + + // If we are doing an assign, there is no need for the + // the clear. + // + template<class Token> + static void clear(Token &) { + + } + }; + + template <> + struct assign_or_plus_equal<std::input_iterator_tag> { + template<class Iterator, class Token> + static void assign(Iterator b, Iterator e, Token &t) { + + } + template<class Token, class Value> + static void plus_equal(Token &t, const Value &v) { + t += v; + } + template<class Token> + static void clear(Token &t) { + t = Token(); + } + }; + + + template<class Iterator> + struct pointer_iterator_category{ + typedef std::random_access_iterator_tag type; + }; + + + template<class Iterator> + struct class_iterator_category{ + typedef typename Iterator::iterator_category type; + }; + + + + // This portably gets the iterator_tag without partial template specialization + template<class Iterator> + struct get_iterator_category{ + typedef typename mpl::if_<is_pointer<Iterator>, + pointer_iterator_category<Iterator>, + class_iterator_category<Iterator> + >::type cat; + + typedef typename cat::type iterator_category; + }; + + +} + + + //=========================================================================== + // The offset_separator class, which is a model of TokenizerFunction. + // Offset breaks a string into tokens based on a range of offsets + + class offset_separator { + private: + + std::vector<int> offsets_; + unsigned int current_offset_; + bool wrap_offsets_; + bool return_partial_last_; + + public: + template <typename Iter> + offset_separator(Iter begin, Iter end, bool wrap_offsets = true, + bool return_partial_last = true) + : offsets_(begin,end), current_offset_(0), + wrap_offsets_(wrap_offsets), + return_partial_last_(return_partial_last) { } + + offset_separator() + : offsets_(1,1), current_offset_(), + wrap_offsets_(true), return_partial_last_(true) { } + + void reset() { + current_offset_ = 0; + } + + template <typename InputIterator, typename Token> + bool operator()(InputIterator& next, InputIterator end, Token& tok) + { + typedef tokenizer_detail::assign_or_plus_equal< +#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 + typename +#endif + tokenizer_detail::get_iterator_category< + InputIterator>::iterator_category> assigner; + + + BOOST_ASSERT(!offsets_.empty()); + + assigner::clear(tok); + InputIterator start(next); + + if (next == end) + return false; + + if (current_offset_ == offsets_.size()) + { + if (wrap_offsets_) + current_offset_=0; + else + return false; + } + + int c = offsets_[current_offset_]; + int i = 0; + for (; i < c; ++i) { + if (next == end)break; + assigner::plus_equal(tok,*next++); + } + assigner::assign(start,next,tok); + + if (!return_partial_last_) + if (i < (c-1) ) + return false; + + ++current_offset_; + return true; + } + }; + + + //=========================================================================== + // The char_separator class breaks a sequence of characters into + // tokens based on the character delimiters (very much like bad old + // strtok). A delimiter character can either be kept or dropped. A + // kept delimiter shows up as an output token, whereas a dropped + // delimiter does not. + + // This class replaces the char_delimiters_separator class. The + // constructor for the char_delimiters_separator class was too + // confusing and needed to be deprecated. However, because of the + // default arguments to the constructor, adding the new constructor + // would cause ambiguity, so instead I deprecated the whole class. + // The implementation of the class was also simplified considerably. + + enum empty_token_policy { drop_empty_tokens, keep_empty_tokens }; + + // The out of the box GCC 2.95 on cygwin does not have a char_traits class. +#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 + template <typename Char, + typename Traits = typename std::basic_string<Char>::traits_type > +#else + template <typename Char, + typename Traits = std::basic_string<Char>::traits_type > +#endif + class char_separator + { + typedef std::basic_string<Char,Traits> string_type; + public: + explicit + char_separator(const Char* dropped_delims, + const Char* kept_delims = 0, + empty_token_policy empty_tokens = drop_empty_tokens) + : m_dropped_delims(dropped_delims), + m_use_ispunct(false), + m_use_isspace(false), + m_empty_tokens(empty_tokens), + m_output_done(false) + { + // Borland workaround + if (kept_delims) + m_kept_delims = kept_delims; + } + + // use ispunct() for kept delimiters and isspace for dropped. + explicit + char_separator() + : m_use_ispunct(true), + m_use_isspace(true), + m_empty_tokens(drop_empty_tokens) { } + + void reset() { } + + template <typename InputIterator, typename Token> + bool operator()(InputIterator& next, InputIterator end, Token& tok) + { + typedef tokenizer_detail::assign_or_plus_equal< +#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 + typename +#endif + tokenizer_detail::get_iterator_category< + InputIterator>::iterator_category> assigner; + + assigner::clear(tok); + + // skip past all dropped_delims + if (m_empty_tokens == drop_empty_tokens) + for (; next != end && is_dropped(*next); ++next) + { } + + InputIterator start(next); + + if (m_empty_tokens == drop_empty_tokens) { + + if (next == end) + return false; + + + // if we are on a kept_delims move past it and stop + if (is_kept(*next)) { + assigner::plus_equal(tok,*next); + ++next; + } else + // append all the non delim characters + for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) + assigner::plus_equal(tok,*next); + } + else { // m_empty_tokens == keep_empty_tokens + + // Handle empty token at the end + if (next == end) + { + if (m_output_done == false) + { + m_output_done = true; + assigner::assign(start,next,tok); + return true; + } + else + return false; + } + + if (is_kept(*next)) { + if (m_output_done == false) + m_output_done = true; + else { + assigner::plus_equal(tok,*next); + ++next; + m_output_done = false; + } + } + else if (m_output_done == false && is_dropped(*next)) { + m_output_done = true; + } + else { + if (is_dropped(*next)) + start=++next; + for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) + assigner::plus_equal(tok,*next); + m_output_done = true; + } + } + assigner::assign(start,next,tok); + return true; + } + + private: + string_type m_kept_delims; + string_type m_dropped_delims; + bool m_use_ispunct; + bool m_use_isspace; + empty_token_policy m_empty_tokens; + bool m_output_done; + + bool is_kept(Char E) const + { + if (m_kept_delims.length()) + return m_kept_delims.find(E) != string_type::npos; + else if (m_use_ispunct) { + return std::ispunct(E) != 0; + } else + return false; + } + bool is_dropped(Char E) const + { + if (m_dropped_delims.length()) + return m_dropped_delims.find(E) != string_type::npos; + else if (m_use_isspace) { + return std::isspace(E) != 0; + } else + return false; + } + }; + + //=========================================================================== + // The following class is DEPRECATED, use class char_separators instead. + // + // The char_delimiters_separator class, which is a model of + // TokenizerFunction. char_delimiters_separator breaks a string + // into tokens based on character delimiters. There are 2 types of + // delimiters. returnable delimiters can be returned as + // tokens. These are often punctuation. nonreturnable delimiters + // cannot be returned as tokens. These are often whitespace + + // The out of the box GCC 2.95 on cygwin does not have a char_traits class. +#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 + template <class Char, + class Traits = typename std::basic_string<Char>::traits_type > +#else + template <class Char, + class Traits = std::basic_string<Char>::traits_type > +#endif + class char_delimiters_separator { + private: + + typedef std::basic_string<Char,Traits> string_type; + string_type returnable_; + string_type nonreturnable_; + bool return_delims_; + bool no_ispunct_; + bool no_isspace_; + + bool is_ret(Char E)const + { + if (returnable_.length()) + return returnable_.find(E) != string_type::npos; + else{ + if (no_ispunct_) {return false;} + else{ + int r = std::ispunct(E); + return r != 0; + } + } + } + bool is_nonret(Char E)const + { + if (nonreturnable_.length()) + return nonreturnable_.find(E) != string_type::npos; + else{ + if (no_isspace_) {return false;} + else{ + int r = std::isspace(E); + return r != 0; + } + } + } + + public: + explicit char_delimiters_separator(bool return_delims = false, + const Char* returnable = 0, + const Char* nonreturnable = 0) + : returnable_(returnable ? returnable : string_type().c_str()), + nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()), + return_delims_(return_delims), no_ispunct_(returnable!=0), + no_isspace_(nonreturnable!=0) { } + + void reset() { } + + public: + + template <typename InputIterator, typename Token> + bool operator()(InputIterator& next, InputIterator end,Token& tok) { + tok = Token(); + + // skip past all nonreturnable delims + // skip past the returnable only if we are not returning delims + for (;next!=end && ( is_nonret(*next) || (is_ret(*next) + && !return_delims_ ) );++next) { } + + if (next == end) { + return false; + } + + // if we are to return delims and we are one a returnable one + // move past it and stop + if (is_ret(*next) && return_delims_) { + tok+=*next; + ++next; + } + else + // append all the non delim characters + for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next) + tok+=*next; + + + return true; + } + }; + + +} //namespace boost + + +#endif + + + + + |