Main Page   Class Hierarchy   Alphabetical List   Compound List   Examples  

tokenizer.h

00001 /***************************************************************************
00002     copyright            : (C) 2002-2005 by Stefano Barbato
00003     email                : [email protected]
00004 
00005     $Id: tokenizer_8h-source.html,v 1.4 2006-03-12 12:28:32 tat Exp $
00006  ***************************************************************************/
00007 
00008 /***************************************************************************
00009  *                                                                         *
00010  *   This program is free software; you can redistribute it and/or modify  *
00011  *   it under the terms of the GNU General Public License as published by  *
00012  *   the Free Software Foundation; either version 2 of the License, or     *
00013  *   (at your option) any later version.                                   *
00014  *                                                                         *
00015  ***************************************************************************/
00016 #ifndef _MIMETIC_TOKENIZER_H_
00017 #define _MIMETIC_TOKENIZER_H_
00018 #include <iterator>
00019 #include <algorithm>
00020 #include <set>
00021 #include <string>
00022 
00023 namespace mimetic
00024 {
00025 
00026 template<typename value_type>
00027 struct IsDelim: public std::unary_function<value_type,bool>
00028 {
00029     bool operator()(const value_type& val) const
00030     {
00031         return m_delims.count(val) != 0; 
00032     }
00033     template<typename Container>
00034     void setDelimList(const Container& cont)
00035     {
00036         typename Container::const_iterator bit, eit;
00037         bit = cont.begin(), eit = cont.end();
00038         for(; bit != eit; ++bit)
00039             m_delims.insert(*bit);
00040     }
00041     template<typename Iterator>
00042     void setDelimList(Iterator bit, Iterator eit)
00043     {
00044         for(; bit != eit; ++bit)
00045             m_delims.insert(*bit);
00046     }
00047     void addDelim(const value_type& value)
00048     {
00049         m_delims.insert(value);
00050     }
00051     void removeDelim(const value_type& value)
00052     {
00053         m_delims.erase(value);
00054     }
00055 private:
00056     std::set<value_type> m_delims;
00057 };
00058 
00059 template<>
00060 struct IsDelim<char>: public std::unary_function<char, bool>
00061 {
00062     void setDelimList(const std::string& delims)
00063     {
00064         setDelimList(delims.begin(), delims.end());
00065     }
00066     template<typename Iterator>
00067     void setDelimList(Iterator bit, Iterator eit)
00068     {
00069         memset(&m_lookup, 0, sizeof(m_lookup));
00070         for(; bit != eit; ++bit)
00071             m_lookup[(int)*bit] = 1;
00072     }
00073     bool operator()(unsigned char val) const
00074     {
00075         return m_lookup[val] != 0;
00076     }
00077 private:
00078     char m_lookup[256];
00079 };
00080 
00081 
00082 /// Iterator tokenizer template class
00083 template<class Iterator,typename value_type>
00084 class ItTokenizer
00085 {
00086 public:
00087     ItTokenizer(Iterator bit, Iterator eit)
00088     : m_bit(bit), m_eit(eit), m_tok_eit(bit)
00089     {
00090     }
00091     void setSource(Iterator bit, Iterator eit)
00092     {
00093         m_bit = bit;
00094         m_eit = eit;
00095         m_tok_eit = bit;
00096     }
00097     template<typename DelimCont>
00098     void setDelimList(const DelimCont& cont)
00099     {
00100         m_delimPred.setDelimList(cont);
00101     }
00102     template<typename It>
00103     void setDelimList(It bit, It eit)
00104     {
00105         m_delimPred.setDelimList(bit, eit);
00106     }
00107     template<typename DestCont>
00108     bool next(DestCont& dst)
00109     {
00110         dst.erase(dst.begin(), dst.end());
00111         if(m_tok_eit == m_eit)
00112             return false;
00113     m_tok_eit = std::find_if(m_bit, m_eit, m_delimPred);
00114         m_matched = 0; // end of input
00115         if(m_tok_eit == m_eit)
00116             m_matched = *m_tok_eit;
00117         std::copy(m_bit, m_tok_eit, std::back_inserter<DestCont>(dst));
00118         m_bit = (m_tok_eit != m_eit && ++m_tok_eit != m_eit ? m_tok_eit : m_eit);
00119         return true;
00120     }
00121     const value_type& matched() const
00122     {
00123         return m_matched;
00124     }
00125     void addDelim(const value_type& value)
00126     {
00127         m_delimPred.addDelim(value);
00128     }
00129     void removeDelim(const value_type& value)
00130     {
00131         m_delimPred.removeDelim(value);
00132     }
00133 private:
00134     Iterator m_bit, m_eit, m_tok_eit;
00135     IsDelim<value_type> m_delimPred;
00136     value_type m_matched;
00137 };
00138 
00139 
00140 /// char container tokenizer template class
00141 template<typename Container>
00142 struct ContTokenizer: public ItTokenizer<typename Container::const_iterator,typename Container::value_type>
00143 {
00144     typedef typename Container::value_type value_type;
00145     typedef typename Container::iterator iterator;
00146     typedef typename Container::const_iterator const_iterator;
00147     // i want to be fast here so i don't want to copy "cont"
00148     // so "cont" MUST be in scope for all following calls
00149     // to next(...). 
00150     ContTokenizer(const Container* cont)
00151     : ItTokenizer<const_iterator, value_type>(cont.begin(), cont.end())
00152     {
00153     }
00154     template<typename DelimCont>
00155     ContTokenizer(const Container* cont, const DelimCont& delims)
00156     : ItTokenizer<const_iterator,value_type>(cont->begin(), cont->end())
00157     {
00158         setDelimList(delims);
00159     }
00160     void setSource(const Container* cont)
00161     {
00162         ItTokenizer<const_iterator,value_type>::setSource(cont->begin(), cont->end());
00163     }
00164 private:
00165     ContTokenizer(const ContTokenizer&);
00166     ContTokenizer& operator=(const ContTokenizer&);
00167 };
00168 
00169 /// std::string tokenizer
00170 typedef ContTokenizer<std::string> StringTokenizer;
00171 
00172 }
00173 
00174 #endif
00175