Main Page   Class Hierarchy   Alphabetical List   Compound List   Examples  

itparser.h

00001 #ifndef _MIMETIC_PARSER_ITPARSER_H_
00002 #define _MIMETIC_PARSER_ITPARSER_H_
00003 #include <iterator>
00004 #include <algorithm>
00005 #include <stack>
00006 #include <iostream>
00007 #include <mimetic/tree.h>
00008 #include <mimetic/utils.h>
00009 #include <mimetic/mimeentity.h>
00010 
00011 
00012 // FIXME: handle HigherLevelClosingBoundary
00013 
00014 namespace mimetic
00015 {
00016 
00017 /// Parse the input reading from an iterator
00018 template<typename Iterator, 
00019 typename ItCategory=typename std::iterator_traits<Iterator>::iterator_category> 
00020 struct IteratorParser
00021 {
00022 };
00023 
00024 /*
00025  * Input Iterator
00026  */
00027 template<typename Iterator>
00028 struct IteratorParser<Iterator, std::input_iterator_tag>
00029 {
00030 
00031     IteratorParser(MimeEntity& me)
00032     : m_me(me), m_iMask(imNone), m_lastBoundary(NoBoundary)
00033     {
00034         m_entityStack.push(&m_me);
00035     }
00036     virtual ~IteratorParser()
00037     {
00038     }
00039     /**
00040      * set the Ignore Mask to \p mask
00041      */
00042     void iMask(size_t mask)    {    m_iMask = mask;        }
00043     /**
00044      * get the Ignore Mask 
00045      */
00046     size_t iMask() const    {    return m_iMask;        }
00047     /**
00048      * start parsing
00049      */
00050     void run(Iterator bit, Iterator eit)
00051     {
00052         m_bit = bit;
00053         m_eit = eit;
00054         doLoad();
00055     }
00056 protected:
00057     typedef std::list<std::string> BoundaryList;
00058     enum { 
00059         CR = 0xD, 
00060         LF = 0xA, 
00061         NL = '\n' 
00062     };
00063     enum /* ParsingElem */ { 
00064         peIgnore, 
00065         pePreamble, 
00066         peBody, 
00067         peEpilogue 
00068     };
00069     enum BoundaryType {
00070         NoBoundary = 0,
00071         Boundary,
00072         ClosingBoundary,
00073         HigherLevelBoundary
00074         //, HigherLevelClosingBoundary
00075     };
00076     enum EntityType { 
00077         etRfc822, 
00078         etMsgRfc822, 
00079         etMultipart 
00080     };
00081     // vars
00082     MimeEntity& m_me;
00083     Iterator m_bit, m_eit;
00084     size_t m_iMask; // ignore mask
00085     BoundaryList m_boundaryList;
00086     BoundaryType m_lastBoundary;
00087     std::stack<MimeEntity*> m_entityStack;
00088 
00089 protected:
00090     void appendPreambleBlock(const char* buf, int sz)
00091     {
00092         MimeEntity* pMe = m_entityStack.top();
00093         pMe->body().preamble().append(buf,sz);
00094     }
00095     
00096     void appendEpilogueBlock(const char* buf, int sz)
00097     {
00098         MimeEntity* pMe = m_entityStack.top();
00099         pMe->body().epilogue().append(buf,sz);
00100     }
00101     
00102     void appendBodyBlock(const char* buf, int sz)
00103     {
00104         MimeEntity* pMe = m_entityStack.top();
00105         pMe->body().append(buf, sz);
00106     }
00107     
00108     std::string getBoundary()
00109     {
00110         const MimeEntity* pMe = m_entityStack.top();
00111         const ContentType& ct = pMe->header().contentType();
00112         return std::string("--") + ct.param("boundary");
00113     }
00114     
00115     void popChild()
00116     {
00117         m_entityStack.pop();
00118     }
00119     
00120     void pushNewChild()
00121     {
00122         MimeEntity* pMe = m_entityStack.top();
00123         MimeEntity* pChild = new MimeEntity;
00124         pMe->body().parts().push_back(pChild);
00125         m_entityStack.push(pChild);
00126     }
00127     
00128     EntityType getType()
00129     {
00130         MimeEntity* pMe = m_entityStack.top();
00131         const Header& h = pMe->header();
00132         // will NOT be automatically created if it doesn't exists;
00133         // null ContentType will be returned
00134         const ContentType& ct = h.contentType();
00135         if(ct.isMultipart())
00136             return etMultipart;
00137         else if    (ct.type() == "message" && ct.subtype() == "rfc822") 
00138             return etMsgRfc822;
00139         else
00140             return etRfc822;
00141     }
00142     
00143     void addField(const std::string& name, const std::string& value)
00144     {
00145         MimeEntity* pMe = m_entityStack.top();
00146         Header& h = pMe->header();
00147         Header::iterator it = h.insert(h.end(), Field());
00148         it->name(name);
00149         it->value(value);
00150     }
00151 
00152     BoundaryType isBoundary(const std::string& line) 
00153     {
00154         if(line.length() == 0 || line[0] != '-')
00155             return m_lastBoundary = NoBoundary;
00156 
00157         int level = 0; // multipart nesting level
00158         int lineLen = line.length();
00159         BoundaryList::const_iterator bit,eit;
00160         bit = m_boundaryList.begin(), eit = m_boundaryList.end();
00161         for(;bit != eit; ++bit, ++level)
00162         {
00163             const std::string& b = *bit;
00164             int bLen = b.length();
00165             if(line.compare(0, bLen, b) == 0)
00166             { 
00167                 // not the expected boundary, malformed msg
00168                 if(level > 0)
00169                     return m_lastBoundary=HigherLevelBoundary;
00170                 // plain boundary or closing boundary?
00171                 if(lineLen > bLen && line.compare(bLen,2,"--") == 0)
00172                     return m_lastBoundary = ClosingBoundary;
00173                 else
00174                     return m_lastBoundary = Boundary;
00175             }
00176         }
00177         return m_lastBoundary = NoBoundary;
00178     }
00179     // is new line
00180     inline bool isnl(char c) const
00181     {
00182         return (c == CR || c == LF);
00183     }
00184     // is a two char newline
00185     inline bool isnl(char a, char b) const
00186     {
00187         if(a == CR || a == LF)
00188             if(b == (a == CR ? LF : CR))
00189                 return true;
00190         return false;
00191     }
00192     void doLoad()
00193     {
00194         loadHeader();
00195         loadBody();
00196     }
00197     bool valid() const
00198     {
00199         return m_bit != m_eit;
00200     }
00201     void append(char*& buf, size_t& bufsz, char c, size_t& pos)
00202     {
00203         enum { alloc_block = 128};
00204         if(pos == bufsz) 
00205         {
00206             // allocate and init buffer
00207             char* tmp = buf;
00208             int oldBufsz = bufsz;
00209             while(pos >= bufsz)
00210                 bufsz = bufsz + alloc_block;
00211             buf = new char[bufsz+1];    
00212             if(tmp != 0)
00213             {
00214                 assert(oldBufsz > 0);
00215                 memset(buf, 0, bufsz);
00216                 memcpy(buf, tmp, oldBufsz);
00217                 delete[] tmp;
00218             }
00219         }
00220         buf[pos++] = c;
00221     }
00222     // parses the header and calls addField and pushChild
00223     // to add fields and nested entities
00224     void loadHeader()
00225     {
00226         enum { 
00227             sInit,
00228             sIgnoreLine,
00229             sNewline,
00230             sWaitingName, 
00231             sWaitingValue, 
00232             sWaitingFoldedValue,
00233             sName, 
00234             sValue,
00235             sIgnoreHeader
00236         };
00237         register int status;
00238         int pos;
00239         char *name, *value;
00240         size_t nBufSz, vBufSz, nPos, vPos;
00241         char prev, c = 0;
00242 
00243         name = value = 0;
00244         pos = nBufSz = vBufSz = nPos = vPos = 0;
00245         status = (m_iMask & imHeader ? sIgnoreHeader : sInit);
00246         //status = sInit;
00247         while(m_bit != m_eit)
00248         {
00249             c = *m_bit;
00250             switch(status)
00251             {
00252             case sInit:
00253                 if(isnl(c))
00254                     status = sNewline;
00255                 else
00256                     status = sName;
00257                 continue;
00258             case sIgnoreLine:
00259                 if(!isnl(c))
00260                     break;
00261                 status = sNewline;
00262                 continue;
00263             case sNewline:
00264                 status = sWaitingName;
00265                 if(pos > 0)
00266                 {
00267                     pos = 0;
00268                     prev = c;
00269                     if(++m_bit == m_eit) goto out; //eof
00270                     c = *m_bit;
00271                     if(c == (prev == CR ? LF : CR))
00272                     {
00273                         --pos;
00274                         break;
00275                     } else 
00276                         continue;
00277                 } else {
00278                     // empty line, end of header
00279                     prev = c;
00280                     if(++m_bit == m_eit) goto out; //eof
00281                     c = *m_bit;
00282                     if(c == (prev == CR ? LF : CR))
00283                         ++m_bit;    
00284                     goto out;
00285                 }
00286             case sWaitingName:
00287                 if(isblank(c))
00288                 {
00289                     // folded value
00290                     status = sWaitingFoldedValue;
00291                     continue;
00292                 } 
00293                 // not blank, new field or empty line 
00294                 if(nPos)
00295                 {
00296                     name[nPos] = 0;
00297                     // is not an empty field (name: \n)
00298                     if(vPos) 
00299                     {
00300                         value[vPos] = 0;
00301                         addField(name,value);
00302                     } else
00303                         addField(name,"");
00304                     nPos = vPos = 0;
00305                 }
00306                 status = (isnl(c) ? sNewline : sName);
00307                 continue;
00308             case sWaitingValue:
00309                 if(isblank(c))
00310                     break; // eat leading blanks
00311                 status = sValue;
00312                 continue;
00313             case sWaitingFoldedValue:
00314                 if(isblank(c))
00315                     break; // eat leading blanks
00316                 append(value, vBufSz, ' ', vPos);
00317                 status = sValue;
00318                 continue;
00319             case sName:
00320                 if(c > 32 && c < 127 && c != ':') {
00321                     append(name, nBufSz, c, nPos);
00322                 } else if(c == ':') {
00323                     status = sWaitingValue;
00324                 } else {
00325                     nPos = 0;
00326                     status = sIgnoreLine;
00327                     continue;
00328                 }
00329                 break;
00330             case sValue:
00331                 if(isnl(c))
00332                 {
00333                     status = sNewline;
00334                     continue;
00335                 }
00336                 append(value, vBufSz, c, vPos);
00337                 break;
00338             case sIgnoreHeader:
00339                 if(isnl(c))
00340                 {
00341                     prev = c;
00342                     if(++m_bit == m_eit) goto out; //eof
00343                     c = *m_bit;
00344                     if(c == (prev == CR ? LF : CR))
00345                         ++m_bit;    
00346                     if(pos == 0)    
00347                         goto out; //empty line, eoh
00348                     pos = 0;
00349                     continue;
00350                 } 
00351                 break;
00352             }
00353             ++m_bit; ++pos;
00354         }
00355     out:
00356         if(name)
00357             delete[] name;
00358         if(value)
00359             delete[] value;
00360         return;
00361     }
00362     void loadBody()
00363     {
00364         switch(getType())
00365         {
00366         case etRfc822:
00367             if(m_iMask & imBody)
00368                 jump_to_next_boundary();
00369             else
00370                 copy_until_boundary(peBody);
00371             break;
00372         case etMultipart:
00373             loadMultipart();
00374             break;
00375         case etMsgRfc822:
00376             if(m_iMask & imChildParts)
00377                 jump_to_next_boundary();
00378             else {
00379                 pushNewChild();
00380                 doLoad(); // load child entities
00381                 popChild();
00382             }
00383             break;
00384         }
00385     }
00386     void loadMultipart()
00387     {
00388         std::string boundary = getBoundary();
00389         m_boundaryList.push_front(boundary);
00390         ParsingElem pe;
00391         // preamble
00392         pe = (m_iMask & imPreamble ? peIgnore : pePreamble );
00393         copy_until_boundary(pe);
00394         while(m_bit != m_eit)
00395         {
00396             switch(m_lastBoundary)
00397             {
00398             case NoBoundary:
00399                 return; // eof
00400             case Boundary:
00401                 if(m_iMask & imChildParts)
00402                     jump_to_next_boundary();
00403                 else {
00404                     pushNewChild();
00405                     doLoad();
00406                     popChild();
00407                 }
00408                 break;
00409             case ClosingBoundary:
00410                 m_boundaryList.erase(m_boundaryList.begin());
00411                 // epilogue
00412                 pe=(m_iMask & imEpilogue? peIgnore: peEpilogue);
00413                 copy_until_boundary(pe);
00414                 return;
00415             case HigherLevelBoundary:
00416                 m_boundaryList.erase(m_boundaryList.begin());
00417                 return;
00418             }
00419         }
00420     }
00421     inline void onBlock(const char* block, int sz, ParsingElem pe)
00422     {
00423         switch(pe)
00424         {
00425         case peIgnore:
00426             return;
00427         case pePreamble:
00428             appendPreambleBlock(block, sz);
00429             break;
00430         case peEpilogue:
00431             appendEpilogueBlock(block, sz);
00432             break;
00433         case peBody:
00434             appendBodyBlock(block, sz);
00435             break;
00436         }
00437     }
00438     void jump_to_next_boundary()
00439     {
00440         copy_until_boundary(peIgnore);
00441     }
00442     // this is where most of execution time is spent when parsing
00443     // large messages; I'm using a plain char[] buffer instead of
00444     // std::string because I want to be as fast as possible here
00445     virtual void copy_until_boundary(ParsingElem pe)
00446     {
00447         size_t pos, lines, eomsz;
00448         register char c;
00449         enum { nlsz = 1 };
00450         char nl[2] = { NL, 0 };
00451         const char *eom = 0;
00452 
00453         enum { blksz = 4096 };
00454         char block[blksz];
00455         size_t blkpos = 0;
00456         size_t sl_off = 0; // start of line offset into *block
00457 
00458         pos = lines = 0;
00459         while(m_bit != m_eit)
00460         {
00461             // if buffer is full
00462             if(blkpos >= blksz - 2 - nlsz)
00463             {
00464                 if(sl_off == 0)
00465                 { 
00466                     // very long line found, assume it 
00467                     // can't be a boundary and flush the buf
00468                     // with the partial line
00469                     block[blkpos] = 0;
00470                     onBlock(block, blkpos, pe);
00471                     blkpos = sl_off = 0;
00472                 } else {
00473                     // flush the buffer except the last
00474                     // (probably incomplete) line
00475                     size_t llen = blkpos - sl_off;
00476                     onBlock(block, sl_off, pe);
00477                     memmove(block, block + sl_off, llen);
00478                     sl_off = 0;
00479                     blkpos = llen;
00480                 }
00481             }
00482             c = *m_bit;
00483             if(isnl(c))
00484             {
00485                 char nlbuf[3] = { 0, 0, 0 };
00486 
00487                 nlbuf[0] = c; // save the current NL char in nlbuf
00488 
00489                 // save the second char of the NL sequence (if any) in nlbuf
00490                 if(++m_bit != m_eit) 
00491                 {
00492                     char next = *m_bit;
00493                     if(next == (c == CR ? LF : CR))
00494                     {
00495                         nlbuf[1] = next; // save the next char in the NL seq
00496                         ++m_bit;
00497                     }
00498                 }
00499 
00500                 if(pos)
00501                 {
00502                     // not an empty row, is this a boundary?
00503                     block[blkpos] = 0;
00504                     if(block[sl_off] == '-' &&
00505                         sl_off < blkpos &&
00506                          block[sl_off+1] == '-')
00507                     {
00508                         std::string Line(block+sl_off, blkpos-sl_off);
00509                         if(isBoundary(Line))
00510                         {
00511                             // trim last newline
00512                             int i = sl_off;
00513                             char a = block[--i];
00514                             char b = block[--i];
00515                             if(isnl(a,b))
00516                                 sl_off -= 2;
00517                             else if(isnl(a))
00518                                 sl_off--;
00519                             onBlock(block, sl_off,
00520                                 pe);
00521                             return;
00522                         }
00523                     }
00524                     // exit if this is the end of message 
00525                     // marker
00526                     if(eom && pos >= eomsz)
00527                     {
00528                         char *line = block + sl_off;
00529                         size_t i = 0;
00530                         for(; i < eomsz; i++)
00531                             if(eom[i] != line[i])
00532                                 break;
00533                         if(i==eomsz) // if eom found
00534                         {
00535                             onBlock(block, sl_off,
00536                                 pe);
00537                             return; 
00538                         }
00539                     }
00540                 }
00541                 // append the saved NL sequence
00542                 for(int i = 0; nlbuf[i] != 0; i++)
00543                     block[blkpos++] = nlbuf[i];
00544                 block[blkpos] = 0;
00545                 sl_off = blkpos;
00546                 pos = 0;
00547             } else {
00548                 pos++; // line pos
00549                 block[blkpos++] = c;
00550                 ++m_bit; 
00551             }
00552         }
00553         // eof
00554         block[blkpos] = 0;
00555         onBlock(block, blkpos, pe);
00556     }
00557 };
00558 
00559 
00560 /*
00561  * Forward Iterator
00562  */
00563 template<typename Iterator>
00564 struct IteratorParser<Iterator, std::forward_iterator_tag>: 
00565     public IteratorParser<Iterator, std::input_iterator_tag>
00566 {
00567     /* input_iterator ops
00568      * *it = xxx
00569      * X& op++
00570      * X& op++(int)
00571      */
00572     typedef IteratorParser<Iterator, std::input_iterator_tag> base_type;
00573     IteratorParser(MimeEntity& me)
00574     : base_type(me)
00575     {
00576     }
00577 };
00578 
00579 /*
00580  * Bidirectional Iterator
00581  */
00582 template<typename Iterator>
00583 struct IteratorParser<Iterator, std::bidirectional_iterator_tag>:
00584     public IteratorParser<Iterator, std::forward_iterator_tag>
00585 {
00586     typedef IteratorParser<Iterator, std::forward_iterator_tag> base_type;
00587     IteratorParser(MimeEntity& me)
00588     : base_type(me)
00589     {
00590     }
00591 };
00592 
00593 /*
00594  * Random Access Iterator
00595  */
00596 template<typename Iterator>
00597 struct IteratorParser<Iterator, std::random_access_iterator_tag>:
00598     public IteratorParser<Iterator, std::bidirectional_iterator_tag>
00599 {
00600     typedef IteratorParser<Iterator, std::bidirectional_iterator_tag> base_type;
00601     IteratorParser(MimeEntity& me)
00602     : base_type(me)
00603     {
00604     }
00605 private:
00606     using base_type::peIgnore;
00607     using base_type::pePreamble;
00608     using base_type::peBody;
00609     using base_type::peEpilogue;
00610     
00611     using base_type::NoBoundary;
00612     using base_type::Boundary;
00613     using base_type::ClosingBoundary;
00614     using base_type::HigherLevelBoundary;
00615     
00616     using base_type::m_boundaryList;
00617     using base_type::m_lastBoundary;
00618     using base_type::m_entityStack;
00619     using base_type::m_me;
00620     using base_type::m_iMask;
00621     using base_type::m_bit;
00622     using base_type::m_eit;
00623     using base_type::isnl;
00624     
00625     typedef TreeNode<char> BoundaryTree;
00626     inline void onBlock(Iterator bit, int size, ParsingElem pe)
00627     {
00628         if(pe == peIgnore)
00629             return;
00630         Iterator eit = bit + size;
00631         MimeEntity* pMe = m_entityStack.top();
00632         switch(pe)
00633         {
00634         case pePreamble:
00635             pMe->body().preamble().append(bit, eit);
00636             break;
00637         case peEpilogue:
00638             pMe->body().epilogue().append(bit, eit);
00639             break;
00640         case peBody:
00641             pMe->body().append(bit, eit);
00642             break;
00643         }
00644     }
00645     void copy_until_boundary(ParsingElem pe)
00646     {
00647         // if we don't have any boundary copy until m_eit and return
00648         if(m_boundaryList.empty())
00649         {
00650             onBlock(m_bit, m_eit-m_bit, pe);
00651             m_bit = m_eit;
00652             return;
00653         }
00654         // search for current boundary; if not found (i.e. malformed
00655         // message) repeat the search for higher level boundary
00656         // (slow just for malformed msg, very fast otherwise)
00657         typename base_type::BoundaryList::const_iterator 
00658             bBit = m_boundaryList.begin(), bEit = m_boundaryList.end();
00659         m_lastBoundary = NoBoundary;
00660         int depth = 0;
00661         for( ;bBit != bEit; ++bBit, ++depth)
00662         {
00663             const std::string& boundary = *bBit;
00664             Iterator off;
00665             if( (off=utils::find_bm(m_bit,m_eit,boundary)) != m_eit)
00666             {
00667                 Iterator base = m_bit;
00668                 size_t block_sz = off - base;
00669                 m_lastBoundary = 
00670                     (depth ? HigherLevelBoundary: Boundary);
00671                 off += boundary.length();
00672                 m_bit = off;
00673                 if(off<m_eit-1 && *off =='-' && *(off+1) == '-')
00674                 {
00675                     m_lastBoundary = ClosingBoundary;
00676                     m_bit = off + 2;
00677                 }
00678                 if(m_bit < m_eit && isnl(*m_bit)) 
00679                 {
00680                     char c = *m_bit++;
00681                     char next = *m_bit;
00682                     if(isnl(next) && next != c)
00683                         ++m_bit;
00684                 }
00685 
00686                 // trim last newline
00687                 if(block_sz)
00688                 {
00689                     Iterator p = base + block_sz;
00690                     char a = *--p, b = *--p;
00691                     if(isnl(a,b))
00692                         block_sz -= 2;
00693                     else if(isnl(a))
00694                         block_sz--;
00695                 }
00696                 onBlock(base, block_sz, pe);
00697                 return;
00698             } else {
00699                 onBlock(m_bit, m_eit-m_bit, pe);
00700                 m_bit = m_eit;
00701             }
00702         }
00703     }
00704     BoundaryTree m_boundaryTree;
00705     void buildBoundaryTree()
00706     {
00707         m_boundaryTree = BoundaryTree(); // clear
00708         typename base_type::BoundaryList::const_iterator 
00709             bit = m_boundaryList.begin(), eit = m_boundaryList.end();
00710         BoundaryTree::NodeList *pChilds;
00711         BoundaryTree::NodeList::iterator it;
00712         int depth = 0;
00713         for( ; bit != eit; ++bit)
00714         {
00715             pChilds = &m_boundaryTree.childList();
00716             it = pChilds->begin();
00717             const char *w = bit->c_str();
00718             do
00719             {
00720                 it = find_if(pChilds->begin(), pChilds->end(), 
00721                         FindNodePred<char>(*w));
00722                 if( it == pChilds->end() )
00723                     it = pChilds->insert(pChilds->end(),*w);
00724                 pChilds = &it->childList();
00725                 depth++;
00726             } while(*(++w));
00727         }
00728     }
00729 
00730 };
00731 
00732 }
00733 
00734 #endif