Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

WARCDocumentIterator.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2009 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // WARCDocumentIterator
00015 //
00016 // 03 Mar 2009 -- dmf
00017 //
00018 
00019 #ifndef INDRI_WARCDOCUMENTITERATOR_HPP
00020 #define INDRI_WARCDOCUMENTITERATOR_HPP
00021 #include <string>
00022 #include <fstream>
00023 #include "zlib.h"
00024 #include "indri/DocumentIterator.hpp"
00025 #include "indri/Buffer.hpp"
00026 #include "indri/UnparsedDocument.hpp"
00027 #include "indri/HashTable.hpp"
00028 
00029 namespace indri
00030 {
00031   namespace parse
00032   {
00033     class WARCRecord {
00034       private:
00035       // header fields
00036       //WARC-TYPE
00037       std::string warcType;
00038       //WARC-Record-ID
00039       std::string uuid;
00040       // WARC-TREC-ID // clueweb specific
00041       std::string trecID;
00042       // WARC-Target-URI
00043       std::string targetURI;
00044       //Content-Length
00045       int contentLength;
00046       // other metadata headers
00047       indri::utility::HashTable< std::string, std::string > metadata;
00048       // the header of the record
00049       std::string header;
00050       // the body of the record
00051       const char *content;
00052       bool _readLine( char*& beginLine, size_t& lineLength );
00053       bool readHeader();
00054       bool readContent();
00055       gzFile &_gzin;
00056       indri::utility::Buffer & _buffer;
00057       public:
00058       WARCRecord(gzFile &in, indri::utility::Buffer &buf) : _gzin(in), 
00059                                                             _buffer(buf) { }
00060 
00061       ~WARCRecord();
00062 
00063       std::string getWarcType() { return warcType ; }
00064       std::string getUUID() { return uuid; }
00065       std::string getTrecID() { return trecID; }
00066       std::string getTargetURI() { return targetURI; }
00067       const char *getHeader() { return header.c_str(); }
00068       const char *getContent(){ return content; }
00069 
00070       std::string getMetadata(const char *key);
00071 
00072       bool readRecord();
00073       // header string constants
00074       static const char * WARCTYPE;
00075       static const char * WARCRECORDID;
00076       static const char * CONTENTLENGTH;
00077       static const char * WARCTARGETURI;
00078       static const char * WARCTRECID;
00079     };
00080 
00081     class WARCDocumentIterator : public DocumentIterator {
00082     private:
00083       WARCRecord *_record;
00084       UnparsedDocument _document;
00085       gzFile _gzin;
00086       indri::utility::Buffer _buffer;
00087       indri::utility::Buffer _metaBuffer;
00088       std::string _warcUUID;
00089       const char * _warcMeta;
00090       const char * _dochdr;
00091       const char * _docnoString;
00092       char _docno[512];
00093 
00094     public:
00095       WARCDocumentIterator();
00096       ~WARCDocumentIterator();
00097       void open( const std::string& filename );
00098       void close();
00099       UnparsedDocument* nextDocument();
00100     };
00101   }
00102 }
00103 
00104 #endif // INDRI_WARCDOCUMENTITERATOR_HPP

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4