Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TaggedDocumentIterator.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // TaggedDocumentIterator
00015 //
00016 // 14 May 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_TRECDOCUMENTITERATOR_HPP
00020 #define INDRI_TRECDOCUMENTITERATOR_HPP
00021 #include "zlib.h"
00022 #include "indri/DocumentIterator.hpp"
00023 #include "indri/Buffer.hpp"
00024 #include "indri/UnparsedDocument.hpp"
00025 #include <string>
00026 #include <fstream>
00027 namespace indri
00028 {
00029   namespace parse
00030   {
00031     
00032     class TaggedDocumentIterator : public DocumentIterator {
00033     private:
00034       UnparsedDocument _document;
00035       gzFile _in;
00036       indri::utility::Buffer _buffer;
00037       indri::utility::Buffer _metaBuffer;
00038       std::string _lastMetadataTag;
00039       std::string _fileName;
00040 
00041       bool _readLine( char*& beginLine, size_t& lineLength );
00042 
00043       const char* _startDocTag;
00044       const char* _endDocTag;
00045       const char* _endMetadataTag;
00046 
00047       int _startDocTagLength;
00048       int _endDocTagLength;
00049       int _endMetadataTagLength;
00050 
00051     public:
00052       TaggedDocumentIterator();
00053       ~TaggedDocumentIterator();
00054 
00055       void setTags( const char* startDoc, const char* endDoc, const char* endMetadata );
00056   
00057       void open( const std::string& filename );
00058       void close();
00059 
00060       UnparsedDocument* nextDocument();
00061     };
00062   }
00063 }
00064 
00065 #endif // INDRI_TRECDOCUMENTITERATOR_HPP

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4