Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

PDFDocumentExtractor.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // PDFDocumentExtractor
00015 //
00016 // 25 June 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
00020 #define INDRI_PDFDOCUMENTEXTRACTOR_HPP
00021 
00022 #include "lemur-compat.hpp"
00023 #include "indri/Buffer.hpp"
00024 #include "indri/UnparsedDocument.hpp"
00025 #include "indri/DocumentIterator.hpp"
00026 #include "indri/XMLReader.hpp"
00027 #include "indri/XMLNode.hpp"
00028 #include "indri/XMLWriter.hpp"
00029 #include <string>
00030 namespace indri
00031 {
00032   namespace parse
00033   {
00034     
00035     class PDFDocumentExtractor : public DocumentIterator {
00036       indri::utility::Buffer _documentTextBuffer;
00037       UnparsedDocument _unparsedDocument;
00038       std::string _documentPath;
00039   
00040     public:
00041       PDFDocumentExtractor();
00042       ~PDFDocumentExtractor();
00043 
00044       void open( const std::string& filename );
00045       UnparsedDocument* nextDocument();
00046           void appendPdfMetaData(indri::xml::XMLNode* node);
00047           void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
00048       void close();
00049         private:
00050           std::string _title;
00051           std::string _author;
00052 
00053     };
00054   }
00055 }
00056 
00057 #endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4